xref: /petsc/src/sys/utils/mpishm.c (revision a32e93adbc24f554b14767e95a73419304826960)
15f7487a0SJunchao Zhang #include <petscsys.h>        /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt shmsize;          /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
115f7487a0SJunchao Zhang    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
185f7487a0SJunchao Zhang PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
195f7487a0SJunchao Zhang {
205f7487a0SJunchao Zhang   PetscErrorCode  ierr;
215f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
225f7487a0SJunchao Zhang 
235f7487a0SJunchao Zhang   PetscFunctionBegin;
245f7487a0SJunchao Zhang   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
255f7487a0SJunchao Zhang   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
265f7487a0SJunchao Zhang   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
275f7487a0SJunchao Zhang   ierr = PetscFree(val);CHKERRMPI(ierr);
285f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
295f7487a0SJunchao Zhang }
305f7487a0SJunchao Zhang 
315f7487a0SJunchao Zhang /*@C
325f7487a0SJunchao Zhang     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
335f7487a0SJunchao Zhang 
345f7487a0SJunchao Zhang 
355f7487a0SJunchao Zhang     Collective on comm.
365f7487a0SJunchao Zhang 
375f7487a0SJunchao Zhang     Input Parameter:
385f7487a0SJunchao Zhang .   globcomm - MPI_Comm
395f7487a0SJunchao Zhang 
405f7487a0SJunchao Zhang     Output Parameter:
415f7487a0SJunchao Zhang .   pshmcomm - the PETSc shared memory communicator object
425f7487a0SJunchao Zhang 
435f7487a0SJunchao Zhang     Level: developer
445f7487a0SJunchao Zhang 
455f7487a0SJunchao Zhang     Notes:
465f7487a0SJunchao Zhang     This should be called only with an PetscCommDuplicate() communictor
475f7487a0SJunchao Zhang 
485f7487a0SJunchao Zhang            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
495f7487a0SJunchao Zhang 
505f7487a0SJunchao Zhang     Concepts: MPI subcomm^numbering
515f7487a0SJunchao Zhang 
525f7487a0SJunchao Zhang @*/
535f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
545f7487a0SJunchao Zhang {
555f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
565f7487a0SJunchao Zhang   PetscErrorCode   ierr;
575f7487a0SJunchao Zhang   MPI_Group        globgroup,shmgroup;
585f7487a0SJunchao Zhang   PetscMPIInt      *shmranks,i,flg;
595f7487a0SJunchao Zhang   PetscCommCounter *counter;
605f7487a0SJunchao Zhang 
615f7487a0SJunchao Zhang   PetscFunctionBegin;
625f7487a0SJunchao Zhang   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
635f7487a0SJunchao Zhang   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
645f7487a0SJunchao Zhang 
655f7487a0SJunchao Zhang   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
665f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
675f7487a0SJunchao Zhang 
685f7487a0SJunchao Zhang   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
695f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
705f7487a0SJunchao Zhang 
715f7487a0SJunchao Zhang   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
725f7487a0SJunchao Zhang 
735f7487a0SJunchao Zhang   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
745f7487a0SJunchao Zhang   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
755f7487a0SJunchao Zhang   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
765f7487a0SJunchao Zhang   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
775f7487a0SJunchao Zhang   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
785f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
795f7487a0SJunchao Zhang   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
805f7487a0SJunchao Zhang   ierr = PetscFree(shmranks);CHKERRQ(ierr);
815f7487a0SJunchao Zhang   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
825f7487a0SJunchao Zhang   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
835f7487a0SJunchao Zhang 
845f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) {
855f7487a0SJunchao Zhang     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
865f7487a0SJunchao Zhang   }
875f7487a0SJunchao Zhang   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
885f7487a0SJunchao Zhang   PetscFunctionReturn(0);
895f7487a0SJunchao Zhang #else
905f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
915f7487a0SJunchao Zhang #endif
925f7487a0SJunchao Zhang }
935f7487a0SJunchao Zhang 
945f7487a0SJunchao Zhang /*@C
955f7487a0SJunchao Zhang     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
965f7487a0SJunchao Zhang 
975f7487a0SJunchao Zhang     Input Parameters:
985f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
995f7487a0SJunchao Zhang -   grank    - the global rank
1005f7487a0SJunchao Zhang 
1015f7487a0SJunchao Zhang     Output Parameter:
1025f7487a0SJunchao Zhang .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
1035f7487a0SJunchao Zhang 
1045f7487a0SJunchao Zhang     Level: developer
1055f7487a0SJunchao Zhang 
1065f7487a0SJunchao Zhang     Developer Notes:
1075f7487a0SJunchao Zhang     Assumes the pshmcomm->globranks[] is sorted
1085f7487a0SJunchao Zhang 
1095f7487a0SJunchao Zhang     It may be better to rewrite this to map multiple global ranks to local in the same function call
1105f7487a0SJunchao Zhang 
1115f7487a0SJunchao Zhang     Concepts: MPI subcomm^numbering
1125f7487a0SJunchao Zhang 
1135f7487a0SJunchao Zhang @*/
1145f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
1155f7487a0SJunchao Zhang {
1165f7487a0SJunchao Zhang   PetscMPIInt    low,high,t,i;
1175f7487a0SJunchao Zhang   PetscBool      flg = PETSC_FALSE;
1185f7487a0SJunchao Zhang   PetscErrorCode ierr;
1195f7487a0SJunchao Zhang 
1205f7487a0SJunchao Zhang   PetscFunctionBegin;
1215f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
1225f7487a0SJunchao Zhang   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
1235f7487a0SJunchao Zhang   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
1245f7487a0SJunchao Zhang   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
1255f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
1265f7487a0SJunchao Zhang   low  = 0;
1275f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1285f7487a0SJunchao Zhang   while (high-low > 5) {
1295f7487a0SJunchao Zhang     t = (low+high)/2;
1305f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1315f7487a0SJunchao Zhang     else low = t;
1325f7487a0SJunchao Zhang   }
1335f7487a0SJunchao Zhang   for (i=low; i<high; i++) {
1345f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
1355f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1365f7487a0SJunchao Zhang       *lrank = i;
1375f7487a0SJunchao Zhang       PetscFunctionReturn(0);
1385f7487a0SJunchao Zhang     }
1395f7487a0SJunchao Zhang   }
1405f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1415f7487a0SJunchao Zhang }
1425f7487a0SJunchao Zhang 
1435f7487a0SJunchao Zhang /*@C
1445f7487a0SJunchao Zhang     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1455f7487a0SJunchao Zhang 
1465f7487a0SJunchao Zhang     Input Parameters:
1475f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1485f7487a0SJunchao Zhang -   lrank    - the local rank in the shared memory communicator
1495f7487a0SJunchao Zhang 
1505f7487a0SJunchao Zhang     Output Parameter:
1515f7487a0SJunchao Zhang .   grank - the global rank in the global communicator where the shared memory communicator is built
1525f7487a0SJunchao Zhang 
1535f7487a0SJunchao Zhang     Level: developer
1545f7487a0SJunchao Zhang 
1555f7487a0SJunchao Zhang     Concepts: MPI subcomm^numbering
1565f7487a0SJunchao Zhang @*/
1575f7487a0SJunchao Zhang PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
1585f7487a0SJunchao Zhang {
1595f7487a0SJunchao Zhang   PetscFunctionBegin;
1605f7487a0SJunchao Zhang #ifdef PETSC_USE_DEBUG
1615f7487a0SJunchao Zhang   {
1625f7487a0SJunchao Zhang     PetscErrorCode ierr;
1635f7487a0SJunchao Zhang     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
1645f7487a0SJunchao Zhang   }
1655f7487a0SJunchao Zhang #endif
1665f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
1675f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1685f7487a0SJunchao Zhang }
1695f7487a0SJunchao Zhang 
1705f7487a0SJunchao Zhang /*@C
1715f7487a0SJunchao Zhang     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
1725f7487a0SJunchao Zhang 
1735f7487a0SJunchao Zhang     Input Parameter:
1745f7487a0SJunchao Zhang .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
1755f7487a0SJunchao Zhang 
1765f7487a0SJunchao Zhang     Output Parameter:
1775f7487a0SJunchao Zhang .   comm     - the MPI communicator
1785f7487a0SJunchao Zhang 
1795f7487a0SJunchao Zhang     Level: developer
1805f7487a0SJunchao Zhang 
1815f7487a0SJunchao Zhang @*/
1825f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
1835f7487a0SJunchao Zhang {
1845f7487a0SJunchao Zhang   PetscFunctionBegin;
1855f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
1865f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1875f7487a0SJunchao Zhang }
1885f7487a0SJunchao Zhang 
189*a32e93adSJunchao Zhang #if defined(PETSC_HAVE_OPENMP) && defined(PETSC_HAVE_PTHREAD) && defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) && defined(PETSC_HAVE_HWLOC)
190*a32e93adSJunchao Zhang #include <pthread.h>
191*a32e93adSJunchao Zhang #include <hwloc.h>
192*a32e93adSJunchao Zhang #include <omp.h>
193*a32e93adSJunchao Zhang 
194*a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
195*a32e93adSJunchao Zhang   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
196*a32e93adSJunchao Zhang   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
197*a32e93adSJunchao Zhang   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
198*a32e93adSJunchao Zhang   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
199*a32e93adSJunchao Zhang   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
200*a32e93adSJunchao Zhang   pthread_barrier_t *barrier;        /* pointer to the barrier */
201*a32e93adSJunchao Zhang   hwloc_topology_t  topology;
202*a32e93adSJunchao Zhang   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
203*a32e93adSJunchao Zhang   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
204*a32e93adSJunchao Zhang };
205*a32e93adSJunchao Zhang 
206*a32e93adSJunchao Zhang /* Allocate a shared pthread_barrier_t object in ctrl->omp_comm, set ctrl->barrier */
207*a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
208*a32e93adSJunchao Zhang {
209*a32e93adSJunchao Zhang   PetscErrorCode        ierr;
210*a32e93adSJunchao Zhang   MPI_Aint              size;
211*a32e93adSJunchao Zhang   PetscMPIInt           disp_unit;
212*a32e93adSJunchao Zhang   void                  *baseptr;
213*a32e93adSJunchao Zhang   pthread_barrierattr_t attr;
214*a32e93adSJunchao Zhang 
215*a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
216*a32e93adSJunchao Zhang   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
217*a32e93adSJunchao Zhang   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
218*a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t*)baseptr;
219*a32e93adSJunchao Zhang 
220*a32e93adSJunchao Zhang   /* omp master initializes the barrier */
221*a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
222*a32e93adSJunchao Zhang     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
223*a32e93adSJunchao Zhang     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
224*a32e93adSJunchao Zhang     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
225*a32e93adSJunchao Zhang     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
226*a32e93adSJunchao Zhang     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
227*a32e93adSJunchao Zhang   }
228*a32e93adSJunchao Zhang 
229*a32e93adSJunchao Zhang   /* the MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
230*a32e93adSJunchao Zhang   MPI_Barrier(ctrl->omp_comm);
231*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
232*a32e93adSJunchao Zhang }
233*a32e93adSJunchao Zhang 
234*a32e93adSJunchao Zhang /* Destroy ctrl->barrier */
235*a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
236*a32e93adSJunchao Zhang {
237*a32e93adSJunchao Zhang   PetscErrorCode ierr;
238*a32e93adSJunchao Zhang 
239*a32e93adSJunchao Zhang   /* the MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
240*a32e93adSJunchao Zhang   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
241*a32e93adSJunchao Zhang   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
242*a32e93adSJunchao Zhang   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
243*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
244*a32e93adSJunchao Zhang }
245*a32e93adSJunchao Zhang 
246*a32e93adSJunchao Zhang /* create a PETSc OpenMP controler, which manages PETSc's interaction with OpenMP runtime */
247*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
248*a32e93adSJunchao Zhang {
249*a32e93adSJunchao Zhang   PetscErrorCode        ierr;
250*a32e93adSJunchao Zhang   PetscOmpCtrl          ctrl;
251*a32e93adSJunchao Zhang   unsigned long         *cpu_ulongs=NULL;
252*a32e93adSJunchao Zhang   PetscInt              i,nr_cpu_ulongs;
253*a32e93adSJunchao Zhang   PetscShmComm          pshmcomm;
254*a32e93adSJunchao Zhang   MPI_Comm              shm_comm;
255*a32e93adSJunchao Zhang   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
256*a32e93adSJunchao Zhang 
257*a32e93adSJunchao Zhang   PetscFunctionBegin;
258*a32e93adSJunchao Zhang   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
259*a32e93adSJunchao Zhang 
260*a32e93adSJunchao Zhang   /*=================================================================================
261*a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
262*a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
263*a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
264*a32e93adSJunchao Zhang     which is usually passed to third party libraries.
265*a32e93adSJunchao Zhang    ==================================================================================*/
266*a32e93adSJunchao Zhang 
267*a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
268*a32e93adSJunchao Zhang   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
269*a32e93adSJunchao Zhang   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
270*a32e93adSJunchao Zhang 
271*a32e93adSJunchao Zhang   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
272*a32e93adSJunchao Zhang   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
273*a32e93adSJunchao Zhang 
274*a32e93adSJunchao Zhang   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
275*a32e93adSJunchao Zhang   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
276*a32e93adSJunchao Zhang 
277*a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
278*a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
279*a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
280*a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
281*a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
282*a32e93adSJunchao Zhang    */
283*a32e93adSJunchao Zhang   color = shm_rank / nthreads;
284*a32e93adSJunchao Zhang   MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);
285*a32e93adSJunchao Zhang 
286*a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
287*a32e93adSJunchao Zhang   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
288*a32e93adSJunchao Zhang   if (!omp_rank) {
289*a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE;  /* master */
290*a32e93adSJunchao Zhang     color = 0;
291*a32e93adSJunchao Zhang   } else {
292*a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE; /* slave */
293*a32e93adSJunchao Zhang     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
294*a32e93adSJunchao Zhang   }
295*a32e93adSJunchao Zhang   MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
296*a32e93adSJunchao Zhang 
297*a32e93adSJunchao Zhang   /*=================================================================================
298*a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
299*a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
300*a32e93adSJunchao Zhang     and run them on the idle CPUs.
301*a32e93adSJunchao Zhang    ==================================================================================*/
302*a32e93adSJunchao Zhang   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
303*a32e93adSJunchao Zhang 
304*a32e93adSJunchao Zhang   /*=================================================================================
305*a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
306*a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
307*a32e93adSJunchao Zhang     =================================================================================*/
308*a32e93adSJunchao Zhang   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
309*a32e93adSJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
310*a32e93adSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
311*a32e93adSJunchao Zhang   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
312*a32e93adSJunchao Zhang   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
313*a32e93adSJunchao Zhang #endif
314*a32e93adSJunchao Zhang   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
315*a32e93adSJunchao Zhang 
316*a32e93adSJunchao Zhang   ctrl->cpuset = hwloc_bitmap_alloc();
317*a32e93adSJunchao Zhang   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
318*a32e93adSJunchao Zhang 
319*a32e93adSJunchao Zhang   /* hwloc main developer said they would add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
320*a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
321*a32e93adSJunchao Zhang   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
322*a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
323*a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
324*a32e93adSJunchao Zhang   } else {
325*a32e93adSJunchao Zhang     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
326*a32e93adSJunchao Zhang   }
327*a32e93adSJunchao Zhang 
328*a32e93adSJunchao Zhang   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
329*a32e93adSJunchao Zhang 
330*a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
331*a32e93adSJunchao Zhang     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed within PetscOmpCtrlCreate()\n");
332*a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
333*a32e93adSJunchao Zhang       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
334*a32e93adSJunchao Zhang     } else {
335*a32e93adSJunchao Zhang       for (i=0; i<nr_cpu_ulongs; i++)  { ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); }
336*a32e93adSJunchao Zhang     }
337*a32e93adSJunchao Zhang   }
338*a32e93adSJunchao Zhang 
339*a32e93adSJunchao Zhang   /* all wait for the master to finish the initialization before using the barrier */
340*a32e93adSJunchao Zhang   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
341*a32e93adSJunchao Zhang   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
342*a32e93adSJunchao Zhang   *pctrl = ctrl;
343*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
344*a32e93adSJunchao Zhang }
345*a32e93adSJunchao Zhang 
346*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
347*a32e93adSJunchao Zhang {
348*a32e93adSJunchao Zhang   PetscErrorCode  ierr;
349*a32e93adSJunchao Zhang   PetscOmpCtrl    ctrl = *pctrl;
350*a32e93adSJunchao Zhang 
351*a32e93adSJunchao Zhang   PetscFunctionBegin;
352*a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
353*a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
354*a32e93adSJunchao Zhang   PetscOmpCtrlDestroyBarrier(ctrl);
355*a32e93adSJunchao Zhang   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
356*a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
357*a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
358*a32e93adSJunchao Zhang     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
359*a32e93adSJunchao Zhang   }
360*a32e93adSJunchao Zhang   ierr = PetscFree(ctrl);CHKERRQ(ierr);
361*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
362*a32e93adSJunchao Zhang }
363*a32e93adSJunchao Zhang 
364*a32e93adSJunchao Zhang /*@C
365*a32e93adSJunchao Zhang     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PetscOmpCtrl
366*a32e93adSJunchao Zhang 
367*a32e93adSJunchao Zhang     Input Parameter:
368*a32e93adSJunchao Zhang .   ctrl - a PetscOmpCtrl
369*a32e93adSJunchao Zhang 
370*a32e93adSJunchao Zhang     Output Parameter:
371*a32e93adSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks.
372*a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
373*a32e93adSJunchao Zhang                        on slave ranks, MPI_COMM_NULL will be return in reality.
374*a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
375*a32e93adSJunchao Zhang 
376*a32e93adSJunchao Zhang     Level: developer
377*a32e93adSJunchao Zhang @*/
378*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
379*a32e93adSJunchao Zhang {
380*a32e93adSJunchao Zhang   PetscFunctionBegin;
381*a32e93adSJunchao Zhang   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
382*a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
383*a32e93adSJunchao Zhang   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
384*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
385*a32e93adSJunchao Zhang }
386*a32e93adSJunchao Zhang 
387*a32e93adSJunchao Zhang /* a barrier in the scope of an omp_comm. Not using MPI_Barrier since it keeps polling and does not free CPUs OMP wants to use */
388*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
389*a32e93adSJunchao Zhang {
390*a32e93adSJunchao Zhang   PetscErrorCode ierr;
391*a32e93adSJunchao Zhang 
392*a32e93adSJunchao Zhang   PetscFunctionBegin;
393*a32e93adSJunchao Zhang   ierr = pthread_barrier_wait(ctrl->barrier);
394*a32e93adSJunchao Zhang   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
395*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
396*a32e93adSJunchao Zhang }
397*a32e93adSJunchao Zhang 
398*a32e93adSJunchao Zhang /* call this on master ranks before calling a library using OpenMP */
399*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
400*a32e93adSJunchao Zhang {
401*a32e93adSJunchao Zhang   PetscErrorCode ierr;
402*a32e93adSJunchao Zhang 
403*a32e93adSJunchao Zhang   PetscFunctionBegin;
404*a32e93adSJunchao Zhang   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
405*a32e93adSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override OMP_NUM_THREAD in environment */
406*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
407*a32e93adSJunchao Zhang }
408*a32e93adSJunchao Zhang 
409*a32e93adSJunchao Zhang /* call this on master ranks after leaving a library using OpenMP */
410*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
411*a32e93adSJunchao Zhang {
412*a32e93adSJunchao Zhang   PetscErrorCode ierr;
413*a32e93adSJunchao Zhang 
414*a32e93adSJunchao Zhang   PetscFunctionBegin;
415*a32e93adSJunchao Zhang   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
416*a32e93adSJunchao Zhang   PetscFunctionReturn(0);
417*a32e93adSJunchao Zhang }
418*a32e93adSJunchao Zhang 
419*a32e93adSJunchao Zhang #endif /* defined(PETSC_HAVE_PTHREAD) && .. */
420