15f7487a0SJunchao Zhang #include <petscsys.h> /*I "petscsys.h" I*/ 25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h> 35f7487a0SJunchao Zhang 45f7487a0SJunchao Zhang struct _n_PetscShmComm { 55f7487a0SJunchao Zhang PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 65f7487a0SJunchao Zhang PetscMPIInt shmsize; /* size of the shared memory communicator */ 75f7487a0SJunchao Zhang MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 85f7487a0SJunchao Zhang }; 95f7487a0SJunchao Zhang 105f7487a0SJunchao Zhang /* 115f7487a0SJunchao Zhang Private routine to delete internal tag/name shared memory communicator when a communicator is freed. 125f7487a0SJunchao Zhang 135f7487a0SJunchao Zhang This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 145f7487a0SJunchao Zhang 155f7487a0SJunchao Zhang Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 165f7487a0SJunchao Zhang 175f7487a0SJunchao Zhang */ 185f7487a0SJunchao Zhang PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 195f7487a0SJunchao Zhang { 205f7487a0SJunchao Zhang PetscErrorCode ierr; 215f7487a0SJunchao Zhang PetscShmComm p = (PetscShmComm)val; 225f7487a0SJunchao Zhang 235f7487a0SJunchao Zhang PetscFunctionBegin; 245f7487a0SJunchao Zhang ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr); 255f7487a0SJunchao Zhang ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr); 265f7487a0SJunchao Zhang ierr = PetscFree(p->globranks);CHKERRMPI(ierr); 275f7487a0SJunchao Zhang ierr = PetscFree(val);CHKERRMPI(ierr); 285f7487a0SJunchao Zhang PetscFunctionReturn(MPI_SUCCESS); 295f7487a0SJunchao Zhang } 305f7487a0SJunchao Zhang 315f7487a0SJunchao Zhang /*@C 325f7487a0SJunchao Zhang PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory 335f7487a0SJunchao Zhang 345f7487a0SJunchao Zhang 355f7487a0SJunchao Zhang Collective on comm. 365f7487a0SJunchao Zhang 375f7487a0SJunchao Zhang Input Parameter: 385f7487a0SJunchao Zhang . globcomm - MPI_Comm 395f7487a0SJunchao Zhang 405f7487a0SJunchao Zhang Output Parameter: 415f7487a0SJunchao Zhang . pshmcomm - the PETSc shared memory communicator object 425f7487a0SJunchao Zhang 435f7487a0SJunchao Zhang Level: developer 445f7487a0SJunchao Zhang 455f7487a0SJunchao Zhang Notes: 465f7487a0SJunchao Zhang This should be called only with an PetscCommDuplicate() communictor 475f7487a0SJunchao Zhang 485f7487a0SJunchao Zhang When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 495f7487a0SJunchao Zhang 505f7487a0SJunchao Zhang Concepts: MPI subcomm^numbering 515f7487a0SJunchao Zhang 525f7487a0SJunchao Zhang @*/ 535f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 545f7487a0SJunchao Zhang { 555f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 565f7487a0SJunchao Zhang PetscErrorCode ierr; 575f7487a0SJunchao Zhang MPI_Group globgroup,shmgroup; 585f7487a0SJunchao Zhang PetscMPIInt *shmranks,i,flg; 595f7487a0SJunchao Zhang PetscCommCounter *counter; 605f7487a0SJunchao Zhang 615f7487a0SJunchao Zhang PetscFunctionBegin; 625f7487a0SJunchao Zhang ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr); 635f7487a0SJunchao Zhang if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator"); 645f7487a0SJunchao Zhang 655f7487a0SJunchao Zhang ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr); 665f7487a0SJunchao Zhang if (flg) PetscFunctionReturn(0); 675f7487a0SJunchao Zhang 685f7487a0SJunchao Zhang ierr = PetscNew(pshmcomm);CHKERRQ(ierr); 695f7487a0SJunchao Zhang (*pshmcomm)->globcomm = globcomm; 705f7487a0SJunchao Zhang 715f7487a0SJunchao Zhang ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr); 725f7487a0SJunchao Zhang 735f7487a0SJunchao Zhang ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr); 745f7487a0SJunchao Zhang ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr); 755f7487a0SJunchao Zhang ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr); 765f7487a0SJunchao Zhang ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr); 775f7487a0SJunchao Zhang ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr); 785f7487a0SJunchao Zhang for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 795f7487a0SJunchao Zhang ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr); 805f7487a0SJunchao Zhang ierr = PetscFree(shmranks);CHKERRQ(ierr); 815f7487a0SJunchao Zhang ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr); 825f7487a0SJunchao Zhang ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr); 835f7487a0SJunchao Zhang 845f7487a0SJunchao Zhang for (i=0; i<(*pshmcomm)->shmsize; i++) { 855f7487a0SJunchao Zhang ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr); 865f7487a0SJunchao Zhang } 875f7487a0SJunchao Zhang ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr); 885f7487a0SJunchao Zhang PetscFunctionReturn(0); 895f7487a0SJunchao Zhang #else 905f7487a0SJunchao Zhang SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 915f7487a0SJunchao Zhang #endif 925f7487a0SJunchao Zhang } 935f7487a0SJunchao Zhang 945f7487a0SJunchao Zhang /*@C 955f7487a0SJunchao Zhang PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 965f7487a0SJunchao Zhang 975f7487a0SJunchao Zhang Input Parameters: 985f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object 995f7487a0SJunchao Zhang - grank - the global rank 1005f7487a0SJunchao Zhang 1015f7487a0SJunchao Zhang Output Parameter: 1025f7487a0SJunchao Zhang . lrank - the local rank, or MPI_PROC_NULL if it does not exist 1035f7487a0SJunchao Zhang 1045f7487a0SJunchao Zhang Level: developer 1055f7487a0SJunchao Zhang 1065f7487a0SJunchao Zhang Developer Notes: 1075f7487a0SJunchao Zhang Assumes the pshmcomm->globranks[] is sorted 1085f7487a0SJunchao Zhang 1095f7487a0SJunchao Zhang It may be better to rewrite this to map multiple global ranks to local in the same function call 1105f7487a0SJunchao Zhang 1115f7487a0SJunchao Zhang Concepts: MPI subcomm^numbering 1125f7487a0SJunchao Zhang 1135f7487a0SJunchao Zhang @*/ 1145f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 1155f7487a0SJunchao Zhang { 1165f7487a0SJunchao Zhang PetscMPIInt low,high,t,i; 1175f7487a0SJunchao Zhang PetscBool flg = PETSC_FALSE; 1185f7487a0SJunchao Zhang PetscErrorCode ierr; 1195f7487a0SJunchao Zhang 1205f7487a0SJunchao Zhang PetscFunctionBegin; 1215f7487a0SJunchao Zhang *lrank = MPI_PROC_NULL; 1225f7487a0SJunchao Zhang if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 1235f7487a0SJunchao Zhang if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 1245f7487a0SJunchao Zhang ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr); 1255f7487a0SJunchao Zhang if (flg) PetscFunctionReturn(0); 1265f7487a0SJunchao Zhang low = 0; 1275f7487a0SJunchao Zhang high = pshmcomm->shmsize; 1285f7487a0SJunchao Zhang while (high-low > 5) { 1295f7487a0SJunchao Zhang t = (low+high)/2; 1305f7487a0SJunchao Zhang if (pshmcomm->globranks[t] > grank) high = t; 1315f7487a0SJunchao Zhang else low = t; 1325f7487a0SJunchao Zhang } 1335f7487a0SJunchao Zhang for (i=low; i<high; i++) { 1345f7487a0SJunchao Zhang if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 1355f7487a0SJunchao Zhang if (pshmcomm->globranks[i] == grank) { 1365f7487a0SJunchao Zhang *lrank = i; 1375f7487a0SJunchao Zhang PetscFunctionReturn(0); 1385f7487a0SJunchao Zhang } 1395f7487a0SJunchao Zhang } 1405f7487a0SJunchao Zhang PetscFunctionReturn(0); 1415f7487a0SJunchao Zhang } 1425f7487a0SJunchao Zhang 1435f7487a0SJunchao Zhang /*@C 1445f7487a0SJunchao Zhang PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 1455f7487a0SJunchao Zhang 1465f7487a0SJunchao Zhang Input Parameters: 1475f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object 1485f7487a0SJunchao Zhang - lrank - the local rank in the shared memory communicator 1495f7487a0SJunchao Zhang 1505f7487a0SJunchao Zhang Output Parameter: 1515f7487a0SJunchao Zhang . grank - the global rank in the global communicator where the shared memory communicator is built 1525f7487a0SJunchao Zhang 1535f7487a0SJunchao Zhang Level: developer 1545f7487a0SJunchao Zhang 1555f7487a0SJunchao Zhang Concepts: MPI subcomm^numbering 1565f7487a0SJunchao Zhang @*/ 1575f7487a0SJunchao Zhang PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 1585f7487a0SJunchao Zhang { 1595f7487a0SJunchao Zhang PetscFunctionBegin; 1605f7487a0SJunchao Zhang #ifdef PETSC_USE_DEBUG 1615f7487a0SJunchao Zhang { 1625f7487a0SJunchao Zhang PetscErrorCode ierr; 1635f7487a0SJunchao Zhang if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); } 1645f7487a0SJunchao Zhang } 1655f7487a0SJunchao Zhang #endif 1665f7487a0SJunchao Zhang *grank = pshmcomm->globranks[lrank]; 1675f7487a0SJunchao Zhang PetscFunctionReturn(0); 1685f7487a0SJunchao Zhang } 1695f7487a0SJunchao Zhang 1705f7487a0SJunchao Zhang /*@C 1715f7487a0SJunchao Zhang PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 1725f7487a0SJunchao Zhang 1735f7487a0SJunchao Zhang Input Parameter: 1745f7487a0SJunchao Zhang . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 1755f7487a0SJunchao Zhang 1765f7487a0SJunchao Zhang Output Parameter: 1775f7487a0SJunchao Zhang . comm - the MPI communicator 1785f7487a0SJunchao Zhang 1795f7487a0SJunchao Zhang Level: developer 1805f7487a0SJunchao Zhang 1815f7487a0SJunchao Zhang @*/ 1825f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 1835f7487a0SJunchao Zhang { 1845f7487a0SJunchao Zhang PetscFunctionBegin; 1855f7487a0SJunchao Zhang *comm = pshmcomm->shmcomm; 1865f7487a0SJunchao Zhang PetscFunctionReturn(0); 1875f7487a0SJunchao Zhang } 1885f7487a0SJunchao Zhang 189*a32e93adSJunchao Zhang #if defined(PETSC_HAVE_OPENMP) && defined(PETSC_HAVE_PTHREAD) && defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) && defined(PETSC_HAVE_HWLOC) 190*a32e93adSJunchao Zhang #include <pthread.h> 191*a32e93adSJunchao Zhang #include <hwloc.h> 192*a32e93adSJunchao Zhang #include <omp.h> 193*a32e93adSJunchao Zhang 194*a32e93adSJunchao Zhang struct _n_PetscOmpCtrl { 195*a32e93adSJunchao Zhang MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 196*a32e93adSJunchao Zhang MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 197*a32e93adSJunchao Zhang PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 198*a32e93adSJunchao Zhang PetscBool is_omp_master; /* rank 0's in omp_comm */ 199*a32e93adSJunchao Zhang MPI_Win omp_win; /* a shared memory window containing a barrier */ 200*a32e93adSJunchao Zhang pthread_barrier_t *barrier; /* pointer to the barrier */ 201*a32e93adSJunchao Zhang hwloc_topology_t topology; 202*a32e93adSJunchao Zhang hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 203*a32e93adSJunchao Zhang hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 204*a32e93adSJunchao Zhang }; 205*a32e93adSJunchao Zhang 206*a32e93adSJunchao Zhang /* Allocate a shared pthread_barrier_t object in ctrl->omp_comm, set ctrl->barrier */ 207*a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 208*a32e93adSJunchao Zhang { 209*a32e93adSJunchao Zhang PetscErrorCode ierr; 210*a32e93adSJunchao Zhang MPI_Aint size; 211*a32e93adSJunchao Zhang PetscMPIInt disp_unit; 212*a32e93adSJunchao Zhang void *baseptr; 213*a32e93adSJunchao Zhang pthread_barrierattr_t attr; 214*a32e93adSJunchao Zhang 215*a32e93adSJunchao Zhang size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 216*a32e93adSJunchao Zhang ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr); 217*a32e93adSJunchao Zhang ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr); 218*a32e93adSJunchao Zhang ctrl->barrier = (pthread_barrier_t*)baseptr; 219*a32e93adSJunchao Zhang 220*a32e93adSJunchao Zhang /* omp master initializes the barrier */ 221*a32e93adSJunchao Zhang if (ctrl->is_omp_master) { 222*a32e93adSJunchao Zhang ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr); 223*a32e93adSJunchao Zhang ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr); 224*a32e93adSJunchao Zhang ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */ 225*a32e93adSJunchao Zhang ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr); 226*a32e93adSJunchao Zhang ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr); 227*a32e93adSJunchao Zhang } 228*a32e93adSJunchao Zhang 229*a32e93adSJunchao Zhang /* the MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 230*a32e93adSJunchao Zhang MPI_Barrier(ctrl->omp_comm); 231*a32e93adSJunchao Zhang PetscFunctionReturn(0); 232*a32e93adSJunchao Zhang } 233*a32e93adSJunchao Zhang 234*a32e93adSJunchao Zhang /* Destroy ctrl->barrier */ 235*a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 236*a32e93adSJunchao Zhang { 237*a32e93adSJunchao Zhang PetscErrorCode ierr; 238*a32e93adSJunchao Zhang 239*a32e93adSJunchao Zhang /* the MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 240*a32e93adSJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 241*a32e93adSJunchao Zhang if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); } 242*a32e93adSJunchao Zhang ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr); 243*a32e93adSJunchao Zhang PetscFunctionReturn(0); 244*a32e93adSJunchao Zhang } 245*a32e93adSJunchao Zhang 246*a32e93adSJunchao Zhang /* create a PETSc OpenMP controler, which manages PETSc's interaction with OpenMP runtime */ 247*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 248*a32e93adSJunchao Zhang { 249*a32e93adSJunchao Zhang PetscErrorCode ierr; 250*a32e93adSJunchao Zhang PetscOmpCtrl ctrl; 251*a32e93adSJunchao Zhang unsigned long *cpu_ulongs=NULL; 252*a32e93adSJunchao Zhang PetscInt i,nr_cpu_ulongs; 253*a32e93adSJunchao Zhang PetscShmComm pshmcomm; 254*a32e93adSJunchao Zhang MPI_Comm shm_comm; 255*a32e93adSJunchao Zhang PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 256*a32e93adSJunchao Zhang 257*a32e93adSJunchao Zhang PetscFunctionBegin; 258*a32e93adSJunchao Zhang ierr = PetscNew(&ctrl);CHKERRQ(ierr); 259*a32e93adSJunchao Zhang 260*a32e93adSJunchao Zhang /*================================================================================= 261*a32e93adSJunchao Zhang Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 262*a32e93adSJunchao Zhang physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 263*a32e93adSJunchao Zhang others are called slaves. OMP Masters make up a new comm called omp_master_comm, 264*a32e93adSJunchao Zhang which is usually passed to third party libraries. 265*a32e93adSJunchao Zhang ==================================================================================*/ 266*a32e93adSJunchao Zhang 267*a32e93adSJunchao Zhang /* fetch the stored shared memory communicator */ 268*a32e93adSJunchao Zhang ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr); 269*a32e93adSJunchao Zhang ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr); 270*a32e93adSJunchao Zhang 271*a32e93adSJunchao Zhang ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr); 272*a32e93adSJunchao Zhang ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr); 273*a32e93adSJunchao Zhang 274*a32e93adSJunchao Zhang if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size); 275*a32e93adSJunchao Zhang if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); } 276*a32e93adSJunchao Zhang 277*a32e93adSJunchao Zhang /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 278*a32e93adSJunchao Zhang shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 279*a32e93adSJunchao Zhang color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 280*a32e93adSJunchao Zhang be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 281*a32e93adSJunchao Zhang Use 0 as key so that rank ordering wont change in new comm. 282*a32e93adSJunchao Zhang */ 283*a32e93adSJunchao Zhang color = shm_rank / nthreads; 284*a32e93adSJunchao Zhang MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm); 285*a32e93adSJunchao Zhang 286*a32e93adSJunchao Zhang /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 287*a32e93adSJunchao Zhang ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr); 288*a32e93adSJunchao Zhang if (!omp_rank) { 289*a32e93adSJunchao Zhang ctrl->is_omp_master = PETSC_TRUE; /* master */ 290*a32e93adSJunchao Zhang color = 0; 291*a32e93adSJunchao Zhang } else { 292*a32e93adSJunchao Zhang ctrl->is_omp_master = PETSC_FALSE; /* slave */ 293*a32e93adSJunchao Zhang color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 294*a32e93adSJunchao Zhang } 295*a32e93adSJunchao Zhang MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */ 296*a32e93adSJunchao Zhang 297*a32e93adSJunchao Zhang /*================================================================================= 298*a32e93adSJunchao Zhang Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 299*a32e93adSJunchao Zhang slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 300*a32e93adSJunchao Zhang and run them on the idle CPUs. 301*a32e93adSJunchao Zhang ==================================================================================*/ 302*a32e93adSJunchao Zhang ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr); 303*a32e93adSJunchao Zhang 304*a32e93adSJunchao Zhang /*================================================================================= 305*a32e93adSJunchao Zhang omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 306*a32e93adSJunchao Zhang is the union of the bindings of all ranks in the omp_comm 307*a32e93adSJunchao Zhang =================================================================================*/ 308*a32e93adSJunchao Zhang ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr); 309*a32e93adSJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000 310*a32e93adSJunchao Zhang /* to filter out unneeded info and have faster hwloc_topology_load */ 311*a32e93adSJunchao Zhang ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr); 312*a32e93adSJunchao Zhang ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr); 313*a32e93adSJunchao Zhang #endif 314*a32e93adSJunchao Zhang ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr); 315*a32e93adSJunchao Zhang 316*a32e93adSJunchao Zhang ctrl->cpuset = hwloc_bitmap_alloc(); 317*a32e93adSJunchao Zhang ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 318*a32e93adSJunchao Zhang 319*a32e93adSJunchao Zhang /* hwloc main developer said they would add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 320*a32e93adSJunchao Zhang nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 321*a32e93adSJunchao Zhang ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr); 322*a32e93adSJunchao Zhang if (nr_cpu_ulongs == 1) { 323*a32e93adSJunchao Zhang cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 324*a32e93adSJunchao Zhang } else { 325*a32e93adSJunchao Zhang for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 326*a32e93adSJunchao Zhang } 327*a32e93adSJunchao Zhang 328*a32e93adSJunchao Zhang ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr); 329*a32e93adSJunchao Zhang 330*a32e93adSJunchao Zhang if (ctrl->is_omp_master) { 331*a32e93adSJunchao Zhang ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed within PetscOmpCtrlCreate()\n"); 332*a32e93adSJunchao Zhang if (nr_cpu_ulongs == 1) { 333*a32e93adSJunchao Zhang ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr); 334*a32e93adSJunchao Zhang } else { 335*a32e93adSJunchao Zhang for (i=0; i<nr_cpu_ulongs; i++) { ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); } 336*a32e93adSJunchao Zhang } 337*a32e93adSJunchao Zhang } 338*a32e93adSJunchao Zhang 339*a32e93adSJunchao Zhang /* all wait for the master to finish the initialization before using the barrier */ 340*a32e93adSJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 341*a32e93adSJunchao Zhang ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr); 342*a32e93adSJunchao Zhang *pctrl = ctrl; 343*a32e93adSJunchao Zhang PetscFunctionReturn(0); 344*a32e93adSJunchao Zhang } 345*a32e93adSJunchao Zhang 346*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 347*a32e93adSJunchao Zhang { 348*a32e93adSJunchao Zhang PetscErrorCode ierr; 349*a32e93adSJunchao Zhang PetscOmpCtrl ctrl = *pctrl; 350*a32e93adSJunchao Zhang 351*a32e93adSJunchao Zhang PetscFunctionBegin; 352*a32e93adSJunchao Zhang hwloc_bitmap_free(ctrl->cpuset); 353*a32e93adSJunchao Zhang hwloc_topology_destroy(ctrl->topology); 354*a32e93adSJunchao Zhang PetscOmpCtrlDestroyBarrier(ctrl); 355*a32e93adSJunchao Zhang ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr); 356*a32e93adSJunchao Zhang if (ctrl->is_omp_master) { 357*a32e93adSJunchao Zhang hwloc_bitmap_free(ctrl->omp_cpuset); 358*a32e93adSJunchao Zhang ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr); 359*a32e93adSJunchao Zhang } 360*a32e93adSJunchao Zhang ierr = PetscFree(ctrl);CHKERRQ(ierr); 361*a32e93adSJunchao Zhang PetscFunctionReturn(0); 362*a32e93adSJunchao Zhang } 363*a32e93adSJunchao Zhang 364*a32e93adSJunchao Zhang /*@C 365*a32e93adSJunchao Zhang PetscOmpCtrlGetOmpComms - Get MPI communicators from a PetscOmpCtrl 366*a32e93adSJunchao Zhang 367*a32e93adSJunchao Zhang Input Parameter: 368*a32e93adSJunchao Zhang . ctrl - a PetscOmpCtrl 369*a32e93adSJunchao Zhang 370*a32e93adSJunchao Zhang Output Parameter: 371*a32e93adSJunchao Zhang + omp_comm - a communicator that includes a master rank and slave ranks. 372*a32e93adSJunchao Zhang . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 373*a32e93adSJunchao Zhang on slave ranks, MPI_COMM_NULL will be return in reality. 374*a32e93adSJunchao Zhang - is_omp_master - true if the calling process is an OMP master rank. 375*a32e93adSJunchao Zhang 376*a32e93adSJunchao Zhang Level: developer 377*a32e93adSJunchao Zhang @*/ 378*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 379*a32e93adSJunchao Zhang { 380*a32e93adSJunchao Zhang PetscFunctionBegin; 381*a32e93adSJunchao Zhang if (omp_comm) *omp_comm = ctrl->omp_comm; 382*a32e93adSJunchao Zhang if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 383*a32e93adSJunchao Zhang if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 384*a32e93adSJunchao Zhang PetscFunctionReturn(0); 385*a32e93adSJunchao Zhang } 386*a32e93adSJunchao Zhang 387*a32e93adSJunchao Zhang /* a barrier in the scope of an omp_comm. Not using MPI_Barrier since it keeps polling and does not free CPUs OMP wants to use */ 388*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 389*a32e93adSJunchao Zhang { 390*a32e93adSJunchao Zhang PetscErrorCode ierr; 391*a32e93adSJunchao Zhang 392*a32e93adSJunchao Zhang PetscFunctionBegin; 393*a32e93adSJunchao Zhang ierr = pthread_barrier_wait(ctrl->barrier); 394*a32e93adSJunchao Zhang if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr); 395*a32e93adSJunchao Zhang PetscFunctionReturn(0); 396*a32e93adSJunchao Zhang } 397*a32e93adSJunchao Zhang 398*a32e93adSJunchao Zhang /* call this on master ranks before calling a library using OpenMP */ 399*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 400*a32e93adSJunchao Zhang { 401*a32e93adSJunchao Zhang PetscErrorCode ierr; 402*a32e93adSJunchao Zhang 403*a32e93adSJunchao Zhang PetscFunctionBegin; 404*a32e93adSJunchao Zhang ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 405*a32e93adSJunchao Zhang omp_set_num_threads(ctrl->omp_comm_size); /* may override OMP_NUM_THREAD in environment */ 406*a32e93adSJunchao Zhang PetscFunctionReturn(0); 407*a32e93adSJunchao Zhang } 408*a32e93adSJunchao Zhang 409*a32e93adSJunchao Zhang /* call this on master ranks after leaving a library using OpenMP */ 410*a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 411*a32e93adSJunchao Zhang { 412*a32e93adSJunchao Zhang PetscErrorCode ierr; 413*a32e93adSJunchao Zhang 414*a32e93adSJunchao Zhang PetscFunctionBegin; 415*a32e93adSJunchao Zhang ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 416*a32e93adSJunchao Zhang PetscFunctionReturn(0); 417*a32e93adSJunchao Zhang } 418*a32e93adSJunchao Zhang 419*a32e93adSJunchao Zhang #endif /* defined(PETSC_HAVE_PTHREAD) && .. */ 420