15f7487a0SJunchao Zhang #include <petscsys.h> /*I "petscsys.h" I*/ 25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h> 35f7487a0SJunchao Zhang 45f7487a0SJunchao Zhang struct _n_PetscShmComm { 55f7487a0SJunchao Zhang PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 65f7487a0SJunchao Zhang PetscMPIInt shmsize; /* size of the shared memory communicator */ 75f7487a0SJunchao Zhang MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 85f7487a0SJunchao Zhang }; 95f7487a0SJunchao Zhang 105f7487a0SJunchao Zhang /* 115f7487a0SJunchao Zhang Private routine to delete internal tag/name shared memory communicator when a communicator is freed. 125f7487a0SJunchao Zhang 135f7487a0SJunchao Zhang This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 145f7487a0SJunchao Zhang 155f7487a0SJunchao Zhang Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 165f7487a0SJunchao Zhang 175f7487a0SJunchao Zhang */ 185f7487a0SJunchao Zhang PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 195f7487a0SJunchao Zhang { 205f7487a0SJunchao Zhang PetscErrorCode ierr; 215f7487a0SJunchao Zhang PetscShmComm p = (PetscShmComm)val; 225f7487a0SJunchao Zhang 235f7487a0SJunchao Zhang PetscFunctionBegin; 245f7487a0SJunchao Zhang ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr); 255f7487a0SJunchao Zhang ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr); 265f7487a0SJunchao Zhang ierr = PetscFree(p->globranks);CHKERRMPI(ierr); 275f7487a0SJunchao Zhang ierr = PetscFree(val);CHKERRMPI(ierr); 285f7487a0SJunchao Zhang PetscFunctionReturn(MPI_SUCCESS); 295f7487a0SJunchao Zhang } 305f7487a0SJunchao Zhang 315f7487a0SJunchao Zhang /*@C 325f7487a0SJunchao Zhang PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory 335f7487a0SJunchao Zhang 345f7487a0SJunchao Zhang 355f7487a0SJunchao Zhang Collective on comm. 365f7487a0SJunchao Zhang 375f7487a0SJunchao Zhang Input Parameter: 385f7487a0SJunchao Zhang . globcomm - MPI_Comm 395f7487a0SJunchao Zhang 405f7487a0SJunchao Zhang Output Parameter: 415f7487a0SJunchao Zhang . pshmcomm - the PETSc shared memory communicator object 425f7487a0SJunchao Zhang 435f7487a0SJunchao Zhang Level: developer 445f7487a0SJunchao Zhang 455f7487a0SJunchao Zhang Notes: 465f7487a0SJunchao Zhang This should be called only with an PetscCommDuplicate() communictor 475f7487a0SJunchao Zhang 485f7487a0SJunchao Zhang When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 495f7487a0SJunchao Zhang 505f7487a0SJunchao Zhang Concepts: MPI subcomm^numbering 515f7487a0SJunchao Zhang 525f7487a0SJunchao Zhang @*/ 535f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 545f7487a0SJunchao Zhang { 555f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 565f7487a0SJunchao Zhang PetscErrorCode ierr; 575f7487a0SJunchao Zhang MPI_Group globgroup,shmgroup; 585f7487a0SJunchao Zhang PetscMPIInt *shmranks,i,flg; 595f7487a0SJunchao Zhang PetscCommCounter *counter; 605f7487a0SJunchao Zhang 615f7487a0SJunchao Zhang PetscFunctionBegin; 625f7487a0SJunchao Zhang ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr); 635f7487a0SJunchao Zhang if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator"); 645f7487a0SJunchao Zhang 655f7487a0SJunchao Zhang ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr); 665f7487a0SJunchao Zhang if (flg) PetscFunctionReturn(0); 675f7487a0SJunchao Zhang 685f7487a0SJunchao Zhang ierr = PetscNew(pshmcomm);CHKERRQ(ierr); 695f7487a0SJunchao Zhang (*pshmcomm)->globcomm = globcomm; 705f7487a0SJunchao Zhang 715f7487a0SJunchao Zhang ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr); 725f7487a0SJunchao Zhang 735f7487a0SJunchao Zhang ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr); 745f7487a0SJunchao Zhang ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr); 755f7487a0SJunchao Zhang ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr); 765f7487a0SJunchao Zhang ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr); 775f7487a0SJunchao Zhang ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr); 785f7487a0SJunchao Zhang for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 795f7487a0SJunchao Zhang ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr); 805f7487a0SJunchao Zhang ierr = PetscFree(shmranks);CHKERRQ(ierr); 815f7487a0SJunchao Zhang ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr); 825f7487a0SJunchao Zhang ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr); 835f7487a0SJunchao Zhang 845f7487a0SJunchao Zhang for (i=0; i<(*pshmcomm)->shmsize; i++) { 855f7487a0SJunchao Zhang ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr); 865f7487a0SJunchao Zhang } 875f7487a0SJunchao Zhang ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr); 885f7487a0SJunchao Zhang PetscFunctionReturn(0); 895f7487a0SJunchao Zhang #else 905f7487a0SJunchao Zhang SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 915f7487a0SJunchao Zhang #endif 925f7487a0SJunchao Zhang } 935f7487a0SJunchao Zhang 945f7487a0SJunchao Zhang /*@C 955f7487a0SJunchao Zhang PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 965f7487a0SJunchao Zhang 975f7487a0SJunchao Zhang Input Parameters: 985f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object 995f7487a0SJunchao Zhang - grank - the global rank 1005f7487a0SJunchao Zhang 1015f7487a0SJunchao Zhang Output Parameter: 1025f7487a0SJunchao Zhang . lrank - the local rank, or MPI_PROC_NULL if it does not exist 1035f7487a0SJunchao Zhang 1045f7487a0SJunchao Zhang Level: developer 1055f7487a0SJunchao Zhang 1065f7487a0SJunchao Zhang Developer Notes: 1075f7487a0SJunchao Zhang Assumes the pshmcomm->globranks[] is sorted 1085f7487a0SJunchao Zhang 1095f7487a0SJunchao Zhang It may be better to rewrite this to map multiple global ranks to local in the same function call 1105f7487a0SJunchao Zhang 1115f7487a0SJunchao Zhang Concepts: MPI subcomm^numbering 1125f7487a0SJunchao Zhang 1135f7487a0SJunchao Zhang @*/ 1145f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 1155f7487a0SJunchao Zhang { 1165f7487a0SJunchao Zhang PetscMPIInt low,high,t,i; 1175f7487a0SJunchao Zhang PetscBool flg = PETSC_FALSE; 1185f7487a0SJunchao Zhang PetscErrorCode ierr; 1195f7487a0SJunchao Zhang 1205f7487a0SJunchao Zhang PetscFunctionBegin; 1215f7487a0SJunchao Zhang *lrank = MPI_PROC_NULL; 1225f7487a0SJunchao Zhang if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 1235f7487a0SJunchao Zhang if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 1245f7487a0SJunchao Zhang ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr); 1255f7487a0SJunchao Zhang if (flg) PetscFunctionReturn(0); 1265f7487a0SJunchao Zhang low = 0; 1275f7487a0SJunchao Zhang high = pshmcomm->shmsize; 1285f7487a0SJunchao Zhang while (high-low > 5) { 1295f7487a0SJunchao Zhang t = (low+high)/2; 1305f7487a0SJunchao Zhang if (pshmcomm->globranks[t] > grank) high = t; 1315f7487a0SJunchao Zhang else low = t; 1325f7487a0SJunchao Zhang } 1335f7487a0SJunchao Zhang for (i=low; i<high; i++) { 1345f7487a0SJunchao Zhang if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 1355f7487a0SJunchao Zhang if (pshmcomm->globranks[i] == grank) { 1365f7487a0SJunchao Zhang *lrank = i; 1375f7487a0SJunchao Zhang PetscFunctionReturn(0); 1385f7487a0SJunchao Zhang } 1395f7487a0SJunchao Zhang } 1405f7487a0SJunchao Zhang PetscFunctionReturn(0); 1415f7487a0SJunchao Zhang } 1425f7487a0SJunchao Zhang 1435f7487a0SJunchao Zhang /*@C 1445f7487a0SJunchao Zhang PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 1455f7487a0SJunchao Zhang 1465f7487a0SJunchao Zhang Input Parameters: 1475f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object 1485f7487a0SJunchao Zhang - lrank - the local rank in the shared memory communicator 1495f7487a0SJunchao Zhang 1505f7487a0SJunchao Zhang Output Parameter: 1515f7487a0SJunchao Zhang . grank - the global rank in the global communicator where the shared memory communicator is built 1525f7487a0SJunchao Zhang 1535f7487a0SJunchao Zhang Level: developer 1545f7487a0SJunchao Zhang 1555f7487a0SJunchao Zhang Concepts: MPI subcomm^numbering 1565f7487a0SJunchao Zhang @*/ 1575f7487a0SJunchao Zhang PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 1585f7487a0SJunchao Zhang { 1595f7487a0SJunchao Zhang PetscFunctionBegin; 1605f7487a0SJunchao Zhang #ifdef PETSC_USE_DEBUG 1615f7487a0SJunchao Zhang { 1625f7487a0SJunchao Zhang PetscErrorCode ierr; 1635f7487a0SJunchao Zhang if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); } 1645f7487a0SJunchao Zhang } 1655f7487a0SJunchao Zhang #endif 1665f7487a0SJunchao Zhang *grank = pshmcomm->globranks[lrank]; 1675f7487a0SJunchao Zhang PetscFunctionReturn(0); 1685f7487a0SJunchao Zhang } 1695f7487a0SJunchao Zhang 1705f7487a0SJunchao Zhang /*@C 1715f7487a0SJunchao Zhang PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 1725f7487a0SJunchao Zhang 1735f7487a0SJunchao Zhang Input Parameter: 1745f7487a0SJunchao Zhang . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 1755f7487a0SJunchao Zhang 1765f7487a0SJunchao Zhang Output Parameter: 1775f7487a0SJunchao Zhang . comm - the MPI communicator 1785f7487a0SJunchao Zhang 1795f7487a0SJunchao Zhang Level: developer 1805f7487a0SJunchao Zhang 1815f7487a0SJunchao Zhang @*/ 1825f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 1835f7487a0SJunchao Zhang { 1845f7487a0SJunchao Zhang PetscFunctionBegin; 1855f7487a0SJunchao Zhang *comm = pshmcomm->shmcomm; 1865f7487a0SJunchao Zhang PetscFunctionReturn(0); 1875f7487a0SJunchao Zhang } 1885f7487a0SJunchao Zhang 189*4df5c2c7SJunchao Zhang #if defined(PETSC_HAVE_OPENMP) && defined(PETSC_HAVE_PTHREAD) && (defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) || defined(PETSC_HAVE_MMAP)) && defined(PETSC_HAVE_HWLOC) 190a32e93adSJunchao Zhang #include <pthread.h> 191a32e93adSJunchao Zhang #include <hwloc.h> 192a32e93adSJunchao Zhang #include <omp.h> 193a32e93adSJunchao Zhang 194*4df5c2c7SJunchao Zhang /* Use mmap() to allocate shared mmeory (for the pthread_barrierattr_t object) if it is available, 195*4df5c2c7SJunchao Zhang otherwise use MPI_Win_allocate_shared. They should have the same effect besides MPI-3 is much 196*4df5c2c7SJunchao Zhang simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 197*4df5c2c7SJunchao Zhang by 50%. Until the reason is found out, we use mmap() instead. 198*4df5c2c7SJunchao Zhang */ 199*4df5c2c7SJunchao Zhang #define USE_MMAP_ALLOCATE_SHARED_MEMORY 200*4df5c2c7SJunchao Zhang 201*4df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 202*4df5c2c7SJunchao Zhang #include <sys/mman.h> 203*4df5c2c7SJunchao Zhang #include <sys/types.h> 204*4df5c2c7SJunchao Zhang #include <sys/stat.h> 205*4df5c2c7SJunchao Zhang #include <fcntl.h> 206*4df5c2c7SJunchao Zhang #endif 207*4df5c2c7SJunchao Zhang 208a32e93adSJunchao Zhang struct _n_PetscOmpCtrl { 209a32e93adSJunchao Zhang MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 210a32e93adSJunchao Zhang MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 211a32e93adSJunchao Zhang PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 212a32e93adSJunchao Zhang PetscBool is_omp_master; /* rank 0's in omp_comm */ 213a32e93adSJunchao Zhang MPI_Win omp_win; /* a shared memory window containing a barrier */ 214a32e93adSJunchao Zhang pthread_barrier_t *barrier; /* pointer to the barrier */ 215a32e93adSJunchao Zhang hwloc_topology_t topology; 216a32e93adSJunchao Zhang hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 217a32e93adSJunchao Zhang hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 218a32e93adSJunchao Zhang }; 219a32e93adSJunchao Zhang 220*4df5c2c7SJunchao Zhang 221a32e93adSJunchao Zhang /* Allocate a shared pthread_barrier_t object in ctrl->omp_comm, set ctrl->barrier */ 222a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 223a32e93adSJunchao Zhang { 224a32e93adSJunchao Zhang PetscErrorCode ierr; 225a32e93adSJunchao Zhang MPI_Aint size; 226a32e93adSJunchao Zhang void *baseptr; 227a32e93adSJunchao Zhang pthread_barrierattr_t attr; 228a32e93adSJunchao Zhang 229*4df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 230*4df5c2c7SJunchao Zhang PetscInt fd; 231*4df5c2c7SJunchao Zhang PetscChar pathname[PETSC_MAX_PATH_LEN]; 232*4df5c2c7SJunchao Zhang #else 233*4df5c2c7SJunchao Zhang PetscMPIInt disp_unit; 234*4df5c2c7SJunchao Zhang #endif 235*4df5c2c7SJunchao Zhang 236*4df5c2c7SJunchao Zhang PetscFunctionBegin; 237*4df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 238*4df5c2c7SJunchao Zhang size = sizeof(pthread_barrier_t); 239*4df5c2c7SJunchao Zhang if (ctrl->is_omp_master) { 240*4df5c2c7SJunchao Zhang /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the unfinished pathname to slaves */ 241*4df5c2c7SJunchao Zhang ierr = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 242*4df5c2c7SJunchao Zhang ierr = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 243*4df5c2c7SJunchao Zhang /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 244*4df5c2c7SJunchao Zhang fd = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname); 245*4df5c2c7SJunchao Zhang ierr = ftruncate(fd,size);CHKERRQ(ierr); 246*4df5c2c7SJunchao Zhang baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 247*4df5c2c7SJunchao Zhang ierr = close(fd);CHKERRQ(ierr); 248*4df5c2c7SJunchao Zhang ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr); 249*4df5c2c7SJunchao Zhang /* this MPI_Barrier is to wait slaves open the file before master unlinks it */ 250*4df5c2c7SJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 251*4df5c2c7SJunchao Zhang ierr = unlink(pathname);CHKERRQ(ierr); 252*4df5c2c7SJunchao Zhang } else { 253*4df5c2c7SJunchao Zhang ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr); 254*4df5c2c7SJunchao Zhang fd = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname); 255*4df5c2c7SJunchao Zhang baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 256*4df5c2c7SJunchao Zhang ierr = close(fd);CHKERRQ(ierr); 257*4df5c2c7SJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 258*4df5c2c7SJunchao Zhang } 259*4df5c2c7SJunchao Zhang #else 260a32e93adSJunchao Zhang size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 261a32e93adSJunchao Zhang ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr); 262a32e93adSJunchao Zhang ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr); 263*4df5c2c7SJunchao Zhang #endif 264a32e93adSJunchao Zhang ctrl->barrier = (pthread_barrier_t*)baseptr; 265a32e93adSJunchao Zhang 266a32e93adSJunchao Zhang /* omp master initializes the barrier */ 267a32e93adSJunchao Zhang if (ctrl->is_omp_master) { 268a32e93adSJunchao Zhang ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr); 269a32e93adSJunchao Zhang ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr); 270a32e93adSJunchao Zhang ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */ 271a32e93adSJunchao Zhang ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr); 272a32e93adSJunchao Zhang ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr); 273a32e93adSJunchao Zhang } 274a32e93adSJunchao Zhang 275*4df5c2c7SJunchao Zhang /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 276*4df5c2c7SJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 277a32e93adSJunchao Zhang PetscFunctionReturn(0); 278a32e93adSJunchao Zhang } 279a32e93adSJunchao Zhang 280a32e93adSJunchao Zhang /* Destroy ctrl->barrier */ 281a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 282a32e93adSJunchao Zhang { 283a32e93adSJunchao Zhang PetscErrorCode ierr; 284a32e93adSJunchao Zhang 285*4df5c2c7SJunchao Zhang PetscFunctionBegin; 286*4df5c2c7SJunchao Zhang /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 287a32e93adSJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 288a32e93adSJunchao Zhang if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); } 289*4df5c2c7SJunchao Zhang 290*4df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 291*4df5c2c7SJunchao Zhang ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr); 292*4df5c2c7SJunchao Zhang #else 293a32e93adSJunchao Zhang ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr); 294*4df5c2c7SJunchao Zhang #endif 295a32e93adSJunchao Zhang PetscFunctionReturn(0); 296a32e93adSJunchao Zhang } 297a32e93adSJunchao Zhang 298a32e93adSJunchao Zhang /* create a PETSc OpenMP controler, which manages PETSc's interaction with OpenMP runtime */ 299a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 300a32e93adSJunchao Zhang { 301a32e93adSJunchao Zhang PetscErrorCode ierr; 302a32e93adSJunchao Zhang PetscOmpCtrl ctrl; 303a32e93adSJunchao Zhang unsigned long *cpu_ulongs=NULL; 304a32e93adSJunchao Zhang PetscInt i,nr_cpu_ulongs; 305a32e93adSJunchao Zhang PetscShmComm pshmcomm; 306a32e93adSJunchao Zhang MPI_Comm shm_comm; 307a32e93adSJunchao Zhang PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 308a32e93adSJunchao Zhang 309a32e93adSJunchao Zhang PetscFunctionBegin; 310a32e93adSJunchao Zhang ierr = PetscNew(&ctrl);CHKERRQ(ierr); 311a32e93adSJunchao Zhang 312a32e93adSJunchao Zhang /*================================================================================= 313a32e93adSJunchao Zhang Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 314a32e93adSJunchao Zhang physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 315a32e93adSJunchao Zhang others are called slaves. OMP Masters make up a new comm called omp_master_comm, 316a32e93adSJunchao Zhang which is usually passed to third party libraries. 317a32e93adSJunchao Zhang ==================================================================================*/ 318a32e93adSJunchao Zhang 319a32e93adSJunchao Zhang /* fetch the stored shared memory communicator */ 320a32e93adSJunchao Zhang ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr); 321a32e93adSJunchao Zhang ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr); 322a32e93adSJunchao Zhang 323a32e93adSJunchao Zhang ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr); 324a32e93adSJunchao Zhang ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr); 325a32e93adSJunchao Zhang 326a32e93adSJunchao Zhang if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size); 327a32e93adSJunchao Zhang if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); } 328a32e93adSJunchao Zhang 329a32e93adSJunchao Zhang /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 330a32e93adSJunchao Zhang shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 331a32e93adSJunchao Zhang color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 332a32e93adSJunchao Zhang be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 333a32e93adSJunchao Zhang Use 0 as key so that rank ordering wont change in new comm. 334a32e93adSJunchao Zhang */ 335a32e93adSJunchao Zhang color = shm_rank / nthreads; 3363ab56b82SJunchao Zhang ierr = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr); 337a32e93adSJunchao Zhang 338a32e93adSJunchao Zhang /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 339a32e93adSJunchao Zhang ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr); 340a32e93adSJunchao Zhang if (!omp_rank) { 341a32e93adSJunchao Zhang ctrl->is_omp_master = PETSC_TRUE; /* master */ 342a32e93adSJunchao Zhang color = 0; 343a32e93adSJunchao Zhang } else { 344a32e93adSJunchao Zhang ctrl->is_omp_master = PETSC_FALSE; /* slave */ 345a32e93adSJunchao Zhang color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 346a32e93adSJunchao Zhang } 3473ab56b82SJunchao Zhang ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */ 348a32e93adSJunchao Zhang 349a32e93adSJunchao Zhang /*================================================================================= 350a32e93adSJunchao Zhang Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 351a32e93adSJunchao Zhang slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 352a32e93adSJunchao Zhang and run them on the idle CPUs. 353a32e93adSJunchao Zhang ==================================================================================*/ 354a32e93adSJunchao Zhang ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr); 355a32e93adSJunchao Zhang 356a32e93adSJunchao Zhang /*================================================================================= 357a32e93adSJunchao Zhang omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 358a32e93adSJunchao Zhang is the union of the bindings of all ranks in the omp_comm 359a32e93adSJunchao Zhang =================================================================================*/ 360a32e93adSJunchao Zhang ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr); 361a32e93adSJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000 362a32e93adSJunchao Zhang /* to filter out unneeded info and have faster hwloc_topology_load */ 363a32e93adSJunchao Zhang ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr); 364a32e93adSJunchao Zhang ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr); 365a32e93adSJunchao Zhang #endif 366a32e93adSJunchao Zhang ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr); 367a32e93adSJunchao Zhang 3683ab56b82SJunchao Zhang ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 369a32e93adSJunchao Zhang ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 370a32e93adSJunchao Zhang 3713ab56b82SJunchao Zhang /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 372a32e93adSJunchao Zhang nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 373a32e93adSJunchao Zhang ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr); 374a32e93adSJunchao Zhang if (nr_cpu_ulongs == 1) { 375a32e93adSJunchao Zhang cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 376a32e93adSJunchao Zhang } else { 377a32e93adSJunchao Zhang for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 378a32e93adSJunchao Zhang } 379a32e93adSJunchao Zhang 380a32e93adSJunchao Zhang ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr); 381a32e93adSJunchao Zhang 382a32e93adSJunchao Zhang if (ctrl->is_omp_master) { 3833ab56b82SJunchao Zhang ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 384a32e93adSJunchao Zhang if (nr_cpu_ulongs == 1) { 3853ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000 386a32e93adSJunchao Zhang ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr); 3873ab56b82SJunchao Zhang #else 3883ab56b82SJunchao Zhang hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]); 3893ab56b82SJunchao Zhang #endif 390a32e93adSJunchao Zhang } else { 3913ab56b82SJunchao Zhang for (i=0; i<nr_cpu_ulongs; i++) { 3923ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000 3933ab56b82SJunchao Zhang ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); 3943ab56b82SJunchao Zhang #else 3953ab56b82SJunchao Zhang hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]); 3963ab56b82SJunchao Zhang #endif 3973ab56b82SJunchao Zhang } 398a32e93adSJunchao Zhang } 399a32e93adSJunchao Zhang } 400a32e93adSJunchao Zhang 401a32e93adSJunchao Zhang /* all wait for the master to finish the initialization before using the barrier */ 402a32e93adSJunchao Zhang ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 403a32e93adSJunchao Zhang ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr); 404a32e93adSJunchao Zhang *pctrl = ctrl; 405a32e93adSJunchao Zhang PetscFunctionReturn(0); 406a32e93adSJunchao Zhang } 407a32e93adSJunchao Zhang 408a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 409a32e93adSJunchao Zhang { 410a32e93adSJunchao Zhang PetscErrorCode ierr; 411a32e93adSJunchao Zhang PetscOmpCtrl ctrl = *pctrl; 412a32e93adSJunchao Zhang 413a32e93adSJunchao Zhang PetscFunctionBegin; 414a32e93adSJunchao Zhang hwloc_bitmap_free(ctrl->cpuset); 415a32e93adSJunchao Zhang hwloc_topology_destroy(ctrl->topology); 416a32e93adSJunchao Zhang PetscOmpCtrlDestroyBarrier(ctrl); 417a32e93adSJunchao Zhang ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr); 418a32e93adSJunchao Zhang if (ctrl->is_omp_master) { 419a32e93adSJunchao Zhang hwloc_bitmap_free(ctrl->omp_cpuset); 420a32e93adSJunchao Zhang ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr); 421a32e93adSJunchao Zhang } 422a32e93adSJunchao Zhang ierr = PetscFree(ctrl);CHKERRQ(ierr); 423a32e93adSJunchao Zhang PetscFunctionReturn(0); 424a32e93adSJunchao Zhang } 425a32e93adSJunchao Zhang 426a32e93adSJunchao Zhang /*@C 427a32e93adSJunchao Zhang PetscOmpCtrlGetOmpComms - Get MPI communicators from a PetscOmpCtrl 428a32e93adSJunchao Zhang 429a32e93adSJunchao Zhang Input Parameter: 430a32e93adSJunchao Zhang . ctrl - a PetscOmpCtrl 431a32e93adSJunchao Zhang 432a32e93adSJunchao Zhang Output Parameter: 433a32e93adSJunchao Zhang + omp_comm - a communicator that includes a master rank and slave ranks. 434a32e93adSJunchao Zhang . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 435a32e93adSJunchao Zhang on slave ranks, MPI_COMM_NULL will be return in reality. 436a32e93adSJunchao Zhang - is_omp_master - true if the calling process is an OMP master rank. 437a32e93adSJunchao Zhang 438a32e93adSJunchao Zhang Level: developer 439a32e93adSJunchao Zhang @*/ 440a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 441a32e93adSJunchao Zhang { 442a32e93adSJunchao Zhang PetscFunctionBegin; 443a32e93adSJunchao Zhang if (omp_comm) *omp_comm = ctrl->omp_comm; 444a32e93adSJunchao Zhang if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 445a32e93adSJunchao Zhang if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 446a32e93adSJunchao Zhang PetscFunctionReturn(0); 447a32e93adSJunchao Zhang } 448a32e93adSJunchao Zhang 449a32e93adSJunchao Zhang /* a barrier in the scope of an omp_comm. Not using MPI_Barrier since it keeps polling and does not free CPUs OMP wants to use */ 450a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 451a32e93adSJunchao Zhang { 452a32e93adSJunchao Zhang PetscErrorCode ierr; 453a32e93adSJunchao Zhang 454a32e93adSJunchao Zhang PetscFunctionBegin; 455a32e93adSJunchao Zhang ierr = pthread_barrier_wait(ctrl->barrier); 456a32e93adSJunchao Zhang if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr); 457a32e93adSJunchao Zhang PetscFunctionReturn(0); 458a32e93adSJunchao Zhang } 459a32e93adSJunchao Zhang 460a32e93adSJunchao Zhang /* call this on master ranks before calling a library using OpenMP */ 461a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 462a32e93adSJunchao Zhang { 463a32e93adSJunchao Zhang PetscErrorCode ierr; 464a32e93adSJunchao Zhang 465a32e93adSJunchao Zhang PetscFunctionBegin; 466a32e93adSJunchao Zhang ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 467a32e93adSJunchao Zhang omp_set_num_threads(ctrl->omp_comm_size); /* may override OMP_NUM_THREAD in environment */ 468a32e93adSJunchao Zhang PetscFunctionReturn(0); 469a32e93adSJunchao Zhang } 470a32e93adSJunchao Zhang 471a32e93adSJunchao Zhang /* call this on master ranks after leaving a library using OpenMP */ 472a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 473a32e93adSJunchao Zhang { 474a32e93adSJunchao Zhang PetscErrorCode ierr; 475a32e93adSJunchao Zhang 476a32e93adSJunchao Zhang PetscFunctionBegin; 477a32e93adSJunchao Zhang ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 478a32e93adSJunchao Zhang PetscFunctionReturn(0); 479a32e93adSJunchao Zhang } 480a32e93adSJunchao Zhang 481*4df5c2c7SJunchao Zhang #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 482a32e93adSJunchao Zhang #endif /* defined(PETSC_HAVE_PTHREAD) && .. */ 483