1*632d0f97SHong Zhang /*$Id: sbaijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/ 2*632d0f97SHong Zhang 3*632d0f97SHong Zhang /* 4*632d0f97SHong Zhang Routines to compute overlapping regions of a parallel MPI matrix. 5*632d0f97SHong Zhang Used for finding submatrices that were shared across processors. 6*632d0f97SHong Zhang */ 7*632d0f97SHong Zhang #include "src/mat/impls/sbaij/mpi/mpisbaij.h" 8*632d0f97SHong Zhang #include "petscbt.h" 9*632d0f97SHong Zhang 10*632d0f97SHong Zhang static int MatIncreaseOverlap_MPISBAIJ_Once(Mat,int,IS *); 11*632d0f97SHong Zhang static int MatIncreaseOverlap_MPISBAIJ_Local(Mat,int,char **,int*,int**); 12*632d0f97SHong Zhang static int MatIncreaseOverlap_MPISBAIJ_Receive(Mat,int,int **,int**,int*); 13*632d0f97SHong Zhang 14*632d0f97SHong Zhang /* this function is sasme as MatCompressIndicesGeneral_MPIBAIJ -- should be removed! */ 15*632d0f97SHong Zhang #undef __FUNCT__ 16*632d0f97SHong Zhang #define __FUNCT__ "MatCompressIndicesGeneral_MPISBAIJ" 17*632d0f97SHong Zhang static int MatCompressIndicesGeneral_MPISBAIJ(Mat C,int imax,const IS is_in[],IS is_out[]) 18*632d0f97SHong Zhang { 19*632d0f97SHong Zhang Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)C->data; 20*632d0f97SHong Zhang int ierr,isz,bs = baij->bs,n,i,j,*idx,ival; 21*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 22*632d0f97SHong Zhang PetscTable gid1_lid1; 23*632d0f97SHong Zhang int tt, gid1, *nidx; 24*632d0f97SHong Zhang PetscTablePosition tpos; 25*632d0f97SHong Zhang #else 26*632d0f97SHong Zhang int Nbs,*nidx; 27*632d0f97SHong Zhang PetscBT table; 28*632d0f97SHong Zhang #endif 29*632d0f97SHong Zhang 30*632d0f97SHong Zhang PetscFunctionBegin; 31*632d0f97SHong Zhang /* printf(" ...MatCompressIndicesGeneral_MPISBAIJ is called ...\n"); */ 32*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 33*632d0f97SHong Zhang ierr = PetscTableCreate(baij->mbs,&gid1_lid1);CHKERRQ(ierr); 34*632d0f97SHong Zhang #else 35*632d0f97SHong Zhang Nbs = baij->Nbs; 36*632d0f97SHong Zhang ierr = PetscMalloc((Nbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 37*632d0f97SHong Zhang ierr = PetscBTCreate(Nbs,table);CHKERRQ(ierr); 38*632d0f97SHong Zhang #endif 39*632d0f97SHong Zhang for (i=0; i<imax; i++) { 40*632d0f97SHong Zhang isz = 0; 41*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 42*632d0f97SHong Zhang ierr = PetscTableRemoveAll(gid1_lid1);CHKERRQ(ierr); 43*632d0f97SHong Zhang #else 44*632d0f97SHong Zhang ierr = PetscBTMemzero(Nbs,table);CHKERRQ(ierr); 45*632d0f97SHong Zhang #endif 46*632d0f97SHong Zhang ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 47*632d0f97SHong Zhang ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 48*632d0f97SHong Zhang for (j=0; j<n ; j++) { 49*632d0f97SHong Zhang ival = idx[j]/bs; /* convert the indices into block indices */ 50*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 51*632d0f97SHong Zhang ierr = PetscTableFind(gid1_lid1,ival+1,&tt);CHKERRQ(ierr); 52*632d0f97SHong Zhang if (!tt) { 53*632d0f97SHong Zhang ierr = PetscTableAdd(gid1_lid1,ival+1,isz+1);CHKERRQ(ierr); 54*632d0f97SHong Zhang isz++; 55*632d0f97SHong Zhang } 56*632d0f97SHong Zhang #else 57*632d0f97SHong Zhang if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim"); 58*632d0f97SHong Zhang if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;} 59*632d0f97SHong Zhang #endif 60*632d0f97SHong Zhang } 61*632d0f97SHong Zhang ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 62*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 63*632d0f97SHong Zhang ierr = PetscMalloc((isz+1)*sizeof(int),&nidx);CHKERRQ(ierr); 64*632d0f97SHong Zhang ierr = PetscTableGetHeadPosition(gid1_lid1,&tpos);CHKERRQ(ierr); 65*632d0f97SHong Zhang j = 0; 66*632d0f97SHong Zhang while (tpos) { 67*632d0f97SHong Zhang ierr = PetscTableGetNext(gid1_lid1,&tpos,&gid1,&tt);CHKERRQ(ierr); 68*632d0f97SHong Zhang if (tt-- > isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than array-dim"); } 69*632d0f97SHong Zhang nidx[tt] = gid1 - 1; 70*632d0f97SHong Zhang j++; 71*632d0f97SHong Zhang } 72*632d0f97SHong Zhang if (j != isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"table error: jj != isz"); } 73*632d0f97SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr); 74*632d0f97SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 75*632d0f97SHong Zhang #else 76*632d0f97SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));CHKERRQ(ierr); 77*632d0f97SHong Zhang #endif 78*632d0f97SHong Zhang } 79*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 80*632d0f97SHong Zhang ierr = PetscTableDelete(gid1_lid1);CHKERRQ(ierr); 81*632d0f97SHong Zhang #else 82*632d0f97SHong Zhang ierr = PetscBTDestroy(table);CHKERRQ(ierr); 83*632d0f97SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 84*632d0f97SHong Zhang #endif 85*632d0f97SHong Zhang PetscFunctionReturn(0); 86*632d0f97SHong Zhang } 87*632d0f97SHong Zhang 88*632d0f97SHong Zhang #undef __FUNCT__ 89*632d0f97SHong Zhang #define __FUNCT__ "MatCompressIndicesSorted_MPISBAIJ" 90*632d0f97SHong Zhang static int MatCompressIndicesSorted_MPISBAIJ(Mat C,int imax,const IS is_in[],IS is_out[]) 91*632d0f97SHong Zhang { 92*632d0f97SHong Zhang Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)C->data; 93*632d0f97SHong Zhang int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,*idx_local; 94*632d0f97SHong Zhang PetscTruth flg; 95*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 96*632d0f97SHong Zhang int maxsz; 97*632d0f97SHong Zhang #else 98*632d0f97SHong Zhang int Nbs=baij->Nbs; 99*632d0f97SHong Zhang #endif 100*632d0f97SHong Zhang PetscFunctionBegin; 101*632d0f97SHong Zhang printf(" ... MatCompressIndicesSorted_MPISBAIJ is called ...\n"); 102*632d0f97SHong Zhang for (i=0; i<imax; i++) { 103*632d0f97SHong Zhang ierr = ISSorted(is_in[i],&flg);CHKERRQ(ierr); 104*632d0f97SHong Zhang if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Indices are not sorted"); 105*632d0f97SHong Zhang } 106*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 107*632d0f97SHong Zhang /* Now check max size */ 108*632d0f97SHong Zhang for (i=0,maxsz=0; i<imax; i++) { 109*632d0f97SHong Zhang ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 110*632d0f97SHong Zhang ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 111*632d0f97SHong Zhang if (n%bs !=0) SETERRQ(1,"Indices are not block ordered"); 112*632d0f97SHong Zhang n = n/bs; /* The reduced index size */ 113*632d0f97SHong Zhang if (n > maxsz) maxsz = n; 114*632d0f97SHong Zhang } 115*632d0f97SHong Zhang ierr = PetscMalloc((maxsz+1)*sizeof(int),&nidx);CHKERRQ(ierr); 116*632d0f97SHong Zhang #else 117*632d0f97SHong Zhang ierr = PetscMalloc((Nbs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 118*632d0f97SHong Zhang #endif 119*632d0f97SHong Zhang /* Now check if the indices are in block order */ 120*632d0f97SHong Zhang for (i=0; i<imax; i++) { 121*632d0f97SHong Zhang ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 122*632d0f97SHong Zhang ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 123*632d0f97SHong Zhang if (n%bs !=0) SETERRQ(1,"Indices are not block ordered"); 124*632d0f97SHong Zhang 125*632d0f97SHong Zhang n = n/bs; /* The reduced index size */ 126*632d0f97SHong Zhang idx_local = idx; 127*632d0f97SHong Zhang for (j=0; j<n ; j++) { 128*632d0f97SHong Zhang val = idx_local[0]; 129*632d0f97SHong Zhang if (val%bs != 0) SETERRQ(1,"Indices are not block ordered"); 130*632d0f97SHong Zhang for (k=0; k<bs; k++) { 131*632d0f97SHong Zhang if (val+k != idx_local[k]) SETERRQ(1,"Indices are not block ordered"); 132*632d0f97SHong Zhang } 133*632d0f97SHong Zhang nidx[j] = val/bs; 134*632d0f97SHong Zhang idx_local +=bs; 135*632d0f97SHong Zhang } 136*632d0f97SHong Zhang ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 137*632d0f97SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));CHKERRQ(ierr); 138*632d0f97SHong Zhang } 139*632d0f97SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 140*632d0f97SHong Zhang 141*632d0f97SHong Zhang PetscFunctionReturn(0); 142*632d0f97SHong Zhang } 143*632d0f97SHong Zhang 144*632d0f97SHong Zhang #undef __FUNCT__ 145*632d0f97SHong Zhang #define __FUNCT__ "MatExpandIndices_MPISBAIJ" 146*632d0f97SHong Zhang static int MatExpandIndices_MPISBAIJ(Mat C,int imax,const IS is_in[],IS is_out[]) 147*632d0f97SHong Zhang { 148*632d0f97SHong Zhang Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)C->data; 149*632d0f97SHong Zhang int ierr,bs = baij->bs,n,i,j,k,*idx,*nidx; 150*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 151*632d0f97SHong Zhang int maxsz; 152*632d0f97SHong Zhang #else 153*632d0f97SHong Zhang int Nbs = baij->Nbs; 154*632d0f97SHong Zhang #endif 155*632d0f97SHong Zhang 156*632d0f97SHong Zhang PetscFunctionBegin; 157*632d0f97SHong Zhang /* printf(" ... MatExpandIndices_MPISBAIJ is called ...\n"); */ 158*632d0f97SHong Zhang #if defined (PETSC_USE_CTABLE) 159*632d0f97SHong Zhang /* Now check max size */ 160*632d0f97SHong Zhang for (i=0,maxsz=0; i<imax; i++) { 161*632d0f97SHong Zhang ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 162*632d0f97SHong Zhang ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 163*632d0f97SHong Zhang if (n*bs > maxsz) maxsz = n*bs; 164*632d0f97SHong Zhang } 165*632d0f97SHong Zhang ierr = PetscMalloc((maxsz+1)*sizeof(int),&nidx);CHKERRQ(ierr); 166*632d0f97SHong Zhang #else 167*632d0f97SHong Zhang ierr = PetscMalloc((Nbs*bs+1)*sizeof(int),&nidx);CHKERRQ(ierr); 168*632d0f97SHong Zhang #endif 169*632d0f97SHong Zhang 170*632d0f97SHong Zhang for (i=0; i<imax; i++) { 171*632d0f97SHong Zhang ierr = ISGetIndices(is_in[i],&idx);CHKERRQ(ierr); 172*632d0f97SHong Zhang ierr = ISGetLocalSize(is_in[i],&n);CHKERRQ(ierr); 173*632d0f97SHong Zhang for (j=0; j<n ; ++j){ 174*632d0f97SHong Zhang for (k=0; k<bs; k++) 175*632d0f97SHong Zhang nidx[j*bs+k] = idx[j]*bs+k; 176*632d0f97SHong Zhang } 177*632d0f97SHong Zhang ierr = ISRestoreIndices(is_in[i],&idx);CHKERRQ(ierr); 178*632d0f97SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);CHKERRQ(ierr); 179*632d0f97SHong Zhang } 180*632d0f97SHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 181*632d0f97SHong Zhang PetscFunctionReturn(0); 182*632d0f97SHong Zhang } 183*632d0f97SHong Zhang 184*632d0f97SHong Zhang 185*632d0f97SHong Zhang #undef __FUNCT__ 186*632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ" 187*632d0f97SHong Zhang int MatIncreaseOverlap_MPISBAIJ(Mat C,int imax,IS is[],int ov) 188*632d0f97SHong Zhang { 189*632d0f97SHong Zhang int i,ierr; 190*632d0f97SHong Zhang IS *is_new; 191*632d0f97SHong Zhang 192*632d0f97SHong Zhang PetscFunctionBegin; 193*632d0f97SHong Zhang ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr); 194*632d0f97SHong Zhang /* Convert the indices into block format */ 195*632d0f97SHong Zhang ierr = MatCompressIndicesGeneral_MPISBAIJ(C,imax,is,is_new);CHKERRQ(ierr); 196*632d0f97SHong Zhang if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");} 197*632d0f97SHong Zhang for (i=0; i<ov; ++i) { 198*632d0f97SHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 199*632d0f97SHong Zhang } 200*632d0f97SHong Zhang for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 201*632d0f97SHong Zhang ierr = MatExpandIndices_MPISBAIJ(C,imax,is_new,is);CHKERRQ(ierr); 202*632d0f97SHong Zhang for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 203*632d0f97SHong Zhang ierr = PetscFree(is_new);CHKERRQ(ierr); 204*632d0f97SHong Zhang PetscFunctionReturn(0); 205*632d0f97SHong Zhang } 206*632d0f97SHong Zhang 207*632d0f97SHong Zhang #undef __FUNCT__ 208*632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Once" 209*632d0f97SHong Zhang static int MatIncreaseOverlap_MPISBAIJ_Once(Mat C,int imax,IS is[]) 210*632d0f97SHong Zhang { 211*632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 212*632d0f97SHong Zhang int **idx,*n,len,*idx_i; 213*632d0f97SHong Zhang int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,*outdat,*indat; 214*632d0f97SHong Zhang int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2,flag,proc_id; 215*632d0f97SHong Zhang PetscBT *table; 216*632d0f97SHong Zhang MPI_Comm comm; 217*632d0f97SHong Zhang MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 218*632d0f97SHong Zhang MPI_Status *s_status,r_status; 219*632d0f97SHong Zhang 220*632d0f97SHong Zhang PetscFunctionBegin; 221*632d0f97SHong Zhang 222*632d0f97SHong Zhang comm = C->comm; 223*632d0f97SHong Zhang size = c->size; 224*632d0f97SHong Zhang rank = c->rank; 225*632d0f97SHong Zhang Mbs = c->Mbs; 226*632d0f97SHong Zhang 227*632d0f97SHong Zhang /* 1. Send is[] to all other processors */ 228*632d0f97SHong Zhang /*--------------------------------------*/ 229*632d0f97SHong Zhang /* This processor sends its is[] to all other processors in the format: 230*632d0f97SHong Zhang outdat[0] = is_max, no of is in this processor 231*632d0f97SHong Zhang outdat[1] = n[0], size of is[0] 232*632d0f97SHong Zhang ... 233*632d0f97SHong Zhang outdat[is_max] = n[is_max-1], size of is[is_max-1] 234*632d0f97SHong Zhang outdat[is_max + 1] = data(is[0]) 235*632d0f97SHong Zhang ... 236*632d0f97SHong Zhang outdat[is_max + i] = data(is[i]) 237*632d0f97SHong Zhang ... 238*632d0f97SHong Zhang */ 239*632d0f97SHong Zhang len = (imax+1)*sizeof(int*)+ (imax)*sizeof(int); 240*632d0f97SHong Zhang ierr = PetscMalloc(len,&idx);CHKERRQ(ierr); 241*632d0f97SHong Zhang n = (int*)(idx + imax); 242*632d0f97SHong Zhang 243*632d0f97SHong Zhang /* Allocate Memory for outgoing messages */ 244*632d0f97SHong Zhang len = 1 + imax; 245*632d0f97SHong Zhang for (i=0; i<imax; i++) { 246*632d0f97SHong Zhang ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 247*632d0f97SHong Zhang ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 248*632d0f97SHong Zhang len += n[i]; 249*632d0f97SHong Zhang } 250*632d0f97SHong Zhang ierr = PetscMalloc(len*sizeof(int),&outdat);CHKERRQ(ierr); 251*632d0f97SHong Zhang 252*632d0f97SHong Zhang /* Form the outgoing messages */ 253*632d0f97SHong Zhang outdat[0] = imax; 254*632d0f97SHong Zhang for (i=0; i<imax; i++) { 255*632d0f97SHong Zhang outdat[i+1] = n[i]; 256*632d0f97SHong Zhang } 257*632d0f97SHong Zhang k = imax + 1; 258*632d0f97SHong Zhang for (i=0; i<imax; i++) { /* for is[i] */ 259*632d0f97SHong Zhang idx_i = idx[i]; 260*632d0f97SHong Zhang for (j=0; j<n[i]; j++){ 261*632d0f97SHong Zhang outdat[k] = *(idx_i); 262*632d0f97SHong Zhang /* if (!rank) printf(" outdat[%d] = %d\n",k,outdat[k] ); */ 263*632d0f97SHong Zhang k++; idx_i++; 264*632d0f97SHong Zhang } 265*632d0f97SHong Zhang /* printf(" [%d] n[%d]=%d, k: %d, \n",rank,i,n[i],k); */ 266*632d0f97SHong Zhang } 267*632d0f97SHong Zhang if (k != len) SETERRQ3(1,"[%d] Error on forming the outgoing messages: k %d != len %d",rank,k,len); 268*632d0f97SHong Zhang 269*632d0f97SHong Zhang /* Now post the sends */ 270*632d0f97SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 271*632d0f97SHong Zhang 272*632d0f97SHong Zhang k = 0; 273*632d0f97SHong Zhang for (i=0; i<size; ++i) { /* send outdat to processor [i] */ 274*632d0f97SHong Zhang if (i != rank){ 275*632d0f97SHong Zhang ierr = MPI_Isend(outdat,len,MPI_INT,i,rank,comm,&s_waits1[k]);CHKERRQ(ierr); 276*632d0f97SHong Zhang printf(" [%d] send %d msg to [%d] \n",rank,len,i); 277*632d0f97SHong Zhang k++; 278*632d0f97SHong Zhang } 279*632d0f97SHong Zhang } 280*632d0f97SHong Zhang 281*632d0f97SHong Zhang /* 2. Do local work */ 282*632d0f97SHong Zhang /*------------------*/ 283*632d0f97SHong Zhang 284*632d0f97SHong Zhang /* No longer need the original indices*/ 285*632d0f97SHong Zhang for (i=0; i<imax; ++i) { 286*632d0f97SHong Zhang ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 287*632d0f97SHong Zhang } 288*632d0f97SHong Zhang ierr = PetscFree(idx);CHKERRQ(ierr); 289*632d0f97SHong Zhang 290*632d0f97SHong Zhang /* 3. Receive other's is[] and process. Then send back */ 291*632d0f97SHong Zhang /*----------------------------------------------------*/ 292*632d0f97SHong Zhang /* Send is done */ 293*632d0f97SHong Zhang nrqs = size-1; 294*632d0f97SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 295*632d0f97SHong Zhang ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 296*632d0f97SHong Zhang ierr = PetscFree(outdat);CHKERRQ(ierr); 297*632d0f97SHong Zhang 298*632d0f97SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Request),&r_waits1);CHKERRQ(ierr); 299*632d0f97SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 300*632d0f97SHong Zhang k = 0; 301*632d0f97SHong Zhang do { 302*632d0f97SHong Zhang /* Receive messages */ 303*632d0f97SHong Zhang ierr = MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG,comm,&flag,&r_status); 304*632d0f97SHong Zhang if (flag){ 305*632d0f97SHong Zhang ierr = MPI_Get_count(&r_status,MPI_INT,&len); 306*632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 307*632d0f97SHong Zhang ierr = ierr = PetscMalloc(len*sizeof(int),&indat);CHKERRQ(ierr); 308*632d0f97SHong Zhang ierr = MPI_Irecv(indat,len,MPI_INT,proc_id,r_status.MPI_TAG,comm,&r_waits1[k]); 309*632d0f97SHong Zhang printf(" [%d] recv %d msg from [%d] \n",rank,len,proc_id); 310*632d0f97SHong Zhang 311*632d0f97SHong Zhang /* Process messages -- not done yet */ 312*632d0f97SHong Zhang len = indat[0]; 313*632d0f97SHong Zhang ierr = PetscMalloc(len*sizeof(int),&outdat);CHKERRQ(ierr); 314*632d0f97SHong Zhang for (i=0; i<len; i++){outdat[i] = indat[i+1];} 315*632d0f97SHong Zhang 316*632d0f97SHong Zhang /* Send messages back */ 317*632d0f97SHong Zhang printf(" [%d] send %d msg back to [%d] \n",rank,len,proc_id); 318*632d0f97SHong Zhang ierr = MPI_Isend(outdat,len,MPI_INT,proc_id,rank,comm,&s_waits2[k]);CHKERRQ(ierr); 319*632d0f97SHong Zhang 320*632d0f97SHong Zhang k++; 321*632d0f97SHong Zhang ierr = PetscFree(outdat);CHKERRQ(ierr); 322*632d0f97SHong Zhang ierr = PetscFree(indat);CHKERRQ(ierr); 323*632d0f97SHong Zhang } 324*632d0f97SHong Zhang } while (k < nrqs); 325*632d0f97SHong Zhang 326*632d0f97SHong Zhang /* 4. Receive work done on other processors, then process */ 327*632d0f97SHong Zhang /*--------------------------------------------------------*/ 328*632d0f97SHong Zhang ierr = MPI_Waitall(nrqs,s_waits2,s_status);CHKERRQ(ierr); 329*632d0f97SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 330*632d0f97SHong Zhang k = 0; 331*632d0f97SHong Zhang do { 332*632d0f97SHong Zhang /* Receive messages */ 333*632d0f97SHong Zhang ierr = MPI_Iprobe(MPI_ANY_SOURCE,MPI_ANY_TAG,comm,&flag,&r_status); 334*632d0f97SHong Zhang if (flag){ 335*632d0f97SHong Zhang ierr = MPI_Get_count(&r_status,MPI_INT,&len); 336*632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 337*632d0f97SHong Zhang ierr = ierr = PetscMalloc(len*sizeof(int),&indat);CHKERRQ(ierr); 338*632d0f97SHong Zhang ierr = MPI_Irecv(indat,len,MPI_INT,proc_id,r_status.MPI_TAG,comm,&r_waits2[k]); 339*632d0f97SHong Zhang printf(" [%d] recv %d msg from [%d] \n",rank,len,proc_id); 340*632d0f97SHong Zhang 341*632d0f97SHong Zhang /* Process messages -- not done yet */ 342*632d0f97SHong Zhang 343*632d0f97SHong Zhang 344*632d0f97SHong Zhang k++; 345*632d0f97SHong Zhang ierr = PetscFree(indat);CHKERRQ(ierr); 346*632d0f97SHong Zhang } 347*632d0f97SHong Zhang } while (k < nrqs); 348*632d0f97SHong Zhang 349*632d0f97SHong Zhang #ifdef OLD 350*632d0f97SHong Zhang for (i=0; i<imax; ++i) { 351*632d0f97SHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr); 352*632d0f97SHong Zhang } 353*632d0f97SHong Zhang 354*632d0f97SHong Zhang 355*632d0f97SHong Zhang ierr = PetscFree(onodes2);CHKERRQ(ierr); 356*632d0f97SHong Zhang ierr = PetscFree(olengths2);CHKERRQ(ierr); 357*632d0f97SHong Zhang 358*632d0f97SHong Zhang ierr = PetscFree(pa);CHKERRQ(ierr); 359*632d0f97SHong Zhang ierr = PetscFree(rbuf2);CHKERRQ(ierr); 360*632d0f97SHong Zhang ierr = PetscFree(s_waits1);CHKERRQ(ierr); 361*632d0f97SHong Zhang ierr = PetscFree(r_waits1);CHKERRQ(ierr); 362*632d0f97SHong Zhang ierr = PetscFree(s_waits2);CHKERRQ(ierr); 363*632d0f97SHong Zhang ierr = PetscFree(r_waits2);CHKERRQ(ierr); 364*632d0f97SHong Zhang ierr = PetscFree(table);CHKERRQ(ierr); 365*632d0f97SHong Zhang ierr = PetscFree(s_status);CHKERRQ(ierr); 366*632d0f97SHong Zhang ierr = PetscFree(recv_status);CHKERRQ(ierr); 367*632d0f97SHong Zhang ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 368*632d0f97SHong Zhang ierr = PetscFree(xdata);CHKERRQ(ierr); 369*632d0f97SHong Zhang ierr = PetscFree(isz1);CHKERRQ(ierr); 370*632d0f97SHong Zhang #endif /* OLD */ 371*632d0f97SHong Zhang ierr = PetscFree(s_waits1);CHKERRQ(ierr); 372*632d0f97SHong Zhang ierr = PetscFree(r_waits1);CHKERRQ(ierr); 373*632d0f97SHong Zhang ierr = PetscFree(s_waits2);CHKERRQ(ierr); 374*632d0f97SHong Zhang ierr = PetscFree(r_waits2);CHKERRQ(ierr); 375*632d0f97SHong Zhang ierr = PetscFree(s_status);CHKERRQ(ierr); 376*632d0f97SHong Zhang PetscFunctionReturn(0); 377*632d0f97SHong Zhang } 378*632d0f97SHong Zhang 379*632d0f97SHong Zhang #undef __FUNCT__ 380*632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Local" 381*632d0f97SHong Zhang /* 382*632d0f97SHong Zhang MatIncreaseOverlap_MPISBAIJ_Local - Called by MatincreaseOverlap, to do 383*632d0f97SHong Zhang the work on the local processor. 384*632d0f97SHong Zhang 385*632d0f97SHong Zhang Inputs: 386*632d0f97SHong Zhang C - MAT_MPISBAIJ; 387*632d0f97SHong Zhang imax - total no of index sets processed at a time; 388*632d0f97SHong Zhang table - an array of char - size = Mbs bits. 389*632d0f97SHong Zhang 390*632d0f97SHong Zhang Output: 391*632d0f97SHong Zhang isz - array containing the count of the solution elements correspondign 392*632d0f97SHong Zhang to each index set; 393*632d0f97SHong Zhang data - pointer to the solutions 394*632d0f97SHong Zhang */ 395*632d0f97SHong Zhang static int MatIncreaseOverlap_MPISBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data) 396*632d0f97SHong Zhang { 397*632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 398*632d0f97SHong Zhang Mat A = c->A,B = c->B; 399*632d0f97SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 400*632d0f97SHong Zhang Mat_SeqBAIJ *b = (Mat_SeqBAIJ*)B->data; 401*632d0f97SHong Zhang int start,end,val,max,rstart,cstart,*ai,*aj; 402*632d0f97SHong Zhang int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 403*632d0f97SHong Zhang PetscBT table_i; 404*632d0f97SHong Zhang 405*632d0f97SHong Zhang PetscFunctionBegin; 406*632d0f97SHong Zhang rstart = c->rstart; 407*632d0f97SHong Zhang cstart = c->cstart; 408*632d0f97SHong Zhang ai = a->i; 409*632d0f97SHong Zhang aj = a->j; 410*632d0f97SHong Zhang bi = b->i; 411*632d0f97SHong Zhang bj = b->j; 412*632d0f97SHong Zhang garray = c->garray; 413*632d0f97SHong Zhang 414*632d0f97SHong Zhang 415*632d0f97SHong Zhang for (i=0; i<imax; i++) { 416*632d0f97SHong Zhang data_i = data[i]; 417*632d0f97SHong Zhang table_i = table[i]; 418*632d0f97SHong Zhang isz_i = isz[i]; 419*632d0f97SHong Zhang for (j=0,max=isz[i]; j<max; j++) { 420*632d0f97SHong Zhang row = data_i[j] - rstart; 421*632d0f97SHong Zhang start = ai[row]; 422*632d0f97SHong Zhang end = ai[row+1]; 423*632d0f97SHong Zhang for (k=start; k<end; k++) { /* Amat */ 424*632d0f97SHong Zhang val = aj[k] + cstart; 425*632d0f97SHong Zhang if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 426*632d0f97SHong Zhang } 427*632d0f97SHong Zhang start = bi[row]; 428*632d0f97SHong Zhang end = bi[row+1]; 429*632d0f97SHong Zhang for (k=start; k<end; k++) { /* Bmat */ 430*632d0f97SHong Zhang val = garray[bj[k]]; 431*632d0f97SHong Zhang if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 432*632d0f97SHong Zhang } 433*632d0f97SHong Zhang } 434*632d0f97SHong Zhang isz[i] = isz_i; 435*632d0f97SHong Zhang } 436*632d0f97SHong Zhang PetscFunctionReturn(0); 437*632d0f97SHong Zhang } 438*632d0f97SHong Zhang #undef __FUNCT__ 439*632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Receive" 440*632d0f97SHong Zhang /* 441*632d0f97SHong Zhang MatIncreaseOverlap_MPISBAIJ_Receive - Process the recieved messages, 442*632d0f97SHong Zhang and return the output 443*632d0f97SHong Zhang 444*632d0f97SHong Zhang Input: 445*632d0f97SHong Zhang C - the matrix 446*632d0f97SHong Zhang nrqr - no of messages being processed. 447*632d0f97SHong Zhang rbuf - an array of pointers to the recieved requests 448*632d0f97SHong Zhang 449*632d0f97SHong Zhang Output: 450*632d0f97SHong Zhang xdata - array of messages to be sent back 451*632d0f97SHong Zhang isz1 - size of each message 452*632d0f97SHong Zhang 453*632d0f97SHong Zhang For better efficiency perhaps we should malloc seperately each xdata[i], 454*632d0f97SHong Zhang then if a remalloc is required we need only copy the data for that one row 455*632d0f97SHong Zhang rather then all previous rows as it is now where a single large chunck of 456*632d0f97SHong Zhang memory is used. 457*632d0f97SHong Zhang 458*632d0f97SHong Zhang */ 459*632d0f97SHong Zhang static int MatIncreaseOverlap_MPISBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1) 460*632d0f97SHong Zhang { 461*632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 462*632d0f97SHong Zhang Mat A = c->A,B = c->B; 463*632d0f97SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 464*632d0f97SHong Zhang Mat_SeqBAIJ *b = (Mat_SeqBAIJ*)B->data; 465*632d0f97SHong Zhang int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 466*632d0f97SHong Zhang int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 467*632d0f97SHong Zhang int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 468*632d0f97SHong Zhang int *rbuf_i,kmax,rbuf_0,ierr; 469*632d0f97SHong Zhang PetscBT xtable; 470*632d0f97SHong Zhang 471*632d0f97SHong Zhang PetscFunctionBegin; 472*632d0f97SHong Zhang rank = c->rank; 473*632d0f97SHong Zhang Mbs = c->Mbs; 474*632d0f97SHong Zhang rstart = c->rstart; 475*632d0f97SHong Zhang cstart = c->cstart; 476*632d0f97SHong Zhang ai = a->i; 477*632d0f97SHong Zhang aj = a->j; 478*632d0f97SHong Zhang bi = b->i; 479*632d0f97SHong Zhang bj = b->j; 480*632d0f97SHong Zhang garray = c->garray; 481*632d0f97SHong Zhang 482*632d0f97SHong Zhang 483*632d0f97SHong Zhang for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 484*632d0f97SHong Zhang rbuf_i = rbuf[i]; 485*632d0f97SHong Zhang rbuf_0 = rbuf_i[0]; 486*632d0f97SHong Zhang ct += rbuf_0; 487*632d0f97SHong Zhang for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 488*632d0f97SHong Zhang } 489*632d0f97SHong Zhang 490*632d0f97SHong Zhang if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 491*632d0f97SHong Zhang else max1 = 1; 492*632d0f97SHong Zhang mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 493*632d0f97SHong Zhang ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr); 494*632d0f97SHong Zhang ++no_malloc; 495*632d0f97SHong Zhang ierr = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr); 496*632d0f97SHong Zhang ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); 497*632d0f97SHong Zhang 498*632d0f97SHong Zhang ct3 = 0; 499*632d0f97SHong Zhang for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 500*632d0f97SHong Zhang rbuf_i = rbuf[i]; 501*632d0f97SHong Zhang rbuf_0 = rbuf_i[0]; 502*632d0f97SHong Zhang ct1 = 2*rbuf_0+1; 503*632d0f97SHong Zhang ct2 = ct1; 504*632d0f97SHong Zhang ct3 += ct1; 505*632d0f97SHong Zhang for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 506*632d0f97SHong Zhang ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 507*632d0f97SHong Zhang oct2 = ct2; 508*632d0f97SHong Zhang kmax = rbuf_i[2*j]; 509*632d0f97SHong Zhang for (k=0; k<kmax; k++,ct1++) { 510*632d0f97SHong Zhang row = rbuf_i[ct1]; 511*632d0f97SHong Zhang if (!PetscBTLookupSet(xtable,row)) { 512*632d0f97SHong Zhang if (!(ct3 < mem_estimate)) { 513*632d0f97SHong Zhang new_estimate = (int)(1.5*mem_estimate)+1; 514*632d0f97SHong Zhang ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 515*632d0f97SHong Zhang ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 516*632d0f97SHong Zhang ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 517*632d0f97SHong Zhang xdata[0] = tmp; 518*632d0f97SHong Zhang mem_estimate = new_estimate; ++no_malloc; 519*632d0f97SHong Zhang for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 520*632d0f97SHong Zhang } 521*632d0f97SHong Zhang xdata[i][ct2++] = row; 522*632d0f97SHong Zhang ct3++; 523*632d0f97SHong Zhang } 524*632d0f97SHong Zhang } 525*632d0f97SHong Zhang for (k=oct2,max2=ct2; k<max2; k++) { 526*632d0f97SHong Zhang row = xdata[i][k] - rstart; 527*632d0f97SHong Zhang start = ai[row]; 528*632d0f97SHong Zhang end = ai[row+1]; 529*632d0f97SHong Zhang for (l=start; l<end; l++) { 530*632d0f97SHong Zhang val = aj[l] + cstart; 531*632d0f97SHong Zhang if (!PetscBTLookupSet(xtable,val)) { 532*632d0f97SHong Zhang if (!(ct3 < mem_estimate)) { 533*632d0f97SHong Zhang new_estimate = (int)(1.5*mem_estimate)+1; 534*632d0f97SHong Zhang ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 535*632d0f97SHong Zhang ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 536*632d0f97SHong Zhang ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 537*632d0f97SHong Zhang xdata[0] = tmp; 538*632d0f97SHong Zhang mem_estimate = new_estimate; ++no_malloc; 539*632d0f97SHong Zhang for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 540*632d0f97SHong Zhang } 541*632d0f97SHong Zhang xdata[i][ct2++] = val; 542*632d0f97SHong Zhang ct3++; 543*632d0f97SHong Zhang } 544*632d0f97SHong Zhang } 545*632d0f97SHong Zhang start = bi[row]; 546*632d0f97SHong Zhang end = bi[row+1]; 547*632d0f97SHong Zhang for (l=start; l<end; l++) { 548*632d0f97SHong Zhang val = garray[bj[l]]; 549*632d0f97SHong Zhang if (!PetscBTLookupSet(xtable,val)) { 550*632d0f97SHong Zhang if (!(ct3 < mem_estimate)) { 551*632d0f97SHong Zhang new_estimate = (int)(1.5*mem_estimate)+1; 552*632d0f97SHong Zhang ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 553*632d0f97SHong Zhang ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 554*632d0f97SHong Zhang ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 555*632d0f97SHong Zhang xdata[0] = tmp; 556*632d0f97SHong Zhang mem_estimate = new_estimate; ++no_malloc; 557*632d0f97SHong Zhang for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 558*632d0f97SHong Zhang } 559*632d0f97SHong Zhang xdata[i][ct2++] = val; 560*632d0f97SHong Zhang ct3++; 561*632d0f97SHong Zhang } 562*632d0f97SHong Zhang } 563*632d0f97SHong Zhang } 564*632d0f97SHong Zhang /* Update the header*/ 565*632d0f97SHong Zhang xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 566*632d0f97SHong Zhang xdata[i][2*j-1] = rbuf_i[2*j-1]; 567*632d0f97SHong Zhang } 568*632d0f97SHong Zhang xdata[i][0] = rbuf_0; 569*632d0f97SHong Zhang xdata[i+1] = xdata[i] + ct2; 570*632d0f97SHong Zhang isz1[i] = ct2; /* size of each message */ 571*632d0f97SHong Zhang } 572*632d0f97SHong Zhang ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 573*632d0f97SHong Zhang PetscLogInfo(0,"MatIncreaseOverlap_MPISBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 574*632d0f97SHong Zhang PetscFunctionReturn(0); 575*632d0f97SHong Zhang } 576*632d0f97SHong Zhang 577*632d0f97SHong Zhang 578