1*d5b485abSSatish Balay #ifndef lint 2*d5b485abSSatish Balay static char vcid[] = "$Id: mpiov.c,v 1.26.1.29 1996/04/26 00:04:17 balay Exp $"; 3*d5b485abSSatish Balay #endif 4*d5b485abSSatish Balay /* 5*d5b485abSSatish Balay Routines to compute overlapping regions of a parallel MPI matrix 6*d5b485abSSatish Balay and to find submatrices that were shared across processors. 7*d5b485abSSatish Balay */ 8*d5b485abSSatish Balay #include "mpiaij.h" 9*d5b485abSSatish Balay #include "src/inline/bitarray.h" 10*d5b485abSSatish Balay 11*d5b485abSSatish Balay static int MatIncreaseOverlap_MPIAIJ_Once(Mat, int, IS *); 12*d5b485abSSatish Balay static int MatIncreaseOverlap_MPIAIJ_Local(Mat , int , char **,int*, int**); 13*d5b485abSSatish Balay static int MatIncreaseOverlap_MPIAIJ_Receive(Mat , int, int **, int**, int* ); 14*d5b485abSSatish Balay extern int MatGetRow_MPIAIJ(Mat,int,int*,int**,Scalar**); 15*d5b485abSSatish Balay extern int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,Scalar**); 16*d5b485abSSatish Balay 17*d5b485abSSatish Balay int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov) 18*d5b485abSSatish Balay { 19*d5b485abSSatish Balay int i, ierr; 20*d5b485abSSatish Balay if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ:Negative overlap specified\n");} 21*d5b485abSSatish Balay for ( i=0; i<ov; ++i ) { 22*d5b485abSSatish Balay ierr = MatIncreaseOverlap_MPIAIJ_Once(C, imax, is); CHKERRQ(ierr); 23*d5b485abSSatish Balay } 24*d5b485abSSatish Balay return 0; 25*d5b485abSSatish Balay } 26*d5b485abSSatish Balay 27*d5b485abSSatish Balay /* 28*d5b485abSSatish Balay Sample message format: 29*d5b485abSSatish Balay If a processor A wants processor B to process some elements corresponding 30*d5b485abSSatish Balay to index sets 1s[1], is[5] 31*d5b485abSSatish Balay mesg [0] = 2 ( no of index sets in the mesg) 32*d5b485abSSatish Balay ----------- 33*d5b485abSSatish Balay mesg [1] = 1 => is[1] 34*d5b485abSSatish Balay mesg [2] = sizeof(is[1]); 35*d5b485abSSatish Balay ----------- 36*d5b485abSSatish Balay mesg [5] = 5 => is[5] 37*d5b485abSSatish Balay mesg [6] = sizeof(is[5]); 38*d5b485abSSatish Balay ----------- 39*d5b485abSSatish Balay mesg [7] 40*d5b485abSSatish Balay mesg [n] datas[1] 41*d5b485abSSatish Balay ----------- 42*d5b485abSSatish Balay mesg[n+1] 43*d5b485abSSatish Balay mesg[m] data(is[5]) 44*d5b485abSSatish Balay ----------- 45*d5b485abSSatish Balay 46*d5b485abSSatish Balay Notes: 47*d5b485abSSatish Balay nrqs - no of requests sent (or to be sent out) 48*d5b485abSSatish Balay nrqr - no of requests recieved (which have to be or which have been processed 49*d5b485abSSatish Balay */ 50*d5b485abSSatish Balay static int MatIncreaseOverlap_MPIAIJ_Once(Mat C, int imax, IS *is) 51*d5b485abSSatish Balay { 52*d5b485abSSatish Balay Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 53*d5b485abSSatish Balay int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 54*d5b485abSSatish Balay int size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 55*d5b485abSSatish Balay int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 56*d5b485abSSatish Balay char **table; 57*d5b485abSSatish Balay MPI_Comm comm; 58*d5b485abSSatish Balay MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 59*d5b485abSSatish Balay MPI_Status *s_status,*recv_status; 60*d5b485abSSatish Balay 61*d5b485abSSatish Balay comm = C->comm; 62*d5b485abSSatish Balay tag = C->tag; 63*d5b485abSSatish Balay size = c->size; 64*d5b485abSSatish Balay rank = c->rank; 65*d5b485abSSatish Balay m = c->M; 66*d5b485abSSatish Balay 67*d5b485abSSatish Balay len = (imax+1)*sizeof(int *) + (imax + m)*sizeof(int); 68*d5b485abSSatish Balay idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 69*d5b485abSSatish Balay n = (int *) (idx + imax); 70*d5b485abSSatish Balay rtable = n + imax; 71*d5b485abSSatish Balay 72*d5b485abSSatish Balay for ( i=0; i<imax; i++ ) { 73*d5b485abSSatish Balay ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 74*d5b485abSSatish Balay ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 75*d5b485abSSatish Balay } 76*d5b485abSSatish Balay 77*d5b485abSSatish Balay /* Create hash table for the mapping :row -> proc*/ 78*d5b485abSSatish Balay for ( i=0,j=0; i<size; i++ ) { 79*d5b485abSSatish Balay len = c->rowners[i+1]; 80*d5b485abSSatish Balay for ( ; j<len; j++ ) { 81*d5b485abSSatish Balay rtable[j] = i; 82*d5b485abSSatish Balay } 83*d5b485abSSatish Balay } 84*d5b485abSSatish Balay 85*d5b485abSSatish Balay /* evaluate communication - mesg to who, length of mesg, and buffer space 86*d5b485abSSatish Balay required. Based on this, buffers are allocated, and data copied into them*/ 87*d5b485abSSatish Balay w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 88*d5b485abSSatish Balay w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 89*d5b485abSSatish Balay w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 90*d5b485abSSatish Balay w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 91*d5b485abSSatish Balay PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 92*d5b485abSSatish Balay for ( i=0; i<imax; i++ ) { 93*d5b485abSSatish Balay PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 94*d5b485abSSatish Balay idx_i = idx[i]; 95*d5b485abSSatish Balay len = n[i]; 96*d5b485abSSatish Balay for ( j=0; j<len; j++ ) { 97*d5b485abSSatish Balay row = idx_i[j]; 98*d5b485abSSatish Balay proc = rtable[row]; 99*d5b485abSSatish Balay w4[proc]++; 100*d5b485abSSatish Balay } 101*d5b485abSSatish Balay for ( j=0; j<size; j++ ){ 102*d5b485abSSatish Balay if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 103*d5b485abSSatish Balay } 104*d5b485abSSatish Balay } 105*d5b485abSSatish Balay 106*d5b485abSSatish Balay nrqs = 0; /* no of outgoing messages */ 107*d5b485abSSatish Balay msz = 0; /* total mesg length (for all proc */ 108*d5b485abSSatish Balay w1[rank] = 0; /* no mesg sent to intself */ 109*d5b485abSSatish Balay w3[rank] = 0; 110*d5b485abSSatish Balay for ( i=0; i<size; i++ ) { 111*d5b485abSSatish Balay if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 112*d5b485abSSatish Balay } 113*d5b485abSSatish Balay /* pa - is list of processors to communicate with */ 114*d5b485abSSatish Balay pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 115*d5b485abSSatish Balay for ( i=0,j=0; i<size; i++ ) { 116*d5b485abSSatish Balay if (w1[i]) {pa[j] = i; j++;} 117*d5b485abSSatish Balay } 118*d5b485abSSatish Balay 119*d5b485abSSatish Balay /* Each message would have a header = 1 + 2*(no of IS) + data */ 120*d5b485abSSatish Balay for ( i=0; i<nrqs; i++ ) { 121*d5b485abSSatish Balay j = pa[i]; 122*d5b485abSSatish Balay w1[j] += w2[j] + 2*w3[j]; 123*d5b485abSSatish Balay msz += w1[j]; 124*d5b485abSSatish Balay } 125*d5b485abSSatish Balay 126*d5b485abSSatish Balay 127*d5b485abSSatish Balay /* Do a global reduction to determine how many messages to expect*/ 128*d5b485abSSatish Balay { 129*d5b485abSSatish Balay int *rw1, *rw2; 130*d5b485abSSatish Balay rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 131*d5b485abSSatish Balay rw2 = rw1+size; 132*d5b485abSSatish Balay MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 133*d5b485abSSatish Balay bsz = rw1[rank]; 134*d5b485abSSatish Balay MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 135*d5b485abSSatish Balay nrqr = rw2[rank]; 136*d5b485abSSatish Balay PetscFree(rw1); 137*d5b485abSSatish Balay } 138*d5b485abSSatish Balay 139*d5b485abSSatish Balay /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 140*d5b485abSSatish Balay len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 141*d5b485abSSatish Balay rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 142*d5b485abSSatish Balay rbuf[0] = (int *) (rbuf + nrqr); 143*d5b485abSSatish Balay for ( i=1; i<nrqr; ++i ) rbuf[i] = rbuf[i-1] + bsz; 144*d5b485abSSatish Balay 145*d5b485abSSatish Balay /* Post the receives */ 146*d5b485abSSatish Balay r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 147*d5b485abSSatish Balay CHKPTRQ(r_waits1); 148*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ){ 149*d5b485abSSatish Balay MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 150*d5b485abSSatish Balay } 151*d5b485abSSatish Balay 152*d5b485abSSatish Balay /* Allocate Memory for outgoing messages */ 153*d5b485abSSatish Balay len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 154*d5b485abSSatish Balay outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 155*d5b485abSSatish Balay ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 156*d5b485abSSatish Balay PetscMemzero(outdat,2*size*sizeof(int*)); 157*d5b485abSSatish Balay tmp = (int *) (outdat + 2*size); 158*d5b485abSSatish Balay ctr = tmp + msz; 159*d5b485abSSatish Balay 160*d5b485abSSatish Balay { 161*d5b485abSSatish Balay int *iptr = tmp,ict = 0; 162*d5b485abSSatish Balay for ( i=0; i<nrqs; i++ ) { 163*d5b485abSSatish Balay j = pa[i]; 164*d5b485abSSatish Balay iptr += ict; 165*d5b485abSSatish Balay outdat[j] = iptr; 166*d5b485abSSatish Balay ict = w1[j]; 167*d5b485abSSatish Balay } 168*d5b485abSSatish Balay } 169*d5b485abSSatish Balay 170*d5b485abSSatish Balay /* Form the outgoing messages */ 171*d5b485abSSatish Balay /*plug in the headers*/ 172*d5b485abSSatish Balay for ( i=0; i<nrqs; i++ ) { 173*d5b485abSSatish Balay j = pa[i]; 174*d5b485abSSatish Balay outdat[j][0] = 0; 175*d5b485abSSatish Balay PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 176*d5b485abSSatish Balay ptr[j] = outdat[j] + 2*w3[j] + 1; 177*d5b485abSSatish Balay } 178*d5b485abSSatish Balay 179*d5b485abSSatish Balay /* Memory for doing local proc's work*/ 180*d5b485abSSatish Balay { 181*d5b485abSSatish Balay int *d_p; 182*d5b485abSSatish Balay char *t_p; 183*d5b485abSSatish Balay 184*d5b485abSSatish Balay len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 185*d5b485abSSatish Balay (m+1)*imax*sizeof(int) + (m/BITSPERBYTE+1)*imax*sizeof(char); 186*d5b485abSSatish Balay table = (char **)PetscMalloc(len); CHKPTRQ(table); 187*d5b485abSSatish Balay data = (int **)(table + imax); 188*d5b485abSSatish Balay data[0] = (int *)(data + imax); 189*d5b485abSSatish Balay isz = data[0] + (m+1)*imax; 190*d5b485abSSatish Balay table[0] = (char *)(isz + imax); 191*d5b485abSSatish Balay d_p = data[0]; t_p = table[0]; 192*d5b485abSSatish Balay for ( i=1; i<imax; i++ ) { 193*d5b485abSSatish Balay table[i] = t_p + (m/BITSPERBYTE+1)*i; 194*d5b485abSSatish Balay data[i] = d_p + (m+1)*i; 195*d5b485abSSatish Balay } 196*d5b485abSSatish Balay } 197*d5b485abSSatish Balay PetscMemzero(*table,(m/BITSPERBYTE+1)*imax); 198*d5b485abSSatish Balay PetscMemzero(isz,imax*sizeof(int)); 199*d5b485abSSatish Balay 200*d5b485abSSatish Balay /* Parse the IS and update local tables and the outgoing buf with the data*/ 201*d5b485abSSatish Balay { 202*d5b485abSSatish Balay int n_i,*data_i,isz_i,*outdat_j,ctr_j; 203*d5b485abSSatish Balay char *table_i; 204*d5b485abSSatish Balay 205*d5b485abSSatish Balay for ( i=0; i<imax; i++ ) { 206*d5b485abSSatish Balay PetscMemzero(ctr,size*sizeof(int)); 207*d5b485abSSatish Balay n_i = n[i]; 208*d5b485abSSatish Balay table_i = table[i]; 209*d5b485abSSatish Balay idx_i = idx[i]; 210*d5b485abSSatish Balay data_i = data[i]; 211*d5b485abSSatish Balay isz_i = isz[i]; 212*d5b485abSSatish Balay for ( j=0; j<n_i; j++ ) { /* parse the indices of each IS */ 213*d5b485abSSatish Balay row = idx_i[j]; 214*d5b485abSSatish Balay proc = rtable[row]; 215*d5b485abSSatish Balay if (proc != rank) { /* copy to the outgoing buffer */ 216*d5b485abSSatish Balay ctr[proc]++; 217*d5b485abSSatish Balay *ptr[proc] = row; 218*d5b485abSSatish Balay ptr[proc]++; 219*d5b485abSSatish Balay } 220*d5b485abSSatish Balay else { /* Update the local table */ 221*d5b485abSSatish Balay if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 222*d5b485abSSatish Balay } 223*d5b485abSSatish Balay } 224*d5b485abSSatish Balay /* Update the headers for the current IS */ 225*d5b485abSSatish Balay for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */ 226*d5b485abSSatish Balay if ((ctr_j = ctr[j])) { 227*d5b485abSSatish Balay outdat_j = outdat[j]; 228*d5b485abSSatish Balay k = ++outdat_j[0]; 229*d5b485abSSatish Balay outdat_j[2*k] = ctr_j; 230*d5b485abSSatish Balay outdat_j[2*k-1] = i; 231*d5b485abSSatish Balay } 232*d5b485abSSatish Balay } 233*d5b485abSSatish Balay isz[i] = isz_i; 234*d5b485abSSatish Balay } 235*d5b485abSSatish Balay } 236*d5b485abSSatish Balay 237*d5b485abSSatish Balay 238*d5b485abSSatish Balay 239*d5b485abSSatish Balay /* Now post the sends */ 240*d5b485abSSatish Balay s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 241*d5b485abSSatish Balay CHKPTRQ(s_waits1); 242*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 243*d5b485abSSatish Balay j = pa[i]; 244*d5b485abSSatish Balay MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 245*d5b485abSSatish Balay } 246*d5b485abSSatish Balay 247*d5b485abSSatish Balay /* No longer need the original indices*/ 248*d5b485abSSatish Balay for ( i=0; i<imax; ++i ) { 249*d5b485abSSatish Balay ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 250*d5b485abSSatish Balay } 251*d5b485abSSatish Balay PetscFree(idx); 252*d5b485abSSatish Balay 253*d5b485abSSatish Balay for ( i=0; i<imax; ++i ) { 254*d5b485abSSatish Balay ierr = ISDestroy(is[i]); CHKERRQ(ierr); 255*d5b485abSSatish Balay } 256*d5b485abSSatish Balay 257*d5b485abSSatish Balay /* Do Local work*/ 258*d5b485abSSatish Balay ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 259*d5b485abSSatish Balay 260*d5b485abSSatish Balay /* Receive messages*/ 261*d5b485abSSatish Balay { 262*d5b485abSSatish Balay int index; 263*d5b485abSSatish Balay 264*d5b485abSSatish Balay recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 265*d5b485abSSatish Balay CHKPTRQ(recv_status); 266*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ) { 267*d5b485abSSatish Balay MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 268*d5b485abSSatish Balay } 269*d5b485abSSatish Balay 270*d5b485abSSatish Balay s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 271*d5b485abSSatish Balay CHKPTRQ(s_status); 272*d5b485abSSatish Balay MPI_Waitall(nrqs,s_waits1,s_status); 273*d5b485abSSatish Balay } 274*d5b485abSSatish Balay 275*d5b485abSSatish Balay /* Phase 1 sends are complete - deallocate buffers */ 276*d5b485abSSatish Balay PetscFree(outdat); 277*d5b485abSSatish Balay PetscFree(w1); 278*d5b485abSSatish Balay 279*d5b485abSSatish Balay xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 280*d5b485abSSatish Balay isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 281*d5b485abSSatish Balay ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 282*d5b485abSSatish Balay PetscFree(rbuf); 283*d5b485abSSatish Balay 284*d5b485abSSatish Balay /* Send the data back*/ 285*d5b485abSSatish Balay /* Do a global reduction to know the buffer space req for incoming messages*/ 286*d5b485abSSatish Balay { 287*d5b485abSSatish Balay int *rw1, *rw2; 288*d5b485abSSatish Balay 289*d5b485abSSatish Balay rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 290*d5b485abSSatish Balay PetscMemzero(rw1,2*size*sizeof(int)); 291*d5b485abSSatish Balay rw2 = rw1+size; 292*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ) { 293*d5b485abSSatish Balay proc = recv_status[i].MPI_SOURCE; 294*d5b485abSSatish Balay rw1[proc] = isz1[i]; 295*d5b485abSSatish Balay } 296*d5b485abSSatish Balay 297*d5b485abSSatish Balay MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 298*d5b485abSSatish Balay bsz1 = rw2[rank]; 299*d5b485abSSatish Balay PetscFree(rw1); 300*d5b485abSSatish Balay } 301*d5b485abSSatish Balay 302*d5b485abSSatish Balay /* Allocate buffers*/ 303*d5b485abSSatish Balay 304*d5b485abSSatish Balay /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 305*d5b485abSSatish Balay len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 306*d5b485abSSatish Balay rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 307*d5b485abSSatish Balay rbuf2[0] = (int *) (rbuf2 + nrqs); 308*d5b485abSSatish Balay for ( i=1; i<nrqs; ++i ) rbuf2[i] = rbuf2[i-1] + bsz1; 309*d5b485abSSatish Balay 310*d5b485abSSatish Balay /* Post the receives */ 311*d5b485abSSatish Balay r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 312*d5b485abSSatish Balay CHKPTRQ(r_waits2); 313*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 314*d5b485abSSatish Balay MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 315*d5b485abSSatish Balay } 316*d5b485abSSatish Balay 317*d5b485abSSatish Balay /* Now post the sends */ 318*d5b485abSSatish Balay s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 319*d5b485abSSatish Balay CHKPTRQ(s_waits2); 320*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ) { 321*d5b485abSSatish Balay j = recv_status[i].MPI_SOURCE; 322*d5b485abSSatish Balay MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 323*d5b485abSSatish Balay } 324*d5b485abSSatish Balay 325*d5b485abSSatish Balay /* receive work done on other processors*/ 326*d5b485abSSatish Balay { 327*d5b485abSSatish Balay int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 328*d5b485abSSatish Balay char *table_i; 329*d5b485abSSatish Balay MPI_Status *status2; 330*d5b485abSSatish Balay 331*d5b485abSSatish Balay status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 332*d5b485abSSatish Balay 333*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 334*d5b485abSSatish Balay MPI_Waitany(nrqs, r_waits2, &index, status2+i); 335*d5b485abSSatish Balay /* Process the message*/ 336*d5b485abSSatish Balay rbuf2_i = rbuf2[index]; 337*d5b485abSSatish Balay ct1 = 2*rbuf2_i[0]+1; 338*d5b485abSSatish Balay jmax = rbuf2[index][0]; 339*d5b485abSSatish Balay for ( j=1; j<=jmax; j++ ) { 340*d5b485abSSatish Balay max = rbuf2_i[2*j]; 341*d5b485abSSatish Balay is_no = rbuf2_i[2*j-1]; 342*d5b485abSSatish Balay isz_i = isz[is_no]; 343*d5b485abSSatish Balay data_i = data[is_no]; 344*d5b485abSSatish Balay table_i = table[is_no]; 345*d5b485abSSatish Balay for ( k=0; k<max; k++,ct1++ ) { 346*d5b485abSSatish Balay row = rbuf2_i[ct1]; 347*d5b485abSSatish Balay if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 348*d5b485abSSatish Balay } 349*d5b485abSSatish Balay isz[is_no] = isz_i; 350*d5b485abSSatish Balay } 351*d5b485abSSatish Balay } 352*d5b485abSSatish Balay MPI_Waitall(nrqr,s_waits2,status2); 353*d5b485abSSatish Balay PetscFree(status2); 354*d5b485abSSatish Balay } 355*d5b485abSSatish Balay 356*d5b485abSSatish Balay for ( i=0; i<imax; ++i ) { 357*d5b485abSSatish Balay ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 358*d5b485abSSatish Balay } 359*d5b485abSSatish Balay 360*d5b485abSSatish Balay PetscFree(pa); 361*d5b485abSSatish Balay PetscFree(rbuf2); 362*d5b485abSSatish Balay PetscFree(s_waits1); 363*d5b485abSSatish Balay PetscFree(r_waits1); 364*d5b485abSSatish Balay PetscFree(s_waits2); 365*d5b485abSSatish Balay PetscFree(r_waits2); 366*d5b485abSSatish Balay PetscFree(table); 367*d5b485abSSatish Balay PetscFree(s_status); 368*d5b485abSSatish Balay PetscFree(recv_status); 369*d5b485abSSatish Balay PetscFree(xdata[0]); 370*d5b485abSSatish Balay PetscFree(xdata); 371*d5b485abSSatish Balay PetscFree(isz1); 372*d5b485abSSatish Balay return 0; 373*d5b485abSSatish Balay } 374*d5b485abSSatish Balay 375*d5b485abSSatish Balay /* 376*d5b485abSSatish Balay MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 377*d5b485abSSatish Balay the work on the local processor. 378*d5b485abSSatish Balay 379*d5b485abSSatish Balay Inputs: 380*d5b485abSSatish Balay C - MAT_MPIAIJ; 381*d5b485abSSatish Balay imax - total no of index sets processed at a time; 382*d5b485abSSatish Balay table - an array of char - size = m bits. 383*d5b485abSSatish Balay 384*d5b485abSSatish Balay Output: 385*d5b485abSSatish Balay isz - array containing the count of the solution elements correspondign 386*d5b485abSSatish Balay to each index set; 387*d5b485abSSatish Balay data - pointer to the solutions 388*d5b485abSSatish Balay */ 389*d5b485abSSatish Balay static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,char **table,int *isz, 390*d5b485abSSatish Balay int **data) 391*d5b485abSSatish Balay { 392*d5b485abSSatish Balay Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 393*d5b485abSSatish Balay Mat A = c->A, B = c->B; 394*d5b485abSSatish Balay Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 395*d5b485abSSatish Balay int start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj; 396*d5b485abSSatish Balay int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 397*d5b485abSSatish Balay char *table_i; 398*d5b485abSSatish Balay 399*d5b485abSSatish Balay rstart = c->rstart; 400*d5b485abSSatish Balay cstart = c->cstart; 401*d5b485abSSatish Balay ashift = a->indexshift; 402*d5b485abSSatish Balay ai = a->i; 403*d5b485abSSatish Balay aj = a->j +ashift; 404*d5b485abSSatish Balay bshift = b->indexshift; 405*d5b485abSSatish Balay bi = b->i; 406*d5b485abSSatish Balay bj = b->j +bshift; 407*d5b485abSSatish Balay garray = c->garray; 408*d5b485abSSatish Balay 409*d5b485abSSatish Balay 410*d5b485abSSatish Balay for ( i=0; i<imax; i++ ) { 411*d5b485abSSatish Balay data_i = data[i]; 412*d5b485abSSatish Balay table_i = table[i]; 413*d5b485abSSatish Balay isz_i = isz[i]; 414*d5b485abSSatish Balay for ( j=0, max=isz[i]; j<max; j++ ) { 415*d5b485abSSatish Balay row = data_i[j] - rstart; 416*d5b485abSSatish Balay start = ai[row]; 417*d5b485abSSatish Balay end = ai[row+1]; 418*d5b485abSSatish Balay for ( k=start; k<end; k++ ) { /* Amat */ 419*d5b485abSSatish Balay val = aj[k] + ashift + cstart; 420*d5b485abSSatish Balay if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 421*d5b485abSSatish Balay } 422*d5b485abSSatish Balay start = bi[row]; 423*d5b485abSSatish Balay end = bi[row+1]; 424*d5b485abSSatish Balay for ( k=start; k<end; k++ ) { /* Bmat */ 425*d5b485abSSatish Balay val = garray[bj[k]+bshift]; 426*d5b485abSSatish Balay if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 427*d5b485abSSatish Balay } 428*d5b485abSSatish Balay } 429*d5b485abSSatish Balay isz[i] = isz_i; 430*d5b485abSSatish Balay } 431*d5b485abSSatish Balay return 0; 432*d5b485abSSatish Balay } 433*d5b485abSSatish Balay /* 434*d5b485abSSatish Balay MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 435*d5b485abSSatish Balay and return the output 436*d5b485abSSatish Balay 437*d5b485abSSatish Balay Input: 438*d5b485abSSatish Balay C - the matrix 439*d5b485abSSatish Balay nrqr - no of messages being processed. 440*d5b485abSSatish Balay rbuf - an array of pointers to the recieved requests 441*d5b485abSSatish Balay 442*d5b485abSSatish Balay Output: 443*d5b485abSSatish Balay xdata - array of messages to be sent back 444*d5b485abSSatish Balay isz1 - size of each message 445*d5b485abSSatish Balay 446*d5b485abSSatish Balay For better efficiency perhaps we should malloc seperately each xdata[i], 447*d5b485abSSatish Balay then if a remalloc is required we need only copy the data for that one row 448*d5b485abSSatish Balay rather then all previous rows as it is now where a single large chunck of 449*d5b485abSSatish Balay memory is used. 450*d5b485abSSatish Balay 451*d5b485abSSatish Balay */ 452*d5b485abSSatish Balay static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf, 453*d5b485abSSatish Balay int **xdata, int * isz1) 454*d5b485abSSatish Balay { 455*d5b485abSSatish Balay Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 456*d5b485abSSatish Balay Mat A = c->A, B = c->B; 457*d5b485abSSatish Balay Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 458*d5b485abSSatish Balay int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k; 459*d5b485abSSatish Balay int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 460*d5b485abSSatish Balay int val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr; 461*d5b485abSSatish Balay int *rbuf_i,kmax,rbuf_0; 462*d5b485abSSatish Balay char *xtable; 463*d5b485abSSatish Balay 464*d5b485abSSatish Balay rank = c->rank; 465*d5b485abSSatish Balay m = c->M; 466*d5b485abSSatish Balay rstart = c->rstart; 467*d5b485abSSatish Balay cstart = c->cstart; 468*d5b485abSSatish Balay ashift = a->indexshift; 469*d5b485abSSatish Balay ai = a->i; 470*d5b485abSSatish Balay aj = a->j +ashift; 471*d5b485abSSatish Balay bshift = b->indexshift; 472*d5b485abSSatish Balay bi = b->i; 473*d5b485abSSatish Balay bj = b->j +bshift; 474*d5b485abSSatish Balay garray = c->garray; 475*d5b485abSSatish Balay 476*d5b485abSSatish Balay 477*d5b485abSSatish Balay for ( i=0,ct=0,total_sz=0; i<nrqr; ++i ) { 478*d5b485abSSatish Balay rbuf_i = rbuf[i]; 479*d5b485abSSatish Balay rbuf_0 = rbuf_i[0]; 480*d5b485abSSatish Balay ct += rbuf_0; 481*d5b485abSSatish Balay for ( j=1; j<=rbuf_0; j++ ) { total_sz += rbuf_i[2*j]; } 482*d5b485abSSatish Balay } 483*d5b485abSSatish Balay 484*d5b485abSSatish Balay max1 = ct*(a->nz +b->nz)/c->m; 485*d5b485abSSatish Balay mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 486*d5b485abSSatish Balay xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 487*d5b485abSSatish Balay ++no_malloc; 488*d5b485abSSatish Balay xtable = (char *)PetscMalloc((m/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 489*d5b485abSSatish Balay PetscMemzero(isz1,nrqr*sizeof(int)); 490*d5b485abSSatish Balay 491*d5b485abSSatish Balay ct3 = 0; 492*d5b485abSSatish Balay for ( i=0; i<nrqr; i++ ) { /* for easch mesg from proc i */ 493*d5b485abSSatish Balay rbuf_i = rbuf[i]; 494*d5b485abSSatish Balay rbuf_0 = rbuf_i[0]; 495*d5b485abSSatish Balay ct1 = 2*rbuf_0+1; 496*d5b485abSSatish Balay ct2 = ct1; 497*d5b485abSSatish Balay ct3 += ct1; 498*d5b485abSSatish Balay for ( j=1; j<=rbuf_0; j++ ) { /* for each IS from proc i*/ 499*d5b485abSSatish Balay PetscMemzero(xtable,(m/BITSPERBYTE+1)*sizeof(char)); 500*d5b485abSSatish Balay oct2 = ct2; 501*d5b485abSSatish Balay kmax = rbuf_i[2*j]; 502*d5b485abSSatish Balay for ( k=0; k<kmax; k++, ct1++ ) { 503*d5b485abSSatish Balay row = rbuf_i[ct1]; 504*d5b485abSSatish Balay if (!BT_LOOKUP(xtable,row)) { 505*d5b485abSSatish Balay if (!(ct3 < mem_estimate)) { 506*d5b485abSSatish Balay new_estimate = (int)(1.5*mem_estimate)+1; 507*d5b485abSSatish Balay tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 508*d5b485abSSatish Balay PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 509*d5b485abSSatish Balay PetscFree(xdata[0]); 510*d5b485abSSatish Balay xdata[0] = tmp; 511*d5b485abSSatish Balay mem_estimate = new_estimate; ++no_malloc; 512*d5b485abSSatish Balay for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 513*d5b485abSSatish Balay } 514*d5b485abSSatish Balay xdata[i][ct2++] = row; 515*d5b485abSSatish Balay ct3++; 516*d5b485abSSatish Balay } 517*d5b485abSSatish Balay } 518*d5b485abSSatish Balay for ( k=oct2,max2=ct2; k<max2; k++ ) { 519*d5b485abSSatish Balay row = xdata[i][k] - rstart; 520*d5b485abSSatish Balay start = ai[row]; 521*d5b485abSSatish Balay end = ai[row+1]; 522*d5b485abSSatish Balay for ( l=start; l<end; l++ ) { 523*d5b485abSSatish Balay val = aj[l] + ashift + cstart; 524*d5b485abSSatish Balay if (!BT_LOOKUP(xtable,val)) { 525*d5b485abSSatish Balay if (!(ct3 < mem_estimate)) { 526*d5b485abSSatish Balay new_estimate = (int)(1.5*mem_estimate)+1; 527*d5b485abSSatish Balay tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 528*d5b485abSSatish Balay PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 529*d5b485abSSatish Balay PetscFree(xdata[0]); 530*d5b485abSSatish Balay xdata[0] = tmp; 531*d5b485abSSatish Balay mem_estimate = new_estimate; ++no_malloc; 532*d5b485abSSatish Balay for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 533*d5b485abSSatish Balay } 534*d5b485abSSatish Balay xdata[i][ct2++] = val; 535*d5b485abSSatish Balay ct3++; 536*d5b485abSSatish Balay } 537*d5b485abSSatish Balay } 538*d5b485abSSatish Balay start = bi[row]; 539*d5b485abSSatish Balay end = bi[row+1]; 540*d5b485abSSatish Balay for ( l=start; l<end; l++ ) { 541*d5b485abSSatish Balay val = garray[bj[l]+bshift]; 542*d5b485abSSatish Balay if (!BT_LOOKUP(xtable,val)) { 543*d5b485abSSatish Balay if (!(ct3 < mem_estimate)) { 544*d5b485abSSatish Balay new_estimate = (int)(1.5*mem_estimate)+1; 545*d5b485abSSatish Balay tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 546*d5b485abSSatish Balay PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 547*d5b485abSSatish Balay PetscFree(xdata[0]); 548*d5b485abSSatish Balay xdata[0] = tmp; 549*d5b485abSSatish Balay mem_estimate = new_estimate; ++no_malloc; 550*d5b485abSSatish Balay for ( ctr =1; ctr <=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 551*d5b485abSSatish Balay } 552*d5b485abSSatish Balay xdata[i][ct2++] = val; 553*d5b485abSSatish Balay ct3++; 554*d5b485abSSatish Balay } 555*d5b485abSSatish Balay } 556*d5b485abSSatish Balay } 557*d5b485abSSatish Balay /* Update the header*/ 558*d5b485abSSatish Balay xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 559*d5b485abSSatish Balay xdata[i][2*j-1] = rbuf_i[2*j-1]; 560*d5b485abSSatish Balay } 561*d5b485abSSatish Balay xdata[i][0] = rbuf_0; 562*d5b485abSSatish Balay xdata[i+1] = xdata[i] + ct2; 563*d5b485abSSatish Balay isz1[i] = ct2; /* size of each message */ 564*d5b485abSSatish Balay } 565*d5b485abSSatish Balay PetscFree(xtable); 566*d5b485abSSatish Balay PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 567*d5b485abSSatish Balay return 0; 568*d5b485abSSatish Balay } 569*d5b485abSSatish Balay 570*d5b485abSSatish Balay /* -------------------------------------------------------------------------*/ 571*d5b485abSSatish Balay int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 572*d5b485abSSatish Balay MatGetSubMatrixCall scall,Mat **submat) 573*d5b485abSSatish Balay { 574*d5b485abSSatish Balay Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 575*d5b485abSSatish Balay Mat A = c->A,*submats = *submat; 576*d5b485abSSatish Balay Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)c->B->data, *mat; 577*d5b485abSSatish Balay int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 578*d5b485abSSatish Balay int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 579*d5b485abSSatish Balay int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,tag,*tmp,tcol,bsz,nrqr; 580*d5b485abSSatish Balay int **rbuf3,*req_source,**sbuf_aj, ashift, **rbuf2, max1,max2,**rmap; 581*d5b485abSSatish Balay int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 582*d5b485abSSatish Balay int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 583*d5b485abSSatish Balay int *rmap_i; 584*d5b485abSSatish Balay MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 585*d5b485abSSatish Balay MPI_Request *r_waits4,*s_waits3,*s_waits4; 586*d5b485abSSatish Balay MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 587*d5b485abSSatish Balay MPI_Status *r_status3,*r_status4,*s_status4; 588*d5b485abSSatish Balay MPI_Comm comm; 589*d5b485abSSatish Balay Scalar **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i; 590*d5b485abSSatish Balay 591*d5b485abSSatish Balay comm = C->comm; 592*d5b485abSSatish Balay tag = C->tag; 593*d5b485abSSatish Balay size = c->size; 594*d5b485abSSatish Balay rank = c->rank; 595*d5b485abSSatish Balay m = c->M; 596*d5b485abSSatish Balay ashift = a->indexshift; 597*d5b485abSSatish Balay 598*d5b485abSSatish Balay /* Check if the col indices are sorted */ 599*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 600*d5b485abSSatish Balay ierr = ISSorted(iscol[i],(PetscTruth*)&j); 601*d5b485abSSatish Balay if (!j) SETERRQ(1,"MatGetSubmatrices_MPIAIJ:IS is not sorted"); 602*d5b485abSSatish Balay } 603*d5b485abSSatish Balay 604*d5b485abSSatish Balay len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (m+1)*sizeof(int); 605*d5b485abSSatish Balay irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 606*d5b485abSSatish Balay icol = irow + ismax; 607*d5b485abSSatish Balay nrow = (int *) (icol + ismax); 608*d5b485abSSatish Balay ncol = nrow + ismax; 609*d5b485abSSatish Balay rtable = ncol + ismax; 610*d5b485abSSatish Balay 611*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 612*d5b485abSSatish Balay ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 613*d5b485abSSatish Balay ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 614*d5b485abSSatish Balay ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 615*d5b485abSSatish Balay ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 616*d5b485abSSatish Balay } 617*d5b485abSSatish Balay 618*d5b485abSSatish Balay /* Create hash table for the mapping :row -> proc*/ 619*d5b485abSSatish Balay for ( i=0,j=0; i<size; i++ ) { 620*d5b485abSSatish Balay jmax = c->rowners[i+1]; 621*d5b485abSSatish Balay for ( ; j<jmax; j++ ) { 622*d5b485abSSatish Balay rtable[j] = i; 623*d5b485abSSatish Balay } 624*d5b485abSSatish Balay } 625*d5b485abSSatish Balay 626*d5b485abSSatish Balay /* evaluate communication - mesg to who, length of mesg, and buffer space 627*d5b485abSSatish Balay required. Based on this, buffers are allocated, and data copied into them*/ 628*d5b485abSSatish Balay w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 629*d5b485abSSatish Balay w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 630*d5b485abSSatish Balay w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 631*d5b485abSSatish Balay w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 632*d5b485abSSatish Balay PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 633*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 634*d5b485abSSatish Balay PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 635*d5b485abSSatish Balay jmax = nrow[i]; 636*d5b485abSSatish Balay irow_i = irow[i]; 637*d5b485abSSatish Balay for ( j=0; j<jmax; j++ ) { 638*d5b485abSSatish Balay row = irow_i[j]; 639*d5b485abSSatish Balay proc = rtable[row]; 640*d5b485abSSatish Balay w4[proc]++; 641*d5b485abSSatish Balay } 642*d5b485abSSatish Balay for ( j=0; j<size; j++ ) { 643*d5b485abSSatish Balay if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 644*d5b485abSSatish Balay } 645*d5b485abSSatish Balay } 646*d5b485abSSatish Balay 647*d5b485abSSatish Balay nrqs = 0; /* no of outgoing messages */ 648*d5b485abSSatish Balay msz = 0; /* total mesg length (for all proc */ 649*d5b485abSSatish Balay w1[rank] = 0; /* no mesg sent to intself */ 650*d5b485abSSatish Balay w3[rank] = 0; 651*d5b485abSSatish Balay for ( i=0; i<size; i++ ) { 652*d5b485abSSatish Balay if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 653*d5b485abSSatish Balay } 654*d5b485abSSatish Balay pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 655*d5b485abSSatish Balay for ( i=0, j=0; i<size; i++ ) { 656*d5b485abSSatish Balay if (w1[i]) { pa[j] = i; j++; } 657*d5b485abSSatish Balay } 658*d5b485abSSatish Balay 659*d5b485abSSatish Balay /* Each message would have a header = 1 + 2*(no of IS) + data */ 660*d5b485abSSatish Balay for ( i=0; i<nrqs; i++ ) { 661*d5b485abSSatish Balay j = pa[i]; 662*d5b485abSSatish Balay w1[j] += w2[j] + 2* w3[j]; 663*d5b485abSSatish Balay msz += w1[j]; 664*d5b485abSSatish Balay } 665*d5b485abSSatish Balay /* Do a global reduction to determine how many messages to expect*/ 666*d5b485abSSatish Balay { 667*d5b485abSSatish Balay int *rw1, *rw2; 668*d5b485abSSatish Balay rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 669*d5b485abSSatish Balay rw2 = rw1+size; 670*d5b485abSSatish Balay MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 671*d5b485abSSatish Balay bsz = rw1[rank]; 672*d5b485abSSatish Balay MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 673*d5b485abSSatish Balay nrqr = rw2[rank]; 674*d5b485abSSatish Balay PetscFree(rw1); 675*d5b485abSSatish Balay } 676*d5b485abSSatish Balay 677*d5b485abSSatish Balay /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 678*d5b485abSSatish Balay len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 679*d5b485abSSatish Balay rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 680*d5b485abSSatish Balay rbuf1[0] = (int *) (rbuf1 + nrqr); 681*d5b485abSSatish Balay for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 682*d5b485abSSatish Balay 683*d5b485abSSatish Balay /* Post the receives */ 684*d5b485abSSatish Balay r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 685*d5b485abSSatish Balay CHKPTRQ(r_waits1); 686*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ) { 687*d5b485abSSatish Balay MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 688*d5b485abSSatish Balay } 689*d5b485abSSatish Balay 690*d5b485abSSatish Balay /* Allocate Memory for outgoing messages */ 691*d5b485abSSatish Balay len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 692*d5b485abSSatish Balay sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 693*d5b485abSSatish Balay ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 694*d5b485abSSatish Balay PetscMemzero(sbuf1,2*size*sizeof(int*)); 695*d5b485abSSatish Balay /* allocate memory for outgoing data + buf to receive the first reply */ 696*d5b485abSSatish Balay tmp = (int *) (ptr + size); 697*d5b485abSSatish Balay ctr = tmp + 2*msz; 698*d5b485abSSatish Balay 699*d5b485abSSatish Balay { 700*d5b485abSSatish Balay int *iptr = tmp,ict = 0; 701*d5b485abSSatish Balay for ( i=0; i<nrqs; i++ ) { 702*d5b485abSSatish Balay j = pa[i]; 703*d5b485abSSatish Balay iptr += ict; 704*d5b485abSSatish Balay sbuf1[j] = iptr; 705*d5b485abSSatish Balay ict = w1[j]; 706*d5b485abSSatish Balay } 707*d5b485abSSatish Balay } 708*d5b485abSSatish Balay 709*d5b485abSSatish Balay /* Form the outgoing messages */ 710*d5b485abSSatish Balay /* Initialise the header space */ 711*d5b485abSSatish Balay for ( i=0; i<nrqs; i++ ) { 712*d5b485abSSatish Balay j = pa[i]; 713*d5b485abSSatish Balay sbuf1[j][0] = 0; 714*d5b485abSSatish Balay PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 715*d5b485abSSatish Balay ptr[j] = sbuf1[j] + 2*w3[j] + 1; 716*d5b485abSSatish Balay } 717*d5b485abSSatish Balay 718*d5b485abSSatish Balay /* Parse the isrow and copy data into outbuf */ 719*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 720*d5b485abSSatish Balay PetscMemzero(ctr,size*sizeof(int)); 721*d5b485abSSatish Balay irow_i = irow[i]; 722*d5b485abSSatish Balay jmax = nrow[i]; 723*d5b485abSSatish Balay for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 724*d5b485abSSatish Balay row = irow_i[j]; 725*d5b485abSSatish Balay proc = rtable[row]; 726*d5b485abSSatish Balay if (proc != rank) { /* copy to the outgoing buf*/ 727*d5b485abSSatish Balay ctr[proc]++; 728*d5b485abSSatish Balay *ptr[proc] = row; 729*d5b485abSSatish Balay ptr[proc]++; 730*d5b485abSSatish Balay } 731*d5b485abSSatish Balay } 732*d5b485abSSatish Balay /* Update the headers for the current IS */ 733*d5b485abSSatish Balay for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 734*d5b485abSSatish Balay if ((ctr_j = ctr[j])) { 735*d5b485abSSatish Balay sbuf1_j = sbuf1[j]; 736*d5b485abSSatish Balay k = ++sbuf1_j[0]; 737*d5b485abSSatish Balay sbuf1_j[2*k] = ctr_j; 738*d5b485abSSatish Balay sbuf1_j[2*k-1] = i; 739*d5b485abSSatish Balay } 740*d5b485abSSatish Balay } 741*d5b485abSSatish Balay } 742*d5b485abSSatish Balay 743*d5b485abSSatish Balay /* Now post the sends */ 744*d5b485abSSatish Balay s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 745*d5b485abSSatish Balay CHKPTRQ(s_waits1); 746*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 747*d5b485abSSatish Balay j = pa[i]; 748*d5b485abSSatish Balay /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 749*d5b485abSSatish Balay MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 750*d5b485abSSatish Balay } 751*d5b485abSSatish Balay 752*d5b485abSSatish Balay /* Post Recieves to capture the buffer size */ 753*d5b485abSSatish Balay r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 754*d5b485abSSatish Balay CHKPTRQ(r_waits2); 755*d5b485abSSatish Balay rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 756*d5b485abSSatish Balay rbuf2[0] = tmp + msz; 757*d5b485abSSatish Balay for ( i=1; i<nrqs; ++i ) { 758*d5b485abSSatish Balay j = pa[i]; 759*d5b485abSSatish Balay rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 760*d5b485abSSatish Balay } 761*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 762*d5b485abSSatish Balay j = pa[i]; 763*d5b485abSSatish Balay MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag+1, comm, r_waits2+i); 764*d5b485abSSatish Balay } 765*d5b485abSSatish Balay 766*d5b485abSSatish Balay /* Send to other procs the buf size they should allocate */ 767*d5b485abSSatish Balay 768*d5b485abSSatish Balay 769*d5b485abSSatish Balay /* Receive messages*/ 770*d5b485abSSatish Balay s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 771*d5b485abSSatish Balay CHKPTRQ(s_waits2); 772*d5b485abSSatish Balay r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 773*d5b485abSSatish Balay CHKPTRQ(r_status1); 774*d5b485abSSatish Balay len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 775*d5b485abSSatish Balay sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 776*d5b485abSSatish Balay req_size = (int *) (sbuf2 + nrqr); 777*d5b485abSSatish Balay req_source = req_size + nrqr; 778*d5b485abSSatish Balay 779*d5b485abSSatish Balay { 780*d5b485abSSatish Balay Mat_SeqAIJ *sA = (Mat_SeqAIJ*) c->A->data, *sB = (Mat_SeqAIJ*) c->B->data; 781*d5b485abSSatish Balay int *sAi = sA->i, *sBi = sB->i, id, rstart = c->rstart; 782*d5b485abSSatish Balay int *sbuf2_i; 783*d5b485abSSatish Balay 784*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ) { 785*d5b485abSSatish Balay MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 786*d5b485abSSatish Balay req_size[index] = 0; 787*d5b485abSSatish Balay rbuf1_i = rbuf1[index]; 788*d5b485abSSatish Balay start = 2*rbuf1_i[0] + 1; 789*d5b485abSSatish Balay MPI_Get_count(r_status1+i,MPI_INT, &end); 790*d5b485abSSatish Balay sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 791*d5b485abSSatish Balay sbuf2_i = sbuf2[index]; 792*d5b485abSSatish Balay for ( j=start; j<end; j++ ) { 793*d5b485abSSatish Balay id = rbuf1_i[j] - rstart; 794*d5b485abSSatish Balay ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 795*d5b485abSSatish Balay sbuf2_i[j] = ncols; 796*d5b485abSSatish Balay req_size[index] += ncols; 797*d5b485abSSatish Balay } 798*d5b485abSSatish Balay req_source[index] = r_status1[i].MPI_SOURCE; 799*d5b485abSSatish Balay /* form the header */ 800*d5b485abSSatish Balay sbuf2_i[0] = req_size[index]; 801*d5b485abSSatish Balay for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 802*d5b485abSSatish Balay MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag+1,comm,s_waits2+i); 803*d5b485abSSatish Balay } 804*d5b485abSSatish Balay } 805*d5b485abSSatish Balay PetscFree(r_status1); PetscFree(r_waits1); 806*d5b485abSSatish Balay 807*d5b485abSSatish Balay /* recv buffer sizes */ 808*d5b485abSSatish Balay /* Receive messages*/ 809*d5b485abSSatish Balay 810*d5b485abSSatish Balay rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 811*d5b485abSSatish Balay rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 812*d5b485abSSatish Balay r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 813*d5b485abSSatish Balay CHKPTRQ(r_waits3); 814*d5b485abSSatish Balay r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 815*d5b485abSSatish Balay CHKPTRQ(r_waits4); 816*d5b485abSSatish Balay r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 817*d5b485abSSatish Balay CHKPTRQ(r_status2); 818*d5b485abSSatish Balay 819*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 820*d5b485abSSatish Balay MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 821*d5b485abSSatish Balay rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 822*d5b485abSSatish Balay CHKPTRQ(rbuf3[index]); 823*d5b485abSSatish Balay rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 824*d5b485abSSatish Balay CHKPTRQ(rbuf4[index]); 825*d5b485abSSatish Balay MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 826*d5b485abSSatish Balay r_status2[i].MPI_SOURCE, tag+2, comm, r_waits3+index); 827*d5b485abSSatish Balay MPI_Irecv(rbuf4[index],rbuf2[index][0], MPIU_SCALAR, 828*d5b485abSSatish Balay r_status2[i].MPI_SOURCE, tag+3, comm, r_waits4+index); 829*d5b485abSSatish Balay } 830*d5b485abSSatish Balay PetscFree(r_status2); PetscFree(r_waits2); 831*d5b485abSSatish Balay 832*d5b485abSSatish Balay /* Wait on sends1 and sends2 */ 833*d5b485abSSatish Balay s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 834*d5b485abSSatish Balay CHKPTRQ(s_status1); 835*d5b485abSSatish Balay s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 836*d5b485abSSatish Balay CHKPTRQ(s_status2); 837*d5b485abSSatish Balay 838*d5b485abSSatish Balay MPI_Waitall(nrqs,s_waits1,s_status1); 839*d5b485abSSatish Balay MPI_Waitall(nrqr,s_waits2,s_status2); 840*d5b485abSSatish Balay PetscFree(s_status1); PetscFree(s_status2); 841*d5b485abSSatish Balay PetscFree(s_waits1); PetscFree(s_waits2); 842*d5b485abSSatish Balay 843*d5b485abSSatish Balay /* Now allocate buffers for a->j, and send them off */ 844*d5b485abSSatish Balay sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 845*d5b485abSSatish Balay for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 846*d5b485abSSatish Balay sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 847*d5b485abSSatish Balay for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 848*d5b485abSSatish Balay 849*d5b485abSSatish Balay s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 850*d5b485abSSatish Balay CHKPTRQ(s_waits3); 851*d5b485abSSatish Balay { 852*d5b485abSSatish Balay int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 853*d5b485abSSatish Balay int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 854*d5b485abSSatish Balay int *a_j = a->j, *b_j = b->j, shift = a->indexshift,ctmp, *t_cols; 855*d5b485abSSatish Balay 856*d5b485abSSatish Balay for ( i=0; i<nrqr; i++ ) { 857*d5b485abSSatish Balay rbuf1_i = rbuf1[i]; 858*d5b485abSSatish Balay sbuf_aj_i = sbuf_aj[i]; 859*d5b485abSSatish Balay ct1 = 2*rbuf1_i[0] + 1; 860*d5b485abSSatish Balay ct2 = 0; 861*d5b485abSSatish Balay for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 862*d5b485abSSatish Balay kmax = rbuf1[i][2*j]; 863*d5b485abSSatish Balay for ( k=0; k<kmax; k++,ct1++ ) { 864*d5b485abSSatish Balay row = rbuf1_i[ct1] - cstart; 865*d5b485abSSatish Balay nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 866*d5b485abSSatish Balay ncols = nzA + nzB; 867*d5b485abSSatish Balay cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 868*d5b485abSSatish Balay 869*d5b485abSSatish Balay /* load the column indices for this row into cols*/ 870*d5b485abSSatish Balay cols = sbuf_aj_i + ct2; 871*d5b485abSSatish Balay if (!shift) { 872*d5b485abSSatish Balay for ( l=0; l<nzB; l++ ) { 873*d5b485abSSatish Balay if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 874*d5b485abSSatish Balay else break; 875*d5b485abSSatish Balay } 876*d5b485abSSatish Balay imark = l; 877*d5b485abSSatish Balay for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 878*d5b485abSSatish Balay for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 879*d5b485abSSatish Balay } 880*d5b485abSSatish Balay else { 881*d5b485abSSatish Balay ierr = MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0); CHKERRQ(ierr); 882*d5b485abSSatish Balay PetscMemcpy(cols, t_cols, ncols*sizeof(int)); 883*d5b485abSSatish Balay ierr = MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0); CHKERRQ(ierr); 884*d5b485abSSatish Balay 885*d5b485abSSatish Balay } 886*d5b485abSSatish Balay 887*d5b485abSSatish Balay ct2 += ncols; 888*d5b485abSSatish Balay } 889*d5b485abSSatish Balay } 890*d5b485abSSatish Balay MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag+2,comm,s_waits3+i); 891*d5b485abSSatish Balay } 892*d5b485abSSatish Balay } 893*d5b485abSSatish Balay r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 894*d5b485abSSatish Balay CHKPTRQ(r_status3); 895*d5b485abSSatish Balay s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 896*d5b485abSSatish Balay CHKPTRQ(s_status3); 897*d5b485abSSatish Balay 898*d5b485abSSatish Balay /* Allocate buffers for a->a, and send them off */ 899*d5b485abSSatish Balay sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 900*d5b485abSSatish Balay for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 901*d5b485abSSatish Balay sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 902*d5b485abSSatish Balay for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 903*d5b485abSSatish Balay 904*d5b485abSSatish Balay s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 905*d5b485abSSatish Balay CHKPTRQ(s_waits4); 906*d5b485abSSatish Balay { 907*d5b485abSSatish Balay int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 908*d5b485abSSatish Balay int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 909*d5b485abSSatish Balay int *a_j = a->j, *b_j = b->j,shift = a->indexshift; 910*d5b485abSSatish Balay Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a,*t_vals; 911*d5b485abSSatish Balay 912*d5b485abSSatish Balay for ( i=0; i<nrqr; i++ ) { 913*d5b485abSSatish Balay rbuf1_i = rbuf1[i]; 914*d5b485abSSatish Balay sbuf_aa_i = sbuf_aa[i]; 915*d5b485abSSatish Balay ct1 = 2*rbuf1_i[0]+1; 916*d5b485abSSatish Balay ct2 = 0; 917*d5b485abSSatish Balay for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 918*d5b485abSSatish Balay kmax = rbuf1_i[2*j]; 919*d5b485abSSatish Balay for ( k=0; k<kmax; k++,ct1++ ) { 920*d5b485abSSatish Balay row = rbuf1_i[ct1] - cstart; 921*d5b485abSSatish Balay nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 922*d5b485abSSatish Balay ncols = nzA + nzB; 923*d5b485abSSatish Balay cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 924*d5b485abSSatish Balay vworkA = a_a + a_i[row]; vworkB = b_a + b_i[row]; 925*d5b485abSSatish Balay 926*d5b485abSSatish Balay /* load the column values for this row into vals*/ 927*d5b485abSSatish Balay vals = sbuf_aa_i+ct2; 928*d5b485abSSatish Balay if (!shift) { 929*d5b485abSSatish Balay for ( l=0; l<nzB; l++ ) { 930*d5b485abSSatish Balay if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l]; 931*d5b485abSSatish Balay else break; 932*d5b485abSSatish Balay } 933*d5b485abSSatish Balay imark = l; 934*d5b485abSSatish Balay for ( l=0; l<nzA; l++ ) vals[imark+l] = vworkA[l]; 935*d5b485abSSatish Balay for ( l=imark; l<nzB; l++ ) vals[nzA+l] = vworkB[l]; 936*d5b485abSSatish Balay } 937*d5b485abSSatish Balay else { 938*d5b485abSSatish Balay ierr = MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals); CHKERRQ(ierr); 939*d5b485abSSatish Balay PetscMemcpy(vals, t_vals, ncols*sizeof(Scalar)); 940*d5b485abSSatish Balay ierr = MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals); CHKERRQ(ierr); 941*d5b485abSSatish Balay } 942*d5b485abSSatish Balay ct2 += ncols; 943*d5b485abSSatish Balay } 944*d5b485abSSatish Balay } 945*d5b485abSSatish Balay MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag+3,comm,s_waits4+i); 946*d5b485abSSatish Balay } 947*d5b485abSSatish Balay } 948*d5b485abSSatish Balay r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 949*d5b485abSSatish Balay CHKPTRQ(r_status4); 950*d5b485abSSatish Balay s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 951*d5b485abSSatish Balay CHKPTRQ(s_status4); 952*d5b485abSSatish Balay PetscFree(rbuf1); 953*d5b485abSSatish Balay 954*d5b485abSSatish Balay /* Form the matrix */ 955*d5b485abSSatish Balay /* create col map */ 956*d5b485abSSatish Balay { 957*d5b485abSSatish Balay int *icol_i; 958*d5b485abSSatish Balay 959*d5b485abSSatish Balay len = (1+ismax)*sizeof(int *) + ismax*c->N*sizeof(int); 960*d5b485abSSatish Balay cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 961*d5b485abSSatish Balay cmap[0] = (int *)(cmap + ismax); 962*d5b485abSSatish Balay PetscMemzero(cmap[0],(1+ismax*c->N)*sizeof(int)); 963*d5b485abSSatish Balay for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->N; } 964*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 965*d5b485abSSatish Balay jmax = ncol[i]; 966*d5b485abSSatish Balay icol_i = icol[i]; 967*d5b485abSSatish Balay cmap_i = cmap[i]; 968*d5b485abSSatish Balay for ( j=0; j<jmax; j++ ) { 969*d5b485abSSatish Balay cmap_i[icol_i[j]] = j+1; 970*d5b485abSSatish Balay } 971*d5b485abSSatish Balay } 972*d5b485abSSatish Balay } 973*d5b485abSSatish Balay 974*d5b485abSSatish Balay 975*d5b485abSSatish Balay /* Create lens which is required for MatCreate... */ 976*d5b485abSSatish Balay for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 977*d5b485abSSatish Balay len = (1+ismax)*sizeof(int *) + j*sizeof(int); 978*d5b485abSSatish Balay lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 979*d5b485abSSatish Balay lens[0] = (int *)(lens + ismax); 980*d5b485abSSatish Balay PetscMemzero(lens[0], j*sizeof(int)); 981*d5b485abSSatish Balay for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 982*d5b485abSSatish Balay 983*d5b485abSSatish Balay /* Update lens from local data */ 984*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 985*d5b485abSSatish Balay jmax = nrow[i]; 986*d5b485abSSatish Balay cmap_i = cmap[i]; 987*d5b485abSSatish Balay irow_i = irow[i]; 988*d5b485abSSatish Balay lens_i = lens[i]; 989*d5b485abSSatish Balay for ( j=0; j<jmax; j++ ) { 990*d5b485abSSatish Balay row = irow_i[j]; 991*d5b485abSSatish Balay proc = rtable[row]; 992*d5b485abSSatish Balay if (proc == rank) { 993*d5b485abSSatish Balay ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 994*d5b485abSSatish Balay for ( k=0; k<ncols; k++ ) { 995*d5b485abSSatish Balay if (cmap_i[cols[k]]) { lens_i[j]++;} 996*d5b485abSSatish Balay } 997*d5b485abSSatish Balay ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 998*d5b485abSSatish Balay } 999*d5b485abSSatish Balay } 1000*d5b485abSSatish Balay } 1001*d5b485abSSatish Balay 1002*d5b485abSSatish Balay /* Create row map*/ 1003*d5b485abSSatish Balay len = (1+ismax)*sizeof(int *) + ismax*c->M*sizeof(int); 1004*d5b485abSSatish Balay rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1005*d5b485abSSatish Balay rmap[0] = (int *)(rmap + ismax); 1006*d5b485abSSatish Balay PetscMemzero(rmap[0],ismax*c->M*sizeof(int)); 1007*d5b485abSSatish Balay for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->M;} 1008*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 1009*d5b485abSSatish Balay rmap_i = rmap[i]; 1010*d5b485abSSatish Balay irow_i = irow[i]; 1011*d5b485abSSatish Balay jmax = nrow[i]; 1012*d5b485abSSatish Balay for ( j=0; j<jmax; j++ ) { 1013*d5b485abSSatish Balay rmap_i[irow_i[j]] = j; 1014*d5b485abSSatish Balay } 1015*d5b485abSSatish Balay } 1016*d5b485abSSatish Balay 1017*d5b485abSSatish Balay /* Update lens from offproc data */ 1018*d5b485abSSatish Balay { 1019*d5b485abSSatish Balay int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1020*d5b485abSSatish Balay 1021*d5b485abSSatish Balay for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1022*d5b485abSSatish Balay MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1023*d5b485abSSatish Balay index = pa[i]; 1024*d5b485abSSatish Balay sbuf1_i = sbuf1[index]; 1025*d5b485abSSatish Balay jmax = sbuf1_i[0]; 1026*d5b485abSSatish Balay ct1 = 2*jmax+1; 1027*d5b485abSSatish Balay ct2 = 0; 1028*d5b485abSSatish Balay rbuf2_i = rbuf2[i]; 1029*d5b485abSSatish Balay rbuf3_i = rbuf3[i]; 1030*d5b485abSSatish Balay for ( j=1; j<=jmax; j++ ) { 1031*d5b485abSSatish Balay is_no = sbuf1_i[2*j-1]; 1032*d5b485abSSatish Balay max1 = sbuf1_i[2*j]; 1033*d5b485abSSatish Balay lens_i = lens[is_no]; 1034*d5b485abSSatish Balay cmap_i = cmap[is_no]; 1035*d5b485abSSatish Balay rmap_i = rmap[is_no]; 1036*d5b485abSSatish Balay for ( k=0; k<max1; k++,ct1++ ) { 1037*d5b485abSSatish Balay row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1038*d5b485abSSatish Balay max2 = rbuf2_i[ct1]; 1039*d5b485abSSatish Balay for ( l=0; l<max2; l++,ct2++ ) { 1040*d5b485abSSatish Balay if (cmap_i[rbuf3_i[ct2]]) { 1041*d5b485abSSatish Balay lens_i[row]++; 1042*d5b485abSSatish Balay } 1043*d5b485abSSatish Balay } 1044*d5b485abSSatish Balay } 1045*d5b485abSSatish Balay } 1046*d5b485abSSatish Balay } 1047*d5b485abSSatish Balay } 1048*d5b485abSSatish Balay PetscFree(r_status3); PetscFree(r_waits3); 1049*d5b485abSSatish Balay MPI_Waitall(nrqr,s_waits3,s_status3); 1050*d5b485abSSatish Balay PetscFree(s_status3); PetscFree(s_waits3); 1051*d5b485abSSatish Balay 1052*d5b485abSSatish Balay /* Create the submatrices */ 1053*d5b485abSSatish Balay if (scall == MAT_REUSE_MATRIX) { 1054*d5b485abSSatish Balay /* 1055*d5b485abSSatish Balay Assumes new rows are same length as the old rows, hence bug! 1056*d5b485abSSatish Balay */ 1057*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 1058*d5b485abSSatish Balay mat = (Mat_SeqAIJ *)(submats[i]->data); 1059*d5b485abSSatish Balay if ((mat->m != nrow[i]) || (mat->n != ncol[i])) { 1060*d5b485abSSatish Balay SETERRQ(1,"MatGetSubmatrices_MPIAIJ:Cannot reuse matrix. wrong size"); 1061*d5b485abSSatish Balay } 1062*d5b485abSSatish Balay if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) { 1063*d5b485abSSatish Balay SETERRQ(1,"MatGetSubmatrices_MPIAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1064*d5b485abSSatish Balay } 1065*d5b485abSSatish Balay /* Initial matrix as if empty */ 1066*d5b485abSSatish Balay PetscMemzero(mat->ilen,mat->m*sizeof(int)); 1067*d5b485abSSatish Balay } 1068*d5b485abSSatish Balay } 1069*d5b485abSSatish Balay else { 1070*d5b485abSSatish Balay *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1071*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 1072*d5b485abSSatish Balay ierr = MatCreateSeqAIJ(comm,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr); 1073*d5b485abSSatish Balay } 1074*d5b485abSSatish Balay } 1075*d5b485abSSatish Balay 1076*d5b485abSSatish Balay /* Assemble the matrices */ 1077*d5b485abSSatish Balay /* First assemble the local rows */ 1078*d5b485abSSatish Balay { 1079*d5b485abSSatish Balay int ilen_row,*imat_ilen, *imat_j, *imat_i,old_row; 1080*d5b485abSSatish Balay Scalar *imat_a; 1081*d5b485abSSatish Balay 1082*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 1083*d5b485abSSatish Balay mat = (Mat_SeqAIJ *) submats[i]->data; 1084*d5b485abSSatish Balay imat_ilen = mat->ilen; 1085*d5b485abSSatish Balay imat_j = mat->j; 1086*d5b485abSSatish Balay imat_i = mat->i; 1087*d5b485abSSatish Balay imat_a = mat->a; 1088*d5b485abSSatish Balay cmap_i = cmap[i]; 1089*d5b485abSSatish Balay rmap_i = rmap[i]; 1090*d5b485abSSatish Balay irow_i = irow[i]; 1091*d5b485abSSatish Balay jmax = nrow[i]; 1092*d5b485abSSatish Balay for ( j=0; j<jmax; j++ ) { 1093*d5b485abSSatish Balay row = irow_i[j]; 1094*d5b485abSSatish Balay proc = rtable[row]; 1095*d5b485abSSatish Balay if (proc == rank) { 1096*d5b485abSSatish Balay old_row = row; 1097*d5b485abSSatish Balay row = rmap_i[row]; 1098*d5b485abSSatish Balay ilen_row = imat_ilen[row]; 1099*d5b485abSSatish Balay ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1100*d5b485abSSatish Balay mat_i = imat_i[row] + ashift; 1101*d5b485abSSatish Balay mat_a = imat_a + mat_i; 1102*d5b485abSSatish Balay mat_j = imat_j + mat_i; 1103*d5b485abSSatish Balay for ( k=0; k<ncols; k++ ) { 1104*d5b485abSSatish Balay if ((tcol = cmap_i[cols[k]])) { 1105*d5b485abSSatish Balay *mat_j++ = tcol - (!ashift); 1106*d5b485abSSatish Balay *mat_a++ = vals[k]; 1107*d5b485abSSatish Balay ilen_row++; 1108*d5b485abSSatish Balay } 1109*d5b485abSSatish Balay } 1110*d5b485abSSatish Balay ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1111*d5b485abSSatish Balay imat_ilen[row] = ilen_row; 1112*d5b485abSSatish Balay } 1113*d5b485abSSatish Balay 1114*d5b485abSSatish Balay } 1115*d5b485abSSatish Balay } 1116*d5b485abSSatish Balay } 1117*d5b485abSSatish Balay 1118*d5b485abSSatish Balay /* Now assemble the off proc rows*/ 1119*d5b485abSSatish Balay { 1120*d5b485abSSatish Balay int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1121*d5b485abSSatish Balay int *imat_j,*imat_i; 1122*d5b485abSSatish Balay Scalar *imat_a,*rbuf4_i; 1123*d5b485abSSatish Balay 1124*d5b485abSSatish Balay for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1125*d5b485abSSatish Balay MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1126*d5b485abSSatish Balay index = pa[i]; 1127*d5b485abSSatish Balay sbuf1_i = sbuf1[index]; 1128*d5b485abSSatish Balay jmax = sbuf1_i[0]; 1129*d5b485abSSatish Balay ct1 = 2*jmax + 1; 1130*d5b485abSSatish Balay ct2 = 0; 1131*d5b485abSSatish Balay rbuf2_i = rbuf2[i]; 1132*d5b485abSSatish Balay rbuf3_i = rbuf3[i]; 1133*d5b485abSSatish Balay rbuf4_i = rbuf4[i]; 1134*d5b485abSSatish Balay for ( j=1; j<=jmax; j++ ) { 1135*d5b485abSSatish Balay is_no = sbuf1_i[2*j-1]; 1136*d5b485abSSatish Balay rmap_i = rmap[is_no]; 1137*d5b485abSSatish Balay cmap_i = cmap[is_no]; 1138*d5b485abSSatish Balay mat = (Mat_SeqAIJ *) submats[is_no]->data; 1139*d5b485abSSatish Balay imat_ilen = mat->ilen; 1140*d5b485abSSatish Balay imat_j = mat->j; 1141*d5b485abSSatish Balay imat_i = mat->i; 1142*d5b485abSSatish Balay imat_a = mat->a; 1143*d5b485abSSatish Balay max1 = sbuf1_i[2*j]; 1144*d5b485abSSatish Balay for ( k=0; k<max1; k++, ct1++ ) { 1145*d5b485abSSatish Balay row = sbuf1_i[ct1]; 1146*d5b485abSSatish Balay row = rmap_i[row]; 1147*d5b485abSSatish Balay ilen = imat_ilen[row]; 1148*d5b485abSSatish Balay mat_i = imat_i[row] + ashift; 1149*d5b485abSSatish Balay mat_a = imat_a + mat_i; 1150*d5b485abSSatish Balay mat_j = imat_j + mat_i; 1151*d5b485abSSatish Balay max2 = rbuf2_i[ct1]; 1152*d5b485abSSatish Balay for ( l=0; l<max2; l++,ct2++ ) { 1153*d5b485abSSatish Balay if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1154*d5b485abSSatish Balay *mat_j++ = tcol - (!ashift); 1155*d5b485abSSatish Balay *mat_a++ = rbuf4_i[ct2]; 1156*d5b485abSSatish Balay ilen++; 1157*d5b485abSSatish Balay } 1158*d5b485abSSatish Balay } 1159*d5b485abSSatish Balay imat_ilen[row] = ilen; 1160*d5b485abSSatish Balay } 1161*d5b485abSSatish Balay } 1162*d5b485abSSatish Balay } 1163*d5b485abSSatish Balay } 1164*d5b485abSSatish Balay PetscFree(r_status4); PetscFree(r_waits4); 1165*d5b485abSSatish Balay MPI_Waitall(nrqr,s_waits4,s_status4); 1166*d5b485abSSatish Balay PetscFree(s_waits4); PetscFree(s_status4); 1167*d5b485abSSatish Balay 1168*d5b485abSSatish Balay /* Restore the indices */ 1169*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 1170*d5b485abSSatish Balay ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1171*d5b485abSSatish Balay ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1172*d5b485abSSatish Balay } 1173*d5b485abSSatish Balay 1174*d5b485abSSatish Balay /* Destroy allocated memory */ 1175*d5b485abSSatish Balay PetscFree(irow); 1176*d5b485abSSatish Balay PetscFree(w1); 1177*d5b485abSSatish Balay PetscFree(pa); 1178*d5b485abSSatish Balay 1179*d5b485abSSatish Balay PetscFree(sbuf1); 1180*d5b485abSSatish Balay PetscFree(rbuf2); 1181*d5b485abSSatish Balay for ( i=0; i<nrqr; ++i ) { 1182*d5b485abSSatish Balay PetscFree(sbuf2[i]); 1183*d5b485abSSatish Balay } 1184*d5b485abSSatish Balay for ( i=0; i<nrqs; ++i ) { 1185*d5b485abSSatish Balay PetscFree(rbuf3[i]); 1186*d5b485abSSatish Balay PetscFree(rbuf4[i]); 1187*d5b485abSSatish Balay } 1188*d5b485abSSatish Balay 1189*d5b485abSSatish Balay PetscFree(sbuf2); 1190*d5b485abSSatish Balay PetscFree(rbuf3); 1191*d5b485abSSatish Balay PetscFree(rbuf4 ); 1192*d5b485abSSatish Balay PetscFree(sbuf_aj[0]); 1193*d5b485abSSatish Balay PetscFree(sbuf_aj); 1194*d5b485abSSatish Balay PetscFree(sbuf_aa[0]); 1195*d5b485abSSatish Balay PetscFree(sbuf_aa); 1196*d5b485abSSatish Balay 1197*d5b485abSSatish Balay PetscFree(cmap); 1198*d5b485abSSatish Balay PetscFree(rmap); 1199*d5b485abSSatish Balay PetscFree(lens); 1200*d5b485abSSatish Balay 1201*d5b485abSSatish Balay for ( i=0; i<ismax; i++ ) { 1202*d5b485abSSatish Balay ierr = MatAssemblyBegin(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1203*d5b485abSSatish Balay ierr = MatAssemblyEnd(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1204*d5b485abSSatish Balay } 1205*d5b485abSSatish Balay 1206*d5b485abSSatish Balay return 0; 1207*d5b485abSSatish Balay } 1208*d5b485abSSatish Balay 1209*d5b485abSSatish Balay 1210*d5b485abSSatish Balay 1211*d5b485abSSatish Balay 1212