1a5eb4965SSatish Balay #ifdef PETSC_RCS_HEADER 2*e26ad7d8SSatish Balay static char vcid[] = "$Id: mmdense.c,v 1.13 1997/10/19 03:25:11 bsmith Exp balay $"; 30500aeceSBarry Smith #endif 40500aeceSBarry Smith 50500aeceSBarry Smith /* 60500aeceSBarry Smith Support for the parallel dense matrix vector multiply 70500aeceSBarry Smith */ 870f55243SBarry Smith #include "src/mat/impls/dense/mpi/mpidense.h" 9f5eb4b81SSatish Balay #include "src/vec/vecimpl.h" 100500aeceSBarry Smith 115615d1e5SSatish Balay #undef __FUNC__ 125615d1e5SSatish Balay #define __FUNC__ "MatSetUpMultiply_MPIDense" 130500aeceSBarry Smith int MatSetUpMultiply_MPIDense(Mat mat) 140500aeceSBarry Smith { 150500aeceSBarry Smith Mat_MPIDense *mdn = (Mat_MPIDense *) mat->data; 16ed3cc1f0SBarry Smith int ierr,n; 17d9f96c7cSLois Curfman McInnes IS tofrom; 180500aeceSBarry Smith Vec gvec; 190500aeceSBarry Smith 203a40ed3dSBarry Smith PetscFunctionBegin; 210500aeceSBarry Smith /* Create local vector that is used to scatter into */ 22029af93fSBarry Smith ierr = VecCreateSeq(PETSC_COMM_SELF,mdn->N,&mdn->lvec); CHKERRQ(ierr); 230500aeceSBarry Smith 240500aeceSBarry Smith /* Create temporary index set for building scatter gather */ 25029af93fSBarry Smith ierr = ISCreateStride(PETSC_COMM_SELF,mdn->N,0,1,&tofrom); CHKERRQ(ierr); 260500aeceSBarry Smith 270500aeceSBarry Smith /* Create temporary global vector to generate scatter context */ 28ed3cc1f0SBarry Smith n = mdn->cowners[mdn->rank+1] - mdn->cowners[mdn->rank]; 29ed3cc1f0SBarry Smith ierr = VecCreateMPI(mat->comm,n,mdn->N,&gvec); CHKERRQ(ierr); 30e19c7942SLois Curfman McInnes 310500aeceSBarry Smith /* Generate the scatter context */ 32d9f96c7cSLois Curfman McInnes ierr = VecScatterCreate(gvec,tofrom,mdn->lvec,tofrom,&mdn->Mvctx); CHKERRQ(ierr); 330500aeceSBarry Smith PLogObjectParent(mat,mdn->Mvctx); 340500aeceSBarry Smith PLogObjectParent(mat,mdn->lvec); 35d9f96c7cSLois Curfman McInnes PLogObjectParent(mat,tofrom); 36d9f96c7cSLois Curfman McInnes PLogObjectParent(mat,gvec); 37e19c7942SLois Curfman McInnes 38d9f96c7cSLois Curfman McInnes ierr = ISDestroy(tofrom); CHKERRQ(ierr); 39d9f96c7cSLois Curfman McInnes ierr = VecDestroy(gvec); CHKERRQ(ierr); 403a40ed3dSBarry Smith PetscFunctionReturn(0); 410500aeceSBarry Smith } 420500aeceSBarry Smith 43*e26ad7d8SSatish Balay #if defined (__JUNK__) 44*e26ad7d8SSatish Balay 45*e26ad7d8SSatish Balay int MatGetSubMatrices_MPIDense_Local(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat*); 46*e26ad7d8SSatish Balay #undef __FUNC__ 47*e26ad7d8SSatish Balay #define __FUNC__ "MatGetSubMatrices_MPIDense" 48*e26ad7d8SSatish Balay int MatGetSubMatrices_MPIDense(Mat C,int ismax,IS *isrow,IS *iscol, 49*e26ad7d8SSatish Balay MatGetSubMatrixCall scall,Mat **submat) 50*e26ad7d8SSatish Balay { 51*e26ad7d8SSatish Balay Mat_MPIDense *c = (Mat_MPIDense *) C->data; 52*e26ad7d8SSatish Balay int nmax,nstages_local,nstages,i,pos,max_no,ierr; 53*e26ad7d8SSatish Balay 54*e26ad7d8SSatish Balay PetscFunctionBegin; 55*e26ad7d8SSatish Balay /* Allocate memory to hold all the submatrices */ 56*e26ad7d8SSatish Balay if (scall != MAT_REUSE_MATRIX) { 57*e26ad7d8SSatish Balay *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat); 58*e26ad7d8SSatish Balay } 59*e26ad7d8SSatish Balay /* Determine the number of stages through which submatrices are done */ 60*e26ad7d8SSatish Balay nmax = 20*1000000 / (c->N * sizeof(int)); 61*e26ad7d8SSatish Balay if (nmax == 0) nmax = 1; 62*e26ad7d8SSatish Balay nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 63*e26ad7d8SSatish Balay 64*e26ad7d8SSatish Balay /* Make sure every processor loops through the nstages */ 65*e26ad7d8SSatish Balay ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 660500aeceSBarry Smith 670500aeceSBarry Smith 68*e26ad7d8SSatish Balay for ( i=0,pos=0; i<nstages; i++ ) { 69*e26ad7d8SSatish Balay if (pos+nmax <= ismax) max_no = nmax; 70*e26ad7d8SSatish Balay else if (pos == ismax) max_no = 0; 71*e26ad7d8SSatish Balay else max_no = ismax-pos; 72*e26ad7d8SSatish Balay ierr = MatGetSubMatrices_MPIDense_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 73*e26ad7d8SSatish Balay pos += max_no; 74*e26ad7d8SSatish Balay } 75*e26ad7d8SSatish Balay PetscFunctionReturn(0); 76*e26ad7d8SSatish Balay } 77*e26ad7d8SSatish Balay /* -------------------------------------------------------------------------*/ 78*e26ad7d8SSatish Balay #undef __FUNC__ 79*e26ad7d8SSatish Balay #define __FUNC__ "MatGetSubMatrices_MPIDense_Local" 80*e26ad7d8SSatish Balay int MatGetSubMatrices_MPIDense_Local(Mat C,int ismax,IS *isrow,IS *iscol, 81*e26ad7d8SSatish Balay MatGetSubMatrixCall scall,Mat *submats) 82*e26ad7d8SSatish Balay { 83*e26ad7d8SSatish Balay Mat_MPIDense *c = (Mat_MPIDense *) C->data; 84*e26ad7d8SSatish Balay Mat A = c->A; 85*e26ad7d8SSatish Balay Mat_SeqDense *a = (Mat_SeqDense*)A->data, *mat; 86*e26ad7d8SSatish Balay int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 87*e26ad7d8SSatish Balay int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 88*e26ad7d8SSatish Balay int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 89*e26ad7d8SSatish Balay int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 90*e26ad7d8SSatish Balay int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 91*e26ad7d8SSatish Balay int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 92*e26ad7d8SSatish Balay int *rmap_i,tag0,tag1,tag2,tag3; 93*e26ad7d8SSatish Balay MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 94*e26ad7d8SSatish Balay MPI_Request *r_waits4,*s_waits3,*s_waits4; 95*e26ad7d8SSatish Balay MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 96*e26ad7d8SSatish Balay MPI_Status *r_status3,*r_status4,*s_status4; 97*e26ad7d8SSatish Balay MPI_Comm comm; 98*e26ad7d8SSatish Balay Scalar **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i; 99*e26ad7d8SSatish Balay 100*e26ad7d8SSatish Balay PetscFunctionBegin; 101*e26ad7d8SSatish Balay comm = C->comm; 102*e26ad7d8SSatish Balay tag0 = C->tag; 103*e26ad7d8SSatish Balay size = c->size; 104*e26ad7d8SSatish Balay rank = c->rank; 105*e26ad7d8SSatish Balay m = c->M; 106*e26ad7d8SSatish Balay 107*e26ad7d8SSatish Balay /* Get some new tags to keep the communication clean */ 108*e26ad7d8SSatish Balay ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 109*e26ad7d8SSatish Balay ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 110*e26ad7d8SSatish Balay ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 111*e26ad7d8SSatish Balay 112*e26ad7d8SSatish Balay /* Check if the col indices are sorted */ 113*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 114*e26ad7d8SSatish Balay ierr = ISSorted(isrow[i],(PetscTruth*)&j); 115*e26ad7d8SSatish Balay if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"ISrow is not sorted"); 116*e26ad7d8SSatish Balay ierr = ISSorted(iscol[i],(PetscTruth*)&j); 117*e26ad7d8SSatish Balay if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IScol is not sorted"); 118*e26ad7d8SSatish Balay } 119*e26ad7d8SSatish Balay 120*e26ad7d8SSatish Balay len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (m+1)*sizeof(int); 121*e26ad7d8SSatish Balay irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 122*e26ad7d8SSatish Balay icol = irow + ismax; 123*e26ad7d8SSatish Balay nrow = (int *) (icol + ismax); 124*e26ad7d8SSatish Balay ncol = nrow + ismax; 125*e26ad7d8SSatish Balay rtable = ncol + ismax; 126*e26ad7d8SSatish Balay 127*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 128*e26ad7d8SSatish Balay ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 129*e26ad7d8SSatish Balay ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 130*e26ad7d8SSatish Balay ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 131*e26ad7d8SSatish Balay ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 132*e26ad7d8SSatish Balay } 133*e26ad7d8SSatish Balay 134*e26ad7d8SSatish Balay /* Create hash table for the mapping :row -> proc*/ 135*e26ad7d8SSatish Balay for ( i=0,j=0; i<size; i++ ) { 136*e26ad7d8SSatish Balay jmax = c->rowners[i+1]; 137*e26ad7d8SSatish Balay for ( ; j<jmax; j++ ) { 138*e26ad7d8SSatish Balay rtable[j] = i; 139*e26ad7d8SSatish Balay } 140*e26ad7d8SSatish Balay } 141*e26ad7d8SSatish Balay 142*e26ad7d8SSatish Balay /* evaluate communication - mesg to who, length of mesg, and buffer space 143*e26ad7d8SSatish Balay required. Based on this, buffers are allocated, and data copied into them*/ 144*e26ad7d8SSatish Balay w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 145*e26ad7d8SSatish Balay w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 146*e26ad7d8SSatish Balay w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 147*e26ad7d8SSatish Balay w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 148*e26ad7d8SSatish Balay PetscMemzero(w1,size*3*sizeof(int)); /* initialize work vector*/ 149*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 150*e26ad7d8SSatish Balay PetscMemzero(w4,size*sizeof(int)); /* initialize work vector*/ 151*e26ad7d8SSatish Balay jmax = nrow[i]; 152*e26ad7d8SSatish Balay irow_i = irow[i]; 153*e26ad7d8SSatish Balay for ( j=0; j<jmax; j++ ) { 154*e26ad7d8SSatish Balay row = irow_i[j]; 155*e26ad7d8SSatish Balay proc = rtable[row]; 156*e26ad7d8SSatish Balay w4[proc]++; 157*e26ad7d8SSatish Balay } 158*e26ad7d8SSatish Balay for ( j=0; j<size; j++ ) { 159*e26ad7d8SSatish Balay if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 160*e26ad7d8SSatish Balay } 161*e26ad7d8SSatish Balay } 162*e26ad7d8SSatish Balay 163*e26ad7d8SSatish Balay nrqs = 0; /* no of outgoing messages */ 164*e26ad7d8SSatish Balay msz = 0; /* total mesg length (for all procs) */ 165*e26ad7d8SSatish Balay w1[rank] = 0; /* no mesg sent to self */ 166*e26ad7d8SSatish Balay w3[rank] = 0; 167*e26ad7d8SSatish Balay for ( i=0; i<size; i++ ) { 168*e26ad7d8SSatish Balay if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 169*e26ad7d8SSatish Balay } 170*e26ad7d8SSatish Balay pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 171*e26ad7d8SSatish Balay for ( i=0, j=0; i<size; i++ ) { 172*e26ad7d8SSatish Balay if (w1[i]) { pa[j] = i; j++; } 173*e26ad7d8SSatish Balay } 174*e26ad7d8SSatish Balay 175*e26ad7d8SSatish Balay /* Each message would have a header = 1 + 2*(no of IS) + data */ 176*e26ad7d8SSatish Balay for ( i=0; i<nrqs; i++ ) { 177*e26ad7d8SSatish Balay j = pa[i]; 178*e26ad7d8SSatish Balay w1[j] += w2[j] + 2* w3[j]; 179*e26ad7d8SSatish Balay msz += w1[j]; 180*e26ad7d8SSatish Balay } 181*e26ad7d8SSatish Balay /* Do a global reduction to determine how many messages to expect*/ 182*e26ad7d8SSatish Balay { 183*e26ad7d8SSatish Balay int *rw1, *rw2; 184*e26ad7d8SSatish Balay rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 185*e26ad7d8SSatish Balay rw2 = rw1+size; 186*e26ad7d8SSatish Balay ierr = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 187*e26ad7d8SSatish Balay bsz = rw1[rank]; 188*e26ad7d8SSatish Balay ierr = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr); 189*e26ad7d8SSatish Balay nrqr = rw2[rank]; 190*e26ad7d8SSatish Balay PetscFree(rw1); 191*e26ad7d8SSatish Balay } 192*e26ad7d8SSatish Balay 193*e26ad7d8SSatish Balay /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 194*e26ad7d8SSatish Balay len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 195*e26ad7d8SSatish Balay rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 196*e26ad7d8SSatish Balay rbuf1[0] = (int *) (rbuf1 + nrqr); 197*e26ad7d8SSatish Balay for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 198*e26ad7d8SSatish Balay 199*e26ad7d8SSatish Balay /* Post the receives */ 200*e26ad7d8SSatish Balay r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 201*e26ad7d8SSatish Balay for ( i=0; i<nrqr; ++i ) { 202*e26ad7d8SSatish Balay ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); 203*e26ad7d8SSatish Balay } 204*e26ad7d8SSatish Balay 205*e26ad7d8SSatish Balay /* Allocate Memory for outgoing messages */ 206*e26ad7d8SSatish Balay len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 207*e26ad7d8SSatish Balay sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 208*e26ad7d8SSatish Balay ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 209*e26ad7d8SSatish Balay PetscMemzero(sbuf1,2*size*sizeof(int*)); 210*e26ad7d8SSatish Balay /* allocate memory for outgoing data + buf to receive the first reply */ 211*e26ad7d8SSatish Balay tmp = (int *) (ptr + size); 212*e26ad7d8SSatish Balay ctr = tmp + 2*msz; 213*e26ad7d8SSatish Balay 214*e26ad7d8SSatish Balay { 215*e26ad7d8SSatish Balay int *iptr = tmp,ict = 0; 216*e26ad7d8SSatish Balay for ( i=0; i<nrqs; i++ ) { 217*e26ad7d8SSatish Balay j = pa[i]; 218*e26ad7d8SSatish Balay iptr += ict; 219*e26ad7d8SSatish Balay sbuf1[j] = iptr; 220*e26ad7d8SSatish Balay ict = w1[j]; 221*e26ad7d8SSatish Balay } 222*e26ad7d8SSatish Balay } 223*e26ad7d8SSatish Balay 224*e26ad7d8SSatish Balay /* Form the outgoing messages */ 225*e26ad7d8SSatish Balay /* Initialize the header space */ 226*e26ad7d8SSatish Balay for ( i=0; i<nrqs; i++ ) { 227*e26ad7d8SSatish Balay j = pa[i]; 228*e26ad7d8SSatish Balay sbuf1[j][0] = 0; 229*e26ad7d8SSatish Balay PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 230*e26ad7d8SSatish Balay ptr[j] = sbuf1[j] + 2*w3[j] + 1; 231*e26ad7d8SSatish Balay } 232*e26ad7d8SSatish Balay 233*e26ad7d8SSatish Balay /* Parse the isrow and copy data into outbuf */ 234*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 235*e26ad7d8SSatish Balay PetscMemzero(ctr,size*sizeof(int)); 236*e26ad7d8SSatish Balay irow_i = irow[i]; 237*e26ad7d8SSatish Balay jmax = nrow[i]; 238*e26ad7d8SSatish Balay for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 239*e26ad7d8SSatish Balay row = irow_i[j]; 240*e26ad7d8SSatish Balay proc = rtable[row]; 241*e26ad7d8SSatish Balay if (proc != rank) { /* copy to the outgoing buf*/ 242*e26ad7d8SSatish Balay ctr[proc]++; 243*e26ad7d8SSatish Balay *ptr[proc] = row; 244*e26ad7d8SSatish Balay ptr[proc]++; 245*e26ad7d8SSatish Balay } 246*e26ad7d8SSatish Balay } 247*e26ad7d8SSatish Balay /* Update the headers for the current IS */ 248*e26ad7d8SSatish Balay for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 249*e26ad7d8SSatish Balay if ((ctr_j = ctr[j])) { 250*e26ad7d8SSatish Balay sbuf1_j = sbuf1[j]; 251*e26ad7d8SSatish Balay k = ++sbuf1_j[0]; 252*e26ad7d8SSatish Balay sbuf1_j[2*k] = ctr_j; 253*e26ad7d8SSatish Balay sbuf1_j[2*k-1] = i; 254*e26ad7d8SSatish Balay } 255*e26ad7d8SSatish Balay } 256*e26ad7d8SSatish Balay } 257*e26ad7d8SSatish Balay 258*e26ad7d8SSatish Balay /* Now post the sends */ 259*e26ad7d8SSatish Balay s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 260*e26ad7d8SSatish Balay for ( i=0; i<nrqs; ++i ) { 261*e26ad7d8SSatish Balay j = pa[i]; 262*e26ad7d8SSatish Balay /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 263*e26ad7d8SSatish Balay ierr = MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i);CHKERRQ(ierr); 264*e26ad7d8SSatish Balay } 265*e26ad7d8SSatish Balay 266*e26ad7d8SSatish Balay /* Post Receives to capture the buffer size */ 267*e26ad7d8SSatish Balay r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 268*e26ad7d8SSatish Balay rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 269*e26ad7d8SSatish Balay rbuf2[0] = tmp + msz; 270*e26ad7d8SSatish Balay for ( i=1; i<nrqs; ++i ) { 271*e26ad7d8SSatish Balay j = pa[i]; 272*e26ad7d8SSatish Balay rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 273*e26ad7d8SSatish Balay } 274*e26ad7d8SSatish Balay for ( i=0; i<nrqs; ++i ) { 275*e26ad7d8SSatish Balay j = pa[i]; 276*e26ad7d8SSatish Balay ierr = MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i);CHKERRQ(ierr); 277*e26ad7d8SSatish Balay } 278*e26ad7d8SSatish Balay 279*e26ad7d8SSatish Balay /* Send to other procs the buf size they should allocate */ 280*e26ad7d8SSatish Balay 281*e26ad7d8SSatish Balay 282*e26ad7d8SSatish Balay /* Receive messages*/ 283*e26ad7d8SSatish Balay s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 284*e26ad7d8SSatish Balay r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1); 285*e26ad7d8SSatish Balay len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 286*e26ad7d8SSatish Balay sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 287*e26ad7d8SSatish Balay req_size = (int *) (sbuf2 + nrqr); 288*e26ad7d8SSatish Balay req_source = req_size + nrqr; 289*e26ad7d8SSatish Balay 290*e26ad7d8SSatish Balay { 291*e26ad7d8SSatish Balay int *sbuf2_i; 292*e26ad7d8SSatish Balay 293*e26ad7d8SSatish Balay for ( i=0; i<nrqr; ++i ) { 294*e26ad7d8SSatish Balay ierr = MPI_Waitany(nrqr, r_waits1, &index, r_status1+i);CHKERRQ(ierr); 295*e26ad7d8SSatish Balay req_size[index] = 0; 296*e26ad7d8SSatish Balay rbuf1_i = rbuf1[index]; 297*e26ad7d8SSatish Balay start = 2*rbuf1_i[0] + 1; 298*e26ad7d8SSatish Balay MPI_Get_count(r_status1+i,MPI_INT, &end); 299*e26ad7d8SSatish Balay sbuf2[index] = (int *)PetscMalloc((end+1)*sizeof(int));CHKPTRQ(sbuf2[index]); 300*e26ad7d8SSatish Balay sbuf2_i = sbuf2[index]; 301*e26ad7d8SSatish Balay for ( j=start; j<end; j++ ) { 302*e26ad7d8SSatish Balay /** ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; */ 303*e26ad7d8SSatish Balay /** ncols is now the whole row in the dense case */ 304*e26ad7d8SSatish Balay /** Can get rid of this unnecessary communication */ 305*e26ad7d8SSatish Balay ncols = c->N; 306*e26ad7d8SSatish Balay sbuf2_i[j] = ncols; 307*e26ad7d8SSatish Balay req_size[index] += ncols; 308*e26ad7d8SSatish Balay } 309*e26ad7d8SSatish Balay req_source[index] = r_status1[i].MPI_SOURCE; 310*e26ad7d8SSatish Balay /* form the header */ 311*e26ad7d8SSatish Balay sbuf2_i[0] = req_size[index]; 312*e26ad7d8SSatish Balay for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 313*e26ad7d8SSatish Balay ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); CHKERRQ(ierr); 314*e26ad7d8SSatish Balay } 315*e26ad7d8SSatish Balay } 316*e26ad7d8SSatish Balay PetscFree(r_status1); PetscFree(r_waits1); 317*e26ad7d8SSatish Balay 318*e26ad7d8SSatish Balay /* recv buffer sizes */ 319*e26ad7d8SSatish Balay /* Receive messages*/ 320*e26ad7d8SSatish Balay 321*e26ad7d8SSatish Balay rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 322*e26ad7d8SSatish Balay rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 323*e26ad7d8SSatish Balay r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3); 324*e26ad7d8SSatish Balay r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4); 325*e26ad7d8SSatish Balay r_status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2); 326*e26ad7d8SSatish Balay 327*e26ad7d8SSatish Balay for ( i=0; i<nrqs; ++i ) { 328*e26ad7d8SSatish Balay ierr = MPI_Waitany(nrqs, r_waits2, &index, r_status2+i);CHKERRQ(ierr); 329*e26ad7d8SSatish Balay rbuf3[index] = (int *)PetscMalloc((rbuf2[index][0]+1)*sizeof(int));CHKPTRQ(rbuf3[index]); 330*e26ad7d8SSatish Balay rbuf4[index] = (Scalar *)PetscMalloc((rbuf2[index][0]+1)*sizeof(Scalar));CHKPTRQ(rbuf4[index]); 331*e26ad7d8SSatish Balay ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 332*e26ad7d8SSatish Balay r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); CHKERRQ(ierr); 333*e26ad7d8SSatish Balay ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0], MPIU_SCALAR, 334*e26ad7d8SSatish Balay r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); CHKERRQ(ierr); 335*e26ad7d8SSatish Balay } 336*e26ad7d8SSatish Balay PetscFree(r_status2); PetscFree(r_waits2); 337*e26ad7d8SSatish Balay 338*e26ad7d8SSatish Balay /* Wait on sends1 and sends2 */ 339*e26ad7d8SSatish Balay s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1); 340*e26ad7d8SSatish Balay s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2); 341*e26ad7d8SSatish Balay 342*e26ad7d8SSatish Balay ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 343*e26ad7d8SSatish Balay ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 344*e26ad7d8SSatish Balay PetscFree(s_status1); PetscFree(s_status2); 345*e26ad7d8SSatish Balay PetscFree(s_waits1); PetscFree(s_waits2); 346*e26ad7d8SSatish Balay 347*e26ad7d8SSatish Balay /* Now allocate buffers for a->j, and send them off */ 348*e26ad7d8SSatish Balay /** No space required for a->j... as the complete row is packed and sent */ 349*e26ad7d8SSatish Balay /** sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); */ 350*e26ad7d8SSatish Balay /** for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; */ 351*e26ad7d8SSatish Balay /** sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); */ 352*e26ad7d8SSatish Balay /** for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; */ 353*e26ad7d8SSatish Balay 354*e26ad7d8SSatish Balay s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3); 355*e26ad7d8SSatish Balay { 356*e26ad7d8SSatish Balay int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 357*e26ad7d8SSatish Balay int *cworkA, *cworkB, cstart = c->cstart, rstart = c->rstart, *bmap = c->garray; 358*e26ad7d8SSatish Balay int *a_j = a->j, *b_j = b->j, ctmp, *t_cols; 359*e26ad7d8SSatish Balay 360*e26ad7d8SSatish Balay for ( i=0; i<nrqr; i++ ) { 361*e26ad7d8SSatish Balay rbuf1_i = rbuf1[i]; 362*e26ad7d8SSatish Balay sbuf_aj_i = sbuf_aj[i]; 363*e26ad7d8SSatish Balay ct1 = 2*rbuf1_i[0] + 1; 364*e26ad7d8SSatish Balay ct2 = 0; 365*e26ad7d8SSatish Balay for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 366*e26ad7d8SSatish Balay kmax = rbuf1[i][2*j]; 367*e26ad7d8SSatish Balay for ( k=0; k<kmax; k++,ct1++ ) { 368*e26ad7d8SSatish Balay row = rbuf1_i[ct1] - rstart; 369*e26ad7d8SSatish Balay nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 370*e26ad7d8SSatish Balay ncols = nzA + nzB; 371*e26ad7d8SSatish Balay cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 372*e26ad7d8SSatish Balay 373*e26ad7d8SSatish Balay /* load the column indices for this row into cols*/ 374*e26ad7d8SSatish Balay cols = sbuf_aj_i + ct2; 375*e26ad7d8SSatish Balay for ( l=0; l<nzB; l++ ) { 376*e26ad7d8SSatish Balay if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 377*e26ad7d8SSatish Balay else break; 378*e26ad7d8SSatish Balay } 379*e26ad7d8SSatish Balay imark = l; 380*e26ad7d8SSatish Balay for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 381*e26ad7d8SSatish Balay for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 382*e26ad7d8SSatish Balay 383*e26ad7d8SSatish Balay ct2 += ncols; 384*e26ad7d8SSatish Balay } 385*e26ad7d8SSatish Balay } 386*e26ad7d8SSatish Balay ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 387*e26ad7d8SSatish Balay } 388*e26ad7d8SSatish Balay } 389*e26ad7d8SSatish Balay r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3); 390*e26ad7d8SSatish Balay s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3); 391*e26ad7d8SSatish Balay 392*e26ad7d8SSatish Balay /* Allocate buffers for a->a, and send them off */ 393*e26ad7d8SSatish Balay sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 394*e26ad7d8SSatish Balay for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 395*e26ad7d8SSatish Balay sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 396*e26ad7d8SSatish Balay for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 397*e26ad7d8SSatish Balay 398*e26ad7d8SSatish Balay s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4); 399*e26ad7d8SSatish Balay { 400*e26ad7d8SSatish Balay int nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, imark; 401*e26ad7d8SSatish Balay int cstart = c->cstart, rstart = c->rstart, *bmap = c->garray; 402*e26ad7d8SSatish Balay int *b_j = b->j; 403*e26ad7d8SSatish Balay Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a,*t_vals; 404*e26ad7d8SSatish Balay 405*e26ad7d8SSatish Balay for ( i=0; i<nrqr; i++ ) { 406*e26ad7d8SSatish Balay rbuf1_i = rbuf1[i]; 407*e26ad7d8SSatish Balay sbuf_aa_i = sbuf_aa[i]; 408*e26ad7d8SSatish Balay ct1 = 2*rbuf1_i[0]+1; 409*e26ad7d8SSatish Balay ct2 = 0; 410*e26ad7d8SSatish Balay for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 411*e26ad7d8SSatish Balay kmax = rbuf1_i[2*j]; 412*e26ad7d8SSatish Balay for ( k=0; k<kmax; k++,ct1++ ) { 413*e26ad7d8SSatish Balay row = rbuf1_i[ct1] - rstart; 414*e26ad7d8SSatish Balay nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 415*e26ad7d8SSatish Balay ncols = nzA + nzB; 416*e26ad7d8SSatish Balay cworkB = b_j + b_i[row]; 417*e26ad7d8SSatish Balay vworkA = a_a + a_i[row]; 418*e26ad7d8SSatish Balay vworkB = b_a + b_i[row]; 419*e26ad7d8SSatish Balay 420*e26ad7d8SSatish Balay /* load the column values for this row into vals*/ 421*e26ad7d8SSatish Balay vals = sbuf_aa_i+ct2; 422*e26ad7d8SSatish Balay for ( l=0; l<nzB; l++ ) { 423*e26ad7d8SSatish Balay if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l]; 424*e26ad7d8SSatish Balay else break; 425*e26ad7d8SSatish Balay } 426*e26ad7d8SSatish Balay imark = l; 427*e26ad7d8SSatish Balay for ( l=0; l<nzA; l++ ) vals[imark+l] = vworkA[l]; 428*e26ad7d8SSatish Balay for ( l=imark; l<nzB; l++ ) vals[nzA+l] = vworkB[l]; 429*e26ad7d8SSatish Balay ct2 += ncols; 430*e26ad7d8SSatish Balay } 431*e26ad7d8SSatish Balay } 432*e26ad7d8SSatish Balay ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 433*e26ad7d8SSatish Balay } 434*e26ad7d8SSatish Balay } 435*e26ad7d8SSatish Balay r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4); 436*e26ad7d8SSatish Balay s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4); 437*e26ad7d8SSatish Balay PetscFree(rbuf1); 438*e26ad7d8SSatish Balay 439*e26ad7d8SSatish Balay /* Form the matrix */ 440*e26ad7d8SSatish Balay /* create col map */ 441*e26ad7d8SSatish Balay { 442*e26ad7d8SSatish Balay int *icol_i; 443*e26ad7d8SSatish Balay 444*e26ad7d8SSatish Balay len = (1+ismax)*sizeof(int *) + ismax*c->N*sizeof(int); 445*e26ad7d8SSatish Balay cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 446*e26ad7d8SSatish Balay cmap[0] = (int *)(cmap + ismax); 447*e26ad7d8SSatish Balay PetscMemzero(cmap[0],(1+ismax*c->N)*sizeof(int)); 448*e26ad7d8SSatish Balay for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->N; } 449*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 450*e26ad7d8SSatish Balay jmax = ncol[i]; 451*e26ad7d8SSatish Balay icol_i = icol[i]; 452*e26ad7d8SSatish Balay cmap_i = cmap[i]; 453*e26ad7d8SSatish Balay for ( j=0; j<jmax; j++ ) { 454*e26ad7d8SSatish Balay cmap_i[icol_i[j]] = j+1; 455*e26ad7d8SSatish Balay } 456*e26ad7d8SSatish Balay } 457*e26ad7d8SSatish Balay } 458*e26ad7d8SSatish Balay 459*e26ad7d8SSatish Balay /* Create lens which is required for MatCreate... */ 460*e26ad7d8SSatish Balay for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 461*e26ad7d8SSatish Balay len = (1+ismax)*sizeof(int *) + j*sizeof(int); 462*e26ad7d8SSatish Balay lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 463*e26ad7d8SSatish Balay lens[0] = (int *)(lens + ismax); 464*e26ad7d8SSatish Balay PetscMemzero(lens[0], j*sizeof(int)); 465*e26ad7d8SSatish Balay for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 466*e26ad7d8SSatish Balay 467*e26ad7d8SSatish Balay /* Update lens from local data */ 468*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 469*e26ad7d8SSatish Balay jmax = nrow[i]; 470*e26ad7d8SSatish Balay cmap_i = cmap[i]; 471*e26ad7d8SSatish Balay irow_i = irow[i]; 472*e26ad7d8SSatish Balay lens_i = lens[i]; 473*e26ad7d8SSatish Balay for ( j=0; j<jmax; j++ ) { 474*e26ad7d8SSatish Balay row = irow_i[j]; 475*e26ad7d8SSatish Balay proc = rtable[row]; 476*e26ad7d8SSatish Balay if (proc == rank) { 477*e26ad7d8SSatish Balay ierr = MatGetRow_MPIDense(C,row,&ncols,&cols,0); CHKERRQ(ierr); 478*e26ad7d8SSatish Balay for ( k=0; k<ncols; k++ ) { 479*e26ad7d8SSatish Balay if (cmap_i[cols[k]]) { lens_i[j]++;} 480*e26ad7d8SSatish Balay } 481*e26ad7d8SSatish Balay ierr = MatRestoreRow_MPIDense(C,row,&ncols,&cols,0); CHKERRQ(ierr); 482*e26ad7d8SSatish Balay } 483*e26ad7d8SSatish Balay } 484*e26ad7d8SSatish Balay } 485*e26ad7d8SSatish Balay 486*e26ad7d8SSatish Balay /* Create row map*/ 487*e26ad7d8SSatish Balay len = (1+ismax)*sizeof(int *) + ismax*c->M*sizeof(int); 488*e26ad7d8SSatish Balay rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 489*e26ad7d8SSatish Balay rmap[0] = (int *)(rmap + ismax); 490*e26ad7d8SSatish Balay PetscMemzero(rmap[0],ismax*c->M*sizeof(int)); 491*e26ad7d8SSatish Balay for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->M;} 492*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 493*e26ad7d8SSatish Balay rmap_i = rmap[i]; 494*e26ad7d8SSatish Balay irow_i = irow[i]; 495*e26ad7d8SSatish Balay jmax = nrow[i]; 496*e26ad7d8SSatish Balay for ( j=0; j<jmax; j++ ) { 497*e26ad7d8SSatish Balay rmap_i[irow_i[j]] = j; 498*e26ad7d8SSatish Balay } 499*e26ad7d8SSatish Balay } 500*e26ad7d8SSatish Balay 501*e26ad7d8SSatish Balay /* Update lens from offproc data */ 502*e26ad7d8SSatish Balay { 503*e26ad7d8SSatish Balay int *rbuf2_i, *rbuf3_i, *sbuf1_i; 504*e26ad7d8SSatish Balay 505*e26ad7d8SSatish Balay for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 506*e26ad7d8SSatish Balay ierr = MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2);CHKERRQ(ierr); 507*e26ad7d8SSatish Balay index = pa[i]; 508*e26ad7d8SSatish Balay sbuf1_i = sbuf1[index]; 509*e26ad7d8SSatish Balay jmax = sbuf1_i[0]; 510*e26ad7d8SSatish Balay ct1 = 2*jmax+1; 511*e26ad7d8SSatish Balay ct2 = 0; 512*e26ad7d8SSatish Balay rbuf2_i = rbuf2[i]; 513*e26ad7d8SSatish Balay rbuf3_i = rbuf3[i]; 514*e26ad7d8SSatish Balay for ( j=1; j<=jmax; j++ ) { 515*e26ad7d8SSatish Balay is_no = sbuf1_i[2*j-1]; 516*e26ad7d8SSatish Balay max1 = sbuf1_i[2*j]; 517*e26ad7d8SSatish Balay lens_i = lens[is_no]; 518*e26ad7d8SSatish Balay cmap_i = cmap[is_no]; 519*e26ad7d8SSatish Balay rmap_i = rmap[is_no]; 520*e26ad7d8SSatish Balay for ( k=0; k<max1; k++,ct1++ ) { 521*e26ad7d8SSatish Balay row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 522*e26ad7d8SSatish Balay max2 = rbuf2_i[ct1]; 523*e26ad7d8SSatish Balay for ( l=0; l<max2; l++,ct2++ ) { 524*e26ad7d8SSatish Balay if (cmap_i[rbuf3_i[ct2]]) { 525*e26ad7d8SSatish Balay lens_i[row]++; 526*e26ad7d8SSatish Balay } 527*e26ad7d8SSatish Balay } 528*e26ad7d8SSatish Balay } 529*e26ad7d8SSatish Balay } 530*e26ad7d8SSatish Balay } 531*e26ad7d8SSatish Balay } 532*e26ad7d8SSatish Balay PetscFree(r_status3); PetscFree(r_waits3); 533*e26ad7d8SSatish Balay ierr = MPI_Waitall(nrqr,s_waits3,s_status3); CHKERRQ(ierr); 534*e26ad7d8SSatish Balay PetscFree(s_status3); PetscFree(s_waits3); 535*e26ad7d8SSatish Balay 536*e26ad7d8SSatish Balay /* Create the submatrices */ 537*e26ad7d8SSatish Balay if (scall == MAT_REUSE_MATRIX) { 538*e26ad7d8SSatish Balay /* 539*e26ad7d8SSatish Balay Assumes new rows are same length as the old rows, hence bug! 540*e26ad7d8SSatish Balay */ 541*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 542*e26ad7d8SSatish Balay mat = (Mat_SeqDense *)(submats[i]->data); 543*e26ad7d8SSatish Balay if ((mat->m != nrow[i]) || (mat->n != ncol[i])) { 544*e26ad7d8SSatish Balay SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size"); 545*e26ad7d8SSatish Balay } 546*e26ad7d8SSatish Balay if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) { 547*e26ad7d8SSatish Balay SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros"); 548*e26ad7d8SSatish Balay } 549*e26ad7d8SSatish Balay /* Initial matrix as if empty */ 550*e26ad7d8SSatish Balay PetscMemzero(mat->ilen,mat->m*sizeof(int)); 551*e26ad7d8SSatish Balay submats[i]->factor = C->factor; 552*e26ad7d8SSatish Balay } 553*e26ad7d8SSatish Balay } else { 554*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 555*e26ad7d8SSatish Balay ierr = MatCreateSeqDense(PETSC_COMM_SELF,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr); 556*e26ad7d8SSatish Balay } 557*e26ad7d8SSatish Balay } 558*e26ad7d8SSatish Balay 559*e26ad7d8SSatish Balay /* Assemble the matrices */ 560*e26ad7d8SSatish Balay /* First assemble the local rows */ 561*e26ad7d8SSatish Balay { 562*e26ad7d8SSatish Balay int ilen_row,*imat_ilen, *imat_j, *imat_i,old_row; 563*e26ad7d8SSatish Balay Scalar *imat_a; 564*e26ad7d8SSatish Balay 565*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 566*e26ad7d8SSatish Balay mat = (Mat_SeqDense *) submats[i]->data; 567*e26ad7d8SSatish Balay imat_ilen = mat->ilen; 568*e26ad7d8SSatish Balay imat_j = mat->j; 569*e26ad7d8SSatish Balay imat_i = mat->i; 570*e26ad7d8SSatish Balay imat_a = mat->a; 571*e26ad7d8SSatish Balay cmap_i = cmap[i]; 572*e26ad7d8SSatish Balay rmap_i = rmap[i]; 573*e26ad7d8SSatish Balay irow_i = irow[i]; 574*e26ad7d8SSatish Balay jmax = nrow[i]; 575*e26ad7d8SSatish Balay for ( j=0; j<jmax; j++ ) { 576*e26ad7d8SSatish Balay row = irow_i[j]; 577*e26ad7d8SSatish Balay proc = rtable[row]; 578*e26ad7d8SSatish Balay if (proc == rank) { 579*e26ad7d8SSatish Balay old_row = row; 580*e26ad7d8SSatish Balay row = rmap_i[row]; 581*e26ad7d8SSatish Balay ilen_row = imat_ilen[row]; 582*e26ad7d8SSatish Balay ierr = MatGetRow_MPIDense(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 583*e26ad7d8SSatish Balay mat_i = imat_i[row]; 584*e26ad7d8SSatish Balay mat_a = imat_a + mat_i; 585*e26ad7d8SSatish Balay mat_j = imat_j + mat_i; 586*e26ad7d8SSatish Balay for ( k=0; k<ncols; k++ ) { 587*e26ad7d8SSatish Balay if ((tcol = cmap_i[cols[k]])) { 588*e26ad7d8SSatish Balay *mat_j++ = tcol - -1; 589*e26ad7d8SSatish Balay *mat_a++ = vals[k]; 590*e26ad7d8SSatish Balay ilen_row++; 591*e26ad7d8SSatish Balay } 592*e26ad7d8SSatish Balay } 593*e26ad7d8SSatish Balay ierr = MatRestoreRow_MPIDense(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 594*e26ad7d8SSatish Balay imat_ilen[row] = ilen_row; 595*e26ad7d8SSatish Balay } 596*e26ad7d8SSatish Balay } 597*e26ad7d8SSatish Balay } 598*e26ad7d8SSatish Balay } 599*e26ad7d8SSatish Balay 600*e26ad7d8SSatish Balay /* Now assemble the off proc rows*/ 601*e26ad7d8SSatish Balay { 602*e26ad7d8SSatish Balay int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 603*e26ad7d8SSatish Balay int *imat_j,*imat_i; 604*e26ad7d8SSatish Balay Scalar *imat_a,*rbuf4_i; 605*e26ad7d8SSatish Balay 606*e26ad7d8SSatish Balay for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 607*e26ad7d8SSatish Balay ierr = MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2);CHKERRQ(ierr); 608*e26ad7d8SSatish Balay index = pa[i]; 609*e26ad7d8SSatish Balay sbuf1_i = sbuf1[index]; 610*e26ad7d8SSatish Balay jmax = sbuf1_i[0]; 611*e26ad7d8SSatish Balay ct1 = 2*jmax + 1; 612*e26ad7d8SSatish Balay ct2 = 0; 613*e26ad7d8SSatish Balay rbuf2_i = rbuf2[i]; 614*e26ad7d8SSatish Balay rbuf3_i = rbuf3[i]; 615*e26ad7d8SSatish Balay rbuf4_i = rbuf4[i]; 616*e26ad7d8SSatish Balay for ( j=1; j<=jmax; j++ ) { 617*e26ad7d8SSatish Balay is_no = sbuf1_i[2*j-1]; 618*e26ad7d8SSatish Balay rmap_i = rmap[is_no]; 619*e26ad7d8SSatish Balay cmap_i = cmap[is_no]; 620*e26ad7d8SSatish Balay mat = (Mat_SeqDense *) submats[is_no]->data; 621*e26ad7d8SSatish Balay imat_ilen = mat->ilen; 622*e26ad7d8SSatish Balay imat_j = mat->j; 623*e26ad7d8SSatish Balay imat_i = mat->i; 624*e26ad7d8SSatish Balay imat_a = mat->a; 625*e26ad7d8SSatish Balay max1 = sbuf1_i[2*j]; 626*e26ad7d8SSatish Balay for ( k=0; k<max1; k++, ct1++ ) { 627*e26ad7d8SSatish Balay row = sbuf1_i[ct1]; 628*e26ad7d8SSatish Balay row = rmap_i[row]; 629*e26ad7d8SSatish Balay ilen = imat_ilen[row]; 630*e26ad7d8SSatish Balay mat_i = imat_i[row]; 631*e26ad7d8SSatish Balay mat_a = imat_a + mat_i; 632*e26ad7d8SSatish Balay mat_j = imat_j + mat_i; 633*e26ad7d8SSatish Balay max2 = rbuf2_i[ct1]; 634*e26ad7d8SSatish Balay for ( l=0; l<max2; l++,ct2++ ) { 635*e26ad7d8SSatish Balay if ((tcol = cmap_i[rbuf3_i[ct2]])) { 636*e26ad7d8SSatish Balay *mat_j++ = tcol - 1; 637*e26ad7d8SSatish Balay *mat_a++ = rbuf4_i[ct2]; 638*e26ad7d8SSatish Balay ilen++; 639*e26ad7d8SSatish Balay } 640*e26ad7d8SSatish Balay } 641*e26ad7d8SSatish Balay imat_ilen[row] = ilen; 642*e26ad7d8SSatish Balay } 643*e26ad7d8SSatish Balay } 644*e26ad7d8SSatish Balay } 645*e26ad7d8SSatish Balay } 646*e26ad7d8SSatish Balay PetscFree(r_status4); PetscFree(r_waits4); 647*e26ad7d8SSatish Balay ierr = MPI_Waitall(nrqr,s_waits4,s_status4); CHKERRQ(ierr); 648*e26ad7d8SSatish Balay PetscFree(s_waits4); PetscFree(s_status4); 649*e26ad7d8SSatish Balay 650*e26ad7d8SSatish Balay /* Restore the indices */ 651*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 652*e26ad7d8SSatish Balay ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 653*e26ad7d8SSatish Balay ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 654*e26ad7d8SSatish Balay } 655*e26ad7d8SSatish Balay 656*e26ad7d8SSatish Balay /* Destroy allocated memory */ 657*e26ad7d8SSatish Balay PetscFree(irow); 658*e26ad7d8SSatish Balay PetscFree(w1); 659*e26ad7d8SSatish Balay PetscFree(pa); 660*e26ad7d8SSatish Balay 661*e26ad7d8SSatish Balay PetscFree(sbuf1); 662*e26ad7d8SSatish Balay PetscFree(rbuf2); 663*e26ad7d8SSatish Balay for ( i=0; i<nrqr; ++i ) { 664*e26ad7d8SSatish Balay PetscFree(sbuf2[i]); 665*e26ad7d8SSatish Balay } 666*e26ad7d8SSatish Balay for ( i=0; i<nrqs; ++i ) { 667*e26ad7d8SSatish Balay PetscFree(rbuf3[i]); 668*e26ad7d8SSatish Balay PetscFree(rbuf4[i]); 669*e26ad7d8SSatish Balay } 670*e26ad7d8SSatish Balay 671*e26ad7d8SSatish Balay PetscFree(sbuf2); 672*e26ad7d8SSatish Balay PetscFree(rbuf3); 673*e26ad7d8SSatish Balay PetscFree(rbuf4 ); 674*e26ad7d8SSatish Balay PetscFree(sbuf_aj[0]); 675*e26ad7d8SSatish Balay PetscFree(sbuf_aj); 676*e26ad7d8SSatish Balay PetscFree(sbuf_aa[0]); 677*e26ad7d8SSatish Balay PetscFree(sbuf_aa); 678*e26ad7d8SSatish Balay 679*e26ad7d8SSatish Balay PetscFree(cmap); 680*e26ad7d8SSatish Balay PetscFree(rmap); 681*e26ad7d8SSatish Balay PetscFree(lens); 682*e26ad7d8SSatish Balay 683*e26ad7d8SSatish Balay ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 684*e26ad7d8SSatish Balay ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 685*e26ad7d8SSatish Balay ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 686*e26ad7d8SSatish Balay 687*e26ad7d8SSatish Balay for ( i=0; i<ismax; i++ ) { 688*e26ad7d8SSatish Balay ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 689*e26ad7d8SSatish Balay ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 690*e26ad7d8SSatish Balay } 691*e26ad7d8SSatish Balay 692*e26ad7d8SSatish Balay PetscFunctionReturn(0); 693*e26ad7d8SSatish Balay } 694*e26ad7d8SSatish Balay 695*e26ad7d8SSatish Balay #endif 696