xref: /petsc/src/mat/impls/dense/mpi/mmdense.c (revision e26ad7d8ee88123bbdfe3b070a3e2d22b606b0ee)
1a5eb4965SSatish Balay #ifdef PETSC_RCS_HEADER
2*e26ad7d8SSatish Balay static char vcid[] = "$Id: mmdense.c,v 1.13 1997/10/19 03:25:11 bsmith Exp balay $";
30500aeceSBarry Smith #endif
40500aeceSBarry Smith 
50500aeceSBarry Smith /*
60500aeceSBarry Smith    Support for the parallel dense matrix vector multiply
70500aeceSBarry Smith */
870f55243SBarry Smith #include "src/mat/impls/dense/mpi/mpidense.h"
9f5eb4b81SSatish Balay #include "src/vec/vecimpl.h"
100500aeceSBarry Smith 
115615d1e5SSatish Balay #undef __FUNC__
125615d1e5SSatish Balay #define __FUNC__ "MatSetUpMultiply_MPIDense"
130500aeceSBarry Smith int MatSetUpMultiply_MPIDense(Mat mat)
140500aeceSBarry Smith {
150500aeceSBarry Smith   Mat_MPIDense *mdn = (Mat_MPIDense *) mat->data;
16ed3cc1f0SBarry Smith   int          ierr,n;
17d9f96c7cSLois Curfman McInnes   IS           tofrom;
180500aeceSBarry Smith   Vec          gvec;
190500aeceSBarry Smith 
203a40ed3dSBarry Smith   PetscFunctionBegin;
210500aeceSBarry Smith   /* Create local vector that is used to scatter into */
22029af93fSBarry Smith   ierr = VecCreateSeq(PETSC_COMM_SELF,mdn->N,&mdn->lvec); CHKERRQ(ierr);
230500aeceSBarry Smith 
240500aeceSBarry Smith   /* Create temporary index set for building scatter gather */
25029af93fSBarry Smith   ierr = ISCreateStride(PETSC_COMM_SELF,mdn->N,0,1,&tofrom); CHKERRQ(ierr);
260500aeceSBarry Smith 
270500aeceSBarry Smith   /* Create temporary global vector to generate scatter context */
28ed3cc1f0SBarry Smith   n    = mdn->cowners[mdn->rank+1] - mdn->cowners[mdn->rank];
29ed3cc1f0SBarry Smith   ierr = VecCreateMPI(mat->comm,n,mdn->N,&gvec); CHKERRQ(ierr);
30e19c7942SLois Curfman McInnes 
310500aeceSBarry Smith   /* Generate the scatter context */
32d9f96c7cSLois Curfman McInnes   ierr = VecScatterCreate(gvec,tofrom,mdn->lvec,tofrom,&mdn->Mvctx); CHKERRQ(ierr);
330500aeceSBarry Smith   PLogObjectParent(mat,mdn->Mvctx);
340500aeceSBarry Smith   PLogObjectParent(mat,mdn->lvec);
35d9f96c7cSLois Curfman McInnes   PLogObjectParent(mat,tofrom);
36d9f96c7cSLois Curfman McInnes   PLogObjectParent(mat,gvec);
37e19c7942SLois Curfman McInnes 
38d9f96c7cSLois Curfman McInnes   ierr = ISDestroy(tofrom); CHKERRQ(ierr);
39d9f96c7cSLois Curfman McInnes   ierr = VecDestroy(gvec); CHKERRQ(ierr);
403a40ed3dSBarry Smith   PetscFunctionReturn(0);
410500aeceSBarry Smith }
420500aeceSBarry Smith 
43*e26ad7d8SSatish Balay #if defined (__JUNK__)
44*e26ad7d8SSatish Balay 
45*e26ad7d8SSatish Balay int MatGetSubMatrices_MPIDense_Local(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat*);
46*e26ad7d8SSatish Balay #undef __FUNC__
47*e26ad7d8SSatish Balay #define __FUNC__ "MatGetSubMatrices_MPIDense"
48*e26ad7d8SSatish Balay int MatGetSubMatrices_MPIDense(Mat C,int ismax,IS *isrow,IS *iscol,
49*e26ad7d8SSatish Balay                              MatGetSubMatrixCall scall,Mat **submat)
50*e26ad7d8SSatish Balay {
51*e26ad7d8SSatish Balay   Mat_MPIDense  *c = (Mat_MPIDense *) C->data;
52*e26ad7d8SSatish Balay   int         nmax,nstages_local,nstages,i,pos,max_no,ierr;
53*e26ad7d8SSatish Balay 
54*e26ad7d8SSatish Balay   PetscFunctionBegin;
55*e26ad7d8SSatish Balay   /* Allocate memory to hold all the submatrices */
56*e26ad7d8SSatish Balay   if (scall != MAT_REUSE_MATRIX) {
57*e26ad7d8SSatish Balay     *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat);
58*e26ad7d8SSatish Balay   }
59*e26ad7d8SSatish Balay   /* Determine the number of stages through which submatrices are done */
60*e26ad7d8SSatish Balay   nmax          = 20*1000000 / (c->N * sizeof(int));
61*e26ad7d8SSatish Balay   if (nmax == 0) nmax = 1;
62*e26ad7d8SSatish Balay   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
63*e26ad7d8SSatish Balay 
64*e26ad7d8SSatish Balay   /* Make sure every processor loops through the nstages */
65*e26ad7d8SSatish Balay   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
660500aeceSBarry Smith 
670500aeceSBarry Smith 
68*e26ad7d8SSatish Balay   for ( i=0,pos=0; i<nstages; i++ ) {
69*e26ad7d8SSatish Balay     if (pos+nmax <= ismax) max_no = nmax;
70*e26ad7d8SSatish Balay     else if (pos == ismax) max_no = 0;
71*e26ad7d8SSatish Balay     else                   max_no = ismax-pos;
72*e26ad7d8SSatish Balay     ierr = MatGetSubMatrices_MPIDense_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
73*e26ad7d8SSatish Balay     pos += max_no;
74*e26ad7d8SSatish Balay   }
75*e26ad7d8SSatish Balay   PetscFunctionReturn(0);
76*e26ad7d8SSatish Balay }
77*e26ad7d8SSatish Balay /* -------------------------------------------------------------------------*/
78*e26ad7d8SSatish Balay #undef __FUNC__
79*e26ad7d8SSatish Balay #define __FUNC__ "MatGetSubMatrices_MPIDense_Local"
80*e26ad7d8SSatish Balay int MatGetSubMatrices_MPIDense_Local(Mat C,int ismax,IS *isrow,IS *iscol,
81*e26ad7d8SSatish Balay                              MatGetSubMatrixCall scall,Mat *submats)
82*e26ad7d8SSatish Balay {
83*e26ad7d8SSatish Balay   Mat_MPIDense  *c = (Mat_MPIDense *) C->data;
84*e26ad7d8SSatish Balay   Mat         A = c->A;
85*e26ad7d8SSatish Balay   Mat_SeqDense  *a = (Mat_SeqDense*)A->data, *mat;
86*e26ad7d8SSatish Balay   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
87*e26ad7d8SSatish Balay   int         **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc;
88*e26ad7d8SSatish Balay   int         nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr;
89*e26ad7d8SSatish Balay   int         **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap;
90*e26ad7d8SSatish Balay   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
91*e26ad7d8SSatish Balay   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
92*e26ad7d8SSatish Balay   int         *rmap_i,tag0,tag1,tag2,tag3;
93*e26ad7d8SSatish Balay   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
94*e26ad7d8SSatish Balay   MPI_Request *r_waits4,*s_waits3,*s_waits4;
95*e26ad7d8SSatish Balay   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
96*e26ad7d8SSatish Balay   MPI_Status  *r_status3,*r_status4,*s_status4;
97*e26ad7d8SSatish Balay   MPI_Comm    comm;
98*e26ad7d8SSatish Balay   Scalar      **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i;
99*e26ad7d8SSatish Balay 
100*e26ad7d8SSatish Balay   PetscFunctionBegin;
101*e26ad7d8SSatish Balay   comm   = C->comm;
102*e26ad7d8SSatish Balay   tag0   = C->tag;
103*e26ad7d8SSatish Balay   size   = c->size;
104*e26ad7d8SSatish Balay   rank   = c->rank;
105*e26ad7d8SSatish Balay   m      = c->M;
106*e26ad7d8SSatish Balay 
107*e26ad7d8SSatish Balay   /* Get some new tags to keep the communication clean */
108*e26ad7d8SSatish Balay   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr);
109*e26ad7d8SSatish Balay   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr);
110*e26ad7d8SSatish Balay   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr);
111*e26ad7d8SSatish Balay 
112*e26ad7d8SSatish Balay     /* Check if the col indices are sorted */
113*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
114*e26ad7d8SSatish Balay     ierr = ISSorted(isrow[i],(PetscTruth*)&j);
115*e26ad7d8SSatish Balay     if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"ISrow is not sorted");
116*e26ad7d8SSatish Balay     ierr = ISSorted(iscol[i],(PetscTruth*)&j);
117*e26ad7d8SSatish Balay     if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IScol is not sorted");
118*e26ad7d8SSatish Balay   }
119*e26ad7d8SSatish Balay 
120*e26ad7d8SSatish Balay   len    = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (m+1)*sizeof(int);
121*e26ad7d8SSatish Balay   irow   = (int **)PetscMalloc(len); CHKPTRQ(irow);
122*e26ad7d8SSatish Balay   icol   = irow + ismax;
123*e26ad7d8SSatish Balay   nrow   = (int *) (icol + ismax);
124*e26ad7d8SSatish Balay   ncol   = nrow + ismax;
125*e26ad7d8SSatish Balay   rtable = ncol + ismax;
126*e26ad7d8SSatish Balay 
127*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
128*e26ad7d8SSatish Balay     ierr = ISGetIndices(isrow[i],&irow[i]);  CHKERRQ(ierr);
129*e26ad7d8SSatish Balay     ierr = ISGetIndices(iscol[i],&icol[i]);  CHKERRQ(ierr);
130*e26ad7d8SSatish Balay     ierr = ISGetSize(isrow[i],&nrow[i]);  CHKERRQ(ierr);
131*e26ad7d8SSatish Balay     ierr = ISGetSize(iscol[i],&ncol[i]);  CHKERRQ(ierr);
132*e26ad7d8SSatish Balay   }
133*e26ad7d8SSatish Balay 
134*e26ad7d8SSatish Balay   /* Create hash table for the mapping :row -> proc*/
135*e26ad7d8SSatish Balay   for ( i=0,j=0; i<size; i++ ) {
136*e26ad7d8SSatish Balay     jmax = c->rowners[i+1];
137*e26ad7d8SSatish Balay     for ( ; j<jmax; j++ ) {
138*e26ad7d8SSatish Balay       rtable[j] = i;
139*e26ad7d8SSatish Balay     }
140*e26ad7d8SSatish Balay   }
141*e26ad7d8SSatish Balay 
142*e26ad7d8SSatish Balay   /* evaluate communication - mesg to who, length of mesg, and buffer space
143*e26ad7d8SSatish Balay      required. Based on this, buffers are allocated, and data copied into them*/
144*e26ad7d8SSatish Balay   w1     = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */
145*e26ad7d8SSatish Balay   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
146*e26ad7d8SSatish Balay   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
147*e26ad7d8SSatish Balay   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
148*e26ad7d8SSatish Balay   PetscMemzero(w1,size*3*sizeof(int)); /* initialize work vector*/
149*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
150*e26ad7d8SSatish Balay     PetscMemzero(w4,size*sizeof(int)); /* initialize work vector*/
151*e26ad7d8SSatish Balay     jmax   = nrow[i];
152*e26ad7d8SSatish Balay     irow_i = irow[i];
153*e26ad7d8SSatish Balay     for ( j=0; j<jmax; j++ ) {
154*e26ad7d8SSatish Balay       row  = irow_i[j];
155*e26ad7d8SSatish Balay       proc = rtable[row];
156*e26ad7d8SSatish Balay       w4[proc]++;
157*e26ad7d8SSatish Balay     }
158*e26ad7d8SSatish Balay     for ( j=0; j<size; j++ ) {
159*e26ad7d8SSatish Balay       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
160*e26ad7d8SSatish Balay     }
161*e26ad7d8SSatish Balay   }
162*e26ad7d8SSatish Balay 
163*e26ad7d8SSatish Balay   nrqs     = 0;              /* no of outgoing messages */
164*e26ad7d8SSatish Balay   msz      = 0;              /* total mesg length (for all procs) */
165*e26ad7d8SSatish Balay   w1[rank] = 0;              /* no mesg sent to self */
166*e26ad7d8SSatish Balay   w3[rank] = 0;
167*e26ad7d8SSatish Balay   for ( i=0; i<size; i++ ) {
168*e26ad7d8SSatish Balay     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
169*e26ad7d8SSatish Balay   }
170*e26ad7d8SSatish Balay   pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/
171*e26ad7d8SSatish Balay   for ( i=0, j=0; i<size; i++ ) {
172*e26ad7d8SSatish Balay     if (w1[i]) { pa[j] = i; j++; }
173*e26ad7d8SSatish Balay   }
174*e26ad7d8SSatish Balay 
175*e26ad7d8SSatish Balay   /* Each message would have a header = 1 + 2*(no of IS) + data */
176*e26ad7d8SSatish Balay   for ( i=0; i<nrqs; i++ ) {
177*e26ad7d8SSatish Balay     j     = pa[i];
178*e26ad7d8SSatish Balay     w1[j] += w2[j] + 2* w3[j];
179*e26ad7d8SSatish Balay     msz   += w1[j];
180*e26ad7d8SSatish Balay   }
181*e26ad7d8SSatish Balay   /* Do a global reduction to determine how many messages to expect*/
182*e26ad7d8SSatish Balay   {
183*e26ad7d8SSatish Balay     int *rw1, *rw2;
184*e26ad7d8SSatish Balay     rw1   = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
185*e26ad7d8SSatish Balay     rw2   = rw1+size;
186*e26ad7d8SSatish Balay     ierr  = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr);
187*e26ad7d8SSatish Balay     bsz   = rw1[rank];
188*e26ad7d8SSatish Balay     ierr  = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr);
189*e26ad7d8SSatish Balay     nrqr  = rw2[rank];
190*e26ad7d8SSatish Balay     PetscFree(rw1);
191*e26ad7d8SSatish Balay   }
192*e26ad7d8SSatish Balay 
193*e26ad7d8SSatish Balay   /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */
194*e26ad7d8SSatish Balay   len      = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int);
195*e26ad7d8SSatish Balay   rbuf1    = (int**) PetscMalloc(len);  CHKPTRQ(rbuf1);
196*e26ad7d8SSatish Balay   rbuf1[0] = (int *) (rbuf1 + nrqr);
197*e26ad7d8SSatish Balay   for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz;
198*e26ad7d8SSatish Balay 
199*e26ad7d8SSatish Balay   /* Post the receives */
200*e26ad7d8SSatish Balay   r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1);
201*e26ad7d8SSatish Balay   for ( i=0; i<nrqr; ++i ) {
202*e26ad7d8SSatish Balay     ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr);
203*e26ad7d8SSatish Balay   }
204*e26ad7d8SSatish Balay 
205*e26ad7d8SSatish Balay   /* Allocate Memory for outgoing messages */
206*e26ad7d8SSatish Balay   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
207*e26ad7d8SSatish Balay   sbuf1    = (int **)PetscMalloc(len); CHKPTRQ(sbuf1);
208*e26ad7d8SSatish Balay   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
209*e26ad7d8SSatish Balay   PetscMemzero(sbuf1,2*size*sizeof(int*));
210*e26ad7d8SSatish Balay   /* allocate memory for outgoing data + buf to receive the first reply */
211*e26ad7d8SSatish Balay   tmp      = (int *) (ptr + size);
212*e26ad7d8SSatish Balay   ctr      = tmp + 2*msz;
213*e26ad7d8SSatish Balay 
214*e26ad7d8SSatish Balay   {
215*e26ad7d8SSatish Balay     int *iptr = tmp,ict = 0;
216*e26ad7d8SSatish Balay     for ( i=0; i<nrqs; i++ ) {
217*e26ad7d8SSatish Balay       j         = pa[i];
218*e26ad7d8SSatish Balay       iptr     += ict;
219*e26ad7d8SSatish Balay       sbuf1[j]  = iptr;
220*e26ad7d8SSatish Balay       ict       = w1[j];
221*e26ad7d8SSatish Balay     }
222*e26ad7d8SSatish Balay   }
223*e26ad7d8SSatish Balay 
224*e26ad7d8SSatish Balay   /* Form the outgoing messages */
225*e26ad7d8SSatish Balay   /* Initialize the header space */
226*e26ad7d8SSatish Balay   for ( i=0; i<nrqs; i++ ) {
227*e26ad7d8SSatish Balay     j           = pa[i];
228*e26ad7d8SSatish Balay     sbuf1[j][0] = 0;
229*e26ad7d8SSatish Balay     PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int));
230*e26ad7d8SSatish Balay     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
231*e26ad7d8SSatish Balay   }
232*e26ad7d8SSatish Balay 
233*e26ad7d8SSatish Balay   /* Parse the isrow and copy data into outbuf */
234*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
235*e26ad7d8SSatish Balay     PetscMemzero(ctr,size*sizeof(int));
236*e26ad7d8SSatish Balay     irow_i = irow[i];
237*e26ad7d8SSatish Balay     jmax   = nrow[i];
238*e26ad7d8SSatish Balay     for ( j=0; j<jmax; j++ ) {  /* parse the indices of each IS */
239*e26ad7d8SSatish Balay       row  = irow_i[j];
240*e26ad7d8SSatish Balay       proc = rtable[row];
241*e26ad7d8SSatish Balay       if (proc != rank) { /* copy to the outgoing buf*/
242*e26ad7d8SSatish Balay         ctr[proc]++;
243*e26ad7d8SSatish Balay         *ptr[proc] = row;
244*e26ad7d8SSatish Balay         ptr[proc]++;
245*e26ad7d8SSatish Balay       }
246*e26ad7d8SSatish Balay     }
247*e26ad7d8SSatish Balay     /* Update the headers for the current IS */
248*e26ad7d8SSatish Balay     for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */
249*e26ad7d8SSatish Balay       if ((ctr_j = ctr[j])) {
250*e26ad7d8SSatish Balay         sbuf1_j        = sbuf1[j];
251*e26ad7d8SSatish Balay         k              = ++sbuf1_j[0];
252*e26ad7d8SSatish Balay         sbuf1_j[2*k]   = ctr_j;
253*e26ad7d8SSatish Balay         sbuf1_j[2*k-1] = i;
254*e26ad7d8SSatish Balay       }
255*e26ad7d8SSatish Balay     }
256*e26ad7d8SSatish Balay   }
257*e26ad7d8SSatish Balay 
258*e26ad7d8SSatish Balay   /*  Now  post the sends */
259*e26ad7d8SSatish Balay   s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1);
260*e26ad7d8SSatish Balay   for ( i=0; i<nrqs; ++i ) {
261*e26ad7d8SSatish Balay     j = pa[i];
262*e26ad7d8SSatish Balay     /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */
263*e26ad7d8SSatish Balay     ierr = MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i);CHKERRQ(ierr);
264*e26ad7d8SSatish Balay   }
265*e26ad7d8SSatish Balay 
266*e26ad7d8SSatish Balay   /* Post Receives to capture the buffer size */
267*e26ad7d8SSatish Balay   r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2);
268*e26ad7d8SSatish Balay   rbuf2    = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2);
269*e26ad7d8SSatish Balay   rbuf2[0] = tmp + msz;
270*e26ad7d8SSatish Balay   for ( i=1; i<nrqs; ++i ) {
271*e26ad7d8SSatish Balay     j        = pa[i];
272*e26ad7d8SSatish Balay     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
273*e26ad7d8SSatish Balay   }
274*e26ad7d8SSatish Balay   for ( i=0; i<nrqs; ++i ) {
275*e26ad7d8SSatish Balay     j    = pa[i];
276*e26ad7d8SSatish Balay     ierr = MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i);CHKERRQ(ierr);
277*e26ad7d8SSatish Balay   }
278*e26ad7d8SSatish Balay 
279*e26ad7d8SSatish Balay   /* Send to other procs the buf size they should allocate */
280*e26ad7d8SSatish Balay 
281*e26ad7d8SSatish Balay 
282*e26ad7d8SSatish Balay   /* Receive messages*/
283*e26ad7d8SSatish Balay   s_waits2  = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2);
284*e26ad7d8SSatish Balay   r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1);
285*e26ad7d8SSatish Balay   len         = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
286*e26ad7d8SSatish Balay   sbuf2       = (int**) PetscMalloc(len);CHKPTRQ(sbuf2);
287*e26ad7d8SSatish Balay   req_size    = (int *) (sbuf2 + nrqr);
288*e26ad7d8SSatish Balay   req_source  = req_size + nrqr;
289*e26ad7d8SSatish Balay 
290*e26ad7d8SSatish Balay   {
291*e26ad7d8SSatish Balay     int        *sbuf2_i;
292*e26ad7d8SSatish Balay 
293*e26ad7d8SSatish Balay     for ( i=0; i<nrqr; ++i ) {
294*e26ad7d8SSatish Balay       ierr = MPI_Waitany(nrqr, r_waits1, &index, r_status1+i);CHKERRQ(ierr);
295*e26ad7d8SSatish Balay       req_size[index] = 0;
296*e26ad7d8SSatish Balay       rbuf1_i         = rbuf1[index];
297*e26ad7d8SSatish Balay       start           = 2*rbuf1_i[0] + 1;
298*e26ad7d8SSatish Balay       MPI_Get_count(r_status1+i,MPI_INT, &end);
299*e26ad7d8SSatish Balay       sbuf2[index] = (int *)PetscMalloc((end+1)*sizeof(int));CHKPTRQ(sbuf2[index]);
300*e26ad7d8SSatish Balay       sbuf2_i      = sbuf2[index];
301*e26ad7d8SSatish Balay       for ( j=start; j<end; j++ ) {
302*e26ad7d8SSatish Balay         /** ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; */
303*e26ad7d8SSatish Balay         /** ncols is now the whole row in the dense case */
304*e26ad7d8SSatish Balay         /** Can get rid of this unnecessary communication */
305*e26ad7d8SSatish Balay         ncols            = c->N;
306*e26ad7d8SSatish Balay         sbuf2_i[j]       = ncols;
307*e26ad7d8SSatish Balay         req_size[index] += ncols;
308*e26ad7d8SSatish Balay       }
309*e26ad7d8SSatish Balay       req_source[index] = r_status1[i].MPI_SOURCE;
310*e26ad7d8SSatish Balay       /* form the header */
311*e26ad7d8SSatish Balay       sbuf2_i[0]   = req_size[index];
312*e26ad7d8SSatish Balay       for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; }
313*e26ad7d8SSatish Balay       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); CHKERRQ(ierr);
314*e26ad7d8SSatish Balay     }
315*e26ad7d8SSatish Balay   }
316*e26ad7d8SSatish Balay   PetscFree(r_status1); PetscFree(r_waits1);
317*e26ad7d8SSatish Balay 
318*e26ad7d8SSatish Balay   /*  recv buffer sizes */
319*e26ad7d8SSatish Balay   /* Receive messages*/
320*e26ad7d8SSatish Balay 
321*e26ad7d8SSatish Balay   rbuf3     = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3);
322*e26ad7d8SSatish Balay   rbuf4     = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4);
323*e26ad7d8SSatish Balay   r_waits3  = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3);
324*e26ad7d8SSatish Balay   r_waits4  = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4);
325*e26ad7d8SSatish Balay   r_status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2);
326*e26ad7d8SSatish Balay 
327*e26ad7d8SSatish Balay   for ( i=0; i<nrqs; ++i ) {
328*e26ad7d8SSatish Balay     ierr = MPI_Waitany(nrqs, r_waits2, &index, r_status2+i);CHKERRQ(ierr);
329*e26ad7d8SSatish Balay     rbuf3[index] = (int *)PetscMalloc((rbuf2[index][0]+1)*sizeof(int));CHKPTRQ(rbuf3[index]);
330*e26ad7d8SSatish Balay     rbuf4[index] = (Scalar *)PetscMalloc((rbuf2[index][0]+1)*sizeof(Scalar));CHKPTRQ(rbuf4[index]);
331*e26ad7d8SSatish Balay     ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT,
332*e26ad7d8SSatish Balay               r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); CHKERRQ(ierr);
333*e26ad7d8SSatish Balay     ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0], MPIU_SCALAR,
334*e26ad7d8SSatish Balay               r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); CHKERRQ(ierr);
335*e26ad7d8SSatish Balay   }
336*e26ad7d8SSatish Balay   PetscFree(r_status2); PetscFree(r_waits2);
337*e26ad7d8SSatish Balay 
338*e26ad7d8SSatish Balay   /* Wait on sends1 and sends2 */
339*e26ad7d8SSatish Balay   s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1);
340*e26ad7d8SSatish Balay   s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2);
341*e26ad7d8SSatish Balay 
342*e26ad7d8SSatish Balay   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
343*e26ad7d8SSatish Balay   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
344*e26ad7d8SSatish Balay   PetscFree(s_status1); PetscFree(s_status2);
345*e26ad7d8SSatish Balay   PetscFree(s_waits1); PetscFree(s_waits2);
346*e26ad7d8SSatish Balay 
347*e26ad7d8SSatish Balay   /* Now allocate buffers for a->j, and send them off */
348*e26ad7d8SSatish Balay   /** No space required for a->j... as the complete row is packed and sent */
349*e26ad7d8SSatish Balay   /** sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); */
350*e26ad7d8SSatish Balay   /** for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; */
351*e26ad7d8SSatish Balay   /** sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); */
352*e26ad7d8SSatish Balay   /** for ( i=1; i<nrqr; i++ )  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; */
353*e26ad7d8SSatish Balay 
354*e26ad7d8SSatish Balay   s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3);
355*e26ad7d8SSatish Balay   {
356*e26ad7d8SSatish Balay     int nzA, nzB, *a_i = a->i, *b_i = b->i, imark;
357*e26ad7d8SSatish Balay     int *cworkA, *cworkB, cstart = c->cstart, rstart = c->rstart, *bmap = c->garray;
358*e26ad7d8SSatish Balay     int *a_j = a->j, *b_j = b->j, ctmp, *t_cols;
359*e26ad7d8SSatish Balay 
360*e26ad7d8SSatish Balay     for ( i=0; i<nrqr; i++ ) {
361*e26ad7d8SSatish Balay       rbuf1_i   = rbuf1[i];
362*e26ad7d8SSatish Balay       sbuf_aj_i = sbuf_aj[i];
363*e26ad7d8SSatish Balay       ct1       = 2*rbuf1_i[0] + 1;
364*e26ad7d8SSatish Balay       ct2       = 0;
365*e26ad7d8SSatish Balay       for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) {
366*e26ad7d8SSatish Balay         kmax = rbuf1[i][2*j];
367*e26ad7d8SSatish Balay         for ( k=0; k<kmax; k++,ct1++ ) {
368*e26ad7d8SSatish Balay           row    = rbuf1_i[ct1] - rstart;
369*e26ad7d8SSatish Balay           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
370*e26ad7d8SSatish Balay           ncols  = nzA + nzB;
371*e26ad7d8SSatish Balay           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
372*e26ad7d8SSatish Balay 
373*e26ad7d8SSatish Balay           /* load the column indices for this row into cols*/
374*e26ad7d8SSatish Balay           cols  = sbuf_aj_i + ct2;
375*e26ad7d8SSatish Balay           for ( l=0; l<nzB; l++ ) {
376*e26ad7d8SSatish Balay             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
377*e26ad7d8SSatish Balay             else break;
378*e26ad7d8SSatish Balay           }
379*e26ad7d8SSatish Balay           imark = l;
380*e26ad7d8SSatish Balay           for ( l=0; l<nzA; l++ )   cols[imark+l] = cstart + cworkA[l];
381*e26ad7d8SSatish Balay           for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]];
382*e26ad7d8SSatish Balay 
383*e26ad7d8SSatish Balay           ct2 += ncols;
384*e26ad7d8SSatish Balay         }
385*e26ad7d8SSatish Balay       }
386*e26ad7d8SSatish Balay       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
387*e26ad7d8SSatish Balay     }
388*e26ad7d8SSatish Balay   }
389*e26ad7d8SSatish Balay   r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3);
390*e26ad7d8SSatish Balay   s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3);
391*e26ad7d8SSatish Balay 
392*e26ad7d8SSatish Balay   /* Allocate buffers for a->a, and send them off */
393*e26ad7d8SSatish Balay   sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa);
394*e26ad7d8SSatish Balay   for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i];
395*e26ad7d8SSatish Balay   sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]);
396*e26ad7d8SSatish Balay   for ( i=1; i<nrqr; i++ )  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
397*e26ad7d8SSatish Balay 
398*e26ad7d8SSatish Balay   s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4);
399*e26ad7d8SSatish Balay   {
400*e26ad7d8SSatish Balay     int    nzA, nzB, *a_i = a->i, *b_i = b->i,  *cworkB, imark;
401*e26ad7d8SSatish Balay     int    cstart = c->cstart, rstart = c->rstart, *bmap = c->garray;
402*e26ad7d8SSatish Balay     int    *b_j = b->j;
403*e26ad7d8SSatish Balay     Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a,*t_vals;
404*e26ad7d8SSatish Balay 
405*e26ad7d8SSatish Balay     for ( i=0; i<nrqr; i++ ) {
406*e26ad7d8SSatish Balay       rbuf1_i   = rbuf1[i];
407*e26ad7d8SSatish Balay       sbuf_aa_i = sbuf_aa[i];
408*e26ad7d8SSatish Balay       ct1       = 2*rbuf1_i[0]+1;
409*e26ad7d8SSatish Balay       ct2       = 0;
410*e26ad7d8SSatish Balay       for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) {
411*e26ad7d8SSatish Balay         kmax = rbuf1_i[2*j];
412*e26ad7d8SSatish Balay         for ( k=0; k<kmax; k++,ct1++ ) {
413*e26ad7d8SSatish Balay           row    = rbuf1_i[ct1] - rstart;
414*e26ad7d8SSatish Balay           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
415*e26ad7d8SSatish Balay           ncols  = nzA + nzB;
416*e26ad7d8SSatish Balay           cworkB = b_j + b_i[row];
417*e26ad7d8SSatish Balay           vworkA = a_a + a_i[row];
418*e26ad7d8SSatish Balay           vworkB = b_a + b_i[row];
419*e26ad7d8SSatish Balay 
420*e26ad7d8SSatish Balay           /* load the column values for this row into vals*/
421*e26ad7d8SSatish Balay           vals  = sbuf_aa_i+ct2;
422*e26ad7d8SSatish Balay           for ( l=0; l<nzB; l++ ) {
423*e26ad7d8SSatish Balay             if ((bmap[cworkB[l]]) < cstart)  vals[l] = vworkB[l];
424*e26ad7d8SSatish Balay             else break;
425*e26ad7d8SSatish Balay           }
426*e26ad7d8SSatish Balay           imark = l;
427*e26ad7d8SSatish Balay           for ( l=0; l<nzA; l++ )   vals[imark+l] = vworkA[l];
428*e26ad7d8SSatish Balay           for ( l=imark; l<nzB; l++ ) vals[nzA+l] = vworkB[l];
429*e26ad7d8SSatish Balay           ct2 += ncols;
430*e26ad7d8SSatish Balay         }
431*e26ad7d8SSatish Balay       }
432*e26ad7d8SSatish Balay       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
433*e26ad7d8SSatish Balay     }
434*e26ad7d8SSatish Balay   }
435*e26ad7d8SSatish Balay   r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4);
436*e26ad7d8SSatish Balay   s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4);
437*e26ad7d8SSatish Balay   PetscFree(rbuf1);
438*e26ad7d8SSatish Balay 
439*e26ad7d8SSatish Balay   /* Form the matrix */
440*e26ad7d8SSatish Balay   /* create col map */
441*e26ad7d8SSatish Balay   {
442*e26ad7d8SSatish Balay     int *icol_i;
443*e26ad7d8SSatish Balay 
444*e26ad7d8SSatish Balay     len     = (1+ismax)*sizeof(int *) + ismax*c->N*sizeof(int);
445*e26ad7d8SSatish Balay     cmap    = (int **)PetscMalloc(len); CHKPTRQ(cmap);
446*e26ad7d8SSatish Balay     cmap[0] = (int *)(cmap + ismax);
447*e26ad7d8SSatish Balay     PetscMemzero(cmap[0],(1+ismax*c->N)*sizeof(int));
448*e26ad7d8SSatish Balay     for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->N; }
449*e26ad7d8SSatish Balay     for ( i=0; i<ismax; i++ ) {
450*e26ad7d8SSatish Balay       jmax   = ncol[i];
451*e26ad7d8SSatish Balay       icol_i = icol[i];
452*e26ad7d8SSatish Balay       cmap_i = cmap[i];
453*e26ad7d8SSatish Balay       for ( j=0; j<jmax; j++ ) {
454*e26ad7d8SSatish Balay         cmap_i[icol_i[j]] = j+1;
455*e26ad7d8SSatish Balay       }
456*e26ad7d8SSatish Balay     }
457*e26ad7d8SSatish Balay   }
458*e26ad7d8SSatish Balay 
459*e26ad7d8SSatish Balay   /* Create lens which is required for MatCreate... */
460*e26ad7d8SSatish Balay   for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; }
461*e26ad7d8SSatish Balay   len     = (1+ismax)*sizeof(int *) + j*sizeof(int);
462*e26ad7d8SSatish Balay   lens    = (int **)PetscMalloc(len); CHKPTRQ(lens);
463*e26ad7d8SSatish Balay   lens[0] = (int *)(lens + ismax);
464*e26ad7d8SSatish Balay   PetscMemzero(lens[0], j*sizeof(int));
465*e26ad7d8SSatish Balay   for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; }
466*e26ad7d8SSatish Balay 
467*e26ad7d8SSatish Balay   /* Update lens from local data */
468*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
469*e26ad7d8SSatish Balay     jmax   = nrow[i];
470*e26ad7d8SSatish Balay     cmap_i = cmap[i];
471*e26ad7d8SSatish Balay     irow_i = irow[i];
472*e26ad7d8SSatish Balay     lens_i = lens[i];
473*e26ad7d8SSatish Balay     for ( j=0; j<jmax; j++ ) {
474*e26ad7d8SSatish Balay       row  = irow_i[j];
475*e26ad7d8SSatish Balay       proc = rtable[row];
476*e26ad7d8SSatish Balay       if (proc == rank) {
477*e26ad7d8SSatish Balay         ierr = MatGetRow_MPIDense(C,row,&ncols,&cols,0); CHKERRQ(ierr);
478*e26ad7d8SSatish Balay         for ( k=0; k<ncols; k++ ) {
479*e26ad7d8SSatish Balay           if (cmap_i[cols[k]]) { lens_i[j]++;}
480*e26ad7d8SSatish Balay         }
481*e26ad7d8SSatish Balay         ierr = MatRestoreRow_MPIDense(C,row,&ncols,&cols,0); CHKERRQ(ierr);
482*e26ad7d8SSatish Balay       }
483*e26ad7d8SSatish Balay     }
484*e26ad7d8SSatish Balay   }
485*e26ad7d8SSatish Balay 
486*e26ad7d8SSatish Balay   /* Create row map*/
487*e26ad7d8SSatish Balay   len     = (1+ismax)*sizeof(int *) + ismax*c->M*sizeof(int);
488*e26ad7d8SSatish Balay   rmap    = (int **)PetscMalloc(len); CHKPTRQ(rmap);
489*e26ad7d8SSatish Balay   rmap[0] = (int *)(rmap + ismax);
490*e26ad7d8SSatish Balay   PetscMemzero(rmap[0],ismax*c->M*sizeof(int));
491*e26ad7d8SSatish Balay   for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->M;}
492*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
493*e26ad7d8SSatish Balay     rmap_i = rmap[i];
494*e26ad7d8SSatish Balay     irow_i = irow[i];
495*e26ad7d8SSatish Balay     jmax   = nrow[i];
496*e26ad7d8SSatish Balay     for ( j=0; j<jmax; j++ ) {
497*e26ad7d8SSatish Balay       rmap_i[irow_i[j]] = j;
498*e26ad7d8SSatish Balay     }
499*e26ad7d8SSatish Balay   }
500*e26ad7d8SSatish Balay 
501*e26ad7d8SSatish Balay   /* Update lens from offproc data */
502*e26ad7d8SSatish Balay   {
503*e26ad7d8SSatish Balay     int *rbuf2_i, *rbuf3_i, *sbuf1_i;
504*e26ad7d8SSatish Balay 
505*e26ad7d8SSatish Balay     for ( tmp2=0; tmp2<nrqs; tmp2++ ) {
506*e26ad7d8SSatish Balay       ierr = MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2);CHKERRQ(ierr);
507*e26ad7d8SSatish Balay       index   = pa[i];
508*e26ad7d8SSatish Balay       sbuf1_i = sbuf1[index];
509*e26ad7d8SSatish Balay       jmax    = sbuf1_i[0];
510*e26ad7d8SSatish Balay       ct1     = 2*jmax+1;
511*e26ad7d8SSatish Balay       ct2     = 0;
512*e26ad7d8SSatish Balay       rbuf2_i = rbuf2[i];
513*e26ad7d8SSatish Balay       rbuf3_i = rbuf3[i];
514*e26ad7d8SSatish Balay       for ( j=1; j<=jmax; j++ ) {
515*e26ad7d8SSatish Balay         is_no   = sbuf1_i[2*j-1];
516*e26ad7d8SSatish Balay         max1    = sbuf1_i[2*j];
517*e26ad7d8SSatish Balay         lens_i  = lens[is_no];
518*e26ad7d8SSatish Balay         cmap_i  = cmap[is_no];
519*e26ad7d8SSatish Balay         rmap_i  = rmap[is_no];
520*e26ad7d8SSatish Balay         for ( k=0; k<max1; k++,ct1++ ) {
521*e26ad7d8SSatish Balay           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
522*e26ad7d8SSatish Balay           max2 = rbuf2_i[ct1];
523*e26ad7d8SSatish Balay           for ( l=0; l<max2; l++,ct2++ ) {
524*e26ad7d8SSatish Balay             if (cmap_i[rbuf3_i[ct2]]) {
525*e26ad7d8SSatish Balay               lens_i[row]++;
526*e26ad7d8SSatish Balay             }
527*e26ad7d8SSatish Balay           }
528*e26ad7d8SSatish Balay         }
529*e26ad7d8SSatish Balay       }
530*e26ad7d8SSatish Balay     }
531*e26ad7d8SSatish Balay   }
532*e26ad7d8SSatish Balay   PetscFree(r_status3); PetscFree(r_waits3);
533*e26ad7d8SSatish Balay   ierr = MPI_Waitall(nrqr,s_waits3,s_status3); CHKERRQ(ierr);
534*e26ad7d8SSatish Balay   PetscFree(s_status3); PetscFree(s_waits3);
535*e26ad7d8SSatish Balay 
536*e26ad7d8SSatish Balay   /* Create the submatrices */
537*e26ad7d8SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
538*e26ad7d8SSatish Balay     /*
539*e26ad7d8SSatish Balay         Assumes new rows are same length as the old rows, hence bug!
540*e26ad7d8SSatish Balay     */
541*e26ad7d8SSatish Balay     for ( i=0; i<ismax; i++ ) {
542*e26ad7d8SSatish Balay       mat = (Mat_SeqDense *)(submats[i]->data);
543*e26ad7d8SSatish Balay       if ((mat->m != nrow[i]) || (mat->n != ncol[i])) {
544*e26ad7d8SSatish Balay         SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size");
545*e26ad7d8SSatish Balay       }
546*e26ad7d8SSatish Balay       if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) {
547*e26ad7d8SSatish Balay         SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
548*e26ad7d8SSatish Balay       }
549*e26ad7d8SSatish Balay       /* Initial matrix as if empty */
550*e26ad7d8SSatish Balay       PetscMemzero(mat->ilen,mat->m*sizeof(int));
551*e26ad7d8SSatish Balay       submats[i]->factor = C->factor;
552*e26ad7d8SSatish Balay     }
553*e26ad7d8SSatish Balay   } else {
554*e26ad7d8SSatish Balay     for ( i=0; i<ismax; i++ ) {
555*e26ad7d8SSatish Balay       ierr = MatCreateSeqDense(PETSC_COMM_SELF,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr);
556*e26ad7d8SSatish Balay     }
557*e26ad7d8SSatish Balay   }
558*e26ad7d8SSatish Balay 
559*e26ad7d8SSatish Balay   /* Assemble the matrices */
560*e26ad7d8SSatish Balay   /* First assemble the local rows */
561*e26ad7d8SSatish Balay   {
562*e26ad7d8SSatish Balay     int    ilen_row,*imat_ilen, *imat_j, *imat_i,old_row;
563*e26ad7d8SSatish Balay     Scalar *imat_a;
564*e26ad7d8SSatish Balay 
565*e26ad7d8SSatish Balay     for ( i=0; i<ismax; i++ ) {
566*e26ad7d8SSatish Balay       mat       = (Mat_SeqDense *) submats[i]->data;
567*e26ad7d8SSatish Balay       imat_ilen = mat->ilen;
568*e26ad7d8SSatish Balay       imat_j    = mat->j;
569*e26ad7d8SSatish Balay       imat_i    = mat->i;
570*e26ad7d8SSatish Balay       imat_a    = mat->a;
571*e26ad7d8SSatish Balay       cmap_i    = cmap[i];
572*e26ad7d8SSatish Balay       rmap_i    = rmap[i];
573*e26ad7d8SSatish Balay       irow_i    = irow[i];
574*e26ad7d8SSatish Balay       jmax      = nrow[i];
575*e26ad7d8SSatish Balay       for ( j=0; j<jmax; j++ ) {
576*e26ad7d8SSatish Balay         row      = irow_i[j];
577*e26ad7d8SSatish Balay         proc     = rtable[row];
578*e26ad7d8SSatish Balay         if (proc == rank) {
579*e26ad7d8SSatish Balay           old_row  = row;
580*e26ad7d8SSatish Balay           row      = rmap_i[row];
581*e26ad7d8SSatish Balay           ilen_row = imat_ilen[row];
582*e26ad7d8SSatish Balay           ierr     = MatGetRow_MPIDense(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
583*e26ad7d8SSatish Balay           mat_i    = imat_i[row];
584*e26ad7d8SSatish Balay           mat_a    = imat_a + mat_i;
585*e26ad7d8SSatish Balay           mat_j    = imat_j + mat_i;
586*e26ad7d8SSatish Balay           for ( k=0; k<ncols; k++ ) {
587*e26ad7d8SSatish Balay             if ((tcol = cmap_i[cols[k]])) {
588*e26ad7d8SSatish Balay               *mat_j++ = tcol - -1;
589*e26ad7d8SSatish Balay               *mat_a++ = vals[k];
590*e26ad7d8SSatish Balay               ilen_row++;
591*e26ad7d8SSatish Balay             }
592*e26ad7d8SSatish Balay           }
593*e26ad7d8SSatish Balay           ierr = MatRestoreRow_MPIDense(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
594*e26ad7d8SSatish Balay           imat_ilen[row] = ilen_row;
595*e26ad7d8SSatish Balay         }
596*e26ad7d8SSatish Balay       }
597*e26ad7d8SSatish Balay     }
598*e26ad7d8SSatish Balay   }
599*e26ad7d8SSatish Balay 
600*e26ad7d8SSatish Balay   /*   Now assemble the off proc rows*/
601*e26ad7d8SSatish Balay   {
602*e26ad7d8SSatish Balay     int    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
603*e26ad7d8SSatish Balay     int    *imat_j,*imat_i;
604*e26ad7d8SSatish Balay     Scalar *imat_a,*rbuf4_i;
605*e26ad7d8SSatish Balay 
606*e26ad7d8SSatish Balay     for ( tmp2=0; tmp2<nrqs; tmp2++ ) {
607*e26ad7d8SSatish Balay       ierr = MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2);CHKERRQ(ierr);
608*e26ad7d8SSatish Balay       index   = pa[i];
609*e26ad7d8SSatish Balay       sbuf1_i = sbuf1[index];
610*e26ad7d8SSatish Balay       jmax    = sbuf1_i[0];
611*e26ad7d8SSatish Balay       ct1     = 2*jmax + 1;
612*e26ad7d8SSatish Balay       ct2     = 0;
613*e26ad7d8SSatish Balay       rbuf2_i = rbuf2[i];
614*e26ad7d8SSatish Balay       rbuf3_i = rbuf3[i];
615*e26ad7d8SSatish Balay       rbuf4_i = rbuf4[i];
616*e26ad7d8SSatish Balay       for ( j=1; j<=jmax; j++ ) {
617*e26ad7d8SSatish Balay         is_no     = sbuf1_i[2*j-1];
618*e26ad7d8SSatish Balay         rmap_i    = rmap[is_no];
619*e26ad7d8SSatish Balay         cmap_i    = cmap[is_no];
620*e26ad7d8SSatish Balay         mat       = (Mat_SeqDense *) submats[is_no]->data;
621*e26ad7d8SSatish Balay         imat_ilen = mat->ilen;
622*e26ad7d8SSatish Balay         imat_j    = mat->j;
623*e26ad7d8SSatish Balay         imat_i    = mat->i;
624*e26ad7d8SSatish Balay         imat_a    = mat->a;
625*e26ad7d8SSatish Balay         max1      = sbuf1_i[2*j];
626*e26ad7d8SSatish Balay         for ( k=0; k<max1; k++, ct1++ ) {
627*e26ad7d8SSatish Balay           row   = sbuf1_i[ct1];
628*e26ad7d8SSatish Balay           row   = rmap_i[row];
629*e26ad7d8SSatish Balay           ilen  = imat_ilen[row];
630*e26ad7d8SSatish Balay           mat_i = imat_i[row];
631*e26ad7d8SSatish Balay           mat_a = imat_a + mat_i;
632*e26ad7d8SSatish Balay           mat_j = imat_j + mat_i;
633*e26ad7d8SSatish Balay           max2 = rbuf2_i[ct1];
634*e26ad7d8SSatish Balay           for ( l=0; l<max2; l++,ct2++ ) {
635*e26ad7d8SSatish Balay             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
636*e26ad7d8SSatish Balay               *mat_j++ = tcol - 1;
637*e26ad7d8SSatish Balay               *mat_a++ = rbuf4_i[ct2];
638*e26ad7d8SSatish Balay               ilen++;
639*e26ad7d8SSatish Balay             }
640*e26ad7d8SSatish Balay           }
641*e26ad7d8SSatish Balay           imat_ilen[row] = ilen;
642*e26ad7d8SSatish Balay         }
643*e26ad7d8SSatish Balay       }
644*e26ad7d8SSatish Balay     }
645*e26ad7d8SSatish Balay   }
646*e26ad7d8SSatish Balay   PetscFree(r_status4); PetscFree(r_waits4);
647*e26ad7d8SSatish Balay   ierr = MPI_Waitall(nrqr,s_waits4,s_status4); CHKERRQ(ierr);
648*e26ad7d8SSatish Balay   PetscFree(s_waits4); PetscFree(s_status4);
649*e26ad7d8SSatish Balay 
650*e26ad7d8SSatish Balay   /* Restore the indices */
651*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
652*e26ad7d8SSatish Balay     ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr);
653*e26ad7d8SSatish Balay     ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr);
654*e26ad7d8SSatish Balay   }
655*e26ad7d8SSatish Balay 
656*e26ad7d8SSatish Balay   /* Destroy allocated memory */
657*e26ad7d8SSatish Balay   PetscFree(irow);
658*e26ad7d8SSatish Balay   PetscFree(w1);
659*e26ad7d8SSatish Balay   PetscFree(pa);
660*e26ad7d8SSatish Balay 
661*e26ad7d8SSatish Balay   PetscFree(sbuf1);
662*e26ad7d8SSatish Balay   PetscFree(rbuf2);
663*e26ad7d8SSatish Balay   for ( i=0; i<nrqr; ++i ) {
664*e26ad7d8SSatish Balay     PetscFree(sbuf2[i]);
665*e26ad7d8SSatish Balay   }
666*e26ad7d8SSatish Balay   for ( i=0; i<nrqs; ++i ) {
667*e26ad7d8SSatish Balay     PetscFree(rbuf3[i]);
668*e26ad7d8SSatish Balay     PetscFree(rbuf4[i]);
669*e26ad7d8SSatish Balay   }
670*e26ad7d8SSatish Balay 
671*e26ad7d8SSatish Balay   PetscFree(sbuf2);
672*e26ad7d8SSatish Balay   PetscFree(rbuf3);
673*e26ad7d8SSatish Balay   PetscFree(rbuf4 );
674*e26ad7d8SSatish Balay   PetscFree(sbuf_aj[0]);
675*e26ad7d8SSatish Balay   PetscFree(sbuf_aj);
676*e26ad7d8SSatish Balay   PetscFree(sbuf_aa[0]);
677*e26ad7d8SSatish Balay   PetscFree(sbuf_aa);
678*e26ad7d8SSatish Balay 
679*e26ad7d8SSatish Balay   PetscFree(cmap);
680*e26ad7d8SSatish Balay   PetscFree(rmap);
681*e26ad7d8SSatish Balay   PetscFree(lens);
682*e26ad7d8SSatish Balay 
683*e26ad7d8SSatish Balay   ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr);
684*e26ad7d8SSatish Balay   ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr);
685*e26ad7d8SSatish Balay   ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr);
686*e26ad7d8SSatish Balay 
687*e26ad7d8SSatish Balay   for ( i=0; i<ismax; i++ ) {
688*e26ad7d8SSatish Balay     ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
689*e26ad7d8SSatish Balay     ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
690*e26ad7d8SSatish Balay   }
691*e26ad7d8SSatish Balay 
692*e26ad7d8SSatish Balay   PetscFunctionReturn(0);
693*e26ad7d8SSatish Balay }
694*e26ad7d8SSatish Balay 
695*e26ad7d8SSatish Balay #endif
696