xref: /petsc/src/mat/impls/dense/mpi/mmdense.c (revision 28b400f66ebc7ae0049166a2294dfcd3df27e64b)
1be1d678aSKris Buschelman 
20500aeceSBarry Smith /*
30500aeceSBarry Smith    Support for the parallel dense matrix vector multiply
40500aeceSBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/dense/mpi/mpidense.h>
6c6db04a5SJed Brown #include <petscblaslapack.h>
70500aeceSBarry Smith 
8dfbe8321SBarry Smith PetscErrorCode MatSetUpMultiply_MPIDense(Mat mat)
90500aeceSBarry Smith {
100500aeceSBarry Smith   Mat_MPIDense   *mdn = (Mat_MPIDense*)mat->data;
110500aeceSBarry Smith 
123a40ed3dSBarry Smith   PetscFunctionBegin;
130500aeceSBarry Smith   /* Create local vector that is used to scatter into */
145f80ce2aSJacob Faibussowitsch   CHKERRQ(VecDestroy(&mdn->lvec));
15637a0070SStefano Zampini   if (mdn->A) {
165f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateVecs(mdn->A,&mdn->lvec,NULL));
175f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogObjectParent((PetscObject)mat,(PetscObject)mdn->lvec));
18637a0070SStefano Zampini   }
19637a0070SStefano Zampini   if (!mdn->Mvctx) {
205f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetUp(mat->cmap));
215f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFCreate(PetscObjectComm((PetscObject)mat),&mdn->Mvctx));
225f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFSetGraphWithPattern(mdn->Mvctx,mat->cmap,PETSCSF_PATTERN_ALLGATHER));
235f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogObjectParent((PetscObject)mat,(PetscObject)mdn->Mvctx));
24637a0070SStefano Zampini   }
253a40ed3dSBarry Smith   PetscFunctionReturn(0);
260500aeceSBarry Smith }
270500aeceSBarry Smith 
28637a0070SStefano Zampini static PetscErrorCode MatCreateSubMatrices_MPIDense_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
29637a0070SStefano Zampini 
307dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_MPIDense(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
31e26ad7d8SSatish Balay {
3213f74950SBarry Smith   PetscInt       nmax,nstages_local,nstages,i,pos,max_no;
33e26ad7d8SSatish Balay 
34e26ad7d8SSatish Balay   PetscFunctionBegin;
35e26ad7d8SSatish Balay   /* Allocate memory to hold all the submatrices */
36e26ad7d8SSatish Balay   if (scall != MAT_REUSE_MATRIX) {
375f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscCalloc1(ismax+1,submat));
38e26ad7d8SSatish Balay   }
39e26ad7d8SSatish Balay   /* Determine the number of stages through which submatrices are done */
40d0f46423SBarry Smith   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
41329f5518SBarry Smith   if (!nmax) nmax = 1;
42e26ad7d8SSatish Balay   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
43e26ad7d8SSatish Balay 
44e26ad7d8SSatish Balay   /* Make sure every processor loops through the nstages */
455f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C)));
460500aeceSBarry Smith 
47e26ad7d8SSatish Balay   for (i=0,pos=0; i<nstages; i++) {
48e26ad7d8SSatish Balay     if (pos+nmax <= ismax) max_no = nmax;
49e26ad7d8SSatish Balay     else if (pos == ismax) max_no = 0;
50e26ad7d8SSatish Balay     else                   max_no = ismax-pos;
515f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSubMatrices_MPIDense_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos));
52e26ad7d8SSatish Balay     pos += max_no;
53e26ad7d8SSatish Balay   }
54e26ad7d8SSatish Balay   PetscFunctionReturn(0);
55e26ad7d8SSatish Balay }
56e26ad7d8SSatish Balay /* -------------------------------------------------------------------------*/
577dae84e0SHong Zhang PetscErrorCode MatCreateSubMatrices_MPIDense_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
58e26ad7d8SSatish Balay {
59e26ad7d8SSatish Balay   Mat_MPIDense   *c = (Mat_MPIDense*)C->data;
60e26ad7d8SSatish Balay   Mat            A  = c->A;
61e26ad7d8SSatish Balay   Mat_SeqDense   *a = (Mat_SeqDense*)A->data,*mat;
6213f74950SBarry Smith   PetscMPIInt    rank,size,tag0,tag1,idex,end,i;
63d0f46423SBarry Smith   PetscInt       N = C->cmap->N,rstart = C->rmap->rstart,count;
645d0c19d7SBarry Smith   const PetscInt **irow,**icol,*irow_i;
655d0c19d7SBarry Smith   PetscInt       *nrow,*ncol,*w1,*w3,*w4,*rtable,start;
6613f74950SBarry Smith   PetscInt       **sbuf1,m,j,k,l,ct1,**rbuf1,row,proc;
6713f74950SBarry Smith   PetscInt       nrqs,msz,**ptr,*ctr,*pa,*tmp,bsz,nrqr;
685d0c19d7SBarry Smith   PetscInt       is_no,jmax,**rmap,*rmap_i;
69052f0c41SBarry Smith   PetscInt       ctr_j,*sbuf1_j,*rbuf1_i;
704b3ec233SSatish Balay   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
714b3ec233SSatish Balay   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status2;
72e26ad7d8SSatish Balay   MPI_Comm       comm;
7387828ca2SBarry Smith   PetscScalar    **rbuf2,**sbuf2;
74ace3abfcSBarry Smith   PetscBool      sorted;
75e26ad7d8SSatish Balay 
76e26ad7d8SSatish Balay   PetscFunctionBegin;
775f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm));
787adad957SLisandro Dalcin   tag0 = ((PetscObject)C)->tag;
795f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(comm,&rank));
805f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(comm,&size));
81d0f46423SBarry Smith   m    = C->rmap->N;
82e26ad7d8SSatish Balay 
83e26ad7d8SSatish Balay   /* Get some new tags to keep the communication clean */
845f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag1));
85e26ad7d8SSatish Balay 
86e26ad7d8SSatish Balay   /* Check if the col indices are sorted */
87e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
885f80ce2aSJacob Faibussowitsch     CHKERRQ(ISSorted(isrow[i],&sorted));
89*28b400f6SJacob Faibussowitsch     PetscCheck(sorted,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
905f80ce2aSJacob Faibussowitsch     CHKERRQ(ISSorted(iscol[i],&sorted));
91*28b400f6SJacob Faibussowitsch     PetscCheck(sorted,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted");
92e26ad7d8SSatish Balay   }
93e26ad7d8SSatish Balay 
945f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,m,&rtable));
95e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
965f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(isrow[i],&irow[i]));
975f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(iscol[i],&icol[i]));
985f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetLocalSize(isrow[i],&nrow[i]));
995f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetLocalSize(iscol[i],&ncol[i]));
100e26ad7d8SSatish Balay   }
101e26ad7d8SSatish Balay 
102e26ad7d8SSatish Balay   /* Create hash table for the mapping :row -> proc*/
103e26ad7d8SSatish Balay   for (i=0,j=0; i<size; i++) {
104d0f46423SBarry Smith     jmax = C->rmap->range[i+1];
1052205254eSKarl Rupp     for (; j<jmax; j++) rtable[j] = i;
106e26ad7d8SSatish Balay   }
107e26ad7d8SSatish Balay 
108e26ad7d8SSatish Balay   /* evaluate communication - mesg to who,length of mesg, and buffer space
109e26ad7d8SSatish Balay      required. Based on this, buffers are allocated, and data copied into them*/
1105f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc3(2*size,&w1,size,&w3,size,&w4));
1115f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(w1,size*2)); /* initialize work vector*/
1125f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(w3,size)); /* initialize work vector*/
113e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
1145f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscArrayzero(w4,size)); /* initialize work vector*/
115e26ad7d8SSatish Balay     jmax   = nrow[i];
116e26ad7d8SSatish Balay     irow_i = irow[i];
117e26ad7d8SSatish Balay     for (j=0; j<jmax; j++) {
118e26ad7d8SSatish Balay       row  = irow_i[j];
119e26ad7d8SSatish Balay       proc = rtable[row];
120e26ad7d8SSatish Balay       w4[proc]++;
121e26ad7d8SSatish Balay     }
122e26ad7d8SSatish Balay     for (j=0; j<size; j++) {
123c1dc657dSBarry Smith       if (w4[j]) { w1[2*j] += w4[j];  w3[j]++;}
124e26ad7d8SSatish Balay     }
125e26ad7d8SSatish Balay   }
126e26ad7d8SSatish Balay 
127e26ad7d8SSatish Balay   nrqs       = 0;              /* no of outgoing messages */
128e26ad7d8SSatish Balay   msz        = 0;              /* total mesg length (for all procs) */
129c1dc657dSBarry Smith   w1[2*rank] = 0;              /* no mesg sent to self */
130e26ad7d8SSatish Balay   w3[rank]   = 0;
131e26ad7d8SSatish Balay   for (i=0; i<size; i++) {
132c1dc657dSBarry Smith     if (w1[2*i])  { w1[2*i+1] = 1; nrqs++;} /* there exists a message to proc i */
133e26ad7d8SSatish Balay   }
1345f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqs+1,&pa)); /*(proc -array)*/
135e26ad7d8SSatish Balay   for (i=0,j=0; i<size; i++) {
136c1dc657dSBarry Smith     if (w1[2*i]) { pa[j] = i; j++; }
137e26ad7d8SSatish Balay   }
138e26ad7d8SSatish Balay 
139e26ad7d8SSatish Balay   /* Each message would have a header = 1 + 2*(no of IS) + data */
140e26ad7d8SSatish Balay   for (i=0; i<nrqs; i++) {
141e26ad7d8SSatish Balay     j        = pa[i];
142c1dc657dSBarry Smith     w1[2*j] += w1[2*j+1] + 2* w3[j];
143c1dc657dSBarry Smith     msz     += w1[2*j];
144e26ad7d8SSatish Balay   }
145e26ad7d8SSatish Balay   /* Do a global reduction to determine how many messages to expect*/
1465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMaxSum(comm,w1,&bsz,&nrqr));
147e26ad7d8SSatish Balay 
14874ed9c26SBarry Smith   /* Allocate memory for recv buffers . Make sure rbuf1[0] exists by adding 1 to the buffer length */
1495f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr+1,&rbuf1));
1505f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr*bsz,&rbuf1[0]));
151e26ad7d8SSatish Balay   for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz;
152e26ad7d8SSatish Balay 
153e26ad7d8SSatish Balay   /* Post the receives */
1545f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr+1,&r_waits1));
155e26ad7d8SSatish Balay   for (i=0; i<nrqr; ++i) {
1565f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Irecv(rbuf1[i],bsz,MPIU_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i));
157e26ad7d8SSatish Balay   }
158e26ad7d8SSatish Balay 
159e26ad7d8SSatish Balay   /* Allocate Memory for outgoing messages */
1605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr));
1615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(sbuf1,size));
1625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(ptr,size));
163e26ad7d8SSatish Balay   {
16413f74950SBarry Smith     PetscInt *iptr = tmp,ict = 0;
165e26ad7d8SSatish Balay     for (i=0; i<nrqs; i++) {
166e26ad7d8SSatish Balay       j        = pa[i];
167e26ad7d8SSatish Balay       iptr    += ict;
168e26ad7d8SSatish Balay       sbuf1[j] = iptr;
169c1dc657dSBarry Smith       ict      = w1[2*j];
170e26ad7d8SSatish Balay     }
171e26ad7d8SSatish Balay   }
172e26ad7d8SSatish Balay 
173e26ad7d8SSatish Balay   /* Form the outgoing messages */
174e26ad7d8SSatish Balay   /* Initialize the header space */
175e26ad7d8SSatish Balay   for (i=0; i<nrqs; i++) {
176e26ad7d8SSatish Balay     j           = pa[i];
177e26ad7d8SSatish Balay     sbuf1[j][0] = 0;
1785f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscArrayzero(sbuf1[j]+1,2*w3[j]));
179e26ad7d8SSatish Balay     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
180e26ad7d8SSatish Balay   }
181e26ad7d8SSatish Balay 
182e26ad7d8SSatish Balay   /* Parse the isrow and copy data into outbuf */
183e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
1845f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscArrayzero(ctr,size));
185e26ad7d8SSatish Balay     irow_i = irow[i];
186e26ad7d8SSatish Balay     jmax   = nrow[i];
187e26ad7d8SSatish Balay     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
188e26ad7d8SSatish Balay       row  = irow_i[j];
189e26ad7d8SSatish Balay       proc = rtable[row];
190e26ad7d8SSatish Balay       if (proc != rank) { /* copy to the outgoing buf*/
191e26ad7d8SSatish Balay         ctr[proc]++;
192e26ad7d8SSatish Balay         *ptr[proc] = row;
193e26ad7d8SSatish Balay         ptr[proc]++;
194e26ad7d8SSatish Balay       }
195e26ad7d8SSatish Balay     }
196e26ad7d8SSatish Balay     /* Update the headers for the current IS */
197e26ad7d8SSatish Balay     for (j=0; j<size; j++) { /* Can Optimise this loop too */
19806ef35abSSatish Balay       if ((ctr_j = ctr[j])) {
199e26ad7d8SSatish Balay         sbuf1_j        = sbuf1[j];
200e26ad7d8SSatish Balay         k              = ++sbuf1_j[0];
201e26ad7d8SSatish Balay         sbuf1_j[2*k]   = ctr_j;
202e26ad7d8SSatish Balay         sbuf1_j[2*k-1] = i;
203e26ad7d8SSatish Balay       }
204e26ad7d8SSatish Balay     }
205e26ad7d8SSatish Balay   }
206e26ad7d8SSatish Balay 
207e26ad7d8SSatish Balay   /*  Now  post the sends */
2085f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqs+1,&s_waits1));
209e26ad7d8SSatish Balay   for (i=0; i<nrqs; ++i) {
210e26ad7d8SSatish Balay     j    = pa[i];
2115f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Isend(sbuf1[j],w1[2*j],MPIU_INT,j,tag0,comm,s_waits1+i));
212e26ad7d8SSatish Balay   }
213e26ad7d8SSatish Balay 
2142d4ee042Sprj-   /* Post receives to capture the row_data from other procs */
2155f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqs+1,&r_waits2));
2165f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqs+1,&rbuf2));
2174b3ec233SSatish Balay   for (i=0; i<nrqs; i++) {
218e26ad7d8SSatish Balay     j     = pa[i];
219c1dc657dSBarry Smith     count = (w1[2*j] - (2*sbuf1[j][0] + 1))*N;
2205f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(count+1,&rbuf2[i]));
2215f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Irecv(rbuf2[i],count,MPIU_SCALAR,j,tag1,comm,r_waits2+i));
222e26ad7d8SSatish Balay   }
223e26ad7d8SSatish Balay 
2244b3ec233SSatish Balay   /* Receive messages(row_nos) and then, pack and send off the rowvalues
2254b3ec233SSatish Balay      to the correct processors */
226e26ad7d8SSatish Balay 
2275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr+1,&s_waits2));
2285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr+1,&r_status1));
2295f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr+1,&sbuf2));
230e26ad7d8SSatish Balay 
231e26ad7d8SSatish Balay   {
23287828ca2SBarry Smith     PetscScalar *sbuf2_i,*v_start;
23313f74950SBarry Smith     PetscInt    s_proc;
234e26ad7d8SSatish Balay     for (i=0; i<nrqr; ++i) {
2355f80ce2aSJacob Faibussowitsch       CHKERRMPI(MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i));
2364b3ec233SSatish Balay       s_proc  = r_status1[i].MPI_SOURCE;         /* send processor */
237999d9058SBarry Smith       rbuf1_i = rbuf1[idex];         /* Actual message from s_proc */
238999d9058SBarry Smith       /* no of rows = end - start; since start is array idex[], 0idex, whel end
239999d9058SBarry Smith          is length of the buffer - which is 1idex */
240e26ad7d8SSatish Balay       start = 2*rbuf1_i[0] + 1;
2415f80ce2aSJacob Faibussowitsch       CHKERRMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end));
2424b3ec233SSatish Balay       /* allocate memory sufficinet to hold all the row values */
2435f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscMalloc1((end-start)*N,&sbuf2[idex]));
244999d9058SBarry Smith       sbuf2_i = sbuf2[idex];
2454b3ec233SSatish Balay       /* Now pack the data */
246e26ad7d8SSatish Balay       for (j=start; j<end; j++) {
2474b3ec233SSatish Balay         row     = rbuf1_i[j] - rstart;
2484b3ec233SSatish Balay         v_start = a->v + row;
2494b3ec233SSatish Balay         for (k=0; k<N; k++) {
2504b3ec233SSatish Balay           sbuf2_i[0] = v_start[0];
2512205254eSKarl Rupp           sbuf2_i++;
252bdc8096bSPierre Jolivet           v_start += a->lda;
253e26ad7d8SSatish Balay         }
254e26ad7d8SSatish Balay       }
2554b3ec233SSatish Balay       /* Now send off the data */
2565f80ce2aSJacob Faibussowitsch       CHKERRMPI(MPI_Isend(sbuf2[idex],(end-start)*N,MPIU_SCALAR,s_proc,tag1,comm,s_waits2+i));
2574b3ec233SSatish Balay     }
2584b3ec233SSatish Balay   }
2594b3ec233SSatish Balay   /* End Send-Recv of IS + row_numbers */
2605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(r_status1));
2615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(r_waits1));
2625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqs+1,&s_status1));
2635f80ce2aSJacob Faibussowitsch   if (nrqs) CHKERRMPI(MPI_Waitall(nrqs,s_waits1,s_status1));
2645f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(s_status1));
2655f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(s_waits1));
266e26ad7d8SSatish Balay 
2674b3ec233SSatish Balay   /* Create the submatrices */
2684b3ec233SSatish Balay   if (scall == MAT_REUSE_MATRIX) {
269e26ad7d8SSatish Balay     for (i=0; i<ismax; i++) {
2704b3ec233SSatish Balay       mat = (Mat_SeqDense*)(submats[i]->data);
2712c71b3e2SJacob Faibussowitsch       PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2725f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscArrayzero(mat->v,submats[i]->rmap->n*submats[i]->cmap->n));
2732205254eSKarl Rupp 
274d5f3da31SBarry Smith       submats[i]->factortype = C->factortype;
275e26ad7d8SSatish Balay     }
2764b3ec233SSatish Balay   } else {
277e26ad7d8SSatish Balay     for (i=0; i<ismax; i++) {
2785f80ce2aSJacob Faibussowitsch       CHKERRQ(MatCreate(PETSC_COMM_SELF,submats+i));
2795f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]));
2805f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetType(submats[i],((PetscObject)A)->type_name));
2815f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqDenseSetPreallocation(submats[i],NULL));
2824b3ec233SSatish Balay     }
2834b3ec233SSatish Balay   }
2844b3ec233SSatish Balay 
2854b3ec233SSatish Balay   /* Assemble the matrices */
2864b3ec233SSatish Balay   {
28713f74950SBarry Smith     PetscInt    col;
28887828ca2SBarry Smith     PetscScalar *imat_v,*mat_v,*imat_vi,*mat_vi;
2894b3ec233SSatish Balay 
2904b3ec233SSatish Balay     for (i=0; i<ismax; i++) {
2914b3ec233SSatish Balay       mat    = (Mat_SeqDense*)submats[i]->data;
2924b3ec233SSatish Balay       mat_v  = a->v;
2934b3ec233SSatish Balay       imat_v = mat->v;
294e26ad7d8SSatish Balay       irow_i = irow[i];
2954b3ec233SSatish Balay       m      = nrow[i];
2964b3ec233SSatish Balay       for (j=0; j<m; j++) {
297e26ad7d8SSatish Balay         row  = irow_i[j];
298e26ad7d8SSatish Balay         proc = rtable[row];
299e26ad7d8SSatish Balay         if (proc == rank) {
3004b3ec233SSatish Balay           row     = row - rstart;
3014b3ec233SSatish Balay           mat_vi  = mat_v + row;
3024b3ec233SSatish Balay           imat_vi = imat_v + j;
3034b3ec233SSatish Balay           for (k=0; k<ncol[i]; k++) {
3044b3ec233SSatish Balay             col          = icol[i][k];
305bdc8096bSPierre Jolivet             imat_vi[k*m] = mat_vi[col*a->lda];
306e26ad7d8SSatish Balay           }
3074b3ec233SSatish Balay         }
308e26ad7d8SSatish Balay       }
309e26ad7d8SSatish Balay     }
310e26ad7d8SSatish Balay   }
311e26ad7d8SSatish Balay 
312d0f46423SBarry Smith   /* Create row map-> This maps c->row to submat->row for each submat*/
3134b3ec233SSatish Balay   /* this is a very expensive operation wrt memory usage */
3145f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(ismax,&rmap));
3155f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(ismax*C->rmap->N,&rmap[0]));
3162205254eSKarl Rupp   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
317e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
318e26ad7d8SSatish Balay     rmap_i = rmap[i];
319e26ad7d8SSatish Balay     irow_i = irow[i];
320e26ad7d8SSatish Balay     jmax   = nrow[i];
321e26ad7d8SSatish Balay     for (j=0; j<jmax; j++) {
322e26ad7d8SSatish Balay       rmap_i[irow_i[j]] = j;
323e26ad7d8SSatish Balay     }
324e26ad7d8SSatish Balay   }
325e26ad7d8SSatish Balay 
3264b3ec233SSatish Balay   /* Now Receive the row_values and assemble the rest of the matrix */
3275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqs+1,&r_status2));
3284b3ec233SSatish Balay   {
32913f74950SBarry Smith     PetscInt    is_max,tmp1,col,*sbuf1_i,is_sz;
33087828ca2SBarry Smith     PetscScalar *rbuf2_i,*imat_v,*imat_vi;
3314b3ec233SSatish Balay 
3324b3ec233SSatish Balay     for (tmp1=0; tmp1<nrqs; tmp1++) { /* For each message */
3335f80ce2aSJacob Faibussowitsch       CHKERRMPI(MPI_Waitany(nrqs,r_waits2,&i,r_status2+tmp1));
3344b3ec233SSatish Balay       /* Now dig out the corresponding sbuf1, which contains the IS data_structure */
3354b3ec233SSatish Balay       sbuf1_i = sbuf1[pa[i]];
3364b3ec233SSatish Balay       is_max  = sbuf1_i[0];
3374b3ec233SSatish Balay       ct1     = 2*is_max+1;
338e26ad7d8SSatish Balay       rbuf2_i = rbuf2[i];
3394b3ec233SSatish Balay       for (j=1; j<=is_max; j++) { /* For each IS belonging to the message */
340e26ad7d8SSatish Balay         is_no  = sbuf1_i[2*j-1];
3414b3ec233SSatish Balay         is_sz  = sbuf1_i[2*j];
342e26ad7d8SSatish Balay         mat    = (Mat_SeqDense*)submats[is_no]->data;
3434b3ec233SSatish Balay         imat_v = mat->v;
3444b3ec233SSatish Balay         rmap_i = rmap[is_no];
3454b3ec233SSatish Balay         m      = nrow[is_no];
3464b3ec233SSatish Balay         for (k=0; k<is_sz; k++,rbuf2_i+=N) {  /* For each row */
3474b3ec233SSatish Balay           row     = sbuf1_i[ct1]; ct1++;
348e26ad7d8SSatish Balay           row     = rmap_i[row];
3494b3ec233SSatish Balay           imat_vi = imat_v + row;
3504b3ec233SSatish Balay           for (l=0; l<ncol[is_no]; l++) { /* For each col */
3514b3ec233SSatish Balay             col          = icol[is_no][l];
3524b3ec233SSatish Balay             imat_vi[l*m] = rbuf2_i[col];
353e26ad7d8SSatish Balay           }
354e26ad7d8SSatish Balay         }
355e26ad7d8SSatish Balay       }
356e26ad7d8SSatish Balay     }
3574b3ec233SSatish Balay   }
3584b3ec233SSatish Balay   /* End Send-Recv of row_values */
3595f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(r_status2));
3605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(r_waits2));
3615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nrqr+1,&s_status2));
3625f80ce2aSJacob Faibussowitsch   if (nrqr) CHKERRMPI(MPI_Waitall(nrqr,s_waits2,s_status2));
3635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(s_status2));
3645f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(s_waits2));
365e26ad7d8SSatish Balay 
366e26ad7d8SSatish Balay   /* Restore the indices */
367e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
3685f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(isrow[i],irow+i));
3695f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(iscol[i],icol+i));
370e26ad7d8SSatish Balay   }
371e26ad7d8SSatish Balay 
3725f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,rtable));
3735f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree3(w1,w3,w4));
3745f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(pa));
375e26ad7d8SSatish Balay 
3764b3ec233SSatish Balay   for (i=0; i<nrqs; ++i) {
3775f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(rbuf2[i]));
3784b3ec233SSatish Balay   }
3795f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rbuf2));
3805f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree4(sbuf1,ptr,tmp,ctr));
3815f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rbuf1[0]));
3825f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rbuf1));
3834b3ec233SSatish Balay 
384e26ad7d8SSatish Balay   for (i=0; i<nrqr; ++i) {
3855f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(sbuf2[i]));
386e26ad7d8SSatish Balay   }
387e26ad7d8SSatish Balay 
3885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(sbuf2));
3895f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rmap[0]));
3905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rmap));
391e26ad7d8SSatish Balay 
392e26ad7d8SSatish Balay   for (i=0; i<ismax; i++) {
3935f80ce2aSJacob Faibussowitsch     CHKERRQ(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY));
3945f80ce2aSJacob Faibussowitsch     CHKERRQ(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY));
395e26ad7d8SSatish Balay   }
396e26ad7d8SSatish Balay   PetscFunctionReturn(0);
397e26ad7d8SSatish Balay }
398e26ad7d8SSatish Balay 
39952c5f739Sprj- PETSC_INTERN PetscErrorCode MatScale_MPIDense(Mat inA,PetscScalar alpha)
400d84338a6SBarry Smith {
401d84338a6SBarry Smith   Mat_MPIDense   *A = (Mat_MPIDense*)inA->data;
402d84338a6SBarry Smith 
403d84338a6SBarry Smith   PetscFunctionBegin;
4045f80ce2aSJacob Faibussowitsch   CHKERRQ(MatScale(A->A,alpha));
405d84338a6SBarry Smith   PetscFunctionReturn(0);
406d84338a6SBarry Smith }
407