xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 1eb62cbb64d64b61ce4c1a3ec8cb7777b3ff11c2)
18a729477SBarry Smith 
2*1eb62cbbSBarry Smith #include "mpiaij.h"
38a729477SBarry Smith #include "vec/vecimpl.h"
48a729477SBarry Smith 
58a729477SBarry Smith 
6*1eb62cbbSBarry Smith #define CHUNCKSIZE   100
7*1eb62cbbSBarry Smith /*
8*1eb62cbbSBarry Smith    This is a simple minded stash. Do a linear search to determine if
9*1eb62cbbSBarry Smith  in stash, if not add to end.
10*1eb62cbbSBarry Smith */
11*1eb62cbbSBarry Smith static int StashValues(Stash *stash,int row,int n, int *idxn,
12*1eb62cbbSBarry Smith                        Scalar *values,InsertMode addv)
138a729477SBarry Smith {
14*1eb62cbbSBarry Smith   int    i,j,N = stash->n,found,*n_idx, *n_idy;
15*1eb62cbbSBarry Smith   Scalar val,*n_array;
168a729477SBarry Smith 
17*1eb62cbbSBarry Smith   for ( i=0; i<n; i++ ) {
18*1eb62cbbSBarry Smith     found = 0;
19*1eb62cbbSBarry Smith     val = *values++;
208a729477SBarry Smith     for ( j=0; j<N; j++ ) {
21*1eb62cbbSBarry Smith       if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) {
22*1eb62cbbSBarry Smith         /* found a match */
23*1eb62cbbSBarry Smith         if (addv == AddValues) stash->array[j] += val;
24*1eb62cbbSBarry Smith         else stash->array[j] = val;
25*1eb62cbbSBarry Smith         found = 1;
268a729477SBarry Smith         break;
278a729477SBarry Smith       }
288a729477SBarry Smith     }
29*1eb62cbbSBarry Smith     if (!found) { /* not found so add to end */
30*1eb62cbbSBarry Smith       if ( stash->n == stash->nmax ) {
31*1eb62cbbSBarry Smith         /* allocate a larger stash */
32*1eb62cbbSBarry Smith         n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*(
33*1eb62cbbSBarry Smith                                      2*sizeof(int) + sizeof(Scalar)));
34*1eb62cbbSBarry Smith         CHKPTR(n_array);
35*1eb62cbbSBarry Smith         n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE);
36*1eb62cbbSBarry Smith         n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE);
37*1eb62cbbSBarry Smith         MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar));
38*1eb62cbbSBarry Smith         MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int));
39*1eb62cbbSBarry Smith         MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int));
40*1eb62cbbSBarry Smith         if (stash->array) FREE(stash->array);
41*1eb62cbbSBarry Smith         stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy;
42*1eb62cbbSBarry Smith         stash->nmax += CHUNCKSIZE;
43*1eb62cbbSBarry Smith       }
44*1eb62cbbSBarry Smith       stash->array[stash->n]   = val;
45*1eb62cbbSBarry Smith       stash->idx[stash->n]     = row;
46*1eb62cbbSBarry Smith       stash->idy[stash->n++]   = idxn[i];
47*1eb62cbbSBarry Smith     }
488a729477SBarry Smith   }
498a729477SBarry Smith   return 0;
508a729477SBarry Smith }
518a729477SBarry Smith 
52*1eb62cbbSBarry Smith static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n,
53*1eb62cbbSBarry Smith                             int *idxn,Scalar *v,InsertMode addv)
548a729477SBarry Smith {
55*1eb62cbbSBarry Smith   Matimpiaij *aij = (Matimpiaij *) mat->data;
56*1eb62cbbSBarry Smith   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
57*1eb62cbbSBarry Smith   int        cstart = aij->cstart, cend = aij->cend,row,col;
588a729477SBarry Smith 
59*1eb62cbbSBarry Smith   if (aij->insertmode != NotSetValues && aij->insertmode != addv) {
60*1eb62cbbSBarry Smith     SETERR(1,"You cannot mix inserts and adds");
618a729477SBarry Smith   }
62*1eb62cbbSBarry Smith   aij->insertmode = addv;
638a729477SBarry Smith   for ( i=0; i<m; i++ ) {
64*1eb62cbbSBarry Smith     if (idxm[i] >= rstart && idxm[i] < rend) {
65*1eb62cbbSBarry Smith       row = idxm[i] - rstart;
66*1eb62cbbSBarry Smith       for ( j=0; j<n; j++ ) {
67*1eb62cbbSBarry Smith         if (idxn[j] >= cstart && idxn[j] < cend){
68*1eb62cbbSBarry Smith           col = idxn[j] - cstart;
69*1eb62cbbSBarry Smith           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
70*1eb62cbbSBarry Smith         }
71*1eb62cbbSBarry Smith         else {
72*1eb62cbbSBarry Smith           col = idxn[j];
73*1eb62cbbSBarry Smith           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
74*1eb62cbbSBarry Smith         }
75*1eb62cbbSBarry Smith       }
76*1eb62cbbSBarry Smith     }
77*1eb62cbbSBarry Smith     else {
78*1eb62cbbSBarry Smith       ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr);
79*1eb62cbbSBarry Smith     }
808a729477SBarry Smith   }
818a729477SBarry Smith   return 0;
828a729477SBarry Smith }
838a729477SBarry Smith 
848a729477SBarry Smith /*
85*1eb62cbbSBarry Smith     the assembly code is alot like the code for vectors, we should
86*1eb62cbbSBarry Smith     sometime derive a single assembly code that can be used for
87*1eb62cbbSBarry Smith     either case.
888a729477SBarry Smith */
898a729477SBarry Smith 
90*1eb62cbbSBarry Smith static int MatiAIJBeginAssemble(Mat mat)
918a729477SBarry Smith {
92*1eb62cbbSBarry Smith   Matimpiaij  *aij = (Matimpiaij *) mat->data;
93*1eb62cbbSBarry Smith   MPI_Comm    comm = aij->comm;
94*1eb62cbbSBarry Smith   int         ierr, numtids = aij->numtids, *owners = aij->rowners;
95*1eb62cbbSBarry Smith   int         mytid = aij->mytid;
96*1eb62cbbSBarry Smith   MPI_Request *send_waits,*recv_waits;
97*1eb62cbbSBarry Smith   int         *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work;
98*1eb62cbbSBarry Smith   int         tag = 50, *owner,*starts,count;
99*1eb62cbbSBarry Smith   InsertMode  addv;
100*1eb62cbbSBarry Smith   Scalar      *rvalues,*svalues;
101*1eb62cbbSBarry Smith 
102*1eb62cbbSBarry Smith   /* make sure all processors are either in INSERTMODE or ADDMODE */
103*1eb62cbbSBarry Smith   MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT,
104*1eb62cbbSBarry Smith                 MPI_BOR,comm);
105*1eb62cbbSBarry Smith   if (addv == (AddValues|InsertValues)) {
106*1eb62cbbSBarry Smith     SETERR(1,"Some processors have inserted while others have added");
107*1eb62cbbSBarry Smith   }
108*1eb62cbbSBarry Smith   aij->insertmode = addv; /* in case this processor had no cache */
109*1eb62cbbSBarry Smith 
110*1eb62cbbSBarry Smith   /*  first count number of contributors to each processor */
111*1eb62cbbSBarry Smith   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
112*1eb62cbbSBarry Smith   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
113*1eb62cbbSBarry Smith   owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner);
114*1eb62cbbSBarry Smith   for ( i=0; i<aij->stash.n; i++ ) {
115*1eb62cbbSBarry Smith     idx = aij->stash.idx[i];
116*1eb62cbbSBarry Smith     for ( j=0; j<numtids; j++ ) {
117*1eb62cbbSBarry Smith       if (idx >= owners[j] && idx < owners[j+1]) {
118*1eb62cbbSBarry Smith         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
1198a729477SBarry Smith       }
1208a729477SBarry Smith     }
1218a729477SBarry Smith   }
122*1eb62cbbSBarry Smith   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
123*1eb62cbbSBarry Smith 
124*1eb62cbbSBarry Smith   /* inform other processors of number of messages and max length*/
125*1eb62cbbSBarry Smith   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
126*1eb62cbbSBarry Smith   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
127*1eb62cbbSBarry Smith   nreceives = work[mytid];
128*1eb62cbbSBarry Smith   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
129*1eb62cbbSBarry Smith   nmax = work[mytid];
130*1eb62cbbSBarry Smith   FREE(work);
131*1eb62cbbSBarry Smith 
132*1eb62cbbSBarry Smith   /* post receives:
133*1eb62cbbSBarry Smith        1) each message will consist of ordered pairs
134*1eb62cbbSBarry Smith      (global index,value) we store the global index as a double
135*1eb62cbbSBarry Smith      to simply the message passing.
136*1eb62cbbSBarry Smith        2) since we don't know how long each individual message is we
137*1eb62cbbSBarry Smith      allocate the largest needed buffer for each receive. Potentially
138*1eb62cbbSBarry Smith      this is a lot of wasted space.
139*1eb62cbbSBarry Smith 
140*1eb62cbbSBarry Smith 
141*1eb62cbbSBarry Smith        This could be done better.
142*1eb62cbbSBarry Smith   */
143*1eb62cbbSBarry Smith   rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar));
144*1eb62cbbSBarry Smith   CHKPTR(rvalues);
145*1eb62cbbSBarry Smith   recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request));
146*1eb62cbbSBarry Smith   CHKPTR(recv_waits);
147*1eb62cbbSBarry Smith   for ( i=0; i<nreceives; i++ ) {
148*1eb62cbbSBarry Smith     MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag,
149*1eb62cbbSBarry Smith               comm,recv_waits+i);
150*1eb62cbbSBarry Smith   }
151*1eb62cbbSBarry Smith 
152*1eb62cbbSBarry Smith   /* do sends:
153*1eb62cbbSBarry Smith       1) starts[i] gives the starting index in svalues for stuff going to
154*1eb62cbbSBarry Smith          the ith processor
155*1eb62cbbSBarry Smith   */
156*1eb62cbbSBarry Smith   svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) );
157*1eb62cbbSBarry Smith   CHKPTR(svalues);
158*1eb62cbbSBarry Smith   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
159*1eb62cbbSBarry Smith   CHKPTR(send_waits);
160*1eb62cbbSBarry Smith   starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts);
161*1eb62cbbSBarry Smith   starts[0] = 0;
162*1eb62cbbSBarry Smith   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
163*1eb62cbbSBarry Smith   for ( i=0; i<aij->stash.n; i++ ) {
164*1eb62cbbSBarry Smith     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
165*1eb62cbbSBarry Smith     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
166*1eb62cbbSBarry Smith     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
167*1eb62cbbSBarry Smith   }
168*1eb62cbbSBarry Smith   FREE(owner);
169*1eb62cbbSBarry Smith   starts[0] = 0;
170*1eb62cbbSBarry Smith   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
171*1eb62cbbSBarry Smith   count = 0;
172*1eb62cbbSBarry Smith   for ( i=0; i<numtids; i++ ) {
173*1eb62cbbSBarry Smith     if (procs[i]) {
174*1eb62cbbSBarry Smith       MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag,
175*1eb62cbbSBarry Smith                 comm,send_waits+count++);
176*1eb62cbbSBarry Smith     }
177*1eb62cbbSBarry Smith   }
178*1eb62cbbSBarry Smith   FREE(starts); FREE(nprocs);
179*1eb62cbbSBarry Smith 
180*1eb62cbbSBarry Smith   /* Free cache space */
181*1eb62cbbSBarry Smith   aij->stash.nmax = aij->stash.n = 0;
182*1eb62cbbSBarry Smith   if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;}
183*1eb62cbbSBarry Smith 
184*1eb62cbbSBarry Smith   aij->svalues    = svalues;       aij->rvalues = rvalues;
185*1eb62cbbSBarry Smith   aij->nsends     = nsends;         aij->nrecvs = nreceives;
186*1eb62cbbSBarry Smith   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
187*1eb62cbbSBarry Smith   aij->rmax       = nmax;
188*1eb62cbbSBarry Smith 
189*1eb62cbbSBarry Smith   return 0;
190*1eb62cbbSBarry Smith }
191*1eb62cbbSBarry Smith extern int MPIAIJSetUpMultiply(Mat);
192*1eb62cbbSBarry Smith 
193*1eb62cbbSBarry Smith static int MatiAIJEndAssemble(Mat mat)
194*1eb62cbbSBarry Smith {
195*1eb62cbbSBarry Smith   int        ierr;
196*1eb62cbbSBarry Smith   Matimpiaij *aij = (Matimpiaij *) mat->data;
197*1eb62cbbSBarry Smith 
198*1eb62cbbSBarry Smith   MPI_Status  *send_status,recv_status;
199*1eb62cbbSBarry Smith   int         index,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n;
200*1eb62cbbSBarry Smith   int         row,col;
201*1eb62cbbSBarry Smith   Scalar      *values,val;
202*1eb62cbbSBarry Smith   InsertMode  addv = aij->insertmode;
203*1eb62cbbSBarry Smith 
204*1eb62cbbSBarry Smith   /*  wait on receives */
205*1eb62cbbSBarry Smith   while (count) {
206*1eb62cbbSBarry Smith     MPI_Waitany(nrecvs,aij->recv_waits,&index,&recv_status);
207*1eb62cbbSBarry Smith     /* unpack receives into our local space */
208*1eb62cbbSBarry Smith     values = aij->rvalues + 3*index*aij->rmax;
209*1eb62cbbSBarry Smith     MPI_Get_count(&recv_status,MPI_SCALAR,&n);
210*1eb62cbbSBarry Smith     n = n/3;
211*1eb62cbbSBarry Smith     for ( i=0; i<n; i++ ) {
212*1eb62cbbSBarry Smith       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
213*1eb62cbbSBarry Smith       col = (int) PETSCREAL(values[3*i+1]);
214*1eb62cbbSBarry Smith       val = values[3*i+2];
215*1eb62cbbSBarry Smith       if (col >= aij->cstart && col < aij->cend) {
216*1eb62cbbSBarry Smith           col -= aij->cstart;
217*1eb62cbbSBarry Smith         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
218*1eb62cbbSBarry Smith       }
219*1eb62cbbSBarry Smith       else {
220*1eb62cbbSBarry Smith         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
221*1eb62cbbSBarry Smith       }
222*1eb62cbbSBarry Smith     }
223*1eb62cbbSBarry Smith     count--;
224*1eb62cbbSBarry Smith   }
225*1eb62cbbSBarry Smith   FREE(aij->recv_waits); FREE(aij->rvalues);
226*1eb62cbbSBarry Smith 
227*1eb62cbbSBarry Smith   /* wait on sends */
228*1eb62cbbSBarry Smith   if (aij->nsends) {
229*1eb62cbbSBarry Smith     send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) );
230*1eb62cbbSBarry Smith     CHKPTR(send_status);
231*1eb62cbbSBarry Smith     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
232*1eb62cbbSBarry Smith     FREE(send_status);
233*1eb62cbbSBarry Smith   }
234*1eb62cbbSBarry Smith   FREE(aij->send_waits); FREE(aij->svalues);
235*1eb62cbbSBarry Smith 
236*1eb62cbbSBarry Smith   aij->insertmode = NotSetValues;
237*1eb62cbbSBarry Smith   ierr = MatBeginAssembly(aij->A); CHKERR(ierr);
238*1eb62cbbSBarry Smith   ierr = MatEndAssembly(aij->A); CHKERR(ierr);
239*1eb62cbbSBarry Smith 
240*1eb62cbbSBarry Smith   ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr);
241*1eb62cbbSBarry Smith   ierr = MatBeginAssembly(aij->B); CHKERR(ierr);
242*1eb62cbbSBarry Smith   ierr = MatEndAssembly(aij->B); CHKERR(ierr);
2438a729477SBarry Smith   return 0;
2448a729477SBarry Smith }
2458a729477SBarry Smith 
246*1eb62cbbSBarry Smith static int MatiZero(Mat A)
247*1eb62cbbSBarry Smith {
248*1eb62cbbSBarry Smith   Matimpiaij *l = (Matimpiaij *) A->data;
249*1eb62cbbSBarry Smith 
250*1eb62cbbSBarry Smith   MatZeroEntries(l->A); MatZeroEntries(l->B);
251*1eb62cbbSBarry Smith   return 0;
252*1eb62cbbSBarry Smith }
253*1eb62cbbSBarry Smith 
254*1eb62cbbSBarry Smith /* again this uses the same basic stratagy as in the assembly and
255*1eb62cbbSBarry Smith    scatter create routines, we should try to do it systemamatically
256*1eb62cbbSBarry Smith    if we can figure out the proper level of generality. */
257*1eb62cbbSBarry Smith 
258*1eb62cbbSBarry Smith /* the code does not do the diagonal entries correctly unless the
259*1eb62cbbSBarry Smith    matrix is square and the column and row owerships are identical.
260*1eb62cbbSBarry Smith    This is a BUG. The only way to fix it seems to be to access
261*1eb62cbbSBarry Smith    aij->A and aij->B directly and not through the MatZeroRows()
262*1eb62cbbSBarry Smith    routine.
263*1eb62cbbSBarry Smith */
264*1eb62cbbSBarry Smith static int MatiZerorows(Mat A,IS is,Scalar *diag)
265*1eb62cbbSBarry Smith {
266*1eb62cbbSBarry Smith   Matimpiaij     *l = (Matimpiaij *) A->data;
267*1eb62cbbSBarry Smith   int            i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids;
268*1eb62cbbSBarry Smith   int            *localrows,*procs,*nprocs,j,found,idx,nsends,*work;
269*1eb62cbbSBarry Smith   int            nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid;
270*1eb62cbbSBarry Smith   int            *rvalues,tag = 67,count,base,slen,n,len,*source;
271*1eb62cbbSBarry Smith   int            *lens,index,*lrows,*values;
272*1eb62cbbSBarry Smith   MPI_Comm       comm = l->comm;
273*1eb62cbbSBarry Smith   MPI_Request    *send_waits,*recv_waits;
274*1eb62cbbSBarry Smith   MPI_Status     recv_status,*send_status;
275*1eb62cbbSBarry Smith   IS             istmp;
276*1eb62cbbSBarry Smith 
277*1eb62cbbSBarry Smith   ierr = ISGetLocalSize(is,&N); CHKERR(ierr);
278*1eb62cbbSBarry Smith   ierr = ISGetIndices(is,&rows); CHKERR(ierr);
279*1eb62cbbSBarry Smith 
280*1eb62cbbSBarry Smith   /*  first count number of contributors to each processor */
281*1eb62cbbSBarry Smith   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
282*1eb62cbbSBarry Smith   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
283*1eb62cbbSBarry Smith   owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/
284*1eb62cbbSBarry Smith   for ( i=0; i<N; i++ ) {
285*1eb62cbbSBarry Smith     idx = rows[i];
286*1eb62cbbSBarry Smith     found = 0;
287*1eb62cbbSBarry Smith     for ( j=0; j<numtids; j++ ) {
288*1eb62cbbSBarry Smith       if (idx >= owners[j] && idx < owners[j+1]) {
289*1eb62cbbSBarry Smith         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
290*1eb62cbbSBarry Smith       }
291*1eb62cbbSBarry Smith     }
292*1eb62cbbSBarry Smith     if (!found) SETERR(1,"Index out of range");
293*1eb62cbbSBarry Smith   }
294*1eb62cbbSBarry Smith   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
295*1eb62cbbSBarry Smith 
296*1eb62cbbSBarry Smith   /* inform other processors of number of messages and max length*/
297*1eb62cbbSBarry Smith   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
298*1eb62cbbSBarry Smith   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
299*1eb62cbbSBarry Smith   nrecvs = work[mytid];
300*1eb62cbbSBarry Smith   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
301*1eb62cbbSBarry Smith   nmax = work[mytid];
302*1eb62cbbSBarry Smith   FREE(work);
303*1eb62cbbSBarry Smith 
304*1eb62cbbSBarry Smith   /* post receives:   */
305*1eb62cbbSBarry Smith   rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */
306*1eb62cbbSBarry Smith   CHKPTR(rvalues);
307*1eb62cbbSBarry Smith   recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request));
308*1eb62cbbSBarry Smith   CHKPTR(recv_waits);
309*1eb62cbbSBarry Smith   for ( i=0; i<nrecvs; i++ ) {
310*1eb62cbbSBarry Smith     MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,
311*1eb62cbbSBarry Smith               comm,recv_waits+i);
312*1eb62cbbSBarry Smith   }
313*1eb62cbbSBarry Smith 
314*1eb62cbbSBarry Smith   /* do sends:
315*1eb62cbbSBarry Smith       1) starts[i] gives the starting index in svalues for stuff going to
316*1eb62cbbSBarry Smith          the ith processor
317*1eb62cbbSBarry Smith   */
318*1eb62cbbSBarry Smith   svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues);
319*1eb62cbbSBarry Smith   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
320*1eb62cbbSBarry Smith   CHKPTR(send_waits);
321*1eb62cbbSBarry Smith   starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts);
322*1eb62cbbSBarry Smith   starts[0] = 0;
323*1eb62cbbSBarry Smith   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
324*1eb62cbbSBarry Smith   for ( i=0; i<N; i++ ) {
325*1eb62cbbSBarry Smith     svalues[starts[owner[i]]++] = rows[i];
326*1eb62cbbSBarry Smith   }
327*1eb62cbbSBarry Smith   ISRestoreIndices(is,&rows);
328*1eb62cbbSBarry Smith 
329*1eb62cbbSBarry Smith   starts[0] = 0;
330*1eb62cbbSBarry Smith   for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
331*1eb62cbbSBarry Smith   count = 0;
332*1eb62cbbSBarry Smith   for ( i=0; i<numtids; i++ ) {
333*1eb62cbbSBarry Smith     if (procs[i]) {
334*1eb62cbbSBarry Smith       MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag,
335*1eb62cbbSBarry Smith                 comm,send_waits+count++);
336*1eb62cbbSBarry Smith     }
337*1eb62cbbSBarry Smith   }
338*1eb62cbbSBarry Smith   FREE(starts);
339*1eb62cbbSBarry Smith 
340*1eb62cbbSBarry Smith   base = owners[mytid];
341*1eb62cbbSBarry Smith 
342*1eb62cbbSBarry Smith   /*  wait on receives */
343*1eb62cbbSBarry Smith   lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens);
344*1eb62cbbSBarry Smith   source = lens + nrecvs;
345*1eb62cbbSBarry Smith   count = nrecvs; slen = 0;
346*1eb62cbbSBarry Smith   while (count) {
347*1eb62cbbSBarry Smith     MPI_Waitany(nrecvs,recv_waits,&index,&recv_status);
348*1eb62cbbSBarry Smith     /* unpack receives into our local space */
349*1eb62cbbSBarry Smith     MPI_Get_count(&recv_status,MPI_INT,&n);
350*1eb62cbbSBarry Smith     source[index]  = recv_status.MPI_SOURCE;
351*1eb62cbbSBarry Smith     lens[index]  = n;
352*1eb62cbbSBarry Smith     slen += n;
353*1eb62cbbSBarry Smith     count--;
354*1eb62cbbSBarry Smith   }
355*1eb62cbbSBarry Smith   FREE(recv_waits);
356*1eb62cbbSBarry Smith 
357*1eb62cbbSBarry Smith   /* move the data into the send scatter */
358*1eb62cbbSBarry Smith   lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows);
359*1eb62cbbSBarry Smith   count = 0;
360*1eb62cbbSBarry Smith   for ( i=0; i<nrecvs; i++ ) {
361*1eb62cbbSBarry Smith     values = rvalues + i*nmax;
362*1eb62cbbSBarry Smith     for ( j=0; j<lens[i]; j++ ) {
363*1eb62cbbSBarry Smith       lrows[count++] = values[j] - base;
364*1eb62cbbSBarry Smith     }
365*1eb62cbbSBarry Smith   }
366*1eb62cbbSBarry Smith   FREE(rvalues); FREE(lens);
367*1eb62cbbSBarry Smith   FREE(owner); FREE(nprocs);
368*1eb62cbbSBarry Smith 
369*1eb62cbbSBarry Smith   /* actually zap the local rows */
370*1eb62cbbSBarry Smith   ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr);  FREE(lrows);
371*1eb62cbbSBarry Smith   ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr);
372*1eb62cbbSBarry Smith   ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr);
373*1eb62cbbSBarry Smith   ierr = ISDestroy(istmp); CHKERR(ierr);
374*1eb62cbbSBarry Smith 
375*1eb62cbbSBarry Smith   /* wait on sends */
376*1eb62cbbSBarry Smith   if (nsends) {
377*1eb62cbbSBarry Smith     send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) );
378*1eb62cbbSBarry Smith     CHKPTR(send_status);
379*1eb62cbbSBarry Smith     MPI_Waitall(nsends,send_waits,send_status);
380*1eb62cbbSBarry Smith     FREE(send_status);
381*1eb62cbbSBarry Smith   }
382*1eb62cbbSBarry Smith   FREE(send_waits); FREE(svalues);
383*1eb62cbbSBarry Smith 
384*1eb62cbbSBarry Smith 
385*1eb62cbbSBarry Smith   return 0;
386*1eb62cbbSBarry Smith }
387*1eb62cbbSBarry Smith 
388*1eb62cbbSBarry Smith static int MatiAIJMult(Mat aijin,Vec xx,Vec yy)
389*1eb62cbbSBarry Smith {
390*1eb62cbbSBarry Smith   Matimpiaij *aij = (Matimpiaij *) aijin->data;
391*1eb62cbbSBarry Smith   int        ierr;
392*1eb62cbbSBarry Smith 
393*1eb62cbbSBarry Smith   ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx);
394*1eb62cbbSBarry Smith   CHKERR(ierr);
395*1eb62cbbSBarry Smith   ierr = MatMult(aij->A,xx,yy); CHKERR(ierr);
396*1eb62cbbSBarry Smith   ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); CHKERR(ierr);
397*1eb62cbbSBarry Smith   ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr);
398*1eb62cbbSBarry Smith   return 0;
399*1eb62cbbSBarry Smith }
400*1eb62cbbSBarry Smith 
401*1eb62cbbSBarry Smith /*
402*1eb62cbbSBarry Smith   This only works correctly for square matrices where the subblock A->A is the
403*1eb62cbbSBarry Smith    diagonal block
404*1eb62cbbSBarry Smith */
405*1eb62cbbSBarry Smith static int MatiAIJgetdiag(Mat Ain,Vec v)
406*1eb62cbbSBarry Smith {
407*1eb62cbbSBarry Smith   Matimpiaij *A = (Matimpiaij *) Ain->data;
408*1eb62cbbSBarry Smith   return MatGetDiagonal(A->A,v);
409*1eb62cbbSBarry Smith }
410*1eb62cbbSBarry Smith 
411*1eb62cbbSBarry Smith static int MatiAIJdestroy(PetscObject obj)
412*1eb62cbbSBarry Smith {
413*1eb62cbbSBarry Smith   Mat        mat = (Mat) obj;
414*1eb62cbbSBarry Smith   Matimpiaij *aij = (Matimpiaij *) mat->data;
415*1eb62cbbSBarry Smith   int        ierr;
416*1eb62cbbSBarry Smith   FREE(aij->rowners);
417*1eb62cbbSBarry Smith   ierr = MatDestroy(aij->A); CHKERR(ierr);
418*1eb62cbbSBarry Smith   ierr = MatDestroy(aij->B); CHKERR(ierr);
419*1eb62cbbSBarry Smith   FREE(aij); FREE(mat);
420*1eb62cbbSBarry Smith   if (aij->lvec) VecDestroy(aij->lvec);
421*1eb62cbbSBarry Smith   if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx);
422*1eb62cbbSBarry Smith   return 0;
423*1eb62cbbSBarry Smith }
424*1eb62cbbSBarry Smith 
425*1eb62cbbSBarry Smith static int MatiView(PetscObject obj,Viewer viewer)
426*1eb62cbbSBarry Smith {
427*1eb62cbbSBarry Smith   Mat        mat = (Mat) obj;
428*1eb62cbbSBarry Smith   Matimpiaij *aij = (Matimpiaij *) mat->data;
429*1eb62cbbSBarry Smith   int        ierr;
430*1eb62cbbSBarry Smith 
431*1eb62cbbSBarry Smith   MPE_Seq_begin(aij->comm,1);
432*1eb62cbbSBarry Smith   printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
433*1eb62cbbSBarry Smith           aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
434*1eb62cbbSBarry Smith           aij->cend);
435*1eb62cbbSBarry Smith   ierr = MatView(aij->A,viewer); CHKERR(ierr);
436*1eb62cbbSBarry Smith   ierr = MatView(aij->B,viewer); CHKERR(ierr);
437*1eb62cbbSBarry Smith   MPE_Seq_end(aij->comm,1);
438*1eb62cbbSBarry Smith   return 0;
439*1eb62cbbSBarry Smith }
440*1eb62cbbSBarry Smith 
441*1eb62cbbSBarry Smith /*
442*1eb62cbbSBarry Smith     This has to provide several versions.
443*1eb62cbbSBarry Smith 
444*1eb62cbbSBarry Smith      1) per sequential
445*1eb62cbbSBarry Smith      2) a) use only local smoothing updating outer values only once.
446*1eb62cbbSBarry Smith         b) local smoothing updating outer values each inner iteration
447*1eb62cbbSBarry Smith      3) color updating out values betwen colors. (this imples an
448*1eb62cbbSBarry Smith         ordering that is sort of related to the IS argument, it
449*1eb62cbbSBarry Smith         is not clear a IS argument makes the most sense perhaps it
450*1eb62cbbSBarry Smith         should be dropped.
451*1eb62cbbSBarry Smith */
4528a729477SBarry Smith static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,IS is,
4538a729477SBarry Smith                         int its,Vec xx)
4548a729477SBarry Smith {
455*1eb62cbbSBarry Smith   Matimpiaij *mat = (Matimpiaij *) matin->data;
456*1eb62cbbSBarry Smith   Scalar     zero = 0.0;
4578a729477SBarry Smith   int        ierr,one = 1, tmp, *idx, *diag;
4588a729477SBarry Smith   int        n = mat->n, m = mat->m, i, j;
4598a729477SBarry Smith 
4608a729477SBarry Smith   if (is) SETERR(1,"No support for ordering in relaxation");
4618a729477SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
4628a729477SBarry Smith     if (ierr = VecSet(&zero,xx)) return ierr;
4638a729477SBarry Smith   }
464*1eb62cbbSBarry Smith 
465*1eb62cbbSBarry Smith   /* update outer values from other processors*/
466*1eb62cbbSBarry Smith 
467*1eb62cbbSBarry Smith   /* smooth locally */
4688a729477SBarry Smith   return 0;
4698a729477SBarry Smith }
4708a729477SBarry Smith /* -------------------------------------------------------------------*/
471*1eb62cbbSBarry Smith static struct _MatOps MatOps = {MatiAIJInsertValues,
4728a729477SBarry Smith        0, 0,
473*1eb62cbbSBarry Smith        MatiAIJMult,0,0,0,
474*1eb62cbbSBarry Smith        0,0,0,0,
4758a729477SBarry Smith        0,0,
4768a729477SBarry Smith        MatiAIJrelax,
4778a729477SBarry Smith        0,
478*1eb62cbbSBarry Smith        0,0,0,
4798a729477SBarry Smith        0,
4808a729477SBarry Smith        MatiAIJgetdiag,0,0,
481*1eb62cbbSBarry Smith        MatiAIJBeginAssemble,MatiAIJEndAssemble,
482*1eb62cbbSBarry Smith        0,
483*1eb62cbbSBarry Smith        0,MatiZero,MatiZerorows,0,
484*1eb62cbbSBarry Smith        0,0,0,0 };
4858a729477SBarry Smith 
4868a729477SBarry Smith 
4878a729477SBarry Smith 
4888a729477SBarry Smith /*@
4898a729477SBarry Smith 
490*1eb62cbbSBarry Smith       MatCreateMPIAIJ - Creates a sparse parallel matrix
491*1eb62cbbSBarry Smith                                  in AIJ format.
4928a729477SBarry Smith 
4938a729477SBarry Smith   Input Parameters:
494*1eb62cbbSBarry Smith .   comm - MPI communicator
495*1eb62cbbSBarry Smith .   m,n - number of local rows and columns (or -1 to have calculated)
496*1eb62cbbSBarry Smith .   M,N - global rows and columns (or -1 to have calculated)
497*1eb62cbbSBarry Smith .   d_nz - total number nonzeros in diagonal portion of matrix
498*1eb62cbbSBarry Smith .   d_nzz - number of nonzeros per row in diagonal portion of matrix or null
4998a729477SBarry Smith .           You must leave room for the diagonal entry even if it is zero.
500*1eb62cbbSBarry Smith .   o_nz - total number nonzeros in off-diagonal portion of matrix
501*1eb62cbbSBarry Smith .   o_nzz - number of nonzeros per row in off-diagonal portion of matrix
502*1eb62cbbSBarry Smith .           or null. You must have at least one nonzero per row.
5038a729477SBarry Smith 
5048a729477SBarry Smith   Output parameters:
5058a729477SBarry Smith .  newmat - the matrix
5068a729477SBarry Smith 
507*1eb62cbbSBarry Smith   Keywords: matrix, aij, compressed row, sparse, parallel
5088a729477SBarry Smith @*/
509*1eb62cbbSBarry Smith int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
510*1eb62cbbSBarry Smith                  int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat)
5118a729477SBarry Smith {
5128a729477SBarry Smith   Mat          mat;
513*1eb62cbbSBarry Smith   Matimpiaij   *aij;
514*1eb62cbbSBarry Smith   int          ierr, i,rl,len,sum[2],work[2];
5158a729477SBarry Smith   *newmat         = 0;
5168a729477SBarry Smith   CREATEHEADER(mat,_Mat);
517*1eb62cbbSBarry Smith   mat->data       = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij);
5188a729477SBarry Smith   mat->cookie     = MAT_COOKIE;
5198a729477SBarry Smith   mat->ops        = &MatOps;
5208a729477SBarry Smith   mat->destroy    = MatiAIJdestroy;
521*1eb62cbbSBarry Smith   mat->view       = MatiView;
522*1eb62cbbSBarry Smith   mat->type       = MATAIJMPI;
5238a729477SBarry Smith   mat->factor     = 0;
5248a729477SBarry Smith   mat->row        = 0;
5258a729477SBarry Smith   mat->col        = 0;
526*1eb62cbbSBarry Smith   aij->comm       = comm;
527*1eb62cbbSBarry Smith   aij->insertmode = NotSetValues;
528*1eb62cbbSBarry Smith   MPI_Comm_rank(comm,&aij->mytid);
529*1eb62cbbSBarry Smith   MPI_Comm_size(comm,&aij->numtids);
530*1eb62cbbSBarry Smith 
531*1eb62cbbSBarry Smith   if (M == -1 || N == -1) {
532*1eb62cbbSBarry Smith     work[0] = m; work[1] = n;
533*1eb62cbbSBarry Smith     MPI_Allreduce((void *) work,(void *) sum,1,MPI_INT,MPI_SUM,comm );
534*1eb62cbbSBarry Smith     if (M == -1) M = sum[0];
535*1eb62cbbSBarry Smith     if (N == -1) N = sum[1];
536*1eb62cbbSBarry Smith   }
537*1eb62cbbSBarry Smith   if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);}
538*1eb62cbbSBarry Smith   if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);}
5398a729477SBarry Smith   aij->m       = m;
5408a729477SBarry Smith   aij->n       = n;
541*1eb62cbbSBarry Smith   aij->N       = N;
542*1eb62cbbSBarry Smith   aij->M       = M;
543*1eb62cbbSBarry Smith 
544*1eb62cbbSBarry Smith   /* build local table of row and column ownerships */
545*1eb62cbbSBarry Smith   aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int));
546*1eb62cbbSBarry Smith   CHKPTR(aij->rowners);
547*1eb62cbbSBarry Smith   aij->cowners = aij->rowners + aij->numtids + 1;
548*1eb62cbbSBarry Smith   MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm);
549*1eb62cbbSBarry Smith   aij->rowners[0] = 0;
550*1eb62cbbSBarry Smith   for ( i=2; i<=aij->numtids; i++ ) {
551*1eb62cbbSBarry Smith     aij->rowners[i] += aij->rowners[i-1];
5528a729477SBarry Smith   }
553*1eb62cbbSBarry Smith   aij->rstart = aij->rowners[aij->mytid];
554*1eb62cbbSBarry Smith   aij->rend   = aij->rowners[aij->mytid+1];
555*1eb62cbbSBarry Smith   MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm);
556*1eb62cbbSBarry Smith   aij->cowners[0] = 0;
557*1eb62cbbSBarry Smith   for ( i=2; i<=aij->numtids; i++ ) {
558*1eb62cbbSBarry Smith     aij->cowners[i] += aij->cowners[i-1];
5598a729477SBarry Smith   }
560*1eb62cbbSBarry Smith   aij->cstart = aij->cowners[aij->mytid];
561*1eb62cbbSBarry Smith   aij->cend   = aij->cowners[aij->mytid+1];
5628a729477SBarry Smith 
5638a729477SBarry Smith 
564*1eb62cbbSBarry Smith   ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr);
565*1eb62cbbSBarry Smith   ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr);
5668a729477SBarry Smith 
567*1eb62cbbSBarry Smith   /* build cache for off array entries formed */
568*1eb62cbbSBarry Smith   aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */
569*1eb62cbbSBarry Smith   aij->stash.n    = 0;
570*1eb62cbbSBarry Smith   aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) +
571*1eb62cbbSBarry Smith                             sizeof(Scalar))); CHKPTR(aij->stash.array);
572*1eb62cbbSBarry Smith   aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax);
573*1eb62cbbSBarry Smith   aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax);
5748a729477SBarry Smith 
575*1eb62cbbSBarry Smith   /* stuff used for matrix vector multiply */
576*1eb62cbbSBarry Smith   aij->lvec      = 0;
577*1eb62cbbSBarry Smith   aij->Mvctx     = 0;
5788a729477SBarry Smith 
5798a729477SBarry Smith   *newmat = mat;
5808a729477SBarry Smith   return 0;
5818a729477SBarry Smith }
582