xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision d6dfbf8f6aa26f18ad3ebf3c96fa582aa9714225)
1 
2 #include "mpiaij.h"
3 #include "vec/vecimpl.h"
4 #include "inline/spops.h"
5 
6 #define CHUNCKSIZE   100
7 /*
8    This is a simple minded stash. Do a linear search to determine if
9  in stash, if not add to end.
10 */
11 static int StashValues(Stash *stash,int row,int n, int *idxn,
12                        Scalar *values,InsertMode addv)
13 {
14   int    i,j,N = stash->n,found,*n_idx, *n_idy;
15   Scalar val,*n_array;
16 
17   for ( i=0; i<n; i++ ) {
18     found = 0;
19     val = *values++;
20     for ( j=0; j<N; j++ ) {
21       if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) {
22         /* found a match */
23         if (addv == AddValues) stash->array[j] += val;
24         else stash->array[j] = val;
25         found = 1;
26         break;
27       }
28     }
29     if (!found) { /* not found so add to end */
30       if ( stash->n == stash->nmax ) {
31         /* allocate a larger stash */
32         n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*(
33                                      2*sizeof(int) + sizeof(Scalar)));
34         CHKPTR(n_array);
35         n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE);
36         n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE);
37         MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar));
38         MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int));
39         MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int));
40         if (stash->array) FREE(stash->array);
41         stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy;
42         stash->nmax += CHUNCKSIZE;
43       }
44       stash->array[stash->n]   = val;
45       stash->idx[stash->n]     = row;
46       stash->idy[stash->n++]   = idxn[i];
47     }
48   }
49   return 0;
50 }
51 
52 static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n,
53                             int *idxn,Scalar *v,InsertMode addv)
54 {
55   Matimpiaij *aij = (Matimpiaij *) mat->data;
56   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
57   int        cstart = aij->cstart, cend = aij->cend,row,col;
58 
59   if (aij->insertmode != NotSetValues && aij->insertmode != addv) {
60     SETERR(1,"You cannot mix inserts and adds");
61   }
62   aij->insertmode = addv;
63   for ( i=0; i<m; i++ ) {
64     if (idxm[i] < 0) continue;
65     if (idxm[i] >= rstart && idxm[i] < rend) {
66       row = idxm[i] - rstart;
67       for ( j=0; j<n; j++ ) {
68         if (idxn[j] < 0) continue;
69         if (idxn[j] >= cstart && idxn[j] < cend){
70           col = idxn[j] - cstart;
71           ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
72         }
73         else {
74           if (aij->assembled) {
75             SETERR(1,"Cannot insert off diagonal block in already\
76                      assembled matrix. Contact petsc-maint@mcs.anl.gov\
77                      if your need this feature");
78           }
79           col = idxn[j];
80           ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr);
81         }
82       }
83     }
84     else {
85       ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr);
86     }
87   }
88   return 0;
89 }
90 
91 /*
92     the assembly code is alot like the code for vectors, we should
93     sometime derive a single assembly code that can be used for
94     either case.
95 */
96 
97 static int MatiAIJBeginAssemble(Mat mat)
98 {
99   Matimpiaij  *aij = (Matimpiaij *) mat->data;
100   MPI_Comm    comm = mat->comm;
101   int         ierr, numtids = aij->numtids, *owners = aij->rowners;
102   int         mytid = aij->mytid;
103   MPI_Request *send_waits,*recv_waits;
104   int         *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work;
105   int         tag = 50, *owner,*starts,count;
106   InsertMode  addv;
107   Scalar      *rvalues,*svalues;
108 
109   /* make sure all processors are either in INSERTMODE or ADDMODE */
110   MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT,
111                 MPI_BOR,comm);
112   if (addv == (AddValues|InsertValues)) {
113     SETERR(1,"Some processors have inserted while others have added");
114   }
115   aij->insertmode = addv; /* in case this processor had no cache */
116 
117   /*  first count number of contributors to each processor */
118   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
119   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
120   owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner);
121   for ( i=0; i<aij->stash.n; i++ ) {
122     idx = aij->stash.idx[i];
123     for ( j=0; j<numtids; j++ ) {
124       if (idx >= owners[j] && idx < owners[j+1]) {
125         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
126       }
127     }
128   }
129   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
130 
131   /* inform other processors of number of messages and max length*/
132   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
133   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
134   nreceives = work[mytid];
135   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
136   nmax = work[mytid];
137   FREE(work);
138 
139   /* post receives:
140        1) each message will consist of ordered pairs
141      (global index,value) we store the global index as a double
142      to simplify the message passing.
143        2) since we don't know how long each individual message is we
144      allocate the largest needed buffer for each receive. Potentially
145      this is a lot of wasted space.
146 
147 
148        This could be done better.
149   */
150   rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar));
151   CHKPTR(rvalues);
152   recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request));
153   CHKPTR(recv_waits);
154   for ( i=0; i<nreceives; i++ ) {
155     MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag,
156               comm,recv_waits+i);
157   }
158 
159   /* do sends:
160       1) starts[i] gives the starting index in svalues for stuff going to
161          the ith processor
162   */
163   svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) );
164   CHKPTR(svalues);
165   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
166   CHKPTR(send_waits);
167   starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts);
168   starts[0] = 0;
169   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
170   for ( i=0; i<aij->stash.n; i++ ) {
171     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
172     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
173     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
174   }
175   FREE(owner);
176   starts[0] = 0;
177   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
178   count = 0;
179   for ( i=0; i<numtids; i++ ) {
180     if (procs[i]) {
181       MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag,
182                 comm,send_waits+count++);
183     }
184   }
185   FREE(starts); FREE(nprocs);
186 
187   /* Free cache space */
188   aij->stash.nmax = aij->stash.n = 0;
189   if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;}
190 
191   aij->svalues    = svalues;       aij->rvalues = rvalues;
192   aij->nsends     = nsends;         aij->nrecvs = nreceives;
193   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
194   aij->rmax       = nmax;
195 
196   return 0;
197 }
198 extern int MPIAIJSetUpMultiply(Mat);
199 
200 static int MatiAIJEndAssemble(Mat mat)
201 {
202   int        ierr;
203   Matimpiaij *aij = (Matimpiaij *) mat->data;
204 
205   MPI_Status  *send_status,recv_status;
206   int         imdex,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n;
207   int         row,col;
208   Scalar      *values,val;
209   InsertMode  addv = aij->insertmode;
210 
211   /*  wait on receives */
212   while (count) {
213     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
214     /* unpack receives into our local space */
215     values = aij->rvalues + 3*imdex*aij->rmax;
216     MPI_Get_count(&recv_status,MPI_SCALAR,&n);
217     n = n/3;
218     for ( i=0; i<n; i++ ) {
219       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
220       col = (int) PETSCREAL(values[3*i+1]);
221       val = values[3*i+2];
222       if (col >= aij->cstart && col < aij->cend) {
223           col -= aij->cstart;
224         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
225       }
226       else {
227         if (aij->assembled) {
228           SETERR(1,"Cannot insert off diagonal block in already\
229                    assembled matrix. Contact petsc-maint@mcs.anl.gov\
230                    if your need this feature");
231         }
232         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
233       }
234     }
235     count--;
236   }
237   FREE(aij->recv_waits); FREE(aij->rvalues);
238 
239   /* wait on sends */
240   if (aij->nsends) {
241     send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) );
242     CHKPTR(send_status);
243     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
244     FREE(send_status);
245   }
246   FREE(aij->send_waits); FREE(aij->svalues);
247 
248   aij->insertmode = NotSetValues;
249   ierr = MatBeginAssembly(aij->A); CHKERR(ierr);
250   ierr = MatEndAssembly(aij->A); CHKERR(ierr);
251 
252 
253   if (aij->assembled) {
254     /* this is because the present implimentation doesn't support
255      inserting values off the diagonal block.*/
256     return 0;
257   }
258 
259   ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr);
260   ierr = MatBeginAssembly(aij->B); CHKERR(ierr);
261   ierr = MatEndAssembly(aij->B); CHKERR(ierr);
262 
263   aij->assembled = 1;
264   return 0;
265 }
266 
267 static int MatiZero(Mat A)
268 {
269   Matimpiaij *l = (Matimpiaij *) A->data;
270 
271   MatZeroEntries(l->A); MatZeroEntries(l->B);
272   return 0;
273 }
274 
275 /* again this uses the same basic stratagy as in the assembly and
276    scatter create routines, we should try to do it systemamatically
277    if we can figure out the proper level of generality. */
278 
279 /* the code does not do the diagonal entries correctly unless the
280    matrix is square and the column and row owerships are identical.
281    This is a BUG. The only way to fix it seems to be to access
282    aij->A and aij->B directly and not through the MatZeroRows()
283    routine.
284 */
285 static int MatiZerorows(Mat A,IS is,Scalar *diag)
286 {
287   Matimpiaij     *l = (Matimpiaij *) A->data;
288   int            i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids;
289   int            *localrows,*procs,*nprocs,j,found,idx,nsends,*work;
290   int            nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid;
291   int            *rvalues,tag = 67,count,base,slen,n,len,*source;
292   int            *lens,imdex,*lrows,*values;
293   MPI_Comm       comm = A->comm;
294   MPI_Request    *send_waits,*recv_waits;
295   MPI_Status     recv_status,*send_status;
296   IS             istmp;
297 
298   ierr = ISGetLocalSize(is,&N); CHKERR(ierr);
299   ierr = ISGetIndices(is,&rows); CHKERR(ierr);
300 
301   /*  first count number of contributors to each processor */
302   nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs);
303   MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids;
304   owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/
305   for ( i=0; i<N; i++ ) {
306     idx = rows[i];
307     found = 0;
308     for ( j=0; j<numtids; j++ ) {
309       if (idx >= owners[j] && idx < owners[j+1]) {
310         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
311       }
312     }
313     if (!found) SETERR(1,"Imdex out of range");
314   }
315   nsends = 0;  for ( i=0; i<numtids; i++ ) { nsends += procs[i];}
316 
317   /* inform other processors of number of messages and max length*/
318   work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work);
319   MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm);
320   nrecvs = work[mytid];
321   MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm);
322   nmax = work[mytid];
323   FREE(work);
324 
325   /* post receives:   */
326   rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */
327   CHKPTR(rvalues);
328   recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request));
329   CHKPTR(recv_waits);
330   for ( i=0; i<nrecvs; i++ ) {
331     MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,
332               comm,recv_waits+i);
333   }
334 
335   /* do sends:
336       1) starts[i] gives the starting index in svalues for stuff going to
337          the ith processor
338   */
339   svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues);
340   send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request));
341   CHKPTR(send_waits);
342   starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts);
343   starts[0] = 0;
344   for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
345   for ( i=0; i<N; i++ ) {
346     svalues[starts[owner[i]]++] = rows[i];
347   }
348   ISRestoreIndices(is,&rows);
349 
350   starts[0] = 0;
351   for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
352   count = 0;
353   for ( i=0; i<numtids; i++ ) {
354     if (procs[i]) {
355       MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag,
356                 comm,send_waits+count++);
357     }
358   }
359   FREE(starts);
360 
361   base = owners[mytid];
362 
363   /*  wait on receives */
364   lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens);
365   source = lens + nrecvs;
366   count = nrecvs; slen = 0;
367   while (count) {
368     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
369     /* unpack receives into our local space */
370     MPI_Get_count(&recv_status,MPI_INT,&n);
371     source[imdex]  = recv_status.MPI_SOURCE;
372     lens[imdex]  = n;
373     slen += n;
374     count--;
375   }
376   FREE(recv_waits);
377 
378   /* move the data into the send scatter */
379   lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows);
380   count = 0;
381   for ( i=0; i<nrecvs; i++ ) {
382     values = rvalues + i*nmax;
383     for ( j=0; j<lens[i]; j++ ) {
384       lrows[count++] = values[j] - base;
385     }
386   }
387   FREE(rvalues); FREE(lens);
388   FREE(owner); FREE(nprocs);
389 
390   /* actually zap the local rows */
391   ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr);  FREE(lrows);
392   ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr);
393   ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr);
394   ierr = ISDestroy(istmp); CHKERR(ierr);
395 
396   /* wait on sends */
397   if (nsends) {
398     send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) );
399     CHKPTR(send_status);
400     MPI_Waitall(nsends,send_waits,send_status);
401     FREE(send_status);
402   }
403   FREE(send_waits); FREE(svalues);
404 
405 
406   return 0;
407 }
408 
409 static int MatiAIJMult(Mat aijin,Vec xx,Vec yy)
410 {
411   Matimpiaij *aij = (Matimpiaij *) aijin->data;
412   int        ierr;
413 
414   ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,ScatterAll,aij->Mvctx);
415   CHKERR(ierr);
416   ierr = MatMult(aij->A,xx,yy); CHKERR(ierr);
417   ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,ScatterAll,aij->Mvctx);
418   CHKERR(ierr);
419   ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr);
420   return 0;
421 }
422 
423 /*
424   This only works correctly for square matrices where the subblock A->A is the
425    diagonal block
426 */
427 static int MatiAIJgetdiag(Mat Ain,Vec v)
428 {
429   Matimpiaij *A = (Matimpiaij *) Ain->data;
430   return MatGetDiagonal(A->A,v);
431 }
432 
433 static int MatiAIJdestroy(PetscObject obj)
434 {
435   Mat        mat = (Mat) obj;
436   Matimpiaij *aij = (Matimpiaij *) mat->data;
437   int        ierr;
438   FREE(aij->rowners);
439   ierr = MatDestroy(aij->A); CHKERR(ierr);
440   ierr = MatDestroy(aij->B); CHKERR(ierr);
441   FREE(aij); FREE(mat);
442   if (aij->lvec) VecDestroy(aij->lvec);
443   if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx);
444   return 0;
445 }
446 
447 static int MatiView(PetscObject obj,Viewer viewer)
448 {
449   Mat        mat = (Mat) obj;
450   Matimpiaij *aij = (Matimpiaij *) mat->data;
451   int        ierr;
452 
453   MPE_Seq_begin(mat->comm,1);
454   printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
455           aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
456           aij->cend);
457   ierr = MatView(aij->A,viewer); CHKERR(ierr);
458   ierr = MatView(aij->B,viewer); CHKERR(ierr);
459   MPE_Seq_end(mat->comm,1);
460   return 0;
461 }
462 
463 extern int MatiAIJmarkdiag(Matiaij  *);
464 /*
465     This has to provide several versions.
466 
467      1) per sequential
468      2) a) use only local smoothing updating outer values only once.
469         b) local smoothing updating outer values each inner iteration
470      3) color updating out values betwen colors.
471 */
472 static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,double shift,
473                         int its,Vec xx)
474 {
475   Matimpiaij *mat = (Matimpiaij *) matin->data;
476   Mat        AA = mat->A, BB = mat->B;
477   Matiaij    *A = (Matiaij *) AA->data, *B = (Matiaij *)BB->data;
478   Scalar     zero = 0.0,*b,*x,*w,*xs,*ls,d,*v,sum;
479   int        ierr,one = 1, tmp, *idx, *diag,mytid;
480   int        n = mat->n, m = mat->m, i, j;
481 
482   if (flag & SOR_ZERO_INITIAL_GUESS) {
483     /* could be more efficient by incorporating with below */
484     if (ierr = VecSet(&zero,xx)) return ierr;
485   }
486 
487   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
488   xs = x - 1; /* shift by one for index start of 1 */
489   ls--;;
490   if (!A->diag) {if (ierr = MatiAIJmarkdiag(A)) return ierr;}
491   diag = A->diag;
492 
493   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
494     ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx);
495     CHKERR(ierr);
496     ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx);
497     CHKERR(ierr);
498     while (its--) {
499       /* go down through the rows */
500       ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineDown,
501                               mat->Mvctx); CHKERR(ierr);
502       for ( i=0; i<m; i++ ) {
503         n    = A->i[i+1] - A->i[i];
504         idx  = A->j + A->i[i] - 1;
505         v    = A->a + A->i[i] - 1;
506         sum  = b[i];
507         SPARSEDENSEMDOT(sum,xs,v,idx,n);
508         d    = shift + A->a[diag[i]-1];
509         n    = B->i[i+1] - B->i[i];
510         idx  = B->j + B->i[i] - 1;
511         v    = B->a + B->i[i] - 1;
512         SPARSEDENSEMDOT(sum,ls,v,idx,n);
513         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
514       }
515       ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineDown,
516                             mat->Mvctx); CHKERR(ierr);
517       /* come up through the rows */
518       ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineUp,
519                               mat->Mvctx); CHKERR(ierr);
520       for ( i=m-1; i>-1; i-- ) {
521         n    = A->i[i+1] - A->i[i];
522         idx  = A->j + A->i[i] - 1;
523         v    = A->a + A->i[i] - 1;
524         sum  = b[i];
525         SPARSEDENSEMDOT(sum,xs,v,idx,n);
526         d    = shift + A->a[diag[i]-1];
527         n    = B->i[i+1] - B->i[i];
528         idx  = B->j + B->i[i] - 1;
529         v    = B->a + B->i[i] - 1;
530         SPARSEDENSEMDOT(sum,ls,v,idx,n);
531         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
532       }
533       ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineUp,
534                             mat->Mvctx); CHKERR(ierr);
535     }
536   }
537   else if (flag & SOR_FORWARD_SWEEP){
538     while (its--) {
539       ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx);
540       CHKERR(ierr);
541       ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx);
542       CHKERR(ierr);
543       ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineDown,
544                               mat->Mvctx); CHKERR(ierr);
545       for ( i=0; i<m; i++ ) {
546         n    = A->i[i+1] - A->i[i];
547         idx  = A->j + A->i[i] - 1;
548         v    = A->a + A->i[i] - 1;
549         sum  = b[i];
550         SPARSEDENSEMDOT(sum,xs,v,idx,n);
551         d    = shift + A->a[diag[i]-1];
552         n    = B->i[i+1] - B->i[i];
553         idx  = B->j + B->i[i] - 1;
554         v    = B->a + B->i[i] - 1;
555         SPARSEDENSEMDOT(sum,ls,v,idx,n);
556         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
557       }
558       ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineDown,
559                             mat->Mvctx); CHKERR(ierr);
560     }
561   }
562   else if (flag & SOR_BACKWARD_SWEEP){
563     while (its--) {
564       ierr = VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterDown,
565                             mat->Mvctx); CHKERR(ierr);
566       ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterDown,
567                             mat->Mvctx); CHKERR(ierr);
568       ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineUp,
569                               mat->Mvctx); CHKERR(ierr);
570       for ( i=m-1; i>-1; i-- ) {
571         n    = A->i[i+1] - A->i[i];
572         idx  = A->j + A->i[i] - 1;
573         v    = A->a + A->i[i] - 1;
574         sum  = b[i];
575         SPARSEDENSEMDOT(sum,xs,v,idx,n);
576         d    = shift + A->a[diag[i]-1];
577         n    = B->i[i+1] - B->i[i];
578         idx  = B->j + B->i[i] - 1;
579         v    = B->a + B->i[i] - 1;
580         SPARSEDENSEMDOT(sum,ls,v,idx,n);
581         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
582       }
583       ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineUp,
584                             mat->Mvctx); CHKERR(ierr);
585     }
586   }
587   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
588     ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx);
589     CHKERR(ierr);
590     ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx);
591     CHKERR(ierr);
592     while (its--) {
593       /* go down through the rows */
594       for ( i=0; i<m; i++ ) {
595         n    = A->i[i+1] - A->i[i];
596         idx  = A->j + A->i[i] - 1;
597         v    = A->a + A->i[i] - 1;
598         sum  = b[i];
599         SPARSEDENSEMDOT(sum,xs,v,idx,n);
600         d    = shift + A->a[diag[i]-1];
601         n    = B->i[i+1] - B->i[i];
602         idx  = B->j + B->i[i] - 1;
603         v    = B->a + B->i[i] - 1;
604         SPARSEDENSEMDOT(sum,ls,v,idx,n);
605         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
606       }
607       /* come up through the rows */
608       for ( i=m-1; i>-1; i-- ) {
609         n    = A->i[i+1] - A->i[i];
610         idx  = A->j + A->i[i] - 1;
611         v    = A->a + A->i[i] - 1;
612         sum  = b[i];
613         SPARSEDENSEMDOT(sum,xs,v,idx,n);
614         d    = shift + A->a[diag[i]-1];
615         n    = B->i[i+1] - B->i[i];
616         idx  = B->j + B->i[i] - 1;
617         v    = B->a + B->i[i] - 1;
618         SPARSEDENSEMDOT(sum,ls,v,idx,n);
619         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
620       }
621     }
622   }
623   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
624     ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx);
625     CHKERR(ierr);
626     ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx);
627     CHKERR(ierr);
628     while (its--) {
629       for ( i=0; i<m; i++ ) {
630         n    = A->i[i+1] - A->i[i];
631         idx  = A->j + A->i[i] - 1;
632         v    = A->a + A->i[i] - 1;
633         sum  = b[i];
634         SPARSEDENSEMDOT(sum,xs,v,idx,n);
635         d    = shift + A->a[diag[i]-1];
636         n    = B->i[i+1] - B->i[i];
637         idx  = B->j + B->i[i] - 1;
638         v    = B->a + B->i[i] - 1;
639         SPARSEDENSEMDOT(sum,ls,v,idx,n);
640         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
641       }
642     }
643   }
644   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
645     ierr = VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterAll,
646                             mat->Mvctx); CHKERR(ierr);
647     ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterAll,
648                             mat->Mvctx); CHKERR(ierr);
649     while (its--) {
650       for ( i=m-1; i>-1; i-- ) {
651         n    = A->i[i+1] - A->i[i];
652         idx  = A->j + A->i[i] - 1;
653         v    = A->a + A->i[i] - 1;
654         sum  = b[i];
655         SPARSEDENSEMDOT(sum,xs,v,idx,n);
656         d    = shift + A->a[diag[i]-1];
657         n    = B->i[i+1] - B->i[i];
658         idx  = B->j + B->i[i] - 1;
659         v    = B->a + B->i[i] - 1;
660         SPARSEDENSEMDOT(sum,ls,v,idx,n);
661         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
662       }
663     }
664   }
665   return 0;
666 }
667 static int MatiAIJinsopt(Mat aijin,int op)
668 {
669   Matimpiaij *aij = (Matimpiaij *) aijin->data;
670 
671   if      (op == NO_NEW_NONZERO_LOCATIONS)  {
672     MatSetOption(aij->A,op);
673     MatSetOption(aij->B,op);
674   }
675   else if (op == YES_NEW_NONZERO_LOCATIONS) {
676     MatSetOption(aij->A,op);
677     MatSetOption(aij->B,op);
678   }
679   else if (op == COLUMN_ORIENTED) SETERR(1,"Column oriented not supported");
680   return 0;
681 }
682 
683 static int MatiAIJsize(Mat matin,int *m,int *n)
684 {
685   Matimpiaij *mat = (Matimpiaij *) matin->data;
686   *m = mat->M; *n = mat->N;
687   return 0;
688 }
689 
690 static int MatiAIJlocalsize(Mat matin,int *m,int *n)
691 {
692   Matimpiaij *mat = (Matimpiaij *) matin->data;
693   *m = mat->m; *n = mat->n;
694   return 0;
695 }
696 
697 static int MatiAIJrange(Mat matin,int *m,int *n)
698 {
699   Matimpiaij *mat = (Matimpiaij *) matin->data;
700   *m = mat->rstart; *n = mat->rend;
701   return 0;
702 }
703 
704 static int MatiCopy(Mat,Mat *);
705 
706 /* -------------------------------------------------------------------*/
707 static struct _MatOps MatOps = {MatiAIJInsertValues,
708        0, 0,
709        MatiAIJMult,0,0,0,
710        0,0,0,0,
711        0,0,
712        MatiAIJrelax,
713        0,
714        0,0,0,
715        MatiCopy,
716        MatiAIJgetdiag,0,0,
717        MatiAIJBeginAssemble,MatiAIJEndAssemble,
718        0,
719        MatiAIJinsopt,MatiZero,MatiZerorows,0,
720        0,0,0,0,
721        MatiAIJsize,MatiAIJlocalsize,MatiAIJrange };
722 
723 
724 
725 /*@
726 
727       MatCreateMPIAIJ - Creates a sparse parallel matrix
728                                  in AIJ format.
729 
730   Input Parameters:
731 .   comm - MPI communicator
732 .   m,n - number of local rows and columns (or -1 to have calculated)
733 .   M,N - global rows and columns (or -1 to have calculated)
734 .   d_nz - total number nonzeros in diagonal portion of matrix
735 .   d_nzz - number of nonzeros per row in diagonal portion of matrix or null
736 .           You must leave room for the diagonal entry even if it is zero.
737 .   o_nz - total number nonzeros in off-diagonal portion of matrix
738 .   o_nzz - number of nonzeros per row in off-diagonal portion of matrix
739 .           or null. You must have at least one nonzero per row.
740 
741   Output parameters:
742 .  newmat - the matrix
743 
744   Keywords: matrix, aij, compressed row, sparse, parallel
745 @*/
746 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
747                  int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat)
748 {
749   Mat          mat;
750   Matimpiaij   *aij;
751   int          ierr, i,rl,len,sum[2],work[2];
752   *newmat         = 0;
753   CREATEHEADER(mat,_Mat);
754   mat->data       = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij);
755   mat->cookie     = MAT_COOKIE;
756   mat->ops        = &MatOps;
757   mat->destroy    = MatiAIJdestroy;
758   mat->view       = MatiView;
759   mat->type       = MATAIJMPI;
760   mat->factor     = 0;
761   mat->row        = 0;
762   mat->col        = 0;
763   mat->outofrange = 0;
764   mat->Mlow       = 0;
765   mat->Mhigh      = M;
766   mat->Nlow       = 0;
767   mat->Nhigh      = N;
768 
769 
770   mat->comm       = comm;
771   aij->insertmode = NotSetValues;
772   MPI_Comm_rank(comm,&aij->mytid);
773   MPI_Comm_size(comm,&aij->numtids);
774 
775   if (M == -1 || N == -1) {
776     work[0] = m; work[1] = n;
777     MPI_Allreduce((void *) work,(void *) sum,2,MPI_INT,MPI_SUM,comm );
778     if (M == -1) M = sum[0];
779     if (N == -1) N = sum[1];
780   }
781   if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);}
782   if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);}
783   aij->m       = m;
784   aij->n       = n;
785   aij->N       = N;
786   aij->M       = M;
787 
788   /* build local table of row and column ownerships */
789   aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int));
790   CHKPTR(aij->rowners);
791   aij->cowners = aij->rowners + aij->numtids + 1;
792   MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm);
793   aij->rowners[0] = 0;
794   for ( i=2; i<=aij->numtids; i++ ) {
795     aij->rowners[i] += aij->rowners[i-1];
796   }
797   aij->rstart = aij->rowners[aij->mytid];
798   aij->rend   = aij->rowners[aij->mytid+1];
799   MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm);
800   aij->cowners[0] = 0;
801   for ( i=2; i<=aij->numtids; i++ ) {
802     aij->cowners[i] += aij->cowners[i-1];
803   }
804   aij->cstart = aij->cowners[aij->mytid];
805   aij->cend   = aij->cowners[aij->mytid+1];
806 
807 
808   ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr);
809   ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr);
810 
811   /* build cache for off array entries formed */
812   aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */
813   aij->stash.n    = 0;
814   aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) +
815                             sizeof(Scalar))); CHKPTR(aij->stash.array);
816   aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax);
817   aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax);
818 
819   /* stuff used for matrix vector multiply */
820   aij->lvec      = 0;
821   aij->Mvctx     = 0;
822   aij->assembled = 0;
823 
824   *newmat = mat;
825   return 0;
826 }
827 
828 static int MatiCopy(Mat matin,Mat *newmat)
829 {
830   Mat        mat;
831   Matimpiaij *aij,*oldmat = (Matimpiaij *) matin->data;
832   int        i,rl,len, m = oldmat->m, n = oldmat->n,ierr;
833   *newmat      = 0;
834 
835   if (!oldmat->assembled) SETERR(1,"Cannot copy unassembled matrix");
836   CREATEHEADER(mat,_Mat);
837   mat->data       = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij);
838   mat->cookie     = MAT_COOKIE;
839   mat->ops        = &MatOps;
840   mat->destroy    = MatiAIJdestroy;
841   mat->view       = MatiView;
842   mat->type       = MATAIJSEQ;
843   mat->factor     = matin->factor;
844   mat->row        = 0;
845   mat->col        = 0;
846   mat->outofrange = matin->outofrange;
847   mat->Mlow       = matin->Mlow;
848   mat->Mhigh      = matin->Mhigh;
849   mat->Nlow       = matin->Nlow;
850   mat->Nhigh      = matin->Nhigh;
851 
852   aij->m          = oldmat->m;
853   aij->n          = oldmat->n;
854   aij->M          = oldmat->M;
855   aij->N          = oldmat->N;
856 
857   aij->assembled  = 1;
858   aij->rstart     = oldmat->rstart;
859   aij->rend       = oldmat->rend;
860   aij->cstart     = oldmat->cstart;
861   aij->cend       = oldmat->cend;
862   aij->numtids    = oldmat->numtids;
863   aij->mytid      = oldmat->mytid;
864   aij->insertmode = NotSetValues;
865 
866   aij->rowners    = (int *) MALLOC( (aij->numtids+1)*sizeof(int) );
867   CHKPTR(aij->rowners);
868   MEMCPY(aij->rowners,oldmat->rowners,(aij->numtids+1)*sizeof(int));
869   aij->stash.nmax = 0;
870   aij->stash.n    = 0;
871   aij->stash.array= 0;
872   mat->comm       = matin->comm;
873 
874   ierr =  VecCreate(oldmat->lvec,&aij->lvec); CHKERR(ierr);
875   ierr =  VecScatterCtxCopy(oldmat->Mvctx,&aij->Mvctx); CHKERR(ierr);
876   ierr =  MatCopy(oldmat->A,&aij->A); CHKERR(ierr);
877   ierr =  MatCopy(oldmat->B,&aij->B); CHKERR(ierr);
878   *newmat = mat;
879   return 0;
880 }
881