xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 4b2e5ea65180f80d7b8092696cf316c30453dc0d)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.126 1996/02/09 15:10:11 curfman Exp curfman $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (mat->was_assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   for ( i=0; i<m; i++ ) {
104     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
105     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
106     if (idxm[i] >= rstart && idxm[i] < rend) {
107       row = idxm[i] - rstart;
108       for ( j=0; j<n; j++ ) {
109         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
110         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
111         if (idxn[j] >= cstart && idxn[j] < cend){
112           col = idxn[j] - cstart;
113           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
114         }
115         else {
116           col = aij->colmap[idxn[j]] + shift;
117           ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
118         }
119       }
120     }
121     else {
122       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
123     }
124   }
125   return 0;
126 }
127 
128 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
129 {
130   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
131   MPI_Comm    comm = mat->comm;
132   int         size = aij->size, *owners = aij->rowners;
133   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
134   MPI_Request *send_waits,*recv_waits;
135   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
136   InsertMode  addv;
137   Scalar      *rvalues,*svalues;
138 
139   /* make sure all processors are either in INSERTMODE or ADDMODE */
140   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
141   if (addv == (ADD_VALUES|INSERT_VALUES)) {
142     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
143   }
144   aij->insertmode = addv; /* in case this processor had no cache */
145 
146   /*  first count number of contributors to each processor */
147   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
148   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
149   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
150   for ( i=0; i<aij->stash.n; i++ ) {
151     idx = aij->stash.idx[i];
152     for ( j=0; j<size; j++ ) {
153       if (idx >= owners[j] && idx < owners[j+1]) {
154         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
155       }
156     }
157   }
158   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
159 
160   /* inform other processors of number of messages and max length*/
161   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
162   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
163   nreceives = work[rank];
164   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
165   nmax = work[rank];
166   PetscFree(work);
167 
168   /* post receives:
169        1) each message will consist of ordered pairs
170      (global index,value) we store the global index as a double
171      to simplify the message passing.
172        2) since we don't know how long each individual message is we
173      allocate the largest needed buffer for each receive. Potentially
174      this is a lot of wasted space.
175 
176 
177        This could be done better.
178   */
179   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
180   CHKPTRQ(rvalues);
181   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
182   CHKPTRQ(recv_waits);
183   for ( i=0; i<nreceives; i++ ) {
184     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
185               comm,recv_waits+i);
186   }
187 
188   /* do sends:
189       1) starts[i] gives the starting index in svalues for stuff going to
190          the ith processor
191   */
192   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
193   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
194   CHKPTRQ(send_waits);
195   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
196   starts[0] = 0;
197   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
198   for ( i=0; i<aij->stash.n; i++ ) {
199     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
200     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
201     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
202   }
203   PetscFree(owner);
204   starts[0] = 0;
205   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
206   count = 0;
207   for ( i=0; i<size; i++ ) {
208     if (procs[i]) {
209       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
210                 comm,send_waits+count++);
211     }
212   }
213   PetscFree(starts); PetscFree(nprocs);
214 
215   /* Free cache space */
216   PLogInfo(0,"[%d]MatAssemblyBegin_MPIAIJ:Number of off processor values %d\n",rank,aij->stash.n);
217   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
218 
219   aij->svalues    = svalues;    aij->rvalues    = rvalues;
220   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
221   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
222   aij->rmax       = nmax;
223 
224   return 0;
225 }
226 extern int MatSetUpMultiply_MPIAIJ(Mat);
227 
228 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
229 {
230   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
231   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
232   MPI_Status  *send_status,recv_status;
233   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
234   int         row,col,other_disassembled,shift = C->indexshift;
235   Scalar      *values,val;
236   InsertMode  addv = aij->insertmode;
237 
238   /*  wait on receives */
239   while (count) {
240     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
241     /* unpack receives into our local space */
242     values = aij->rvalues + 3*imdex*aij->rmax;
243     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
244     n = n/3;
245     for ( i=0; i<n; i++ ) {
246       row = (int) PetscReal(values[3*i]) - aij->rstart;
247       col = (int) PetscReal(values[3*i+1]);
248       val = values[3*i+2];
249       if (col >= aij->cstart && col < aij->cend) {
250         col -= aij->cstart;
251         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
252       }
253       else {
254         if (mat->was_assembled) {
255           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
256           col = aij->colmap[col] + shift;
257           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
258             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
259             col = (int) PetscReal(values[3*i+1]);
260           }
261         }
262         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
263       }
264     }
265     count--;
266   }
267   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
268 
269   /* wait on sends */
270   if (aij->nsends) {
271     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
272     CHKPTRQ(send_status);
273     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
274     PetscFree(send_status);
275   }
276   PetscFree(aij->send_waits); PetscFree(aij->svalues);
277 
278   aij->insertmode = NOT_SET_VALUES;
279   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
280   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
281 
282   /* determine if any processor has disassembled, if so we must
283      also disassemble ourselfs, in order that we may reassemble. */
284   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
285   if (mat->was_assembled && !other_disassembled) {
286     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
287   }
288 
289   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
290     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
291   }
292   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
293   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
294 
295   return 0;
296 }
297 
298 static int MatZeroEntries_MPIAIJ(Mat A)
299 {
300   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
301   int        ierr;
302   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
303   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
304   return 0;
305 }
306 
307 /* the code does not do the diagonal entries correctly unless the
308    matrix is square and the column and row owerships are identical.
309    This is a BUG. The only way to fix it seems to be to access
310    aij->A and aij->B directly and not through the MatZeroRows()
311    routine.
312 */
313 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
314 {
315   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
316   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
317   int            *procs,*nprocs,j,found,idx,nsends,*work;
318   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
319   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
320   int            *lens,imdex,*lrows,*values;
321   MPI_Comm       comm = A->comm;
322   MPI_Request    *send_waits,*recv_waits;
323   MPI_Status     recv_status,*send_status;
324   IS             istmp;
325 
326   ierr = ISGetLocalSize(is,&N); CHKERRQ(ierr);
327   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
328 
329   /*  first count number of contributors to each processor */
330   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
331   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
332   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
333   for ( i=0; i<N; i++ ) {
334     idx = rows[i];
335     found = 0;
336     for ( j=0; j<size; j++ ) {
337       if (idx >= owners[j] && idx < owners[j+1]) {
338         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
339       }
340     }
341     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
342   }
343   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
344 
345   /* inform other processors of number of messages and max length*/
346   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
347   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
348   nrecvs = work[rank];
349   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
350   nmax = work[rank];
351   PetscFree(work);
352 
353   /* post receives:   */
354   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
355   CHKPTRQ(rvalues);
356   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
357   CHKPTRQ(recv_waits);
358   for ( i=0; i<nrecvs; i++ ) {
359     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
360   }
361 
362   /* do sends:
363       1) starts[i] gives the starting index in svalues for stuff going to
364          the ith processor
365   */
366   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
367   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
368   CHKPTRQ(send_waits);
369   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
370   starts[0] = 0;
371   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
372   for ( i=0; i<N; i++ ) {
373     svalues[starts[owner[i]]++] = rows[i];
374   }
375   ISRestoreIndices(is,&rows);
376 
377   starts[0] = 0;
378   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
379   count = 0;
380   for ( i=0; i<size; i++ ) {
381     if (procs[i]) {
382       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
383     }
384   }
385   PetscFree(starts);
386 
387   base = owners[rank];
388 
389   /*  wait on receives */
390   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
391   source = lens + nrecvs;
392   count  = nrecvs; slen = 0;
393   while (count) {
394     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
395     /* unpack receives into our local space */
396     MPI_Get_count(&recv_status,MPI_INT,&n);
397     source[imdex]  = recv_status.MPI_SOURCE;
398     lens[imdex]  = n;
399     slen += n;
400     count--;
401   }
402   PetscFree(recv_waits);
403 
404   /* move the data into the send scatter */
405   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
406   count = 0;
407   for ( i=0; i<nrecvs; i++ ) {
408     values = rvalues + i*nmax;
409     for ( j=0; j<lens[i]; j++ ) {
410       lrows[count++] = values[j] - base;
411     }
412   }
413   PetscFree(rvalues); PetscFree(lens);
414   PetscFree(owner); PetscFree(nprocs);
415 
416   /* actually zap the local rows */
417   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
418   PLogObjectParent(A,istmp);
419   PetscFree(lrows);
420   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
421   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
422   ierr = ISDestroy(istmp); CHKERRQ(ierr);
423 
424   /* wait on sends */
425   if (nsends) {
426     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
427     CHKPTRQ(send_status);
428     MPI_Waitall(nsends,send_waits,send_status);
429     PetscFree(send_status);
430   }
431   PetscFree(send_waits); PetscFree(svalues);
432 
433   return 0;
434 }
435 
436 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
437 {
438   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
439   int        ierr;
440 
441   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
442   ierr = MatMult(a->A,xx,yy); CHKERRQ(ierr);
443   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
444   ierr = MatMultAdd(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
445   return 0;
446 }
447 
448 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
449 {
450   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
451   int        ierr;
452   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
453   ierr = MatMultAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
454   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
455   ierr = MatMultAdd(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
456   return 0;
457 }
458 
459 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
460 {
461   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
462   int        ierr;
463 
464   /* do nondiagonal part */
465   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
466   /* send it on its way */
467   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
468                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
469   /* do local part */
470   ierr = MatMultTrans(a->A,xx,yy); CHKERRQ(ierr);
471   /* receive remote parts: note this assumes the values are not actually */
472   /* inserted in yy until the next line, which is true for my implementation*/
473   /* but is not perhaps always true. */
474   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
475                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
476   return 0;
477 }
478 
479 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
480 {
481   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
482   int        ierr;
483 
484   /* do nondiagonal part */
485   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
486   /* send it on its way */
487   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
488                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
489   /* do local part */
490   ierr = MatMultTransAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
491   /* receive remote parts: note this assumes the values are not actually */
492   /* inserted in yy until the next line, which is true for my implementation*/
493   /* but is not perhaps always true. */
494   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
495                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
496   return 0;
497 }
498 
499 /*
500   This only works correctly for square matrices where the subblock A->A is the
501    diagonal block
502 */
503 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
504 {
505   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
506   return MatGetDiagonal(a->A,v);
507 }
508 
509 static int MatScale_MPIAIJ(Scalar *aa,Mat A)
510 {
511   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
512   int        ierr;
513   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
514   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
515   return 0;
516 }
517 
518 static int MatDestroy_MPIAIJ(PetscObject obj)
519 {
520   Mat        mat = (Mat) obj;
521   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
522   int        ierr;
523 #if defined(PETSC_LOG)
524   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
525 #endif
526   PetscFree(aij->rowners);
527   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
528   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
529   if (aij->colmap) PetscFree(aij->colmap);
530   if (aij->garray) PetscFree(aij->garray);
531   if (aij->lvec)   VecDestroy(aij->lvec);
532   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
533   PetscFree(aij);
534   PLogObjectDestroy(mat);
535   PetscHeaderDestroy(mat);
536   return 0;
537 }
538 #include "draw.h"
539 #include "pinclude/pviewer.h"
540 
541 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
542 {
543   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
544   int         ierr;
545 
546   if (aij->size == 1) {
547     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
548   }
549   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
550   return 0;
551 }
552 
553 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
554 {
555   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
556   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
557   int         ierr, format,shift = C->indexshift,rank;
558   PetscObject vobj = (PetscObject) viewer;
559   FILE        *fd;
560 
561   if (vobj->type == ASCII_FILE_VIEWER || vobj->type == ASCII_FILES_VIEWER) {
562     ierr = ViewerFileGetFormat_Private(viewer,&format);
563     if (format == FILE_FORMAT_INFO_DETAILED) {
564       int nz, nzalloc, mem, flg;
565       MPI_Comm_rank(mat->comm,&rank);
566       ierr = ViewerFileGetPointer(viewer,&fd); CHKERRQ(ierr);
567       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
568       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
569       MPIU_Seq_begin(mat->comm,1);
570       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
571          rank,aij->m,nz,nzalloc,mem);
572       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
573          rank,aij->m,nz,nzalloc,mem);
574       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
575       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz);
576       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
577       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz);
578       fflush(fd);
579       MPIU_Seq_end(mat->comm,1);
580       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
581       return 0;
582     }
583     else if (format == FILE_FORMAT_INFO) {
584       return 0;
585     }
586   }
587 
588   if (vobj->type == ASCII_FILE_VIEWER) {
589     ierr = ViewerFileGetPointer(viewer,&fd); CHKERRQ(ierr);
590     MPIU_Seq_begin(mat->comm,1);
591     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
592            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
593            aij->cend);
594     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
595     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
596     fflush(fd);
597     MPIU_Seq_end(mat->comm,1);
598   }
599   else {
600     int size = aij->size;
601     rank = aij->rank;
602     if (size == 1) {
603       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
604     }
605     else {
606       /* assemble the entire matrix onto first processor. */
607       Mat         A;
608       Mat_SeqAIJ *Aloc;
609       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
610       Scalar      *a;
611 
612       if (!rank) {
613         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
614                CHKERRQ(ierr);
615       }
616       else {
617         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
618                CHKERRQ(ierr);
619       }
620       PLogObjectParent(mat,A);
621 
622       /* copy over the A part */
623       Aloc = (Mat_SeqAIJ*) aij->A->data;
624       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
625       row = aij->rstart;
626       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
627       for ( i=0; i<m; i++ ) {
628         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
629         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
630       }
631       aj = Aloc->j;
632       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
633 
634       /* copy over the B part */
635       Aloc = (Mat_SeqAIJ*) aij->B->data;
636       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
637       row = aij->rstart;
638       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
639       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
640       for ( i=0; i<m; i++ ) {
641         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
642         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
643       }
644       PetscFree(ct);
645       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
646       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
647       if (!rank) {
648         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
649       }
650       ierr = MatDestroy(A); CHKERRQ(ierr);
651     }
652   }
653   return 0;
654 }
655 
656 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
657 {
658   Mat         mat = (Mat) obj;
659   int         ierr;
660   PetscObject vobj = (PetscObject) viewer;
661 
662   if (!viewer) {
663     viewer = STDOUT_VIEWER_SELF; vobj = (PetscObject) viewer;
664   }
665   if (vobj->cookie == DRAW_COOKIE && vobj->type == NULLWINDOW) return 0;
666   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILE_VIEWER) {
667     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
668   }
669   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILES_VIEWER) {
670     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
671   }
672   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == MATLAB_VIEWER) {
673     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
674   }
675   else if (vobj->cookie == DRAW_COOKIE) {
676     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
677   }
678   else if (vobj->type == BINARY_FILE_VIEWER) {
679     return MatView_MPIAIJ_Binary(mat,viewer);
680   }
681   return 0;
682 }
683 
684 extern int MatMarkDiag_SeqAIJ(Mat);
685 /*
686     This has to provide several versions.
687 
688      1) per sequential
689      2) a) use only local smoothing updating outer values only once.
690         b) local smoothing updating outer values each inner iteration
691      3) color updating out values betwen colors.
692 */
693 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
694                            double fshift,int its,Vec xx)
695 {
696   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
697   Mat        AA = mat->A, BB = mat->B;
698   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
699   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
700   int        ierr,*idx, *diag;
701   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
702   Vec        tt;
703 
704   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
705   xs = x + shift; /* shift by one for index start of 1 */
706   ls = ls + shift;
707   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
708   diag = A->diag;
709   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
710     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
711   }
712   if (flag & SOR_EISENSTAT) {
713     /* Let  A = L + U + D; where L is lower trianglar,
714     U is upper triangular, E is diagonal; This routine applies
715 
716             (L + E)^{-1} A (U + E)^{-1}
717 
718     to a vector efficiently using Eisenstat's trick. This is for
719     the case of SSOR preconditioner, so E is D/omega where omega
720     is the relaxation factor.
721     */
722     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
723     PLogObjectParent(matin,tt);
724     VecGetArray(tt,&t);
725     scale = (2.0/omega) - 1.0;
726     /*  x = (E + U)^{-1} b */
727     VecSet(&zero,mat->lvec);
728     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
729                               mat->Mvctx); CHKERRQ(ierr);
730     for ( i=m-1; i>-1; i-- ) {
731       n    = A->i[i+1] - diag[i] - 1;
732       idx  = A->j + diag[i] + !shift;
733       v    = A->a + diag[i] + !shift;
734       sum  = b[i];
735       SPARSEDENSEMDOT(sum,xs,v,idx,n);
736       d    = fshift + A->a[diag[i]+shift];
737       n    = B->i[i+1] - B->i[i];
738       idx  = B->j + B->i[i] + shift;
739       v    = B->a + B->i[i] + shift;
740       SPARSEDENSEMDOT(sum,ls,v,idx,n);
741       x[i] = omega*(sum/d);
742     }
743     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
744                             mat->Mvctx); CHKERRQ(ierr);
745 
746     /*  t = b - (2*E - D)x */
747     v = A->a;
748     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
749 
750     /*  t = (E + L)^{-1}t */
751     ts = t + shift; /* shifted by one for index start of a or mat->j*/
752     diag = A->diag;
753     VecSet(&zero,mat->lvec);
754     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
755                                                  mat->Mvctx); CHKERRQ(ierr);
756     for ( i=0; i<m; i++ ) {
757       n    = diag[i] - A->i[i];
758       idx  = A->j + A->i[i] + shift;
759       v    = A->a + A->i[i] + shift;
760       sum  = t[i];
761       SPARSEDENSEMDOT(sum,ts,v,idx,n);
762       d    = fshift + A->a[diag[i]+shift];
763       n    = B->i[i+1] - B->i[i];
764       idx  = B->j + B->i[i] + shift;
765       v    = B->a + B->i[i] + shift;
766       SPARSEDENSEMDOT(sum,ls,v,idx,n);
767       t[i] = omega*(sum/d);
768     }
769     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
770                                                     mat->Mvctx); CHKERRQ(ierr);
771     /*  x = x + t */
772     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
773     VecDestroy(tt);
774     return 0;
775   }
776 
777 
778   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
779     if (flag & SOR_ZERO_INITIAL_GUESS) {
780       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
781     }
782     else {
783       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
784       CHKERRQ(ierr);
785       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
786       CHKERRQ(ierr);
787     }
788     while (its--) {
789       /* go down through the rows */
790       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
791                               mat->Mvctx); CHKERRQ(ierr);
792       for ( i=0; i<m; i++ ) {
793         n    = A->i[i+1] - A->i[i];
794         idx  = A->j + A->i[i] + shift;
795         v    = A->a + A->i[i] + shift;
796         sum  = b[i];
797         SPARSEDENSEMDOT(sum,xs,v,idx,n);
798         d    = fshift + A->a[diag[i]+shift];
799         n    = B->i[i+1] - B->i[i];
800         idx  = B->j + B->i[i] + shift;
801         v    = B->a + B->i[i] + shift;
802         SPARSEDENSEMDOT(sum,ls,v,idx,n);
803         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
804       }
805       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
806                             mat->Mvctx); CHKERRQ(ierr);
807       /* come up through the rows */
808       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
809                               mat->Mvctx); CHKERRQ(ierr);
810       for ( i=m-1; i>-1; i-- ) {
811         n    = A->i[i+1] - A->i[i];
812         idx  = A->j + A->i[i] + shift;
813         v    = A->a + A->i[i] + shift;
814         sum  = b[i];
815         SPARSEDENSEMDOT(sum,xs,v,idx,n);
816         d    = fshift + A->a[diag[i]+shift];
817         n    = B->i[i+1] - B->i[i];
818         idx  = B->j + B->i[i] + shift;
819         v    = B->a + B->i[i] + shift;
820         SPARSEDENSEMDOT(sum,ls,v,idx,n);
821         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
822       }
823       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
824                             mat->Mvctx); CHKERRQ(ierr);
825     }
826   }
827   else if (flag & SOR_FORWARD_SWEEP){
828     if (flag & SOR_ZERO_INITIAL_GUESS) {
829       VecSet(&zero,mat->lvec);
830       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
831                               mat->Mvctx); CHKERRQ(ierr);
832       for ( i=0; i<m; i++ ) {
833         n    = diag[i] - A->i[i];
834         idx  = A->j + A->i[i] + shift;
835         v    = A->a + A->i[i] + shift;
836         sum  = b[i];
837         SPARSEDENSEMDOT(sum,xs,v,idx,n);
838         d    = fshift + A->a[diag[i]+shift];
839         n    = B->i[i+1] - B->i[i];
840         idx  = B->j + B->i[i] + shift;
841         v    = B->a + B->i[i] + shift;
842         SPARSEDENSEMDOT(sum,ls,v,idx,n);
843         x[i] = omega*(sum/d);
844       }
845       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
846                             mat->Mvctx); CHKERRQ(ierr);
847       its--;
848     }
849     while (its--) {
850       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
851       CHKERRQ(ierr);
852       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
853       CHKERRQ(ierr);
854       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
855                               mat->Mvctx); CHKERRQ(ierr);
856       for ( i=0; i<m; i++ ) {
857         n    = A->i[i+1] - A->i[i];
858         idx  = A->j + A->i[i] + shift;
859         v    = A->a + A->i[i] + shift;
860         sum  = b[i];
861         SPARSEDENSEMDOT(sum,xs,v,idx,n);
862         d    = fshift + A->a[diag[i]+shift];
863         n    = B->i[i+1] - B->i[i];
864         idx  = B->j + B->i[i] + shift;
865         v    = B->a + B->i[i] + shift;
866         SPARSEDENSEMDOT(sum,ls,v,idx,n);
867         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
868       }
869       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
870                             mat->Mvctx); CHKERRQ(ierr);
871     }
872   }
873   else if (flag & SOR_BACKWARD_SWEEP){
874     if (flag & SOR_ZERO_INITIAL_GUESS) {
875       VecSet(&zero,mat->lvec);
876       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
877                               mat->Mvctx); CHKERRQ(ierr);
878       for ( i=m-1; i>-1; i-- ) {
879         n    = A->i[i+1] - diag[i] - 1;
880         idx  = A->j + diag[i] + !shift;
881         v    = A->a + diag[i] + !shift;
882         sum  = b[i];
883         SPARSEDENSEMDOT(sum,xs,v,idx,n);
884         d    = fshift + A->a[diag[i]+shift];
885         n    = B->i[i+1] - B->i[i];
886         idx  = B->j + B->i[i] + shift;
887         v    = B->a + B->i[i] + shift;
888         SPARSEDENSEMDOT(sum,ls,v,idx,n);
889         x[i] = omega*(sum/d);
890       }
891       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
892                             mat->Mvctx); CHKERRQ(ierr);
893       its--;
894     }
895     while (its--) {
896       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
897                             mat->Mvctx); CHKERRQ(ierr);
898       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
899                             mat->Mvctx); CHKERRQ(ierr);
900       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
901                               mat->Mvctx); CHKERRQ(ierr);
902       for ( i=m-1; i>-1; i-- ) {
903         n    = A->i[i+1] - A->i[i];
904         idx  = A->j + A->i[i] + shift;
905         v    = A->a + A->i[i] + shift;
906         sum  = b[i];
907         SPARSEDENSEMDOT(sum,xs,v,idx,n);
908         d    = fshift + A->a[diag[i]+shift];
909         n    = B->i[i+1] - B->i[i];
910         idx  = B->j + B->i[i] + shift;
911         v    = B->a + B->i[i] + shift;
912         SPARSEDENSEMDOT(sum,ls,v,idx,n);
913         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
914       }
915       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
916                             mat->Mvctx); CHKERRQ(ierr);
917     }
918   }
919   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
920     if (flag & SOR_ZERO_INITIAL_GUESS) {
921       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
922     }
923     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
924     CHKERRQ(ierr);
925     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
926     CHKERRQ(ierr);
927     while (its--) {
928       /* go down through the rows */
929       for ( i=0; i<m; i++ ) {
930         n    = A->i[i+1] - A->i[i];
931         idx  = A->j + A->i[i] + shift;
932         v    = A->a + A->i[i] + shift;
933         sum  = b[i];
934         SPARSEDENSEMDOT(sum,xs,v,idx,n);
935         d    = fshift + A->a[diag[i]+shift];
936         n    = B->i[i+1] - B->i[i];
937         idx  = B->j + B->i[i] + shift;
938         v    = B->a + B->i[i] + shift;
939         SPARSEDENSEMDOT(sum,ls,v,idx,n);
940         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
941       }
942       /* come up through the rows */
943       for ( i=m-1; i>-1; i-- ) {
944         n    = A->i[i+1] - A->i[i];
945         idx  = A->j + A->i[i] + shift;
946         v    = A->a + A->i[i] + shift;
947         sum  = b[i];
948         SPARSEDENSEMDOT(sum,xs,v,idx,n);
949         d    = fshift + A->a[diag[i]+shift];
950         n    = B->i[i+1] - B->i[i];
951         idx  = B->j + B->i[i] + shift;
952         v    = B->a + B->i[i] + shift;
953         SPARSEDENSEMDOT(sum,ls,v,idx,n);
954         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
955       }
956     }
957   }
958   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
959     if (flag & SOR_ZERO_INITIAL_GUESS) {
960       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
961     }
962     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
963     CHKERRQ(ierr);
964     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
965     CHKERRQ(ierr);
966     while (its--) {
967       for ( i=0; i<m; i++ ) {
968         n    = A->i[i+1] - A->i[i];
969         idx  = A->j + A->i[i] + shift;
970         v    = A->a + A->i[i] + shift;
971         sum  = b[i];
972         SPARSEDENSEMDOT(sum,xs,v,idx,n);
973         d    = fshift + A->a[diag[i]+shift];
974         n    = B->i[i+1] - B->i[i];
975         idx  = B->j + B->i[i] + shift;
976         v    = B->a + B->i[i] + shift;
977         SPARSEDENSEMDOT(sum,ls,v,idx,n);
978         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
979       }
980     }
981   }
982   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
983     if (flag & SOR_ZERO_INITIAL_GUESS) {
984       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
985     }
986     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
987                             mat->Mvctx); CHKERRQ(ierr);
988     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
989                             mat->Mvctx); CHKERRQ(ierr);
990     while (its--) {
991       for ( i=m-1; i>-1; i-- ) {
992         n    = A->i[i+1] - A->i[i];
993         idx  = A->j + A->i[i] + shift;
994         v    = A->a + A->i[i] + shift;
995         sum  = b[i];
996         SPARSEDENSEMDOT(sum,xs,v,idx,n);
997         d    = fshift + A->a[diag[i]+shift];
998         n    = B->i[i+1] - B->i[i];
999         idx  = B->j + B->i[i] + shift;
1000         v    = B->a + B->i[i] + shift;
1001         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1002         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1003       }
1004     }
1005   }
1006   return 0;
1007 }
1008 
1009 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1010                              int *nzalloc,int *mem)
1011 {
1012   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1013   Mat        A = mat->A, B = mat->B;
1014   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1015 
1016   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1017   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1018   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1019   if (flag == MAT_LOCAL) {
1020     *nz = isend[0]; *nzalloc = isend[1]; *mem = isend[2];
1021   } else if (flag == MAT_GLOBAL_MAX) {
1022     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1023     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1024   } else if (flag == MAT_GLOBAL_SUM) {
1025     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1026     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1027   }
1028   return 0;
1029 }
1030 
1031 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1032 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1033 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1034 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1035 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1036 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1037 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1038 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1039 
1040 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1041 {
1042   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1043 
1044   if (op == NO_NEW_NONZERO_LOCATIONS ||
1045       op == YES_NEW_NONZERO_LOCATIONS ||
1046       op == COLUMNS_SORTED ||
1047       op == ROW_ORIENTED) {
1048         MatSetOption(a->A,op);
1049         MatSetOption(a->B,op);
1050   }
1051   else if (op == ROWS_SORTED ||
1052            op == SYMMETRIC_MATRIX ||
1053            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1054            op == YES_NEW_DIAGONALS)
1055     PLogInfo((PetscObject)A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1056   else if (op == COLUMN_ORIENTED) {
1057     a->roworiented = 0;
1058     MatSetOption(a->A,op);
1059     MatSetOption(a->B,op);
1060   }
1061   else if (op == NO_NEW_DIAGONALS)
1062     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1063   else
1064     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1065   return 0;
1066 }
1067 
1068 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1069 {
1070   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1071   *m = mat->M; *n = mat->N;
1072   return 0;
1073 }
1074 
1075 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1076 {
1077   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1078   *m = mat->m; *n = mat->N;
1079   return 0;
1080 }
1081 
1082 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1083 {
1084   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1085   *m = mat->rstart; *n = mat->rend;
1086   return 0;
1087 }
1088 
1089 static int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1090 {
1091   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1092   Scalar     *vworkA, *vworkB, **pvA, **pvB;
1093   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1094   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1095 
1096   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1097   lrow = row - rstart;
1098 
1099   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1100   if (!v)   {pvA = 0; pvB = 0;}
1101   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1102   ierr = MatGetRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1103   ierr = MatGetRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1104   nztot = nzA + nzB;
1105 
1106   if (v  || idx) {
1107     if (nztot) {
1108       /* Sort by increasing column numbers, assuming A and B already sorted */
1109       int imark;
1110       for (i=0; i<nzB; i++) cworkB[i] = mat->garray[cworkB[i]];
1111       if (v) {
1112         *v = (Scalar *) PetscMalloc( (nztot)*sizeof(Scalar) ); CHKPTRQ(*v);
1113         for ( i=0; i<nzB; i++ ) {
1114           if (cworkB[i] < cstart)   (*v)[i] = vworkB[i];
1115           else break;
1116         }
1117         imark = i;
1118         for ( i=0; i<nzA; i++ )     (*v)[imark+i] = vworkA[i];
1119         for ( i=imark; i<nzB; i++ ) (*v)[nzA+i] = vworkB[i];
1120       }
1121       if (idx) {
1122         *idx = (int *) PetscMalloc( (nztot)*sizeof(int) ); CHKPTRQ(*idx);
1123         for (i=0; i<nzA; i++) cworkA[i] += cstart;
1124         for ( i=0; i<nzB; i++ ) {
1125           if (cworkB[i] < cstart)   (*idx)[i] = cworkB[i];
1126           else break;
1127         }
1128         imark = i;
1129         for ( i=0; i<nzA; i++ )     (*idx)[imark+i] = cworkA[i];
1130         for ( i=imark; i<nzB; i++ ) (*idx)[nzA+i] = cworkB[i];
1131       }
1132     }
1133     else {*idx = 0; *v=0;}
1134   }
1135   *nz = nztot;
1136   ierr = MatRestoreRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1137   ierr = MatRestoreRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1138   return 0;
1139 }
1140 
1141 static int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1142 {
1143   if (idx) PetscFree(*idx);
1144   if (v) PetscFree(*v);
1145   return 0;
1146 }
1147 
1148 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1149 {
1150   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1151   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1152   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1153   double     sum = 0.0;
1154   Scalar     *v;
1155 
1156   if (aij->size == 1) {
1157     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1158   } else {
1159     if (type == NORM_FROBENIUS) {
1160       v = amat->a;
1161       for (i=0; i<amat->nz; i++ ) {
1162 #if defined(PETSC_COMPLEX)
1163         sum += real(conj(*v)*(*v)); v++;
1164 #else
1165         sum += (*v)*(*v); v++;
1166 #endif
1167       }
1168       v = bmat->a;
1169       for (i=0; i<bmat->nz; i++ ) {
1170 #if defined(PETSC_COMPLEX)
1171         sum += real(conj(*v)*(*v)); v++;
1172 #else
1173         sum += (*v)*(*v); v++;
1174 #endif
1175       }
1176       MPI_Allreduce((void*)&sum,(void*)norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1177       *norm = sqrt(*norm);
1178     }
1179     else if (type == NORM_1) { /* max column norm */
1180       double *tmp, *tmp2;
1181       int    *jj, *garray = aij->garray;
1182       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1183       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1184       PetscMemzero(tmp,aij->N*sizeof(double));
1185       *norm = 0.0;
1186       v = amat->a; jj = amat->j;
1187       for ( j=0; j<amat->nz; j++ ) {
1188         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1189       }
1190       v = bmat->a; jj = bmat->j;
1191       for ( j=0; j<bmat->nz; j++ ) {
1192         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1193       }
1194       MPI_Allreduce((void*)tmp,(void*)tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1195       for ( j=0; j<aij->N; j++ ) {
1196         if (tmp2[j] > *norm) *norm = tmp2[j];
1197       }
1198       PetscFree(tmp); PetscFree(tmp2);
1199     }
1200     else if (type == NORM_INFINITY) { /* max row norm */
1201       double ntemp = 0.0;
1202       for ( j=0; j<amat->m; j++ ) {
1203         v = amat->a + amat->i[j] + shift;
1204         sum = 0.0;
1205         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1206           sum += PetscAbsScalar(*v); v++;
1207         }
1208         v = bmat->a + bmat->i[j] + shift;
1209         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1210           sum += PetscAbsScalar(*v); v++;
1211         }
1212         if (sum > ntemp) ntemp = sum;
1213       }
1214       MPI_Allreduce((void*)&ntemp,(void*)norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1215     }
1216     else {
1217       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1218     }
1219   }
1220   return 0;
1221 }
1222 
1223 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1224 {
1225   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1226   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1227   int        ierr,shift = Aloc->indexshift;
1228   Mat        B;
1229   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1230   Scalar     *array;
1231 
1232   if (matout == PETSC_NULL && M != N)
1233     SETERRQ(1,"MatTranspose_MPIAIJ:Square matrix only for in-place");
1234   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1235          PETSC_NULL,&B); CHKERRQ(ierr);
1236 
1237   /* copy over the A part */
1238   Aloc = (Mat_SeqAIJ*) a->A->data;
1239   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1240   row = a->rstart;
1241   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1242   for ( i=0; i<m; i++ ) {
1243     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1244     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1245   }
1246   aj = Aloc->j;
1247   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1248 
1249   /* copy over the B part */
1250   Aloc = (Mat_SeqAIJ*) a->B->data;
1251   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1252   row = a->rstart;
1253   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1254   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1255   for ( i=0; i<m; i++ ) {
1256     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1257     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1258   }
1259   PetscFree(ct);
1260   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1261   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1262   if (matout != PETSC_NULL) {
1263     *matout = B;
1264   } else {
1265     /* This isn't really an in-place transpose .... but free data structures from a */
1266     PetscFree(a->rowners);
1267     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1268     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1269     if (a->colmap) PetscFree(a->colmap);
1270     if (a->garray) PetscFree(a->garray);
1271     if (a->lvec) VecDestroy(a->lvec);
1272     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1273     PetscFree(a);
1274     PetscMemcpy(A,B,sizeof(struct _Mat));
1275     PetscHeaderDestroy(B);
1276   }
1277   return 0;
1278 }
1279 
1280 extern int MatPrintHelp_SeqAIJ(Mat);
1281 static int MatPrintHelp_MPIAIJ(Mat A)
1282 {
1283   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1284 
1285   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1286   else return 0;
1287 }
1288 
1289 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1290 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1291 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1292 int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1293 /* -------------------------------------------------------------------*/
1294 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1295        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1296        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1297        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1298        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1299        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1300        MatLUFactor_MPIAIJ,0,
1301        MatRelax_MPIAIJ,
1302        MatTranspose_MPIAIJ,
1303        MatGetInfo_MPIAIJ,0,
1304        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1305        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1306        0,
1307        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1308        MatGetReordering_MPIAIJ,
1309        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1310        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1311        MatILUFactorSymbolic_MPIAIJ,0,
1312        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1313        0,0,0,
1314        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1315        MatPrintHelp_MPIAIJ,
1316        MatScale_MPIAIJ};
1317 
1318 /*@C
1319    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1320    (the default parallel PETSc format).  For good matrix assembly performance
1321    the user should preallocate the matrix storage by setting the parameters
1322    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1323    performance can be increased by more than a factor of 50.
1324 
1325    Input Parameters:
1326 .  comm - MPI communicator
1327 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1328 .  n - number of local columns (or PETSC_DECIDE to have calculated
1329            if N is given)
1330 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1331 .  N - number of global columns (or PETSC_DECIDE to have calculated
1332            if n is given)
1333 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1334            (same for all local rows)
1335 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1336            or null (possibly different for each row).  You must leave room
1337            for the diagonal entry even if it is zero.
1338 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1339            submatrix (same for all local rows).
1340 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1341            submatrix or null (possibly different for each row).
1342 
1343    Output Parameter:
1344 .  newmat - the matrix
1345 
1346    Notes:
1347    The AIJ format (also called the Yale sparse matrix format or
1348    compressed row storage), is fully compatible with standard Fortran 77
1349    storage.  That is, the stored row and column indices can begin at
1350    either one (as in Fortran) or zero.  See the users manual for details.
1351 
1352    The user MUST specify either the local or global matrix dimensions
1353    (possibly both).
1354 
1355    By default, this format uses inodes (identical nodes) when possible.
1356    We search for consecutive rows with the same nonzero structure, thereby
1357    reusing matrix information to achieve increased efficiency.
1358 
1359    Options Database Keys:
1360 $    -mat_aij_no_inode  - Do not use inodes
1361 $    -mat_aij_inode_limit <limit> - Set inode limit.
1362 $        (max limit=5)
1363 $    -mat_aij_oneindex - Internally use indexing starting at 1
1364 $        rather than 0.  Note: When calling MatSetValues(),
1365 $        the user still MUST index entries starting at 0!
1366 
1367    Storage Information:
1368    For a square global matrix we define each processor's diagonal portion
1369    to be its local rows and the corresponding columns (a square submatrix);
1370    each processor's off-diagonal portion encompasses the remainder of the
1371    local matrix (a rectangular submatrix).
1372 
1373    The user can specify preallocated storage for the diagonal part of
1374    the local submatrix with either d_nz or d_nnz (not both).  Set
1375    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1376    memory allocation.  Likewise, specify preallocated storage for the
1377    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1378 
1379    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1380    the figure below we depict these three local rows and all columns (0-11).
1381 
1382 $          0 1 2 3 4 5 6 7 8 9 10 11
1383 $         -------------------
1384 $  row 3  |  o o o d d d o o o o o o
1385 $  row 4  |  o o o d d d o o o o o o
1386 $  row 5  |  o o o d d d o o o o o o
1387 $         -------------------
1388 $
1389 
1390    Thus, any entries in the d locations are stored in the d (diagonal)
1391    submatrix, and any entries in the o locations are stored in the
1392    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1393    stored simply in the MATSEQAIJ format for compressed row storage.
1394 
1395    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1396    and o_nz should indicate the number of nonzeros per row in the o matrix.
1397    In general, for PDE problems in which most nonzeros are near the diagonal,
1398    one expects d_nz >> o_nz.   For additional details, see the users manual
1399    chapter on matrices and the file $(PETSC_DIR)/Performance.
1400 
1401 .keywords: matrix, aij, compressed row, sparse, parallel
1402 
1403 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1404 @*/
1405 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1406                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *newmat)
1407 {
1408   Mat          mat;
1409   Mat_MPIAIJ   *a;
1410   int          ierr, i,sum[2],work[2];
1411 
1412   *newmat         = 0;
1413   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1414   PLogObjectCreate(mat);
1415   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1416   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1417   mat->destroy    = MatDestroy_MPIAIJ;
1418   mat->view       = MatView_MPIAIJ;
1419   mat->factor     = 0;
1420   mat->assembled  = PETSC_FALSE;
1421 
1422   a->insertmode = NOT_SET_VALUES;
1423   MPI_Comm_rank(comm,&a->rank);
1424   MPI_Comm_size(comm,&a->size);
1425 
1426   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1427     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSc decide rows but set d_nnz or o_nnz");
1428 
1429   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1430     work[0] = m; work[1] = n;
1431     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1432     if (M == PETSC_DECIDE) M = sum[0];
1433     if (N == PETSC_DECIDE) N = sum[1];
1434   }
1435   if (m == PETSC_DECIDE) {m = M/a->size + ((M % a->size) > a->rank);}
1436   if (n == PETSC_DECIDE) {n = N/a->size + ((N % a->size) > a->rank);}
1437   a->m = m;
1438   a->n = n;
1439   a->N = N;
1440   a->M = M;
1441 
1442   /* build local table of row and column ownerships */
1443   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1444   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1445   a->cowners = a->rowners + a->size + 1;
1446   MPI_Allgather(&m,1,MPI_INT,a->rowners+1,1,MPI_INT,comm);
1447   a->rowners[0] = 0;
1448   for ( i=2; i<=a->size; i++ ) {
1449     a->rowners[i] += a->rowners[i-1];
1450   }
1451   a->rstart = a->rowners[a->rank];
1452   a->rend   = a->rowners[a->rank+1];
1453   MPI_Allgather(&n,1,MPI_INT,a->cowners+1,1,MPI_INT,comm);
1454   a->cowners[0] = 0;
1455   for ( i=2; i<=a->size; i++ ) {
1456     a->cowners[i] += a->cowners[i-1];
1457   }
1458   a->cstart = a->cowners[a->rank];
1459   a->cend   = a->cowners[a->rank+1];
1460 
1461   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1462   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&a->A); CHKERRQ(ierr);
1463   PLogObjectParent(mat,a->A);
1464   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1465   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&a->B); CHKERRQ(ierr);
1466   PLogObjectParent(mat,a->B);
1467 
1468   /* build cache for off array entries formed */
1469   ierr = StashBuild_Private(&a->stash); CHKERRQ(ierr);
1470   a->colmap      = 0;
1471   a->garray      = 0;
1472   a->roworiented = 1;
1473 
1474   /* stuff used for matrix vector multiply */
1475   a->lvec      = 0;
1476   a->Mvctx     = 0;
1477 
1478   *newmat = mat;
1479   return 0;
1480 }
1481 
1482 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1483 {
1484   Mat        mat;
1485   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1486   int        ierr, len,flg;
1487 
1488   *newmat       = 0;
1489   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1490   PLogObjectCreate(mat);
1491   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1492   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1493   mat->destroy    = MatDestroy_MPIAIJ;
1494   mat->view       = MatView_MPIAIJ;
1495   mat->factor     = matin->factor;
1496   mat->assembled  = PETSC_TRUE;
1497 
1498   a->m          = oldmat->m;
1499   a->n          = oldmat->n;
1500   a->M          = oldmat->M;
1501   a->N          = oldmat->N;
1502 
1503   a->rstart     = oldmat->rstart;
1504   a->rend       = oldmat->rend;
1505   a->cstart     = oldmat->cstart;
1506   a->cend       = oldmat->cend;
1507   a->size       = oldmat->size;
1508   a->rank       = oldmat->rank;
1509   a->insertmode = NOT_SET_VALUES;
1510 
1511   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1512   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1513   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1514   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1515   if (oldmat->colmap) {
1516     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1517     PLogObjectMemory(mat,(a->N)*sizeof(int));
1518     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1519   } else a->colmap = 0;
1520   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1521     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1522     PLogObjectMemory(mat,len*sizeof(int));
1523     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1524   } else a->garray = 0;
1525 
1526   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1527   PLogObjectParent(mat,a->lvec);
1528   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1529   PLogObjectParent(mat,a->Mvctx);
1530   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1531   PLogObjectParent(mat,a->A);
1532   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1533   PLogObjectParent(mat,a->B);
1534   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1535   if (flg) {
1536     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1537   }
1538   *newmat = mat;
1539   return 0;
1540 }
1541 
1542 #include "sysio.h"
1543 
1544 int MatLoad_MPIAIJ(Viewer bview,MatType type,Mat *newmat)
1545 {
1546   Mat          A;
1547   int          i, nz, ierr, j,rstart, rend, fd;
1548   Scalar       *vals,*svals;
1549   PetscObject  vobj = (PetscObject) bview;
1550   MPI_Comm     comm = vobj->comm;
1551   MPI_Status   status;
1552   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1553   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1554   int          tag = ((PetscObject)bview)->tag;
1555 
1556   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1557   if (!rank) {
1558     ierr = ViewerFileGetDescriptor_Private(bview,&fd); CHKERRQ(ierr);
1559     ierr = SYRead(fd,(char *)header,4,SYINT); CHKERRQ(ierr);
1560     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
1561   }
1562 
1563   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1564   M = header[1]; N = header[2];
1565   /* determine ownership of all rows */
1566   m = M/size + ((M % size) > rank);
1567   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1568   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1569   rowners[0] = 0;
1570   for ( i=2; i<=size; i++ ) {
1571     rowners[i] += rowners[i-1];
1572   }
1573   rstart = rowners[rank];
1574   rend   = rowners[rank+1];
1575 
1576   /* distribute row lengths to all processors */
1577   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1578   offlens = ourlens + (rend-rstart);
1579   if (!rank) {
1580     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1581     ierr = SYRead(fd,rowlengths,M,SYINT); CHKERRQ(ierr);
1582     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1583     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1584     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1585     PetscFree(sndcounts);
1586   }
1587   else {
1588     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1589   }
1590 
1591   if (!rank) {
1592     /* calculate the number of nonzeros on each processor */
1593     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1594     PetscMemzero(procsnz,size*sizeof(int));
1595     for ( i=0; i<size; i++ ) {
1596       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1597         procsnz[i] += rowlengths[j];
1598       }
1599     }
1600     PetscFree(rowlengths);
1601 
1602     /* determine max buffer needed and allocate it */
1603     maxnz = 0;
1604     for ( i=0; i<size; i++ ) {
1605       maxnz = PetscMax(maxnz,procsnz[i]);
1606     }
1607     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1608 
1609     /* read in my part of the matrix column indices  */
1610     nz = procsnz[0];
1611     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1612     ierr = SYRead(fd,mycols,nz,SYINT); CHKERRQ(ierr);
1613 
1614     /* read in every one elses and ship off */
1615     for ( i=1; i<size; i++ ) {
1616       nz = procsnz[i];
1617       ierr = SYRead(fd,cols,nz,SYINT); CHKERRQ(ierr);
1618       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1619     }
1620     PetscFree(cols);
1621   }
1622   else {
1623     /* determine buffer space needed for message */
1624     nz = 0;
1625     for ( i=0; i<m; i++ ) {
1626       nz += ourlens[i];
1627     }
1628     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1629 
1630     /* receive message of column indices*/
1631     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1632     MPI_Get_count(&status,MPI_INT,&maxnz);
1633     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1634   }
1635 
1636   /* loop over local rows, determining number of off diagonal entries */
1637   PetscMemzero(offlens,m*sizeof(int));
1638   jj = 0;
1639   for ( i=0; i<m; i++ ) {
1640     for ( j=0; j<ourlens[i]; j++ ) {
1641       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1642       jj++;
1643     }
1644   }
1645 
1646   /* create our matrix */
1647   for ( i=0; i<m; i++ ) {
1648     ourlens[i] -= offlens[i];
1649   }
1650   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1651   A = *newmat;
1652   MatSetOption(A,COLUMNS_SORTED);
1653   for ( i=0; i<m; i++ ) {
1654     ourlens[i] += offlens[i];
1655   }
1656 
1657   if (!rank) {
1658     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1659 
1660     /* read in my part of the matrix numerical values  */
1661     nz = procsnz[0];
1662     ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1663 
1664     /* insert into matrix */
1665     jj      = rstart;
1666     smycols = mycols;
1667     svals   = vals;
1668     for ( i=0; i<m; i++ ) {
1669       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1670       smycols += ourlens[i];
1671       svals   += ourlens[i];
1672       jj++;
1673     }
1674 
1675     /* read in other processors and ship out */
1676     for ( i=1; i<size; i++ ) {
1677       nz = procsnz[i];
1678       ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1679       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1680     }
1681     PetscFree(procsnz);
1682   }
1683   else {
1684     /* receive numeric values */
1685     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1686 
1687     /* receive message of values*/
1688     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1689     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1690     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1691 
1692     /* insert into matrix */
1693     jj      = rstart;
1694     smycols = mycols;
1695     svals   = vals;
1696     for ( i=0; i<m; i++ ) {
1697       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1698       smycols += ourlens[i];
1699       svals   += ourlens[i];
1700       jj++;
1701     }
1702   }
1703   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1704 
1705   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1706   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1707   return 0;
1708 }
1709