xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 4b0e389b2accb070e9f94a7c5aef9e462c8b7c96)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.103 1995/12/23 04:54:07 bsmith Exp bsmith $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (aij->assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   if (!aij->assembled) SETERRQ(1,"MatGetValues_MPIAIJ:Not for unassembled matrix");
104   for ( i=0; i<m; i++ ) {
105     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
106     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
107     if (idxm[i] >= rstart && idxm[i] < rend) {
108       row = idxm[i] - rstart;
109       for ( j=0; j<n; j++ ) {
110         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
111         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
112         if (idxn[j] >= cstart && idxn[j] < cend){
113           col = idxn[j] - cstart;
114           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
115         }
116         else {
117           col = aij->colmap[idxn[j]] + shift;
118           ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
119         }
120       }
121     }
122     else {
123       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
124     }
125   }
126   return 0;
127 }
128 
129 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
130 {
131   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
132   MPI_Comm    comm = mat->comm;
133   int         size = aij->size, *owners = aij->rowners;
134   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
135   MPI_Request *send_waits,*recv_waits;
136   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
137   InsertMode  addv;
138   Scalar      *rvalues,*svalues;
139 
140   /* make sure all processors are either in INSERTMODE or ADDMODE */
141   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
142   if (addv == (ADD_VALUES|INSERT_VALUES)) {
143     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
144   }
145   aij->insertmode = addv; /* in case this processor had no cache */
146 
147   /*  first count number of contributors to each processor */
148   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
149   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
150   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
151   for ( i=0; i<aij->stash.n; i++ ) {
152     idx = aij->stash.idx[i];
153     for ( j=0; j<size; j++ ) {
154       if (idx >= owners[j] && idx < owners[j+1]) {
155         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
156       }
157     }
158   }
159   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
160 
161   /* inform other processors of number of messages and max length*/
162   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
163   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
164   nreceives = work[rank];
165   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
166   nmax = work[rank];
167   PetscFree(work);
168 
169   /* post receives:
170        1) each message will consist of ordered pairs
171      (global index,value) we store the global index as a double
172      to simplify the message passing.
173        2) since we don't know how long each individual message is we
174      allocate the largest needed buffer for each receive. Potentially
175      this is a lot of wasted space.
176 
177 
178        This could be done better.
179   */
180   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
181   CHKPTRQ(rvalues);
182   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
183   CHKPTRQ(recv_waits);
184   for ( i=0; i<nreceives; i++ ) {
185     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
186               comm,recv_waits+i);
187   }
188 
189   /* do sends:
190       1) starts[i] gives the starting index in svalues for stuff going to
191          the ith processor
192   */
193   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
194   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
195   CHKPTRQ(send_waits);
196   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
197   starts[0] = 0;
198   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
199   for ( i=0; i<aij->stash.n; i++ ) {
200     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
201     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
202     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
203   }
204   PetscFree(owner);
205   starts[0] = 0;
206   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
207   count = 0;
208   for ( i=0; i<size; i++ ) {
209     if (procs[i]) {
210       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
211                 comm,send_waits+count++);
212     }
213   }
214   PetscFree(starts); PetscFree(nprocs);
215 
216   /* Free cache space */
217   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
218 
219   aij->svalues    = svalues;    aij->rvalues    = rvalues;
220   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
221   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
222   aij->rmax       = nmax;
223 
224   return 0;
225 }
226 extern int MatSetUpMultiply_MPIAIJ(Mat);
227 
228 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
229 {
230   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
231   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
232   MPI_Status  *send_status,recv_status;
233   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
234   int         row,col,other_disassembled,shift = C->indexshift;
235   Scalar      *values,val;
236   InsertMode  addv = aij->insertmode;
237 
238   /*  wait on receives */
239   while (count) {
240     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
241     /* unpack receives into our local space */
242     values = aij->rvalues + 3*imdex*aij->rmax;
243     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
244     n = n/3;
245     for ( i=0; i<n; i++ ) {
246       row = (int) PETSCREAL(values[3*i]) - aij->rstart;
247       col = (int) PETSCREAL(values[3*i+1]);
248       val = values[3*i+2];
249       if (col >= aij->cstart && col < aij->cend) {
250         col -= aij->cstart;
251         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
252       }
253       else {
254         if (aij->assembled) {
255           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
256           col = aij->colmap[col] + shift;
257           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
258             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
259             col = (int) PETSCREAL(values[3*i+1]);
260           }
261         }
262         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
263       }
264     }
265     count--;
266   }
267   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
268 
269   /* wait on sends */
270   if (aij->nsends) {
271     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
272     CHKPTRQ(send_status);
273     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
274     PetscFree(send_status);
275   }
276   PetscFree(aij->send_waits); PetscFree(aij->svalues);
277 
278   aij->insertmode = NOT_SET_VALUES;
279   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
280   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
281 
282   /* determine if any processor has disassembled, if so we must
283      also disassemble ourselfs, in order that we may reassemble. */
284   MPI_Allreduce(&aij->assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
285   if (aij->assembled && !other_disassembled) {
286     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
287   }
288 
289   if (!aij->assembled && mode == FINAL_ASSEMBLY) {
290     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
291     aij->assembled = 1;
292   }
293   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
294   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
295 
296   return 0;
297 }
298 
299 static int MatZeroEntries_MPIAIJ(Mat A)
300 {
301   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
302   int        ierr;
303   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
304   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
305   return 0;
306 }
307 
308 /* the code does not do the diagonal entries correctly unless the
309    matrix is square and the column and row owerships are identical.
310    This is a BUG. The only way to fix it seems to be to access
311    aij->A and aij->B directly and not through the MatZeroRows()
312    routine.
313 */
314 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
315 {
316   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
317   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
318   int            *procs,*nprocs,j,found,idx,nsends,*work;
319   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
320   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
321   int            *lens,imdex,*lrows,*values;
322   MPI_Comm       comm = A->comm;
323   MPI_Request    *send_waits,*recv_waits;
324   MPI_Status     recv_status,*send_status;
325   IS             istmp;
326 
327   if (!l->assembled) SETERRQ(1,"MatZeroRows_MPIAIJ:Must assemble matrix");
328   ierr = ISGetLocalSize(is,&N); CHKERRQ(ierr);
329   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
330 
331   /*  first count number of contributors to each processor */
332   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
333   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
334   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
335   for ( i=0; i<N; i++ ) {
336     idx = rows[i];
337     found = 0;
338     for ( j=0; j<size; j++ ) {
339       if (idx >= owners[j] && idx < owners[j+1]) {
340         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
341       }
342     }
343     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
344   }
345   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
346 
347   /* inform other processors of number of messages and max length*/
348   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
349   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
350   nrecvs = work[rank];
351   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
352   nmax = work[rank];
353   PetscFree(work);
354 
355   /* post receives:   */
356   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
357   CHKPTRQ(rvalues);
358   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
359   CHKPTRQ(recv_waits);
360   for ( i=0; i<nrecvs; i++ ) {
361     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
362   }
363 
364   /* do sends:
365       1) starts[i] gives the starting index in svalues for stuff going to
366          the ith processor
367   */
368   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
369   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
370   CHKPTRQ(send_waits);
371   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
372   starts[0] = 0;
373   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
374   for ( i=0; i<N; i++ ) {
375     svalues[starts[owner[i]]++] = rows[i];
376   }
377   ISRestoreIndices(is,&rows);
378 
379   starts[0] = 0;
380   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
381   count = 0;
382   for ( i=0; i<size; i++ ) {
383     if (procs[i]) {
384       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
385     }
386   }
387   PetscFree(starts);
388 
389   base = owners[rank];
390 
391   /*  wait on receives */
392   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
393   source = lens + nrecvs;
394   count  = nrecvs; slen = 0;
395   while (count) {
396     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
397     /* unpack receives into our local space */
398     MPI_Get_count(&recv_status,MPI_INT,&n);
399     source[imdex]  = recv_status.MPI_SOURCE;
400     lens[imdex]  = n;
401     slen += n;
402     count--;
403   }
404   PetscFree(recv_waits);
405 
406   /* move the data into the send scatter */
407   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
408   count = 0;
409   for ( i=0; i<nrecvs; i++ ) {
410     values = rvalues + i*nmax;
411     for ( j=0; j<lens[i]; j++ ) {
412       lrows[count++] = values[j] - base;
413     }
414   }
415   PetscFree(rvalues); PetscFree(lens);
416   PetscFree(owner); PetscFree(nprocs);
417 
418   /* actually zap the local rows */
419   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
420   PLogObjectParent(A,istmp);
421   PetscFree(lrows);
422   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
423   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
424   ierr = ISDestroy(istmp); CHKERRQ(ierr);
425 
426   /* wait on sends */
427   if (nsends) {
428     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
429     CHKPTRQ(send_status);
430     MPI_Waitall(nsends,send_waits,send_status);
431     PetscFree(send_status);
432   }
433   PetscFree(send_waits); PetscFree(svalues);
434 
435   return 0;
436 }
437 
438 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
439 {
440   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
441   int        ierr;
442 
443   if (!a->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix");
444   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
445   ierr = MatMult(a->A,xx,yy); CHKERRQ(ierr);
446   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
447   ierr = MatMultAdd(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
448   return 0;
449 }
450 
451 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
452 {
453   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
454   int        ierr;
455   if (!a->assembled) SETERRQ(1,"MatMult_MPIAIJ:must assemble matrix");
456   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
457   ierr = MatMultAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
458   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
459   ierr = MatMultAdd(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
460   return 0;
461 }
462 
463 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
464 {
465   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
466   int        ierr;
467 
468   if (!a->assembled) SETERRQ(1,"MatMulTrans_MPIAIJ:must assemble matrix");
469   /* do nondiagonal part */
470   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
471   /* send it on its way */
472   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
473                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
474   /* do local part */
475   ierr = MatMultTrans(a->A,xx,yy); CHKERRQ(ierr);
476   /* receive remote parts: note this assumes the values are not actually */
477   /* inserted in yy until the next line, which is true for my implementation*/
478   /* but is not perhaps always true. */
479   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
480                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
481   return 0;
482 }
483 
484 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
485 {
486   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
487   int        ierr;
488 
489   if (!a->assembled) SETERRQ(1,"MatMulTransAdd_MPIAIJ:must assemble matrix");
490   /* do nondiagonal part */
491   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
492   /* send it on its way */
493   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
494                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
495   /* do local part */
496   ierr = MatMultTransAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
497   /* receive remote parts: note this assumes the values are not actually */
498   /* inserted in yy until the next line, which is true for my implementation*/
499   /* but is not perhaps always true. */
500   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
501                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
502   return 0;
503 }
504 
505 /*
506   This only works correctly for square matrices where the subblock A->A is the
507    diagonal block
508 */
509 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
510 {
511   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
512   if (!a->assembled) SETERRQ(1,"MatGetDiag_MPIAIJ:must assemble matrix");
513   return MatGetDiagonal(a->A,v);
514 }
515 
516 static int MatDestroy_MPIAIJ(PetscObject obj)
517 {
518   Mat        mat = (Mat) obj;
519   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
520   int        ierr;
521 #if defined(PETSC_LOG)
522   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
523 #endif
524   PetscFree(aij->rowners);
525   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
526   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
527   if (aij->colmap) PetscFree(aij->colmap);
528   if (aij->garray) PetscFree(aij->garray);
529   if (aij->lvec)   VecDestroy(aij->lvec);
530   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
531   PetscFree(aij);
532   PLogObjectDestroy(mat);
533   PetscHeaderDestroy(mat);
534   return 0;
535 }
536 #include "draw.h"
537 #include "pinclude/pviewer.h"
538 
539 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
540 {
541   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
542   int         ierr;
543 
544   if (aij->size == 1) {
545     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
546   }
547   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
548   return 0;
549 }
550 
551 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
552 {
553   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
554   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
555   int         ierr, format,shift = C->indexshift,rank;
556   PetscObject vobj = (PetscObject) viewer;
557   FILE        *fd;
558 
559   if (vobj->type == ASCII_FILE_VIEWER || vobj->type == ASCII_FILES_VIEWER) {
560     ierr = ViewerFileGetFormat_Private(viewer,&format);
561     if (format == FILE_FORMAT_INFO_DETAILED) {
562       int nz,nzalloc,mem;
563       MPI_Comm_rank(mat->comm,&rank);
564       ierr = ViewerFileGetPointer_Private(viewer,&fd); CHKERRQ(ierr);
565       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
566       MPIU_Seq_begin(mat->comm,1);
567       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d \n",rank,aij->m,nz,
568                 nzalloc,mem);
569       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
570       fprintf(fd,"[%d] on diagonal nz %d \n",rank,nz);
571       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
572       fprintf(fd,"[%d] off diagonal nz %d \n",rank,nz);
573       fflush(fd);
574       MPIU_Seq_end(mat->comm,1);
575       ierr = VecScatterView(aij->Mvctx,viewer);
576       return 0;
577     }
578     else if (format == FILE_FORMAT_INFO) {
579       return 0;
580     }
581   }
582 
583   if (vobj->type == ASCII_FILE_VIEWER) {
584     ierr = ViewerFileGetPointer_Private(viewer,&fd); CHKERRQ(ierr);
585     MPIU_Seq_begin(mat->comm,1);
586     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
587            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
588            aij->cend);
589     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
590     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
591     fflush(fd);
592     MPIU_Seq_end(mat->comm,1);
593   }
594   else {
595     int size = aij->size;
596     rank = aij->rank;
597     if (size == 1) {
598       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
599     }
600     else {
601       /* assemble the entire matrix onto first processor. */
602       Mat         A;
603       Mat_SeqAIJ *Aloc;
604       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
605       Scalar      *a;
606 
607       if (!rank) {
608         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
609                CHKERRQ(ierr);
610       }
611       else {
612         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
613                CHKERRQ(ierr);
614       }
615       PLogObjectParent(mat,A);
616 
617       /* copy over the A part */
618       Aloc = (Mat_SeqAIJ*) aij->A->data;
619       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
620       row = aij->rstart;
621       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
622       for ( i=0; i<m; i++ ) {
623         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
624         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
625       }
626       aj = Aloc->j;
627       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
628 
629       /* copy over the B part */
630       Aloc = (Mat_SeqAIJ*) aij->B->data;
631       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
632       row = aij->rstart;
633       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
634       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
635       for ( i=0; i<m; i++ ) {
636         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
637         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
638       }
639       PetscFree(ct);
640       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
641       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
642       if (!rank) {
643         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
644       }
645       ierr = MatDestroy(A); CHKERRQ(ierr);
646     }
647   }
648   return 0;
649 }
650 
651 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
652 {
653   Mat         mat = (Mat) obj;
654   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
655   int         ierr;
656   PetscObject vobj = (PetscObject) viewer;
657 
658   if (!aij->assembled) SETERRQ(1,"MatView_MPIAIJ:must assemble matrix");
659   if (!viewer) {
660     viewer = STDOUT_VIEWER_SELF; vobj = (PetscObject) viewer;
661   }
662   if (vobj->cookie == DRAW_COOKIE && vobj->type == NULLWINDOW) return 0;
663   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILE_VIEWER) {
664     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
665   }
666   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILES_VIEWER) {
667     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
668   }
669   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == MATLAB_VIEWER) {
670     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
671   }
672   else if (vobj->cookie == DRAW_COOKIE) {
673     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
674   }
675   else if (vobj->type == BINARY_FILE_VIEWER) {
676     return MatView_MPIAIJ_Binary(mat,viewer);
677   }
678   return 0;
679 }
680 
681 extern int MatMarkDiag_SeqAIJ(Mat);
682 /*
683     This has to provide several versions.
684 
685      1) per sequential
686      2) a) use only local smoothing updating outer values only once.
687         b) local smoothing updating outer values each inner iteration
688      3) color updating out values betwen colors.
689 */
690 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
691                            double fshift,int its,Vec xx)
692 {
693   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
694   Mat        AA = mat->A, BB = mat->B;
695   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
696   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
697   int        ierr,*idx, *diag;
698   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
699   Vec        tt;
700 
701   if (!mat->assembled) SETERRQ(1,"MatRelax_MPIAIJ:must assemble matrix");
702 
703   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
704   xs = x + shift; /* shift by one for index start of 1 */
705   ls = ls + shift;
706   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
707   diag = A->diag;
708   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
709     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
710   }
711   if (flag & SOR_EISENSTAT) {
712     /* Let  A = L + U + D; where L is lower trianglar,
713     U is upper triangular, E is diagonal; This routine applies
714 
715             (L + E)^{-1} A (U + E)^{-1}
716 
717     to a vector efficiently using Eisenstat's trick. This is for
718     the case of SSOR preconditioner, so E is D/omega where omega
719     is the relaxation factor.
720     */
721     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
722     PLogObjectParent(matin,tt);
723     VecGetArray(tt,&t);
724     scale = (2.0/omega) - 1.0;
725     /*  x = (E + U)^{-1} b */
726     VecSet(&zero,mat->lvec);
727     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
728                               mat->Mvctx); CHKERRQ(ierr);
729     for ( i=m-1; i>-1; i-- ) {
730       n    = A->i[i+1] - diag[i] - 1;
731       idx  = A->j + diag[i] + !shift;
732       v    = A->a + diag[i] + !shift;
733       sum  = b[i];
734       SPARSEDENSEMDOT(sum,xs,v,idx,n);
735       d    = fshift + A->a[diag[i]+shift];
736       n    = B->i[i+1] - B->i[i];
737       idx  = B->j + B->i[i] + shift;
738       v    = B->a + B->i[i] + shift;
739       SPARSEDENSEMDOT(sum,ls,v,idx,n);
740       x[i] = omega*(sum/d);
741     }
742     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
743                             mat->Mvctx); CHKERRQ(ierr);
744 
745     /*  t = b - (2*E - D)x */
746     v = A->a;
747     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
748 
749     /*  t = (E + L)^{-1}t */
750     ts = t + shift; /* shifted by one for index start of a or mat->j*/
751     diag = A->diag;
752     VecSet(&zero,mat->lvec);
753     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
754                                                  mat->Mvctx); CHKERRQ(ierr);
755     for ( i=0; i<m; i++ ) {
756       n    = diag[i] - A->i[i];
757       idx  = A->j + A->i[i] + shift;
758       v    = A->a + A->i[i] + shift;
759       sum  = t[i];
760       SPARSEDENSEMDOT(sum,ts,v,idx,n);
761       d    = fshift + A->a[diag[i]+shift];
762       n    = B->i[i+1] - B->i[i];
763       idx  = B->j + B->i[i] + shift;
764       v    = B->a + B->i[i] + shift;
765       SPARSEDENSEMDOT(sum,ls,v,idx,n);
766       t[i] = omega*(sum/d);
767     }
768     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
769                                                     mat->Mvctx); CHKERRQ(ierr);
770     /*  x = x + t */
771     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
772     VecDestroy(tt);
773     return 0;
774   }
775 
776 
777   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
778     if (flag & SOR_ZERO_INITIAL_GUESS) {
779       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
780     }
781     else {
782       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
783       CHKERRQ(ierr);
784       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
785       CHKERRQ(ierr);
786     }
787     while (its--) {
788       /* go down through the rows */
789       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
790                               mat->Mvctx); CHKERRQ(ierr);
791       for ( i=0; i<m; i++ ) {
792         n    = A->i[i+1] - A->i[i];
793         idx  = A->j + A->i[i] + shift;
794         v    = A->a + A->i[i] + shift;
795         sum  = b[i];
796         SPARSEDENSEMDOT(sum,xs,v,idx,n);
797         d    = fshift + A->a[diag[i]+shift];
798         n    = B->i[i+1] - B->i[i];
799         idx  = B->j + B->i[i] + shift;
800         v    = B->a + B->i[i] + shift;
801         SPARSEDENSEMDOT(sum,ls,v,idx,n);
802         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
803       }
804       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
805                             mat->Mvctx); CHKERRQ(ierr);
806       /* come up through the rows */
807       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
808                               mat->Mvctx); CHKERRQ(ierr);
809       for ( i=m-1; i>-1; i-- ) {
810         n    = A->i[i+1] - A->i[i];
811         idx  = A->j + A->i[i] + shift;
812         v    = A->a + A->i[i] + shift;
813         sum  = b[i];
814         SPARSEDENSEMDOT(sum,xs,v,idx,n);
815         d    = fshift + A->a[diag[i]+shift];
816         n    = B->i[i+1] - B->i[i];
817         idx  = B->j + B->i[i] + shift;
818         v    = B->a + B->i[i] + shift;
819         SPARSEDENSEMDOT(sum,ls,v,idx,n);
820         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
821       }
822       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
823                             mat->Mvctx); CHKERRQ(ierr);
824     }
825   }
826   else if (flag & SOR_FORWARD_SWEEP){
827     if (flag & SOR_ZERO_INITIAL_GUESS) {
828       VecSet(&zero,mat->lvec);
829       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
830                               mat->Mvctx); CHKERRQ(ierr);
831       for ( i=0; i<m; i++ ) {
832         n    = diag[i] - A->i[i];
833         idx  = A->j + A->i[i] + shift;
834         v    = A->a + A->i[i] + shift;
835         sum  = b[i];
836         SPARSEDENSEMDOT(sum,xs,v,idx,n);
837         d    = fshift + A->a[diag[i]+shift];
838         n    = B->i[i+1] - B->i[i];
839         idx  = B->j + B->i[i] + shift;
840         v    = B->a + B->i[i] + shift;
841         SPARSEDENSEMDOT(sum,ls,v,idx,n);
842         x[i] = omega*(sum/d);
843       }
844       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
845                             mat->Mvctx); CHKERRQ(ierr);
846       its--;
847     }
848     while (its--) {
849       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
850       CHKERRQ(ierr);
851       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
852       CHKERRQ(ierr);
853       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
854                               mat->Mvctx); CHKERRQ(ierr);
855       for ( i=0; i<m; i++ ) {
856         n    = A->i[i+1] - A->i[i];
857         idx  = A->j + A->i[i] + shift;
858         v    = A->a + A->i[i] + shift;
859         sum  = b[i];
860         SPARSEDENSEMDOT(sum,xs,v,idx,n);
861         d    = fshift + A->a[diag[i]+shift];
862         n    = B->i[i+1] - B->i[i];
863         idx  = B->j + B->i[i] + shift;
864         v    = B->a + B->i[i] + shift;
865         SPARSEDENSEMDOT(sum,ls,v,idx,n);
866         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
867       }
868       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
869                             mat->Mvctx); CHKERRQ(ierr);
870     }
871   }
872   else if (flag & SOR_BACKWARD_SWEEP){
873     if (flag & SOR_ZERO_INITIAL_GUESS) {
874       VecSet(&zero,mat->lvec);
875       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
876                               mat->Mvctx); CHKERRQ(ierr);
877       for ( i=m-1; i>-1; i-- ) {
878         n    = A->i[i+1] - diag[i] - 1;
879         idx  = A->j + diag[i] + !shift;
880         v    = A->a + diag[i] + !shift;
881         sum  = b[i];
882         SPARSEDENSEMDOT(sum,xs,v,idx,n);
883         d    = fshift + A->a[diag[i]+shift];
884         n    = B->i[i+1] - B->i[i];
885         idx  = B->j + B->i[i] + shift;
886         v    = B->a + B->i[i] + shift;
887         SPARSEDENSEMDOT(sum,ls,v,idx,n);
888         x[i] = omega*(sum/d);
889       }
890       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
891                             mat->Mvctx); CHKERRQ(ierr);
892       its--;
893     }
894     while (its--) {
895       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
896                             mat->Mvctx); CHKERRQ(ierr);
897       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
898                             mat->Mvctx); CHKERRQ(ierr);
899       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
900                               mat->Mvctx); CHKERRQ(ierr);
901       for ( i=m-1; i>-1; i-- ) {
902         n    = A->i[i+1] - A->i[i];
903         idx  = A->j + A->i[i] + shift;
904         v    = A->a + A->i[i] + shift;
905         sum  = b[i];
906         SPARSEDENSEMDOT(sum,xs,v,idx,n);
907         d    = fshift + A->a[diag[i]+shift];
908         n    = B->i[i+1] - B->i[i];
909         idx  = B->j + B->i[i] + shift;
910         v    = B->a + B->i[i] + shift;
911         SPARSEDENSEMDOT(sum,ls,v,idx,n);
912         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
913       }
914       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
915                             mat->Mvctx); CHKERRQ(ierr);
916     }
917   }
918   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
919     if (flag & SOR_ZERO_INITIAL_GUESS) {
920       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
921     }
922     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
923     CHKERRQ(ierr);
924     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
925     CHKERRQ(ierr);
926     while (its--) {
927       /* go down through the rows */
928       for ( i=0; i<m; i++ ) {
929         n    = A->i[i+1] - A->i[i];
930         idx  = A->j + A->i[i] + shift;
931         v    = A->a + A->i[i] + shift;
932         sum  = b[i];
933         SPARSEDENSEMDOT(sum,xs,v,idx,n);
934         d    = fshift + A->a[diag[i]+shift];
935         n    = B->i[i+1] - B->i[i];
936         idx  = B->j + B->i[i] + shift;
937         v    = B->a + B->i[i] + shift;
938         SPARSEDENSEMDOT(sum,ls,v,idx,n);
939         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
940       }
941       /* come up through the rows */
942       for ( i=m-1; i>-1; i-- ) {
943         n    = A->i[i+1] - A->i[i];
944         idx  = A->j + A->i[i] + shift;
945         v    = A->a + A->i[i] + shift;
946         sum  = b[i];
947         SPARSEDENSEMDOT(sum,xs,v,idx,n);
948         d    = fshift + A->a[diag[i]+shift];
949         n    = B->i[i+1] - B->i[i];
950         idx  = B->j + B->i[i] + shift;
951         v    = B->a + B->i[i] + shift;
952         SPARSEDENSEMDOT(sum,ls,v,idx,n);
953         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
954       }
955     }
956   }
957   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
958     if (flag & SOR_ZERO_INITIAL_GUESS) {
959       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
960     }
961     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
962     CHKERRQ(ierr);
963     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
964     CHKERRQ(ierr);
965     while (its--) {
966       for ( i=0; i<m; i++ ) {
967         n    = A->i[i+1] - A->i[i];
968         idx  = A->j + A->i[i] + shift;
969         v    = A->a + A->i[i] + shift;
970         sum  = b[i];
971         SPARSEDENSEMDOT(sum,xs,v,idx,n);
972         d    = fshift + A->a[diag[i]+shift];
973         n    = B->i[i+1] - B->i[i];
974         idx  = B->j + B->i[i] + shift;
975         v    = B->a + B->i[i] + shift;
976         SPARSEDENSEMDOT(sum,ls,v,idx,n);
977         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
978       }
979     }
980   }
981   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
982     if (flag & SOR_ZERO_INITIAL_GUESS) {
983       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
984     }
985     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
986                             mat->Mvctx); CHKERRQ(ierr);
987     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
988                             mat->Mvctx); CHKERRQ(ierr);
989     while (its--) {
990       for ( i=m-1; i>-1; i-- ) {
991         n    = A->i[i+1] - A->i[i];
992         idx  = A->j + A->i[i] + shift;
993         v    = A->a + A->i[i] + shift;
994         sum  = b[i];
995         SPARSEDENSEMDOT(sum,xs,v,idx,n);
996         d    = fshift + A->a[diag[i]+shift];
997         n    = B->i[i+1] - B->i[i];
998         idx  = B->j + B->i[i] + shift;
999         v    = B->a + B->i[i] + shift;
1000         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1001         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1002       }
1003     }
1004   }
1005   return 0;
1006 }
1007 
1008 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1009                              int *nzalloc,int *mem)
1010 {
1011   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1012   Mat        A = mat->A, B = mat->B;
1013   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1014 
1015   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1016   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1017   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1018   if (flag == MAT_LOCAL) {
1019     *nz = isend[0]; *nzalloc = isend[1]; *mem = isend[2];
1020   } else if (flag == MAT_GLOBAL_MAX) {
1021     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1022     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1023   } else if (flag == MAT_GLOBAL_SUM) {
1024     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1025     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1026   }
1027   return 0;
1028 }
1029 
1030 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1031 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1032 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1033 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1034 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1035 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1036 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1037 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1038 
1039 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1040 {
1041   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1042 
1043   if (op == NO_NEW_NONZERO_LOCATIONS ||
1044       op == YES_NEW_NONZERO_LOCATIONS ||
1045       op == COLUMNS_SORTED ||
1046       op == ROW_ORIENTED) {
1047         MatSetOption(a->A,op);
1048         MatSetOption(a->B,op);
1049   }
1050   else if (op == ROWS_SORTED ||
1051            op == SYMMETRIC_MATRIX ||
1052            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1053            op == YES_NEW_DIAGONALS)
1054     PLogInfo((PetscObject)A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1055   else if (op == COLUMN_ORIENTED) {
1056     a->roworiented = 0;
1057     MatSetOption(a->A,op);
1058     MatSetOption(a->B,op);
1059   }
1060   else if (op == NO_NEW_DIAGONALS)
1061     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1062   else
1063     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1064   return 0;
1065 }
1066 
1067 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1068 {
1069   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1070   *m = mat->M; *n = mat->N;
1071   return 0;
1072 }
1073 
1074 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1075 {
1076   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1077   *m = mat->m; *n = mat->N;
1078   return 0;
1079 }
1080 
1081 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1082 {
1083   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1084   *m = mat->rstart; *n = mat->rend;
1085   return 0;
1086 }
1087 
1088 static int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1089 {
1090   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1091   Scalar     *vworkA, *vworkB, **pvA, **pvB;
1092   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1093   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1094 
1095   if (!mat->assembled) SETERRQ(1,"MatGetRow_MPIAIJ:Must assemble matrix");
1096   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1097   lrow = row - rstart;
1098 
1099   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1100   if (!v)   {pvA = 0; pvB = 0;}
1101   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1102   ierr = MatGetRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1103   ierr = MatGetRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1104   nztot = nzA + nzB;
1105 
1106   if (v  || idx) {
1107     if (nztot) {
1108       /* Sort by increasing column numbers, assuming A and B already sorted */
1109       int imark;
1110       if (mat->assembled) {
1111         for (i=0; i<nzB; i++) cworkB[i] = mat->garray[cworkB[i]];
1112       }
1113       if (v) {
1114         *v = (Scalar *) PetscMalloc( (nztot)*sizeof(Scalar) ); CHKPTRQ(*v);
1115         for ( i=0; i<nzB; i++ ) {
1116           if (cworkB[i] < cstart)   (*v)[i] = vworkB[i];
1117           else break;
1118         }
1119         imark = i;
1120         for ( i=0; i<nzA; i++ )     (*v)[imark+i] = vworkA[i];
1121         for ( i=imark; i<nzB; i++ ) (*v)[nzA+i] = vworkB[i];
1122       }
1123       if (idx) {
1124         *idx = (int *) PetscMalloc( (nztot)*sizeof(int) ); CHKPTRQ(*idx);
1125         for (i=0; i<nzA; i++) cworkA[i] += cstart;
1126         for ( i=0; i<nzB; i++ ) {
1127           if (cworkB[i] < cstart)   (*idx)[i] = cworkB[i];
1128           else break;
1129         }
1130         imark = i;
1131         for ( i=0; i<nzA; i++ )     (*idx)[imark+i] = cworkA[i];
1132         for ( i=imark; i<nzB; i++ ) (*idx)[nzA+i] = cworkB[i];
1133       }
1134     }
1135     else {*idx = 0; *v=0;}
1136   }
1137   *nz = nztot;
1138   ierr = MatRestoreRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1139   ierr = MatRestoreRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1140   return 0;
1141 }
1142 
1143 static int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1144 {
1145   if (idx) PetscFree(*idx);
1146   if (v) PetscFree(*v);
1147   return 0;
1148 }
1149 
1150 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1151 {
1152   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1153   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1154   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1155   double     sum = 0.0;
1156   Scalar     *v;
1157 
1158   if (!aij->assembled) SETERRQ(1,"MatNorm_MPIAIJ:Must assemble matrix");
1159   if (aij->size == 1) {
1160     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1161   } else {
1162     if (type == NORM_FROBENIUS) {
1163       v = amat->a;
1164       for (i=0; i<amat->nz; i++ ) {
1165 #if defined(PETSC_COMPLEX)
1166         sum += real(conj(*v)*(*v)); v++;
1167 #else
1168         sum += (*v)*(*v); v++;
1169 #endif
1170       }
1171       v = bmat->a;
1172       for (i=0; i<bmat->nz; i++ ) {
1173 #if defined(PETSC_COMPLEX)
1174         sum += real(conj(*v)*(*v)); v++;
1175 #else
1176         sum += (*v)*(*v); v++;
1177 #endif
1178       }
1179       MPI_Allreduce((void*)&sum,(void*)norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1180       *norm = sqrt(*norm);
1181     }
1182     else if (type == NORM_1) { /* max column norm */
1183       double *tmp, *tmp2;
1184       int    *jj, *garray = aij->garray;
1185       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1186       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1187       PetscMemzero(tmp,aij->N*sizeof(double));
1188       *norm = 0.0;
1189       v = amat->a; jj = amat->j;
1190       for ( j=0; j<amat->nz; j++ ) {
1191         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1192       }
1193       v = bmat->a; jj = bmat->j;
1194       for ( j=0; j<bmat->nz; j++ ) {
1195         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1196       }
1197       MPI_Allreduce((void*)tmp,(void*)tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1198       for ( j=0; j<aij->N; j++ ) {
1199         if (tmp2[j] > *norm) *norm = tmp2[j];
1200       }
1201       PetscFree(tmp); PetscFree(tmp2);
1202     }
1203     else if (type == NORM_INFINITY) { /* max row norm */
1204       double ntemp = 0.0;
1205       for ( j=0; j<amat->m; j++ ) {
1206         v = amat->a + amat->i[j] + shift;
1207         sum = 0.0;
1208         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1209           sum += PetscAbsScalar(*v); v++;
1210         }
1211         v = bmat->a + bmat->i[j] + shift;
1212         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1213           sum += PetscAbsScalar(*v); v++;
1214         }
1215         if (sum > ntemp) ntemp = sum;
1216       }
1217       MPI_Allreduce((void*)&ntemp,(void*)norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1218     }
1219     else {
1220       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1221     }
1222   }
1223   return 0;
1224 }
1225 
1226 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1227 {
1228   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1229   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1230   int        ierr,shift = Aloc->indexshift;
1231   Mat        B;
1232   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1233   Scalar     *array;
1234 
1235   if (!matout && M != N) SETERRQ(1,"MatTranspose_MPIAIJ:Square only in-place");
1236   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1237          PETSC_NULL,&B); CHKERRQ(ierr);
1238 
1239   /* copy over the A part */
1240   Aloc = (Mat_SeqAIJ*) a->A->data;
1241   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1242   row = a->rstart;
1243   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1244   for ( i=0; i<m; i++ ) {
1245     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1246     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1247   }
1248   aj = Aloc->j;
1249   for ( i=0; i<ai[m]|+shift; i++ ) {aj[i] -= a->cstart + shift;}
1250 
1251   /* copy over the B part */
1252   Aloc = (Mat_SeqAIJ*) a->B->data;
1253   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1254   row = a->rstart;
1255   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1256   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1257   for ( i=0; i<m; i++ ) {
1258     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1259     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1260   }
1261   PetscFree(ct);
1262   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1263   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1264   if (matout) {
1265     *matout = B;
1266   } else {
1267     /* This isn't really an in-place transpose .... but free data structures from a */
1268     PetscFree(a->rowners);
1269     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1270     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1271     if (a->colmap) PetscFree(a->colmap);
1272     if (a->garray) PetscFree(a->garray);
1273     if (a->lvec) VecDestroy(a->lvec);
1274     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1275     PetscFree(a);
1276     PetscMemcpy(A,B,sizeof(struct _Mat));
1277     PetscHeaderDestroy(B);
1278   }
1279   return 0;
1280 }
1281 
1282 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1283 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1284 
1285 /* -------------------------------------------------------------------*/
1286 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1287        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1288        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1289        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1290        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1291        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1292        MatLUFactor_MPIAIJ,0,
1293        MatRelax_MPIAIJ,
1294        MatTranspose_MPIAIJ,
1295        MatGetInfo_MPIAIJ,0,
1296        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1297        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1298        0,
1299        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1300        MatGetReordering_MPIAIJ,
1301        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1302        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1303        MatILUFactorSymbolic_MPIAIJ,0,
1304        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1305        0,0,0,
1306        0,0,MatGetValues_MPIAIJ};
1307 
1308 /*@C
1309    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1310    (the default parallel PETSc format).
1311 
1312    Input Parameters:
1313 .  comm - MPI communicator
1314 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1315 .  n - number of local columns (or PETSC_DECIDE to have calculated
1316            if N is given)
1317 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1318 .  N - number of global columns (or PETSC_DECIDE to have calculated
1319            if n is given)
1320 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1321            (same for all local rows)
1322 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1323            or null (possibly different for each row).  You must leave room
1324            for the diagonal entry even if it is zero.
1325 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1326            submatrix (same for all local rows).
1327 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1328            submatrix or null (possibly different for each row).
1329 
1330    Output Parameter:
1331 .  newmat - the matrix
1332 
1333    Notes:
1334    The AIJ format (also called the Yale sparse matrix format or
1335    compressed row storage), is fully compatible with standard Fortran 77
1336    storage.  That is, the stored row and column indices can begin at
1337    either one (as in Fortran) or zero.  See the users manual for details.
1338 
1339    The user MUST specify either the local or global matrix dimensions
1340    (possibly both).
1341 
1342    Storage Information:
1343    For a square global matrix we define each processor's diagonal portion
1344    to be its local rows and the corresponding columns (a square submatrix);
1345    each processor's off-diagonal portion encompasses the remainder of the
1346    local matrix (a rectangular submatrix).
1347 
1348    The user can specify preallocated storage for the diagonal part of
1349    the local submatrix with either d_nz or d_nnz (not both).  Set d_nz=0
1350    and d_nnz=PETSC_NULL for PETSc to control dynamic memory allocation.
1351    Likewise, specify preallocated storage for the off-diagonal part of
1352    the local submatrix with o_nz or o_nnz (not both).
1353 
1354    By default, this format uses inodes (identical nodes) when possible.
1355    We search for consecutive rows with the same nonzero structure, thereby
1356    reusing matrix information to achieve increased efficiency.
1357 
1358    Options Database Keys:
1359 $    -mat_aij_no_inode  - Do not use inodes
1360 $    -mat_aij_inode_limit <limit> - Set inode limit (max limit=5)
1361 
1362 .keywords: matrix, aij, compressed row, sparse, parallel
1363 
1364 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1365 @*/
1366 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1367                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *newmat)
1368 {
1369   Mat          mat;
1370   Mat_MPIAIJ   *a;
1371   int          ierr, i,sum[2],work[2];
1372 
1373   *newmat         = 0;
1374   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1375   PLogObjectCreate(mat);
1376   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1377   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1378   mat->destroy    = MatDestroy_MPIAIJ;
1379   mat->view       = MatView_MPIAIJ;
1380   mat->factor     = 0;
1381 
1382   a->insertmode = NOT_SET_VALUES;
1383   MPI_Comm_rank(comm,&a->rank);
1384   MPI_Comm_size(comm,&a->size);
1385 
1386   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1387     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSc decide rows but set d_nnz or o_nnz");
1388 
1389   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1390     work[0] = m; work[1] = n;
1391     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1392     if (M == PETSC_DECIDE) M = sum[0];
1393     if (N == PETSC_DECIDE) N = sum[1];
1394   }
1395   if (m == PETSC_DECIDE) {m = M/a->size + ((M % a->size) > a->rank);}
1396   if (n == PETSC_DECIDE) {n = N/a->size + ((N % a->size) > a->rank);}
1397   a->m = m;
1398   a->n = n;
1399   a->N = N;
1400   a->M = M;
1401 
1402   /* build local table of row and column ownerships */
1403   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1404   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1405   a->cowners = a->rowners + a->size + 1;
1406   MPI_Allgather(&m,1,MPI_INT,a->rowners+1,1,MPI_INT,comm);
1407   a->rowners[0] = 0;
1408   for ( i=2; i<=a->size; i++ ) {
1409     a->rowners[i] += a->rowners[i-1];
1410   }
1411   a->rstart = a->rowners[a->rank];
1412   a->rend   = a->rowners[a->rank+1];
1413   MPI_Allgather(&n,1,MPI_INT,a->cowners+1,1,MPI_INT,comm);
1414   a->cowners[0] = 0;
1415   for ( i=2; i<=a->size; i++ ) {
1416     a->cowners[i] += a->cowners[i-1];
1417   }
1418   a->cstart = a->cowners[a->rank];
1419   a->cend   = a->cowners[a->rank+1];
1420 
1421   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&a->A); CHKERRQ(ierr);
1422   PLogObjectParent(mat,a->A);
1423   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&a->B); CHKERRQ(ierr);
1424   PLogObjectParent(mat,a->B);
1425 
1426   /* build cache for off array entries formed */
1427   ierr = StashBuild_Private(&a->stash); CHKERRQ(ierr);
1428   a->colmap      = 0;
1429   a->garray      = 0;
1430   a->roworiented = 1;
1431 
1432   /* stuff used for matrix vector multiply */
1433   a->lvec      = 0;
1434   a->Mvctx     = 0;
1435   a->assembled = 0;
1436 
1437   *newmat = mat;
1438   return 0;
1439 }
1440 
1441 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1442 {
1443   Mat        mat;
1444   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1445   int        ierr, len;
1446 
1447   if (!oldmat->assembled) SETERRQ(1,"MatConvertSameType_MPIAIJ:Must assemble matrix");
1448   *newmat       = 0;
1449   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1450   PLogObjectCreate(mat);
1451   mat->data     = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1452   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1453   mat->destroy  = MatDestroy_MPIAIJ;
1454   mat->view     = MatView_MPIAIJ;
1455   mat->factor   = matin->factor;
1456 
1457   a->m          = oldmat->m;
1458   a->n          = oldmat->n;
1459   a->M          = oldmat->M;
1460   a->N          = oldmat->N;
1461 
1462   a->assembled  = 1;
1463   a->rstart     = oldmat->rstart;
1464   a->rend       = oldmat->rend;
1465   a->cstart     = oldmat->cstart;
1466   a->cend       = oldmat->cend;
1467   a->size       = oldmat->size;
1468   a->rank       = oldmat->rank;
1469   a->insertmode = NOT_SET_VALUES;
1470 
1471   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1472   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1473   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1474   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1475   if (oldmat->colmap) {
1476     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1477     PLogObjectMemory(mat,(a->N)*sizeof(int));
1478     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1479   } else a->colmap = 0;
1480   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1481     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1482     PLogObjectMemory(mat,len*sizeof(int));
1483     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1484   } else a->garray = 0;
1485 
1486   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1487   PLogObjectParent(mat,a->lvec);
1488   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1489   PLogObjectParent(mat,a->Mvctx);
1490   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1491   PLogObjectParent(mat,a->A);
1492   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1493   PLogObjectParent(mat,a->B);
1494   *newmat = mat;
1495   return 0;
1496 }
1497 
1498 #include "sysio.h"
1499 
1500 int MatLoad_MPIAIJorMPIRow(Viewer bview,MatType type,Mat *newmat)
1501 {
1502   Mat          A;
1503   int          i, nz, ierr, j,rstart, rend, fd;
1504   Scalar       *vals,*svals;
1505   PetscObject  vobj = (PetscObject) bview;
1506   MPI_Comm     comm = vobj->comm;
1507   MPI_Status   status;
1508   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1509   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1510   int          tag = ((PetscObject)bview)->tag;
1511 
1512   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1513   if (!rank) {
1514     ierr = ViewerFileGetDescriptor_Private(bview,&fd); CHKERRQ(ierr);
1515     ierr = SYRead(fd,(char *)header,4,SYINT); CHKERRQ(ierr);
1516     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJorMPIRow:not matrix object");
1517   }
1518 
1519   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1520   M = header[1]; N = header[2];
1521   /* determine ownership of all rows */
1522   m = M/size + ((M % size) > rank);
1523   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1524   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1525   rowners[0] = 0;
1526   for ( i=2; i<=size; i++ ) {
1527     rowners[i] += rowners[i-1];
1528   }
1529   rstart = rowners[rank];
1530   rend   = rowners[rank+1];
1531 
1532   /* distribute row lengths to all processors */
1533   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1534   offlens = ourlens + (rend-rstart);
1535   if (!rank) {
1536     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1537     ierr = SYRead(fd,rowlengths,M,SYINT); CHKERRQ(ierr);
1538     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1539     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1540     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1541     PetscFree(sndcounts);
1542   }
1543   else {
1544     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1545   }
1546 
1547   if (!rank) {
1548     /* calculate the number of nonzeros on each processor */
1549     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1550     PetscMemzero(procsnz,size*sizeof(int));
1551     for ( i=0; i<size; i++ ) {
1552       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1553         procsnz[i] += rowlengths[j];
1554       }
1555     }
1556     PetscFree(rowlengths);
1557 
1558     /* determine max buffer needed and allocate it */
1559     maxnz = 0;
1560     for ( i=0; i<size; i++ ) {
1561       maxnz = PetscMax(maxnz,procsnz[i]);
1562     }
1563     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1564 
1565     /* read in my part of the matrix column indices  */
1566     nz = procsnz[0];
1567     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1568     ierr = SYRead(fd,mycols,nz,SYINT); CHKERRQ(ierr);
1569 
1570     /* read in every one elses and ship off */
1571     for ( i=1; i<size; i++ ) {
1572       nz = procsnz[i];
1573       ierr = SYRead(fd,cols,nz,SYINT); CHKERRQ(ierr);
1574       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1575     }
1576     PetscFree(cols);
1577   }
1578   else {
1579     /* determine buffer space needed for message */
1580     nz = 0;
1581     for ( i=0; i<m; i++ ) {
1582       nz += ourlens[i];
1583     }
1584     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1585 
1586     /* receive message of column indices*/
1587     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1588     MPI_Get_count(&status,MPI_INT,&maxnz);
1589     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJorMPIRow:something is wrong with file");
1590   }
1591 
1592   /* loop over local rows, determining number of off diagonal entries */
1593   PetscMemzero(offlens,m*sizeof(int));
1594   jj = 0;
1595   for ( i=0; i<m; i++ ) {
1596     for ( j=0; j<ourlens[i]; j++ ) {
1597       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1598       jj++;
1599     }
1600   }
1601 
1602   /* create our matrix */
1603   for ( i=0; i<m; i++ ) {
1604     ourlens[i] -= offlens[i];
1605   }
1606   if (type == MATMPIAIJ) {
1607     ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1608   }
1609   else if (type == MATMPIROW) {
1610     ierr = MatCreateMPIRow(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1611   }
1612   A = *newmat;
1613   MatSetOption(A,COLUMNS_SORTED);
1614   for ( i=0; i<m; i++ ) {
1615     ourlens[i] += offlens[i];
1616   }
1617 
1618   if (!rank) {
1619     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1620 
1621     /* read in my part of the matrix numerical values  */
1622     nz = procsnz[0];
1623     ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1624 
1625     /* insert into matrix */
1626     jj      = rstart;
1627     smycols = mycols;
1628     svals   = vals;
1629     for ( i=0; i<m; i++ ) {
1630       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1631       smycols += ourlens[i];
1632       svals   += ourlens[i];
1633       jj++;
1634     }
1635 
1636     /* read in other processors and ship out */
1637     for ( i=1; i<size; i++ ) {
1638       nz = procsnz[i];
1639       ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1640       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1641     }
1642     PetscFree(procsnz);
1643   }
1644   else {
1645     /* receive numeric values */
1646     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1647 
1648     /* receive message of values*/
1649     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1650     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1651     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJorMPIRow:something is wrong with file");
1652 
1653     /* insert into matrix */
1654     jj      = rstart;
1655     smycols = mycols;
1656     svals   = vals;
1657     for ( i=0; i<m; i++ ) {
1658       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1659       smycols += ourlens[i];
1660       svals   += ourlens[i];
1661       jj++;
1662     }
1663   }
1664   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1665 
1666   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1667   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1668   return 0;
1669 }
1670