xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision b0a32e0c6855ee6a6cd3495fa7da12ea9885bc5d)
1 /*$Id: mpiaij.c,v 1.323 2000/11/28 17:29:00 bsmith Exp bsmith $*/
2 
3 #include "src/mat/impls/aij/mpi/mpiaij.h"
4 #include "src/vec/vecimpl.h"
5 #include "src/inline/spops.h"
6 
7 EXTERN int MatSetUpMultiply_MPIAIJ(Mat);
8 EXTERN int DisAssemble_MPIAIJ(Mat);
9 EXTERN int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
10 EXTERN int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
11 EXTERN int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
12 EXTERN int MatPrintHelp_SeqAIJ(Mat);
13 
14 /*
15   Local utility routine that creates a mapping from the global column
16 number to the local number in the off-diagonal part of the local
17 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
18 a slightly higher hash table cost; without it it is not scalable (each processor
19 has an order N integer array but is fast to acess.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIAIJ_Private"
23 int CreateColmap_MPIAIJ_Private(Mat mat)
24 {
25   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
26   int        n = aij->B->n,i,ierr;
27 
28   PetscFunctionBegin;
29 #if defined (PETSC_USE_CTABLE)
30   ierr = PetscTableCreate(n,&aij->colmap);CHKERRQ(ierr);
31   for (i=0; i<n; i++){
32     ierr = PetscTableAdd(aij->colmap,aij->garray[i]+1,i+1);CHKERRQ(ierr);
33   }
34 #else
35   ierr = PetscMalloc((mat->N+1)*sizeof(int),&aij->colmap);CHKERRQ(ierr);
36   PetscLogObjectMemory(mat,mat->N*sizeof(int));
37   ierr = PetscMemzero(aij->colmap,mat->N*sizeof(int));CHKERRQ(ierr);
38   for (i=0; i<n; i++) aij->colmap[aij->garray[i]] = i+1;
39 #endif
40   PetscFunctionReturn(0);
41 }
42 
43 #define CHUNKSIZE   15
44 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
45 { \
46  \
47     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
48     rmax = aimax[row]; nrow = ailen[row];  \
49     col1 = col - shift; \
50      \
51     low = 0; high = nrow; \
52     while (high-low > 5) { \
53       t = (low+high)/2; \
54       if (rp[t] > col) high = t; \
55       else             low  = t; \
56     } \
57       for (_i=low; _i<high; _i++) { \
58         if (rp[_i] > col1) break; \
59         if (rp[_i] == col1) { \
60           if (addv == ADD_VALUES) ap[_i] += value;   \
61           else                  ap[_i] = value; \
62           goto a_noinsert; \
63         } \
64       }  \
65       if (nonew == 1) goto a_noinsert; \
66       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
67       if (nrow >= rmax) { \
68         /* there is no extra room in row, therefore enlarge */ \
69         int    new_nz = ai[am] + CHUNKSIZE,len,*new_i,*new_j; \
70         Scalar *new_a; \
71  \
72         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
73  \
74         /* malloc new storage space */ \
75         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(am+1)*sizeof(int); \
76         ierr    = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
77         new_j   = (int*)(new_a + new_nz); \
78         new_i   = new_j + new_nz; \
79  \
80         /* copy over old data into new slots */ \
81         for (ii=0; ii<row+1; ii++) {new_i[ii] = ai[ii];} \
82         for (ii=row+1; ii<am+1; ii++) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
83         ierr = PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int));CHKERRQ(ierr); \
84         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
85         ierr = PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
86                                                            len*sizeof(int));CHKERRQ(ierr); \
87         ierr = PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar));CHKERRQ(ierr); \
88         ierr = PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
89                                                            len*sizeof(Scalar));CHKERRQ(ierr);  \
90         /* free up old matrix storage */ \
91  \
92         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
93         if (!a->singlemalloc) { \
94            ierr = PetscFree(a->i);CHKERRQ(ierr); \
95            ierr = PetscFree(a->j);CHKERRQ(ierr); \
96         } \
97         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
98         a->singlemalloc = PETSC_TRUE; \
99  \
100         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
101         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
102         PetscLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
103         a->maxnz += CHUNKSIZE; \
104         a->reallocs++; \
105       } \
106       N = nrow++ - 1; a->nz++; \
107       /* shift up all the later entries in this row */ \
108       for (ii=N; ii>=_i; ii--) { \
109         rp[ii+1] = rp[ii]; \
110         ap[ii+1] = ap[ii]; \
111       } \
112       rp[_i] = col1;  \
113       ap[_i] = value;  \
114       a_noinsert: ; \
115       ailen[row] = nrow; \
116 }
117 
118 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
119 { \
120  \
121     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
122     rmax = bimax[row]; nrow = bilen[row];  \
123     col1 = col - shift; \
124      \
125     low = 0; high = nrow; \
126     while (high-low > 5) { \
127       t = (low+high)/2; \
128       if (rp[t] > col) high = t; \
129       else             low  = t; \
130     } \
131        for (_i=low; _i<high; _i++) { \
132         if (rp[_i] > col1) break; \
133         if (rp[_i] == col1) { \
134           if (addv == ADD_VALUES) ap[_i] += value;   \
135           else                  ap[_i] = value; \
136           goto b_noinsert; \
137         } \
138       }  \
139       if (nonew == 1) goto b_noinsert; \
140       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
141       if (nrow >= rmax) { \
142         /* there is no extra room in row, therefore enlarge */ \
143         int    new_nz = bi[bm] + CHUNKSIZE,len,*new_i,*new_j; \
144         Scalar *new_a; \
145  \
146         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
147  \
148         /* malloc new storage space */ \
149         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(bm+1)*sizeof(int); \
150         ierr    = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
151         new_j   = (int*)(new_a + new_nz); \
152         new_i   = new_j + new_nz; \
153  \
154         /* copy over old data into new slots */ \
155         for (ii=0; ii<row+1; ii++) {new_i[ii] = bi[ii];} \
156         for (ii=row+1; ii<bm+1; ii++) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
157         ierr = PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int));CHKERRQ(ierr); \
158         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
159         ierr = PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
160                                                            len*sizeof(int));CHKERRQ(ierr); \
161         ierr = PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar));CHKERRQ(ierr); \
162         ierr = PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
163                                                            len*sizeof(Scalar));CHKERRQ(ierr);  \
164         /* free up old matrix storage */ \
165  \
166         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
167         if (!b->singlemalloc) { \
168           ierr = PetscFree(b->i);CHKERRQ(ierr); \
169           ierr = PetscFree(b->j);CHKERRQ(ierr); \
170         } \
171         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
172         b->singlemalloc = PETSC_TRUE; \
173  \
174         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
175         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
176         PetscLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
177         b->maxnz += CHUNKSIZE; \
178         b->reallocs++; \
179       } \
180       N = nrow++ - 1; b->nz++; \
181       /* shift up all the later entries in this row */ \
182       for (ii=N; ii>=_i; ii--) { \
183         rp[ii+1] = rp[ii]; \
184         ap[ii+1] = ap[ii]; \
185       } \
186       rp[_i] = col1;  \
187       ap[_i] = value;  \
188       b_noinsert: ; \
189       bilen[row] = nrow; \
190 }
191 
192 #undef __FUNC__
193 #define __FUNC__ "MatSetValues_MPIAIJ"
194 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
195 {
196   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
197   Scalar     value;
198   int        ierr,i,j,rstart = aij->rstart,rend = aij->rend;
199   int        cstart = aij->cstart,cend = aij->cend,row,col;
200   PetscTruth roworiented = aij->roworiented;
201 
202   /* Some Variables required in the macro */
203   Mat        A = aij->A;
204   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
205   int        *aimax = a->imax,*ai = a->i,*ailen = a->ilen,*aj = a->j;
206   Scalar     *aa = a->a;
207   PetscTruth ignorezeroentries = (((a->ignorezeroentries)&&(addv==ADD_VALUES))?PETSC_TRUE:PETSC_FALSE);
208   Mat        B = aij->B;
209   Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
210   int        *bimax = b->imax,*bi = b->i,*bilen = b->ilen,*bj = b->j,bm = aij->B->m,am = aij->A->m;
211   Scalar     *ba = b->a;
212 
213   int        *rp,ii,nrow,_i,rmax,N,col1,low,high,t;
214   int        nonew = a->nonew,shift = a->indexshift;
215   Scalar     *ap;
216 
217   PetscFunctionBegin;
218   for (i=0; i<m; i++) {
219     if (im[i] < 0) continue;
220 #if defined(PETSC_USE_BOPT_g)
221     if (im[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
222 #endif
223     if (im[i] >= rstart && im[i] < rend) {
224       row = im[i] - rstart;
225       for (j=0; j<n; j++) {
226         if (in[j] >= cstart && in[j] < cend){
227           col = in[j] - cstart;
228           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
229           if (ignorezeroentries && value == 0.0) continue;
230           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
231           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
232         } else if (in[j] < 0) continue;
233 #if defined(PETSC_USE_BOPT_g)
234         else if (in[j] >= mat->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");}
235 #endif
236         else {
237           if (mat->was_assembled) {
238             if (!aij->colmap) {
239               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
240             }
241 #if defined (PETSC_USE_CTABLE)
242             ierr = PetscTableFind(aij->colmap,in[j]+1,&col);CHKERRQ(ierr);
243 	    col--;
244 #else
245             col = aij->colmap[in[j]] - 1;
246 #endif
247             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
248               ierr = DisAssemble_MPIAIJ(mat);CHKERRQ(ierr);
249               col =  in[j];
250               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
251               B = aij->B;
252               b = (Mat_SeqAIJ*)B->data;
253               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
254               ba = b->a;
255             }
256           } else col = in[j];
257           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
258           if (ignorezeroentries && value == 0.0) continue;
259           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
260           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
261         }
262       }
263     } else {
264       if (!aij->donotstash) {
265         if (roworiented) {
266           if (ignorezeroentries && v[i*n] == 0.0) continue;
267           ierr = MatStashValuesRow_Private(&mat->stash,im[i],n,in,v+i*n);CHKERRQ(ierr);
268         } else {
269           if (ignorezeroentries && v[i] == 0.0) continue;
270           ierr = MatStashValuesCol_Private(&mat->stash,im[i],n,in,v+i,m);CHKERRQ(ierr);
271         }
272       }
273     }
274   }
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNC__
279 #define __FUNC__ "MatGetValues_MPIAIJ"
280 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
281 {
282   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
283   int        ierr,i,j,rstart = aij->rstart,rend = aij->rend;
284   int        cstart = aij->cstart,cend = aij->cend,row,col;
285 
286   PetscFunctionBegin;
287   for (i=0; i<m; i++) {
288     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative row");
289     if (idxm[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
290     if (idxm[i] >= rstart && idxm[i] < rend) {
291       row = idxm[i] - rstart;
292       for (j=0; j<n; j++) {
293         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative column");
294         if (idxn[j] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");
295         if (idxn[j] >= cstart && idxn[j] < cend){
296           col = idxn[j] - cstart;
297           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
298         } else {
299           if (!aij->colmap) {
300             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
301           }
302 #if defined (PETSC_USE_CTABLE)
303           ierr = PetscTableFind(aij->colmap,idxn[j]+1,&col);CHKERRQ(ierr);
304           col --;
305 #else
306           col = aij->colmap[idxn[j]] - 1;
307 #endif
308           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
309           else {
310             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
311           }
312         }
313       }
314     } else {
315       SETERRQ(PETSC_ERR_SUP,"Only local values currently supported");
316     }
317   }
318   PetscFunctionReturn(0);
319 }
320 
321 #undef __FUNC__
322 #define __FUNC__ "MatAssemblyBegin_MPIAIJ"
323 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
324 {
325   Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)mat->data;
326   int         ierr,nstash,reallocs;
327   InsertMode  addv;
328 
329   PetscFunctionBegin;
330   if (aij->donotstash) {
331     PetscFunctionReturn(0);
332   }
333 
334   /* make sure all processors are either in INSERTMODE or ADDMODE */
335   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
336   if (addv == (ADD_VALUES|INSERT_VALUES)) {
337     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
338   }
339   mat->insertmode = addv; /* in case this processor had no cache */
340 
341   ierr = MatStashScatterBegin_Private(&mat->stash,aij->rowners);CHKERRQ(ierr);
342   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
343   PetscLogInfo(aij->A,"MatAssemblyBegin_MPIAIJ:Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
344   PetscFunctionReturn(0);
345 }
346 
347 
348 #undef __FUNC__
349 #define __FUNC__ "MatAssemblyEnd_MPIAIJ"
350 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
351 {
352   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
353   int         i,j,rstart,ncols,n,ierr,flg;
354   int         *row,*col,other_disassembled;
355   Scalar      *val;
356   InsertMode  addv = mat->insertmode;
357 
358   PetscFunctionBegin;
359   if (!aij->donotstash) {
360     while (1) {
361       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
362       if (!flg) break;
363 
364       for (i=0; i<n;) {
365         /* Now identify the consecutive vals belonging to the same row */
366         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
367         if (j < n) ncols = j-i;
368         else       ncols = n-i;
369         /* Now assemble all these values with a single function call */
370         ierr = MatSetValues_MPIAIJ(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
371         i = j;
372       }
373     }
374     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
375   }
376 
377   ierr = MatAssemblyBegin(aij->A,mode);CHKERRQ(ierr);
378   ierr = MatAssemblyEnd(aij->A,mode);CHKERRQ(ierr);
379 
380   /* determine if any processor has disassembled, if so we must
381      also disassemble ourselfs, in order that we may reassemble. */
382   /*
383      if nonzero structure of submatrix B cannot change then we know that
384      no processor disassembled thus we can skip this stuff
385   */
386   if (!((Mat_SeqAIJ*)aij->B->data)->nonew)  {
387     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
388     if (mat->was_assembled && !other_disassembled) {
389       ierr = DisAssemble_MPIAIJ(mat);CHKERRQ(ierr);
390     }
391   }
392 
393   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
394     ierr = MatSetUpMultiply_MPIAIJ(mat);CHKERRQ(ierr);
395   }
396   ierr = MatAssemblyBegin(aij->B,mode);CHKERRQ(ierr);
397   ierr = MatAssemblyEnd(aij->B,mode);CHKERRQ(ierr);
398 
399   if (aij->rowvalues) {
400     ierr = PetscFree(aij->rowvalues);CHKERRQ(ierr);
401     aij->rowvalues = 0;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNC__
407 #define __FUNC__ "MatZeroEntries_MPIAIJ"
408 int MatZeroEntries_MPIAIJ(Mat A)
409 {
410   Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
411   int        ierr;
412 
413   PetscFunctionBegin;
414   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
415   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 #undef __FUNC__
420 #define __FUNC__ "MatZeroRows_MPIAIJ"
421 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
422 {
423   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
424   int            i,ierr,N,*rows,*owners = l->rowners,size = l->size;
425   int            *procs,*nprocs,j,idx,nsends,*work,row;
426   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
427   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
428   int            *lens,imdex,*lrows,*values,rstart=l->rstart;
429   MPI_Comm       comm = A->comm;
430   MPI_Request    *send_waits,*recv_waits;
431   MPI_Status     recv_status,*send_status;
432   IS             istmp;
433   PetscTruth     found;
434 
435   PetscFunctionBegin;
436   ierr = ISGetLocalSize(is,&N);CHKERRQ(ierr);
437   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
438 
439   /*  first count number of contributors to each processor */
440   ierr = PetscMalloc(2*size*sizeof(int),&nprocs);CHKERRQ(ierr);
441   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
442   procs  = nprocs + size;
443   ierr = PetscMalloc((N+1)*sizeof(int),&owner);CHKERRQ(ierr); /* see note*/
444   for (i=0; i<N; i++) {
445     idx = rows[i];
446     found = PETSC_FALSE;
447     for (j=0; j<size; j++) {
448       if (idx >= owners[j] && idx < owners[j+1]) {
449         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
450       }
451     }
452     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index out of range");
453   }
454   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}
455 
456   /* inform other processors of number of messages and max length*/
457   ierr = PetscMalloc(2*size*sizeof(int),&work);CHKERRQ(ierr);
458   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
459   nrecvs = work[size+rank];
460   nmax   = work[rank];
461   ierr   = PetscFree(work);CHKERRQ(ierr);
462 
463   /* post receives:   */
464   ierr = PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);CHKERRQ(ierr);
465   ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr);
466   for (i=0; i<nrecvs; i++) {
467     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
468   }
469 
470   /* do sends:
471       1) starts[i] gives the starting index in svalues for stuff going to
472          the ith processor
473   */
474   ierr = PetscMalloc((N+1)*sizeof(int),&svalues);CHKERRQ(ierr);
475   ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr);
476   ierr = PetscMalloc((size+1)*sizeof(int),&starts);CHKERRQ(ierr);
477   starts[0] = 0;
478   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
479   for (i=0; i<N; i++) {
480     svalues[starts[owner[i]]++] = rows[i];
481   }
482   ierr = ISRestoreIndices(is,&rows);CHKERRQ(ierr);
483 
484   starts[0] = 0;
485   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
486   count = 0;
487   for (i=0; i<size; i++) {
488     if (procs[i]) {
489       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
490     }
491   }
492   ierr = PetscFree(starts);CHKERRQ(ierr);
493 
494   base = owners[rank];
495 
496   /*  wait on receives */
497   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);CHKERRQ(ierr);
498   source = lens + nrecvs;
499   count  = nrecvs; slen = 0;
500   while (count) {
501     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
502     /* unpack receives into our local space */
503     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
504     source[imdex]  = recv_status.MPI_SOURCE;
505     lens[imdex]    = n;
506     slen          += n;
507     count--;
508   }
509   ierr = PetscFree(recv_waits);CHKERRQ(ierr);
510 
511   /* move the data into the send scatter */
512   ierr = PetscMalloc((slen+1)*sizeof(int),&lrows);CHKERRQ(ierr);
513   count = 0;
514   for (i=0; i<nrecvs; i++) {
515     values = rvalues + i*nmax;
516     for (j=0; j<lens[i]; j++) {
517       lrows[count++] = values[j] - base;
518     }
519   }
520   ierr = PetscFree(rvalues);CHKERRQ(ierr);
521   ierr = PetscFree(lens);CHKERRQ(ierr);
522   ierr = PetscFree(owner);CHKERRQ(ierr);
523   ierr = PetscFree(nprocs);CHKERRQ(ierr);
524 
525   /* actually zap the local rows */
526   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
527   PetscLogObjectParent(A,istmp);
528 
529   /*
530         Zero the required rows. If the "diagonal block" of the matrix
531      is square and the user wishes to set the diagonal we use seperate
532      code so that MatSetValues() is not called for each diagonal allocating
533      new memory, thus calling lots of mallocs and slowing things down.
534 
535        Contributed by: Mathew Knepley
536   */
537   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
538   ierr = MatZeroRows(l->B,istmp,0);CHKERRQ(ierr);
539   if (diag && (l->A->M == l->A->N)) {
540     ierr      = MatZeroRows(l->A,istmp,diag);CHKERRQ(ierr);
541   } else if (diag) {
542     ierr = MatZeroRows(l->A,istmp,0);CHKERRQ(ierr);
543     if (((Mat_SeqAIJ*)l->A->data)->nonew) {
544       SETERRQ(PETSC_ERR_SUP,"MatZeroRows() on rectangular matrices cannot be used with the Mat options\n\
545 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
546     }
547     for (i = 0; i < slen; i++) {
548       row  = lrows[i] + rstart;
549       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
550     }
551     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
552     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
553   } else {
554     ierr = MatZeroRows(l->A,istmp,0);CHKERRQ(ierr);
555   }
556   ierr = ISDestroy(istmp);CHKERRQ(ierr);
557   ierr = PetscFree(lrows);CHKERRQ(ierr);
558 
559   /* wait on sends */
560   if (nsends) {
561     ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr);
562     ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
563     ierr = PetscFree(send_status);CHKERRQ(ierr);
564   }
565   ierr = PetscFree(send_waits);CHKERRQ(ierr);
566   ierr = PetscFree(svalues);CHKERRQ(ierr);
567 
568   PetscFunctionReturn(0);
569 }
570 
571 #undef __FUNC__
572 #define __FUNC__ "MatMult_MPIAIJ"
573 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
574 {
575   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
576   int        ierr,nt;
577 
578   PetscFunctionBegin;
579   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
580   if (nt != A->n) {
581     SETERRQ2(PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%d) and xx (%d)",A->n,nt);
582   }
583   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
584   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
585   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
586   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
587   PetscFunctionReturn(0);
588 }
589 
590 #undef __FUNC__
591 #define __FUNC__ "MatMultAdd_MPIAIJ"
592 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
593 {
594   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
595   int        ierr;
596 
597   PetscFunctionBegin;
598   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
599   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
600   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
601   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 
605 #undef __FUNC__
606 #define __FUNC__ "MatMultTranspose_MPIAIJ"
607 int MatMultTranspose_MPIAIJ(Mat A,Vec xx,Vec yy)
608 {
609   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
610   int        ierr;
611 
612   PetscFunctionBegin;
613   /* do nondiagonal part */
614   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
615   /* send it on its way */
616   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
617   /* do local part */
618   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
619   /* receive remote parts: note this assumes the values are not actually */
620   /* inserted in yy until the next line, which is true for my implementation*/
621   /* but is not perhaps always true. */
622   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
623   PetscFunctionReturn(0);
624 }
625 
626 #undef __FUNC__
627 #define __FUNC__ "MatMultTransposeAdd_MPIAIJ"
628 int MatMultTransposeAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
629 {
630   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
631   int        ierr;
632 
633   PetscFunctionBegin;
634   /* do nondiagonal part */
635   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
636   /* send it on its way */
637   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
638   /* do local part */
639   ierr = (*a->A->ops->multtransposeadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
640   /* receive remote parts: note this assumes the values are not actually */
641   /* inserted in yy until the next line, which is true for my implementation*/
642   /* but is not perhaps always true. */
643   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
644   PetscFunctionReturn(0);
645 }
646 
647 /*
648   This only works correctly for square matrices where the subblock A->A is the
649    diagonal block
650 */
651 #undef __FUNC__
652 #define __FUNC__ "MatGetDiagonal_MPIAIJ"
653 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
654 {
655   int        ierr;
656   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
657 
658   PetscFunctionBegin;
659   if (A->M != A->N) SETERRQ(PETSC_ERR_SUP,"Supports only square matrix where A->A is diag block");
660   if (a->rstart != a->cstart || a->rend != a->cend) {
661     SETERRQ(PETSC_ERR_ARG_SIZ,"row partition must equal col partition");
662   }
663   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 #undef __FUNC__
668 #define __FUNC__ "MatScale_MPIAIJ"
669 int MatScale_MPIAIJ(Scalar *aa,Mat A)
670 {
671   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
672   int        ierr;
673 
674   PetscFunctionBegin;
675   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
676   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNC__
681 #define __FUNC__ "MatDestroy_MPIAIJ"
682 int MatDestroy_MPIAIJ(Mat mat)
683 {
684   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
685   int        ierr;
686 
687   PetscFunctionBegin;
688 #if defined(PETSC_USE_LOG)
689   PetscLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",mat->M,mat->N);
690 #endif
691   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
692   ierr = PetscFree(aij->rowners);CHKERRQ(ierr);
693   ierr = MatDestroy(aij->A);CHKERRQ(ierr);
694   ierr = MatDestroy(aij->B);CHKERRQ(ierr);
695 #if defined (PETSC_USE_CTABLE)
696   if (aij->colmap) {ierr = PetscTableDelete(aij->colmap);CHKERRQ(ierr);}
697 #else
698   if (aij->colmap) {ierr = PetscFree(aij->colmap);CHKERRQ(ierr);}
699 #endif
700   if (aij->garray) {ierr = PetscFree(aij->garray);CHKERRQ(ierr);}
701   if (aij->lvec)   {ierr = VecDestroy(aij->lvec);CHKERRQ(ierr);}
702   if (aij->Mvctx)  {ierr = VecScatterDestroy(aij->Mvctx);CHKERRQ(ierr);}
703   if (aij->rowvalues) {ierr = PetscFree(aij->rowvalues);CHKERRQ(ierr);}
704   ierr = PetscFree(aij);CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNC__
709 #define __FUNC__ "MatView_MPIAIJ_ASCIIorDraworSocket"
710 int MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat,PetscViewer viewer)
711 {
712   Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)mat->data;
713   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
714   int         ierr,format,shift = C->indexshift,rank = aij->rank,size = aij->size;
715   PetscTruth  isdraw,isascii,flg;
716   PetscViewer      sviewer;
717 
718   PetscFunctionBegin;
719   ierr  = PetscTypeCompare((PetscObject)viewer,PETSC_DRAW_VIEWER,&isdraw);CHKERRQ(ierr);
720   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
721   if (isascii) {
722     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
723     if (format == PETSC_VIEWER_FORMAT_ASCII_INFO_LONG) {
724       MatInfo info;
725       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
726       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
727       ierr = PetscOptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg);CHKERRQ(ierr);
728       if (flg) {
729         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
730 					      rank,mat->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);CHKERRQ(ierr);
731       } else {
732         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
733 		    rank,mat->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);CHKERRQ(ierr);
734       }
735       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
736       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);CHKERRQ(ierr);
737       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
738       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);CHKERRQ(ierr);
739       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
740       ierr = VecScatterView(aij->Mvctx,viewer);CHKERRQ(ierr);
741       PetscFunctionReturn(0);
742     } else if (format == PETSC_VIEWER_FORMAT_ASCII_INFO) {
743       PetscFunctionReturn(0);
744     }
745   } else if (isdraw) {
746     PetscDraw       draw;
747     PetscTruth isnull;
748     ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
749     ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
750   }
751 
752   if (size == 1) {
753     ierr = MatView(aij->A,viewer);CHKERRQ(ierr);
754   } else {
755     /* assemble the entire matrix onto first processor. */
756     Mat         A;
757     Mat_SeqAIJ *Aloc;
758     int         M = mat->M,N = mat->N,m,*ai,*aj,row,*cols,i,*ct;
759     Scalar      *a;
760 
761     if (!rank) {
762       ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
763     } else {
764       ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
765     }
766     PetscLogObjectParent(mat,A);
767 
768     /* copy over the A part */
769     Aloc = (Mat_SeqAIJ*)aij->A->data;
770     m = aij->A->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
771     row = aij->rstart;
772     for (i=0; i<ai[m]+shift; i++) {aj[i] += aij->cstart + shift;}
773     for (i=0; i<m; i++) {
774       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
775       row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
776     }
777     aj = Aloc->j;
778     for (i=0; i<ai[m]+shift; i++) {aj[i] -= aij->cstart + shift;}
779 
780     /* copy over the B part */
781     Aloc = (Mat_SeqAIJ*)aij->B->data;
782     m    = aij->B->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
783     row  = aij->rstart;
784     ierr = PetscMalloc((ai[m]+1)*sizeof(int),&cols);CHKERRQ(ierr);
785     ct   = cols;
786     for (i=0; i<ai[m]+shift; i++) {cols[i] = aij->garray[aj[i]+shift];}
787     for (i=0; i<m; i++) {
788       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
789       row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
790     }
791     ierr = PetscFree(ct);CHKERRQ(ierr);
792     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
793     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
794     /*
795        Everyone has to call to draw the matrix since the graphics waits are
796        synchronized across all processors that share the PetscDraw object
797     */
798     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
799     if (!rank) {
800       ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,sviewer);CHKERRQ(ierr);
801     }
802     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
803     ierr = MatDestroy(A);CHKERRQ(ierr);
804   }
805   PetscFunctionReturn(0);
806 }
807 
808 #undef __FUNC__
809 #define __FUNC__ "MatView_MPIAIJ"
810 int MatView_MPIAIJ(Mat mat,PetscViewer viewer)
811 {
812   int        ierr;
813   PetscTruth isascii,isdraw,issocket,isbinary;
814 
815   PetscFunctionBegin;
816   ierr  = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
817   ierr  = PetscTypeCompare((PetscObject)viewer,PETSC_DRAW_VIEWER,&isdraw);CHKERRQ(ierr);
818   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_BINARY_VIEWER,&isbinary);CHKERRQ(ierr);
819   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_SOCKET,&issocket);CHKERRQ(ierr);
820   if (isascii || isdraw || isbinary || issocket) {
821     ierr = MatView_MPIAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
822   } else {
823     SETERRQ1(1,"Viewer type %s not supported by MPIAIJ matrices",((PetscObject)viewer)->type_name);
824   }
825   PetscFunctionReturn(0);
826 }
827 
828 /*
829     This has to provide several versions.
830 
831      2) a) use only local smoothing updating outer values only once.
832         b) local smoothing updating outer values each inner iteration
833      3) color updating out values betwen colors.
834 */
835 #undef __FUNC__
836 #define __FUNC__ "MatRelax_MPIAIJ"
837 int MatRelax_MPIAIJ(Mat matin,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,int its,Vec xx)
838 {
839   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
840   Mat        AA = mat->A,BB = mat->B;
841   Mat_SeqAIJ *A = (Mat_SeqAIJ*)AA->data,*B = (Mat_SeqAIJ *)BB->data;
842   Scalar     *b,*x,*xs,*ls,d,*v,sum;
843   int        ierr,*idx,*diag;
844   int        n = matin->n,m = matin->m,i,shift = A->indexshift;
845 
846   PetscFunctionBegin;
847   if (!A->diag) {ierr = MatMarkDiagonal_SeqAIJ(AA);CHKERRQ(ierr);}
848   diag = A->diag;
849   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
850     if (flag & SOR_ZERO_INITIAL_GUESS) {
851       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
852       PetscFunctionReturn(0);
853     }
854     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
855     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
856     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
857     if (xx != bb) {
858       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
859     } else {
860       b = x;
861     }
862     ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
863     xs = x + shift; /* shift by one for index start of 1 */
864     ls = ls + shift;
865     while (its--) {
866       /* go down through the rows */
867       for (i=0; i<m; i++) {
868         n    = A->i[i+1] - A->i[i];
869 	PetscLogFlops(4*n+3);
870         idx  = A->j + A->i[i] + shift;
871         v    = A->a + A->i[i] + shift;
872         sum  = b[i];
873         SPARSEDENSEMDOT(sum,xs,v,idx,n);
874         d    = fshift + A->a[diag[i]+shift];
875         n    = B->i[i+1] - B->i[i];
876         idx  = B->j + B->i[i] + shift;
877         v    = B->a + B->i[i] + shift;
878         SPARSEDENSEMDOT(sum,ls,v,idx,n);
879         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
880       }
881       /* come up through the rows */
882       for (i=m-1; i>-1; i--) {
883         n    = A->i[i+1] - A->i[i];
884 	PetscLogFlops(4*n+3);
885         idx  = A->j + A->i[i] + shift;
886         v    = A->a + A->i[i] + shift;
887         sum  = b[i];
888         SPARSEDENSEMDOT(sum,xs,v,idx,n);
889         d    = fshift + A->a[diag[i]+shift];
890         n    = B->i[i+1] - B->i[i];
891         idx  = B->j + B->i[i] + shift;
892         v    = B->a + B->i[i] + shift;
893         SPARSEDENSEMDOT(sum,ls,v,idx,n);
894         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
895       }
896     }
897     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
898     if (bb != xx) {ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); }
899     ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
900   } else if (flag & SOR_LOCAL_FORWARD_SWEEP){
901     if (flag & SOR_ZERO_INITIAL_GUESS) {
902       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
903       PetscFunctionReturn(0);
904     }
905     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
906     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
907     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
908     if (xx != bb) {
909       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
910     } else {
911       b = x;
912     }
913     ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
914     xs = x + shift; /* shift by one for index start of 1 */
915     ls = ls + shift;
916     while (its--) {
917       for (i=0; i<m; i++) {
918         n    = A->i[i+1] - A->i[i];
919 	PetscLogFlops(4*n+3);
920         idx  = A->j + A->i[i] + shift;
921         v    = A->a + A->i[i] + shift;
922         sum  = b[i];
923         SPARSEDENSEMDOT(sum,xs,v,idx,n);
924         d    = fshift + A->a[diag[i]+shift];
925         n    = B->i[i+1] - B->i[i];
926         idx  = B->j + B->i[i] + shift;
927         v    = B->a + B->i[i] + shift;
928         SPARSEDENSEMDOT(sum,ls,v,idx,n);
929         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
930       }
931     }
932     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
933     if (bb != xx) {ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); }
934     ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
935   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
936     if (flag & SOR_ZERO_INITIAL_GUESS) {
937       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
938       PetscFunctionReturn(0);
939     }
940     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
941     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
942     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
943     if (xx != bb) {
944       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
945     } else {
946       b = x;
947     }
948     ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
949     xs = x + shift; /* shift by one for index start of 1 */
950     ls = ls + shift;
951     while (its--) {
952       for (i=m-1; i>-1; i--) {
953         n    = A->i[i+1] - A->i[i];
954 	PetscLogFlops(4*n+3);
955         idx  = A->j + A->i[i] + shift;
956         v    = A->a + A->i[i] + shift;
957         sum  = b[i];
958         SPARSEDENSEMDOT(sum,xs,v,idx,n);
959         d    = fshift + A->a[diag[i]+shift];
960         n    = B->i[i+1] - B->i[i];
961         idx  = B->j + B->i[i] + shift;
962         v    = B->a + B->i[i] + shift;
963         SPARSEDENSEMDOT(sum,ls,v,idx,n);
964         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
965       }
966     }
967     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
968     if (bb != xx) {ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); }
969     ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
970   } else {
971     SETERRQ(PETSC_ERR_SUP,"Parallel SOR not supported");
972   }
973   PetscFunctionReturn(0);
974 }
975 
976 #undef __FUNC__
977 #define __FUNC__ "MatGetInfo_MPIAIJ"
978 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
979 {
980   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
981   Mat        A = mat->A,B = mat->B;
982   int        ierr;
983   PetscReal  isend[5],irecv[5];
984 
985   PetscFunctionBegin;
986   info->block_size     = 1.0;
987   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
988   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
989   isend[3] = info->memory;  isend[4] = info->mallocs;
990   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
991   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
992   isend[3] += info->memory;  isend[4] += info->mallocs;
993   if (flag == MAT_LOCAL) {
994     info->nz_used      = isend[0];
995     info->nz_allocated = isend[1];
996     info->nz_unneeded  = isend[2];
997     info->memory       = isend[3];
998     info->mallocs      = isend[4];
999   } else if (flag == MAT_GLOBAL_MAX) {
1000     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1001     info->nz_used      = irecv[0];
1002     info->nz_allocated = irecv[1];
1003     info->nz_unneeded  = irecv[2];
1004     info->memory       = irecv[3];
1005     info->mallocs      = irecv[4];
1006   } else if (flag == MAT_GLOBAL_SUM) {
1007     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1008     info->nz_used      = irecv[0];
1009     info->nz_allocated = irecv[1];
1010     info->nz_unneeded  = irecv[2];
1011     info->memory       = irecv[3];
1012     info->mallocs      = irecv[4];
1013   }
1014   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1015   info->fill_ratio_needed = 0;
1016   info->factor_mallocs    = 0;
1017   info->rows_global       = (double)matin->M;
1018   info->columns_global    = (double)matin->N;
1019   info->rows_local        = (double)matin->m;
1020   info->columns_local     = (double)matin->N;
1021 
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 #undef __FUNC__
1026 #define __FUNC__ "MatSetOption_MPIAIJ"
1027 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1028 {
1029   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1030   int        ierr;
1031 
1032   PetscFunctionBegin;
1033   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1034       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1035       op == MAT_COLUMNS_UNSORTED ||
1036       op == MAT_COLUMNS_SORTED ||
1037       op == MAT_NEW_NONZERO_ALLOCATION_ERR ||
1038       op == MAT_KEEP_ZEROED_ROWS ||
1039       op == MAT_NEW_NONZERO_LOCATION_ERR ||
1040       op == MAT_USE_INODES ||
1041       op == MAT_DO_NOT_USE_INODES ||
1042       op == MAT_IGNORE_ZERO_ENTRIES) {
1043         ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1044         ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1045   } else if (op == MAT_ROW_ORIENTED) {
1046     a->roworiented = PETSC_TRUE;
1047     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1048     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1049   } else if (op == MAT_ROWS_SORTED ||
1050              op == MAT_ROWS_UNSORTED ||
1051              op == MAT_SYMMETRIC ||
1052              op == MAT_STRUCTURALLY_SYMMETRIC ||
1053              op == MAT_YES_NEW_DIAGONALS) {
1054     PetscLogInfo(A,"MatSetOption_MPIAIJ:Option ignored\n");
1055   } else if (op == MAT_COLUMN_ORIENTED) {
1056     a->roworiented = PETSC_FALSE;
1057     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1058     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1059   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1060     a->donotstash = PETSC_TRUE;
1061   } else if (op == MAT_NO_NEW_DIAGONALS){
1062     SETERRQ(PETSC_ERR_SUP,"MAT_NO_NEW_DIAGONALS");
1063   } else {
1064     SETERRQ(PETSC_ERR_SUP,"unknown option");
1065   }
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNC__
1070 #define __FUNC__ "MatGetOwnershipRange_MPIAIJ"
1071 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1072 {
1073   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
1074 
1075   PetscFunctionBegin;
1076   if (m) *m = mat->rstart;
1077   if (n) *n = mat->rend;
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 #undef __FUNC__
1082 #define __FUNC__ "MatGetRow_MPIAIJ"
1083 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1084 {
1085   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
1086   Scalar     *vworkA,*vworkB,**pvA,**pvB,*v_p;
1087   int        i,ierr,*cworkA,*cworkB,**pcA,**pcB,cstart = mat->cstart;
1088   int        nztot,nzA,nzB,lrow,rstart = mat->rstart,rend = mat->rend;
1089   int        *cmap,*idx_p;
1090 
1091   PetscFunctionBegin;
1092   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Already active");
1093   mat->getrowactive = PETSC_TRUE;
1094 
1095   if (!mat->rowvalues && (idx || v)) {
1096     /*
1097         allocate enough space to hold information from the longest row.
1098     */
1099     Mat_SeqAIJ *Aa = (Mat_SeqAIJ*)mat->A->data,*Ba = (Mat_SeqAIJ*)mat->B->data;
1100     int     max = 1,tmp;
1101     for (i=0; i<matin->m; i++) {
1102       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1103       if (max < tmp) { max = tmp; }
1104     }
1105     ierr = PetscMalloc(max*(sizeof(int)+sizeof(Scalar)),mat->rowvalues);CHKERRQ(ierr);
1106     mat->rowindices = (int*)(mat->rowvalues + max);
1107   }
1108 
1109   if (row < rstart || row >= rend) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Only local rows")
1110   lrow = row - rstart;
1111 
1112   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1113   if (!v)   {pvA = 0; pvB = 0;}
1114   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1115   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1116   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1117   nztot = nzA + nzB;
1118 
1119   cmap  = mat->garray;
1120   if (v  || idx) {
1121     if (nztot) {
1122       /* Sort by increasing column numbers, assuming A and B already sorted */
1123       int imark = -1;
1124       if (v) {
1125         *v = v_p = mat->rowvalues;
1126         for (i=0; i<nzB; i++) {
1127           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1128           else break;
1129         }
1130         imark = i;
1131         for (i=0; i<nzA; i++)     v_p[imark+i] = vworkA[i];
1132         for (i=imark; i<nzB; i++) v_p[nzA+i]   = vworkB[i];
1133       }
1134       if (idx) {
1135         *idx = idx_p = mat->rowindices;
1136         if (imark > -1) {
1137           for (i=0; i<imark; i++) {
1138             idx_p[i] = cmap[cworkB[i]];
1139           }
1140         } else {
1141           for (i=0; i<nzB; i++) {
1142             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1143             else break;
1144           }
1145           imark = i;
1146         }
1147         for (i=0; i<nzA; i++)     idx_p[imark+i] = cstart + cworkA[i];
1148         for (i=imark; i<nzB; i++) idx_p[nzA+i]   = cmap[cworkB[i]];
1149       }
1150     } else {
1151       if (idx) *idx = 0;
1152       if (v)   *v   = 0;
1153     }
1154   }
1155   *nz = nztot;
1156   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1157   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 #undef __FUNC__
1162 #define __FUNC__ "MatRestoreRow_MPIAIJ"
1163 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1164 {
1165   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
1166 
1167   PetscFunctionBegin;
1168   if (aij->getrowactive == PETSC_FALSE) {
1169     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"MatGetRow not called");
1170   }
1171   aij->getrowactive = PETSC_FALSE;
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 #undef __FUNC__
1176 #define __FUNC__ "MatNorm_MPIAIJ"
1177 int MatNorm_MPIAIJ(Mat mat,NormType type,PetscReal *norm)
1178 {
1179   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
1180   Mat_SeqAIJ *amat = (Mat_SeqAIJ*)aij->A->data,*bmat = (Mat_SeqAIJ*)aij->B->data;
1181   int        ierr,i,j,cstart = aij->cstart,shift = amat->indexshift;
1182   PetscReal  sum = 0.0;
1183   Scalar     *v;
1184 
1185   PetscFunctionBegin;
1186   if (aij->size == 1) {
1187     ierr =  MatNorm(aij->A,type,norm);CHKERRQ(ierr);
1188   } else {
1189     if (type == NORM_FROBENIUS) {
1190       v = amat->a;
1191       for (i=0; i<amat->nz; i++) {
1192 #if defined(PETSC_USE_COMPLEX)
1193         sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1194 #else
1195         sum += (*v)*(*v); v++;
1196 #endif
1197       }
1198       v = bmat->a;
1199       for (i=0; i<bmat->nz; i++) {
1200 #if defined(PETSC_USE_COMPLEX)
1201         sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1202 #else
1203         sum += (*v)*(*v); v++;
1204 #endif
1205       }
1206       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1207       *norm = sqrt(*norm);
1208     } else if (type == NORM_1) { /* max column norm */
1209       PetscReal *tmp,*tmp2;
1210       int    *jj,*garray = aij->garray;
1211       ierr = PetscMalloc((mat->N+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1212       ierr = PetscMalloc((mat->N+1)*sizeof(PetscReal),&tmp2);CHKERRQ(ierr);
1213       ierr = PetscMemzero(tmp,mat->N*sizeof(PetscReal));CHKERRQ(ierr);
1214       *norm = 0.0;
1215       v = amat->a; jj = amat->j;
1216       for (j=0; j<amat->nz; j++) {
1217         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1218       }
1219       v = bmat->a; jj = bmat->j;
1220       for (j=0; j<bmat->nz; j++) {
1221         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1222       }
1223       ierr = MPI_Allreduce(tmp,tmp2,mat->N,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1224       for (j=0; j<mat->N; j++) {
1225         if (tmp2[j] > *norm) *norm = tmp2[j];
1226       }
1227       ierr = PetscFree(tmp);CHKERRQ(ierr);
1228       ierr = PetscFree(tmp2);CHKERRQ(ierr);
1229     } else if (type == NORM_INFINITY) { /* max row norm */
1230       PetscReal ntemp = 0.0;
1231       for (j=0; j<aij->A->m; j++) {
1232         v = amat->a + amat->i[j] + shift;
1233         sum = 0.0;
1234         for (i=0; i<amat->i[j+1]-amat->i[j]; i++) {
1235           sum += PetscAbsScalar(*v); v++;
1236         }
1237         v = bmat->a + bmat->i[j] + shift;
1238         for (i=0; i<bmat->i[j+1]-bmat->i[j]; i++) {
1239           sum += PetscAbsScalar(*v); v++;
1240         }
1241         if (sum > ntemp) ntemp = sum;
1242       }
1243       ierr = MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);CHKERRQ(ierr);
1244     } else {
1245       SETERRQ(PETSC_ERR_SUP,"No support for two norm");
1246     }
1247   }
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 #undef __FUNC__
1252 #define __FUNC__ "MatTranspose_MPIAIJ"
1253 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1254 {
1255   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1256   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ*)a->A->data;
1257   int        ierr,shift = Aloc->indexshift;
1258   int        M = A->M,N = A->N,m,*ai,*aj,row,*cols,i,*ct;
1259   Mat        B;
1260   Scalar     *array;
1261 
1262   PetscFunctionBegin;
1263   if (!matout && M != N) {
1264     SETERRQ(PETSC_ERR_ARG_SIZ,"Square matrix only for in-place");
1265   }
1266 
1267   ierr = MatCreateMPIAIJ(A->comm,A->n,A->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1268 
1269   /* copy over the A part */
1270   Aloc = (Mat_SeqAIJ*)a->A->data;
1271   m = a->A->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1272   row = a->rstart;
1273   for (i=0; i<ai[m]+shift; i++) {aj[i] += a->cstart + shift;}
1274   for (i=0; i<m; i++) {
1275     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1276     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1277   }
1278   aj = Aloc->j;
1279   for (i=0; i<ai[m]+shift; i++) {aj[i] -= a->cstart + shift;}
1280 
1281   /* copy over the B part */
1282   Aloc = (Mat_SeqAIJ*)a->B->data;
1283   m = a->B->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1284   row  = a->rstart;
1285   ierr = PetscMalloc((1+ai[m]-shift)*sizeof(int),&cols);CHKERRQ(ierr);
1286   ct   = cols;
1287   for (i=0; i<ai[m]+shift; i++) {cols[i] = a->garray[aj[i]+shift];}
1288   for (i=0; i<m; i++) {
1289     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1290     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1291   }
1292   ierr = PetscFree(ct);CHKERRQ(ierr);
1293   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1294   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1295   if (matout) {
1296     *matout = B;
1297   } else {
1298     ierr = MatHeaderCopy(A,B);CHKERRQ(ierr);
1299   }
1300   PetscFunctionReturn(0);
1301 }
1302 
1303 #undef __FUNC__
1304 #define __FUNC__ "MatDiagonalScale_MPIAIJ"
1305 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1306 {
1307   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
1308   Mat        a = aij->A,b = aij->B;
1309   int        ierr,s1,s2,s3;
1310 
1311   PetscFunctionBegin;
1312   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1313   if (rr) {
1314     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1315     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,"right vector non-conforming local size");
1316     /* Overlap communication with computation. */
1317     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx);CHKERRQ(ierr);
1318   }
1319   if (ll) {
1320     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1321     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,"left vector non-conforming local size");
1322     ierr = (*b->ops->diagonalscale)(b,ll,0);CHKERRQ(ierr);
1323   }
1324   /* scale  the diagonal block */
1325   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1326 
1327   if (rr) {
1328     /* Do a scatter end and then right scale the off-diagonal block */
1329     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx);CHKERRQ(ierr);
1330     ierr = (*b->ops->diagonalscale)(b,0,aij->lvec);CHKERRQ(ierr);
1331   }
1332 
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 
1337 #undef __FUNC__
1338 #define __FUNC__ "MatPrintHelp_MPIAIJ"
1339 int MatPrintHelp_MPIAIJ(Mat A)
1340 {
1341   Mat_MPIAIJ *a   = (Mat_MPIAIJ*)A->data;
1342   int        ierr;
1343 
1344   PetscFunctionBegin;
1345   if (!a->rank) {
1346     ierr = MatPrintHelp_SeqAIJ(a->A);CHKERRQ(ierr);
1347   }
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 #undef __FUNC__
1352 #define __FUNC__ "MatGetBlockSize_MPIAIJ"
1353 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1354 {
1355   PetscFunctionBegin;
1356   *bs = 1;
1357   PetscFunctionReturn(0);
1358 }
1359 #undef __FUNC__
1360 #define __FUNC__ "MatSetUnfactored_MPIAIJ"
1361 int MatSetUnfactored_MPIAIJ(Mat A)
1362 {
1363   Mat_MPIAIJ *a   = (Mat_MPIAIJ*)A->data;
1364   int        ierr;
1365 
1366   PetscFunctionBegin;
1367   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 #undef __FUNC__
1372 #define __FUNC__ "MatEqual_MPIAIJ"
1373 int MatEqual_MPIAIJ(Mat A,Mat B,PetscTruth *flag)
1374 {
1375   Mat_MPIAIJ *matB = (Mat_MPIAIJ*)B->data,*matA = (Mat_MPIAIJ*)A->data;
1376   Mat        a,b,c,d;
1377   PetscTruth flg;
1378   int        ierr;
1379 
1380   PetscFunctionBegin;
1381   ierr = PetscTypeCompare((PetscObject)B,MATMPIAIJ,&flg);CHKERRQ(ierr);
1382   if (!flg) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1383   a = matA->A; b = matA->B;
1384   c = matB->A; d = matB->B;
1385 
1386   ierr = MatEqual(a,c,&flg);CHKERRQ(ierr);
1387   if (flg == PETSC_TRUE) {
1388     ierr = MatEqual(b,d,&flg);CHKERRQ(ierr);
1389   }
1390   ierr = MPI_Allreduce(&flg,flag,1,MPI_INT,MPI_LAND,A->comm);CHKERRQ(ierr);
1391   PetscFunctionReturn(0);
1392 }
1393 
1394 #undef __FUNC__
1395 #define __FUNC__ "MatCopy_MPIAIJ"
1396 int MatCopy_MPIAIJ(Mat A,Mat B,MatStructure str)
1397 {
1398   int        ierr;
1399   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1400   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
1401   PetscTruth flg;
1402 
1403   PetscFunctionBegin;
1404   ierr = PetscTypeCompare((PetscObject)B,MATMPIAIJ,&flg);CHKERRQ(ierr);
1405   if (str != SAME_NONZERO_PATTERN || !flg) {
1406     /* because of the column compression in the off-processor part of the matrix a->B,
1407        the number of columns in a->B and b->B may be different, hence we cannot call
1408        the MatCopy() directly on the two parts. If need be, we can provide a more
1409        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
1410        then copying the submatrices */
1411     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1412   } else {
1413     ierr = MatCopy(a->A,b->A,str);CHKERRQ(ierr);
1414     ierr = MatCopy(a->B,b->B,str);CHKERRQ(ierr);
1415   }
1416   PetscFunctionReturn(0);
1417 }
1418 
1419 #undef __FUNC__
1420 #define __FUNC__ "MatSetUpPreallocation_MPIAIJ"
1421 int MatSetUpPreallocation_MPIAIJ(Mat A)
1422 {
1423   int        ierr;
1424 
1425   PetscFunctionBegin;
1426   ierr =  MatMPIAIJSetPreallocation(A,PETSC_DEFAULT,0,PETSC_DEFAULT,0);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 EXTERN int MatDuplicate_MPIAIJ(Mat,MatDuplicateOption,Mat *);
1431 EXTERN int MatIncreaseOverlap_MPIAIJ(Mat,int,IS *,int);
1432 EXTERN int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1433 EXTERN int MatGetSubMatrices_MPIAIJ (Mat,int,IS *,IS *,MatReuse,Mat **);
1434 EXTERN int MatGetSubMatrix_MPIAIJ (Mat,IS,IS,int,MatReuse,Mat *);
1435 
1436 /* -------------------------------------------------------------------*/
1437 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
1438        MatGetRow_MPIAIJ,
1439        MatRestoreRow_MPIAIJ,
1440        MatMult_MPIAIJ,
1441        MatMultAdd_MPIAIJ,
1442        MatMultTranspose_MPIAIJ,
1443        MatMultTransposeAdd_MPIAIJ,
1444        0,
1445        0,
1446        0,
1447        0,
1448        0,
1449        0,
1450        MatRelax_MPIAIJ,
1451        MatTranspose_MPIAIJ,
1452        MatGetInfo_MPIAIJ,
1453        MatEqual_MPIAIJ,
1454        MatGetDiagonal_MPIAIJ,
1455        MatDiagonalScale_MPIAIJ,
1456        MatNorm_MPIAIJ,
1457        MatAssemblyBegin_MPIAIJ,
1458        MatAssemblyEnd_MPIAIJ,
1459        0,
1460        MatSetOption_MPIAIJ,
1461        MatZeroEntries_MPIAIJ,
1462        MatZeroRows_MPIAIJ,
1463        0,
1464        0,
1465        0,
1466        0,
1467        MatSetUpPreallocation_MPIAIJ,
1468        0,
1469        MatGetOwnershipRange_MPIAIJ,
1470        0,
1471        0,
1472        0,
1473        0,
1474        MatDuplicate_MPIAIJ,
1475        0,
1476        0,
1477        0,
1478        0,
1479        0,
1480        MatGetSubMatrices_MPIAIJ,
1481        MatIncreaseOverlap_MPIAIJ,
1482        MatGetValues_MPIAIJ,
1483        MatCopy_MPIAIJ,
1484        MatPrintHelp_MPIAIJ,
1485        MatScale_MPIAIJ,
1486        0,
1487        0,
1488        0,
1489        MatGetBlockSize_MPIAIJ,
1490        0,
1491        0,
1492        0,
1493        0,
1494        MatFDColoringCreate_MPIAIJ,
1495        0,
1496        MatSetUnfactored_MPIAIJ,
1497        0,
1498        0,
1499        MatGetSubMatrix_MPIAIJ,
1500        MatDestroy_MPIAIJ,
1501        MatView_MPIAIJ,
1502        MatGetMaps_Petsc};
1503 
1504 /* ----------------------------------------------------------------------------------------*/
1505 
1506 EXTERN_C_BEGIN
1507 #undef __FUNC__
1508 #define __FUNC__ "MatStoreValues_MPIAIJ"
1509 int MatStoreValues_MPIAIJ(Mat mat)
1510 {
1511   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1512   int        ierr;
1513 
1514   PetscFunctionBegin;
1515   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
1516   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
1517   PetscFunctionReturn(0);
1518 }
1519 EXTERN_C_END
1520 
1521 EXTERN_C_BEGIN
1522 #undef __FUNC__
1523 #define __FUNC__ "MatRetrieveValues_MPIAIJ"
1524 int MatRetrieveValues_MPIAIJ(Mat mat)
1525 {
1526   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1527   int        ierr;
1528 
1529   PetscFunctionBegin;
1530   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
1531   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
1532   PetscFunctionReturn(0);
1533 }
1534 EXTERN_C_END
1535 
1536 #include "petscpc.h"
1537 EXTERN_C_BEGIN
1538 EXTERN int MatGetDiagonalBlock_MPIAIJ(Mat,PetscTruth *,MatReuse,Mat *);
1539 EXTERN_C_END
1540 
1541 EXTERN int MatUseXXT_MPIAIJ(Mat);
1542 EXTERN int MatUseXYT_MPIAIJ(Mat);
1543 
1544 EXTERN_C_BEGIN
1545 #undef __FUNC__
1546 #define __FUNC__ "MatCreate_MPIAIJ"
1547 int MatCreate_MPIAIJ(Mat B)
1548 {
1549   Mat_MPIAIJ   *b;
1550   int          ierr,i,size;
1551   PetscTruth   flg;
1552 
1553   PetscFunctionBegin;
1554   ierr = MPI_Comm_size(B->comm,&size);CHKERRQ(ierr);
1555 
1556   ierr            = PetscNew(Mat_MPIAIJ,&b);CHKERRQ(ierr);
1557   B->data         = (void*)b;
1558   ierr            = PetscMemzero(b,sizeof(Mat_MPIAIJ));CHKERRQ(ierr);
1559   ierr            = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1560   B->factor       = 0;
1561   B->assembled    = PETSC_FALSE;
1562   B->mapping      = 0;
1563 
1564   B->insertmode      = NOT_SET_VALUES;
1565   b->size            = size;
1566   ierr = MPI_Comm_rank(B->comm,&b->rank);CHKERRQ(ierr);
1567 
1568   ierr = PetscSplitOwnership(B->comm,&B->m,&B->M);CHKERRQ(ierr);
1569   ierr = PetscSplitOwnership(B->comm,&B->n,&B->N);CHKERRQ(ierr);
1570 
1571   /* the information in the maps duplicates the information computed below, eventually
1572      we should remove the duplicate information that is not contained in the maps */
1573   ierr = MapCreateMPI(B->comm,B->m,B->M,&B->rmap);CHKERRQ(ierr);
1574   ierr = MapCreateMPI(B->comm,B->n,B->N,&B->cmap);CHKERRQ(ierr);
1575 
1576   /* build local table of row and column ownerships */
1577   ierr = PetscMalloc(2*(b->size+2)*sizeof(int),&b->rowners);CHKERRQ(ierr);
1578   PetscLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1579   b->cowners = b->rowners + b->size + 2;
1580   ierr = MPI_Allgather(&B->m,1,MPI_INT,b->rowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1581   b->rowners[0] = 0;
1582   for (i=2; i<=b->size; i++) {
1583     b->rowners[i] += b->rowners[i-1];
1584   }
1585   b->rstart = b->rowners[b->rank];
1586   b->rend   = b->rowners[b->rank+1];
1587   ierr = MPI_Allgather(&B->n,1,MPI_INT,b->cowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1588   b->cowners[0] = 0;
1589   for (i=2; i<=b->size; i++) {
1590     b->cowners[i] += b->cowners[i-1];
1591   }
1592   b->cstart = b->cowners[b->rank];
1593   b->cend   = b->cowners[b->rank+1];
1594 
1595   /* build cache for off array entries formed */
1596   ierr = MatStashCreate_Private(B->comm,1,&B->stash);CHKERRQ(ierr);
1597   b->donotstash  = PETSC_FALSE;
1598   b->colmap      = 0;
1599   b->garray      = 0;
1600   b->roworiented = PETSC_TRUE;
1601 
1602   /* stuff used for matrix vector multiply */
1603   b->lvec      = PETSC_NULL;
1604   b->Mvctx     = PETSC_NULL;
1605 
1606   /* stuff for MatGetRow() */
1607   b->rowindices   = 0;
1608   b->rowvalues    = 0;
1609   b->getrowactive = PETSC_FALSE;
1610 
1611   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_mpiaij_xxt",&flg);CHKERRQ(ierr);
1612   if (flg) { ierr = MatUseXXT_MPIAIJ(B);CHKERRQ(ierr); }
1613   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_mpiaij_xyt",&flg);CHKERRQ(ierr);
1614   if (flg) { ierr = MatUseXYT_MPIAIJ(B);CHKERRQ(ierr); }
1615 
1616   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatStoreValues_C",
1617                                      "MatStoreValues_MPIAIJ",
1618                                      MatStoreValues_MPIAIJ);CHKERRQ(ierr);
1619   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatRetrieveValues_C",
1620                                      "MatRetrieveValues_MPIAIJ",
1621                                      MatRetrieveValues_MPIAIJ);CHKERRQ(ierr);
1622   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetDiagonalBlock_C",
1623                                      "MatGetDiagonalBlock_MPIAIJ",
1624                                      MatGetDiagonalBlock_MPIAIJ);CHKERRQ(ierr);
1625   PetscFunctionReturn(0);
1626 }
1627 EXTERN_C_END
1628 
1629 #undef __FUNC__
1630 #define __FUNC__ "MatDuplicate_MPIAIJ"
1631 int MatDuplicate_MPIAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1632 {
1633   Mat        mat;
1634   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ*)matin->data;
1635   int        ierr,len = 0;
1636 
1637   PetscFunctionBegin;
1638   *newmat       = 0;
1639   ierr = MatCreate(matin->comm,matin->m,matin->n,matin->M,matin->N,&mat);CHKERRQ(ierr);
1640   ierr = MatSetType(mat,MATMPIAIJ);CHKERRQ(ierr);
1641   a    = (Mat_MPIAIJ*)mat->data;
1642   ierr              = PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1643   mat->factor       = matin->factor;
1644   mat->assembled    = PETSC_TRUE;
1645   mat->insertmode   = NOT_SET_VALUES;
1646   mat->preallocated = PETSC_TRUE;
1647 
1648   a->rstart       = oldmat->rstart;
1649   a->rend         = oldmat->rend;
1650   a->cstart       = oldmat->cstart;
1651   a->cend         = oldmat->cend;
1652   a->size         = oldmat->size;
1653   a->rank         = oldmat->rank;
1654   a->donotstash   = oldmat->donotstash;
1655   a->roworiented  = oldmat->roworiented;
1656   a->rowindices   = 0;
1657   a->rowvalues    = 0;
1658   a->getrowactive = PETSC_FALSE;
1659 
1660   ierr       = PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));CHKERRQ(ierr);
1661   ierr       = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
1662   if (oldmat->colmap) {
1663 #if defined (PETSC_USE_CTABLE)
1664     ierr = PetscTableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
1665 #else
1666     ierr = PetscMalloc((mat->N)*sizeof(int),&a->colmap);CHKERRQ(ierr);
1667     PetscLogObjectMemory(mat,(mat->N)*sizeof(int));
1668     ierr      = PetscMemcpy(a->colmap,oldmat->colmap,(mat->N)*sizeof(int));CHKERRQ(ierr);
1669 #endif
1670   } else a->colmap = 0;
1671   if (oldmat->garray) {
1672     len  = oldmat->B->n;
1673     ierr = PetscMalloc((len+1)*sizeof(int),&a->garray);CHKERRQ(ierr);
1674     PetscLogObjectMemory(mat,len*sizeof(int));
1675     if (len) { ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr); }
1676   } else a->garray = 0;
1677 
1678   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
1679   PetscLogObjectParent(mat,a->lvec);
1680   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
1681   PetscLogObjectParent(mat,a->Mvctx);
1682   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
1683   PetscLogObjectParent(mat,a->A);
1684   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
1685   PetscLogObjectParent(mat,a->B);
1686   ierr = PetscFListDuplicate(matin->qlist,&mat->qlist);CHKERRQ(ierr);
1687   *newmat = mat;
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 #include "petscsys.h"
1692 
1693 EXTERN_C_BEGIN
1694 #undef __FUNC__
1695 #define __FUNC__ "MatLoad_MPIAIJ"
1696 int MatLoad_MPIAIJ(PetscViewer viewer,MatType type,Mat *newmat)
1697 {
1698   Mat          A;
1699   Scalar       *vals,*svals;
1700   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1701   MPI_Status   status;
1702   int          i,nz,ierr,j,rstart,rend,fd;
1703   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1704   int          *ourlens,*sndcounts = 0,*procsnz = 0,*offlens,jj,*mycols,*smycols;
1705   int          tag = ((PetscObject)viewer)->tag,cend,cstart,n;
1706 
1707   PetscFunctionBegin;
1708   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1709   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1710   if (!rank) {
1711     ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
1712     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
1713     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"not matrix object");
1714     if (header[3] < 0) {
1715       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"Matrix in special format on disk, cannot load as MPIAIJ");
1716     }
1717   }
1718 
1719   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
1720   M = header[1]; N = header[2];
1721   /* determine ownership of all rows */
1722   m = M/size + ((M % size) > rank);
1723   ierr = PetscMalloc((size+2)*sizeof(int),&rowners);CHKERRQ(ierr);
1724   ierr = MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1725   rowners[0] = 0;
1726   for (i=2; i<=size; i++) {
1727     rowners[i] += rowners[i-1];
1728   }
1729   rstart = rowners[rank];
1730   rend   = rowners[rank+1];
1731 
1732   /* distribute row lengths to all processors */
1733   ierr    = PetscMalloc(2*(rend-rstart+1)*sizeof(int),&ourlens);CHKERRQ(ierr);
1734   offlens = ourlens + (rend-rstart);
1735   if (!rank) {
1736     ierr = PetscMalloc(M*sizeof(int),&rowlengths);CHKERRQ(ierr);
1737     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
1738     ierr = PetscMalloc(size*sizeof(int),&sndcounts);CHKERRQ(ierr);
1739     for (i=0; i<size; i++) sndcounts[i] = rowners[i+1] - rowners[i];
1740     ierr = MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
1741     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
1742   } else {
1743     ierr = MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
1744   }
1745 
1746   if (!rank) {
1747     /* calculate the number of nonzeros on each processor */
1748     ierr = PetscMalloc(size*sizeof(int),&procsnz);CHKERRQ(ierr);
1749     ierr = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
1750     for (i=0; i<size; i++) {
1751       for (j=rowners[i]; j< rowners[i+1]; j++) {
1752         procsnz[i] += rowlengths[j];
1753       }
1754     }
1755     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
1756 
1757     /* determine max buffer needed and allocate it */
1758     maxnz = 0;
1759     for (i=0; i<size; i++) {
1760       maxnz = PetscMax(maxnz,procsnz[i]);
1761     }
1762     ierr = PetscMalloc(maxnz*sizeof(int),&cols);CHKERRQ(ierr);
1763 
1764     /* read in my part of the matrix column indices  */
1765     nz   = procsnz[0];
1766     ierr = PetscMalloc(nz*sizeof(int),&mycols);CHKERRQ(ierr);
1767     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
1768 
1769     /* read in every one elses and ship off */
1770     for (i=1; i<size; i++) {
1771       nz   = procsnz[i];
1772       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
1773       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
1774     }
1775     ierr = PetscFree(cols);CHKERRQ(ierr);
1776   } else {
1777     /* determine buffer space needed for message */
1778     nz = 0;
1779     for (i=0; i<m; i++) {
1780       nz += ourlens[i];
1781     }
1782     ierr = PetscMalloc((nz+1)*sizeof(int),&mycols);CHKERRQ(ierr);
1783 
1784     /* receive message of column indices*/
1785     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
1786     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
1787     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
1788   }
1789 
1790   /* determine column ownership if matrix is not square */
1791   if (N != M) {
1792     n      = N/size + ((N % size) > rank);
1793     ierr   = MPI_Scan(&n,&cend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1794     cstart = cend - n;
1795   } else {
1796     cstart = rstart;
1797     cend   = rend;
1798     n      = cend - cstart;
1799   }
1800 
1801   /* loop over local rows, determining number of off diagonal entries */
1802   ierr = PetscMemzero(offlens,m*sizeof(int));CHKERRQ(ierr);
1803   jj = 0;
1804   for (i=0; i<m; i++) {
1805     for (j=0; j<ourlens[i]; j++) {
1806       if (mycols[jj] < cstart || mycols[jj] >= cend) offlens[i]++;
1807       jj++;
1808     }
1809   }
1810 
1811   /* create our matrix */
1812   for (i=0; i<m; i++) {
1813     ourlens[i] -= offlens[i];
1814   }
1815   ierr = MatCreateMPIAIJ(comm,m,n,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1816   A = *newmat;
1817   ierr = MatSetOption(A,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
1818   for (i=0; i<m; i++) {
1819     ourlens[i] += offlens[i];
1820   }
1821 
1822   if (!rank) {
1823     ierr = PetscMalloc(maxnz*sizeof(Scalar),&vals);CHKERRQ(ierr);
1824 
1825     /* read in my part of the matrix numerical values  */
1826     nz   = procsnz[0];
1827     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
1828 
1829     /* insert into matrix */
1830     jj      = rstart;
1831     smycols = mycols;
1832     svals   = vals;
1833     for (i=0; i<m; i++) {
1834       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1835       smycols += ourlens[i];
1836       svals   += ourlens[i];
1837       jj++;
1838     }
1839 
1840     /* read in other processors and ship out */
1841     for (i=1; i<size; i++) {
1842       nz   = procsnz[i];
1843       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
1844       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
1845     }
1846     ierr = PetscFree(procsnz);CHKERRQ(ierr);
1847   } else {
1848     /* receive numeric values */
1849     ierr = PetscMalloc((nz+1)*sizeof(Scalar),&vals);CHKERRQ(ierr);
1850 
1851     /* receive message of values*/
1852     ierr = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
1853     ierr = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
1854     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
1855 
1856     /* insert into matrix */
1857     jj      = rstart;
1858     smycols = mycols;
1859     svals   = vals;
1860     for (i=0; i<m; i++) {
1861       ierr     = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1862       smycols += ourlens[i];
1863       svals   += ourlens[i];
1864       jj++;
1865     }
1866   }
1867   ierr = PetscFree(ourlens);CHKERRQ(ierr);
1868   ierr = PetscFree(vals);CHKERRQ(ierr);
1869   ierr = PetscFree(mycols);CHKERRQ(ierr);
1870   ierr = PetscFree(rowners);CHKERRQ(ierr);
1871 
1872   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1873   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1874   PetscFunctionReturn(0);
1875 }
1876 EXTERN_C_END
1877 
1878 #undef __FUNC__
1879 #define __FUNC__ "MatGetSubMatrix_MPIAIJ"
1880 /*
1881     Not great since it makes two copies of the submatrix, first an SeqAIJ
1882   in local and then by concatenating the local matrices the end result.
1883   Writing it directly would be much like MatGetSubMatrices_MPIAIJ()
1884 */
1885 int MatGetSubMatrix_MPIAIJ(Mat mat,IS isrow,IS iscol,int csize,MatReuse call,Mat *newmat)
1886 {
1887   int        ierr,i,m,n,rstart,row,rend,nz,*cwork,size,rank,j;
1888   int        *ii,*jj,nlocal,*dlens,*olens,dlen,olen,jend;
1889   Mat        *local,M,Mreuse;
1890   Scalar     *vwork,*aa;
1891   MPI_Comm   comm = mat->comm;
1892   Mat_SeqAIJ *aij;
1893 
1894 
1895   PetscFunctionBegin;
1896   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1897   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1898 
1899   if (call ==  MAT_REUSE_MATRIX) {
1900     ierr = PetscObjectQuery((PetscObject)*newmat,"SubMatrix",(PetscObject *)&Mreuse);CHKERRQ(ierr);
1901     if (!Mreuse) SETERRQ(1,"Submatrix passed in was not used before, cannot reuse");
1902     local = &Mreuse;
1903     ierr  = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_REUSE_MATRIX,&local);CHKERRQ(ierr);
1904   } else {
1905     ierr = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&local);CHKERRQ(ierr);
1906     Mreuse = *local;
1907     ierr = PetscFree(local);CHKERRQ(ierr);
1908   }
1909 
1910   /*
1911       m - number of local rows
1912       n - number of columns (same on all processors)
1913       rstart - first row in new global matrix generated
1914   */
1915   ierr = MatGetSize(Mreuse,&m,&n);CHKERRQ(ierr);
1916   if (call == MAT_INITIAL_MATRIX) {
1917     aij = (Mat_SeqAIJ*)(Mreuse)->data;
1918     if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,"No support for index shifted matrix");
1919     ii  = aij->i;
1920     jj  = aij->j;
1921 
1922     /*
1923         Determine the number of non-zeros in the diagonal and off-diagonal
1924         portions of the matrix in order to do correct preallocation
1925     */
1926 
1927     /* first get start and end of "diagonal" columns */
1928     if (csize == PETSC_DECIDE) {
1929       nlocal = n/size + ((n % size) > rank);
1930     } else {
1931       nlocal = csize;
1932     }
1933     ierr   = MPI_Scan(&nlocal,&rend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1934     rstart = rend - nlocal;
1935     if (rank == size - 1 && rend != n) {
1936       SETERRQ(1,"Local column sizes do not add up to total number of columns");
1937     }
1938 
1939     /* next, compute all the lengths */
1940     ierr  = PetscMalloc((2*m+1)*sizeof(int),&dlens);CHKERRQ(ierr);
1941     olens = dlens + m;
1942     for (i=0; i<m; i++) {
1943       jend = ii[i+1] - ii[i];
1944       olen = 0;
1945       dlen = 0;
1946       for (j=0; j<jend; j++) {
1947         if (*jj < rstart || *jj >= rend) olen++;
1948         else dlen++;
1949         jj++;
1950       }
1951       olens[i] = olen;
1952       dlens[i] = dlen;
1953     }
1954     ierr = MatCreateMPIAIJ(comm,m,nlocal,PETSC_DECIDE,n,0,dlens,0,olens,&M);CHKERRQ(ierr);
1955     ierr = PetscFree(dlens);CHKERRQ(ierr);
1956   } else {
1957     int ml,nl;
1958 
1959     M = *newmat;
1960     ierr = MatGetLocalSize(M,&ml,&nl);CHKERRQ(ierr);
1961     if (ml != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Previous matrix must be same size/layout as request");
1962     ierr = MatZeroEntries(M);CHKERRQ(ierr);
1963     /*
1964          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
1965        rather than the slower MatSetValues().
1966     */
1967     M->was_assembled = PETSC_TRUE;
1968     M->assembled     = PETSC_FALSE;
1969   }
1970   ierr = MatGetOwnershipRange(M,&rstart,&rend);CHKERRQ(ierr);
1971   aij = (Mat_SeqAIJ*)(Mreuse)->data;
1972   if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,"No support for index shifted matrix");
1973   ii  = aij->i;
1974   jj  = aij->j;
1975   aa  = aij->a;
1976   for (i=0; i<m; i++) {
1977     row   = rstart + i;
1978     nz    = ii[i+1] - ii[i];
1979     cwork = jj;     jj += nz;
1980     vwork = aa;     aa += nz;
1981     ierr = MatSetValues_MPIAIJ(M,1,&row,nz,cwork,vwork,INSERT_VALUES);CHKERRQ(ierr);
1982   }
1983 
1984   ierr = MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1985   ierr = MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1986   *newmat = M;
1987 
1988   /* save submatrix used in processor for next request */
1989   if (call ==  MAT_INITIAL_MATRIX) {
1990     ierr = PetscObjectCompose((PetscObject)M,"SubMatrix",(PetscObject)Mreuse);CHKERRQ(ierr);
1991     ierr = PetscObjectDereference((PetscObject)Mreuse);CHKERRQ(ierr);
1992   }
1993 
1994   PetscFunctionReturn(0);
1995 }
1996 
1997 #undef __FUNC__
1998 #define __FUNC__ "MatMPIAIJSetPreallocation"
1999 /*@C
2000    MatMPIAIJSetPreallocation - Creates a sparse parallel matrix in AIJ format
2001    (the default parallel PETSc format).  For good matrix assembly performance
2002    the user should preallocate the matrix storage by setting the parameters
2003    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
2004    performance can be increased by more than a factor of 50.
2005 
2006    Collective on MPI_Comm
2007 
2008    Input Parameters:
2009 +  A - the matrix
2010 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
2011            (same value is used for all local rows)
2012 .  d_nnz - array containing the number of nonzeros in the various rows of the
2013            DIAGONAL portion of the local submatrix (possibly different for each row)
2014            or PETSC_NULL, if d_nz is used to specify the nonzero structure.
2015            The size of this array is equal to the number of local rows, i.e 'm'.
2016            You must leave room for the diagonal entry even if it is zero.
2017 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
2018            submatrix (same value is used for all local rows).
2019 -  o_nnz - array containing the number of nonzeros in the various rows of the
2020            OFF-DIAGONAL portion of the local submatrix (possibly different for
2021            each row) or PETSC_NULL, if o_nz is used to specify the nonzero
2022            structure. The size of this array is equal to the number
2023            of local rows, i.e 'm'.
2024 
2025    The AIJ format (also called the Yale sparse matrix format or
2026    compressed row storage), is fully compatible with standard Fortran 77
2027    storage.  That is, the stored row and column indices can begin at
2028    either one (as in Fortran) or zero.  See the users manual for details.
2029 
2030    The user MUST specify either the local or global matrix dimensions
2031    (possibly both).
2032 
2033    The parallel matrix is partitioned such that the first m0 rows belong to
2034    process 0, the next m1 rows belong to process 1, the next m2 rows belong
2035    to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
2036 
2037    The DIAGONAL portion of the local submatrix of a processor can be defined
2038    as the submatrix which is obtained by extraction the part corresponding
2039    to the rows r1-r2 and columns r1-r2 of the global matrix, where r1 is the
2040    first row that belongs to the processor, and r2 is the last row belonging
2041    to the this processor. This is a square mxm matrix. The remaining portion
2042    of the local submatrix (mxN) constitute the OFF-DIAGONAL portion.
2043 
2044    If o_nnz, d_nnz are specified, then o_nz, and d_nz are ignored.
2045 
2046    By default, this format uses inodes (identical nodes) when possible.
2047    We search for consecutive rows with the same nonzero structure, thereby
2048    reusing matrix information to achieve increased efficiency.
2049 
2050    Options Database Keys:
2051 +  -mat_aij_no_inode  - Do not use inodes
2052 .  -mat_aij_inode_limit <limit> - Sets inode limit (max limit=5)
2053 -  -mat_aij_oneindex - Internally use indexing starting at 1
2054         rather than 0.  Note that when calling MatSetValues(),
2055         the user still MUST index entries starting at 0!
2056 
2057    Example usage:
2058 
2059    Consider the following 8x8 matrix with 34 non-zero values, that is
2060    assembled across 3 processors. Lets assume that proc0 owns 3 rows,
2061    proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
2062    as follows:
2063 
2064 .vb
2065             1  2  0  |  0  3  0  |  0  4
2066     Proc0   0  5  6  |  7  0  0  |  8  0
2067             9  0 10  | 11  0  0  | 12  0
2068     -------------------------------------
2069            13  0 14  | 15 16 17  |  0  0
2070     Proc1   0 18  0  | 19 20 21  |  0  0
2071             0  0  0  | 22 23  0  | 24  0
2072     -------------------------------------
2073     Proc2  25 26 27  |  0  0 28  | 29  0
2074            30  0  0  | 31 32 33  |  0 34
2075 .ve
2076 
2077    This can be represented as a collection of submatrices as:
2078 
2079 .vb
2080       A B C
2081       D E F
2082       G H I
2083 .ve
2084 
2085    Where the submatrices A,B,C are owned by proc0, D,E,F are
2086    owned by proc1, G,H,I are owned by proc2.
2087 
2088    The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2089    The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2090    The 'M','N' parameters are 8,8, and have the same values on all procs.
2091 
2092    The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
2093    submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
2094    corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
2095    Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
2096    part as SeqAIJ matrices. for eg: proc1 will store [E] as a SeqAIJ
2097    matrix, ans [DF] as another SeqAIJ matrix.
2098 
2099    When d_nz, o_nz parameters are specified, d_nz storage elements are
2100    allocated for every row of the local diagonal submatrix, and o_nz
2101    storage locations are allocated for every row of the OFF-DIAGONAL submat.
2102    One way to choose d_nz and o_nz is to use the max nonzerors per local
2103    rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
2104    In this case, the values of d_nz,o_nz are:
2105 .vb
2106      proc0 : dnz = 2, o_nz = 2
2107      proc1 : dnz = 3, o_nz = 2
2108      proc2 : dnz = 1, o_nz = 4
2109 .ve
2110    We are allocating m*(d_nz+o_nz) storage locations for every proc. This
2111    translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
2112    for proc3. i.e we are using 12+15+10=37 storage locations to store
2113    34 values.
2114 
2115    When d_nnz, o_nnz parameters are specified, the storage is specified
2116    for every row, coresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
2117    In the above case the values for d_nnz,o_nnz are:
2118 .vb
2119      proc0: d_nnz = [2,2,2] and o_nnz = [2,2,2]
2120      proc1: d_nnz = [3,3,2] and o_nnz = [2,1,1]
2121      proc2: d_nnz = [1,1]   and o_nnz = [4,4]
2122 .ve
2123    Here the space allocated is sum of all the above values i.e 34, and
2124    hence pre-allocation is perfect.
2125 
2126    Level: intermediate
2127 
2128 .keywords: matrix, aij, compressed row, sparse, parallel
2129 
2130 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2131 @*/
2132 int MatMPIAIJSetPreallocation(Mat B,int d_nz,int *d_nnz,int o_nz,int *o_nnz)
2133 {
2134   Mat_MPIAIJ   *b;
2135   int          ierr,i;
2136   PetscTruth   flg2;
2137 
2138   PetscFunctionBegin;
2139   ierr = PetscTypeCompare((PetscObject)B,MATMPIAIJ,&flg2);CHKERRQ(ierr);
2140   if (!flg2) PetscFunctionReturn(0);
2141   B->preallocated = PETSC_TRUE;
2142   if (d_nz < -2) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than -2: value %d",d_nz);
2143   if (o_nz < -2) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than -2: value %d",o_nz);
2144   if (d_nnz) {
2145     for (i=0; i<B->m; i++) {
2146       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %d value %d",i,d_nnz[i]);
2147     }
2148   }
2149   if (o_nnz) {
2150     for (i=0; i<B->m; i++) {
2151       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %d value %d",i,o_nnz[i]);
2152     }
2153   }
2154   b = (Mat_MPIAIJ*)B->data;
2155 
2156   if (d_nz == PETSC_DEFAULT) d_nz = 5;
2157   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,B->m,B->n,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
2158   PetscLogObjectParent(B,b->A);
2159   if (o_nz == PETSC_DEFAULT) o_nz = 0;
2160   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,B->m,B->N,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
2161   PetscLogObjectParent(B,b->B);
2162 
2163   PetscFunctionReturn(0);
2164 }
2165 
2166 #undef __FUNC__
2167 #define __FUNC__ "MatCreateMPIAIJ"
2168 /*@C
2169    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
2170    (the default parallel PETSc format).  For good matrix assembly performance
2171    the user should preallocate the matrix storage by setting the parameters
2172    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
2173    performance can be increased by more than a factor of 50.
2174 
2175    Collective on MPI_Comm
2176 
2177    Input Parameters:
2178 +  comm - MPI communicator
2179 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
2180            This value should be the same as the local size used in creating the
2181            y vector for the matrix-vector product y = Ax.
2182 .  n - This value should be the same as the local size used in creating the
2183        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
2184        calculated if N is given) For square matrices n is almost always m.
2185 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
2186 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
2187 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
2188            (same value is used for all local rows)
2189 .  d_nnz - array containing the number of nonzeros in the various rows of the
2190            DIAGONAL portion of the local submatrix (possibly different for each row)
2191            or PETSC_NULL, if d_nz is used to specify the nonzero structure.
2192            The size of this array is equal to the number of local rows, i.e 'm'.
2193            You must leave room for the diagonal entry even if it is zero.
2194 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
2195            submatrix (same value is used for all local rows).
2196 -  o_nnz - array containing the number of nonzeros in the various rows of the
2197            OFF-DIAGONAL portion of the local submatrix (possibly different for
2198            each row) or PETSC_NULL, if o_nz is used to specify the nonzero
2199            structure. The size of this array is equal to the number
2200            of local rows, i.e 'm'.
2201 
2202    Output Parameter:
2203 .  A - the matrix
2204 
2205    Notes:
2206    m,n,M,N parameters specify the size of the matrix, and its partitioning across
2207    processors, while d_nz,d_nnz,o_nz,o_nnz parameters specify the approximate
2208    storage requirements for this matrix.
2209 
2210    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one
2211    processor than it must be used on all processors that share the object for
2212    that argument.
2213 
2214    The AIJ format (also called the Yale sparse matrix format or
2215    compressed row storage), is fully compatible with standard Fortran 77
2216    storage.  That is, the stored row and column indices can begin at
2217    either one (as in Fortran) or zero.  See the users manual for details.
2218 
2219    The user MUST specify either the local or global matrix dimensions
2220    (possibly both).
2221 
2222    The parallel matrix is partitioned such that the first m0 rows belong to
2223    process 0, the next m1 rows belong to process 1, the next m2 rows belong
2224    to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
2225 
2226    The DIAGONAL portion of the local submatrix of a processor can be defined
2227    as the submatrix which is obtained by extraction the part corresponding
2228    to the rows r1-r2 and columns r1-r2 of the global matrix, where r1 is the
2229    first row that belongs to the processor, and r2 is the last row belonging
2230    to the this processor. This is a square mxm matrix. The remaining portion
2231    of the local submatrix (mxN) constitute the OFF-DIAGONAL portion.
2232 
2233    If o_nnz, d_nnz are specified, then o_nz, and d_nz are ignored.
2234 
2235    By default, this format uses inodes (identical nodes) when possible.
2236    We search for consecutive rows with the same nonzero structure, thereby
2237    reusing matrix information to achieve increased efficiency.
2238 
2239    Options Database Keys:
2240 +  -mat_aij_no_inode  - Do not use inodes
2241 .  -mat_aij_inode_limit <limit> - Sets inode limit (max limit=5)
2242 -  -mat_aij_oneindex - Internally use indexing starting at 1
2243         rather than 0.  Note that when calling MatSetValues(),
2244         the user still MUST index entries starting at 0!
2245 
2246 
2247    Example usage:
2248 
2249    Consider the following 8x8 matrix with 34 non-zero values, that is
2250    assembled across 3 processors. Lets assume that proc0 owns 3 rows,
2251    proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
2252    as follows:
2253 
2254 .vb
2255             1  2  0  |  0  3  0  |  0  4
2256     Proc0   0  5  6  |  7  0  0  |  8  0
2257             9  0 10  | 11  0  0  | 12  0
2258     -------------------------------------
2259            13  0 14  | 15 16 17  |  0  0
2260     Proc1   0 18  0  | 19 20 21  |  0  0
2261             0  0  0  | 22 23  0  | 24  0
2262     -------------------------------------
2263     Proc2  25 26 27  |  0  0 28  | 29  0
2264            30  0  0  | 31 32 33  |  0 34
2265 .ve
2266 
2267    This can be represented as a collection of submatrices as:
2268 
2269 .vb
2270       A B C
2271       D E F
2272       G H I
2273 .ve
2274 
2275    Where the submatrices A,B,C are owned by proc0, D,E,F are
2276    owned by proc1, G,H,I are owned by proc2.
2277 
2278    The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2279    The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2280    The 'M','N' parameters are 8,8, and have the same values on all procs.
2281 
2282    The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
2283    submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
2284    corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
2285    Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
2286    part as SeqAIJ matrices. for eg: proc1 will store [E] as a SeqAIJ
2287    matrix, ans [DF] as another SeqAIJ matrix.
2288 
2289    When d_nz, o_nz parameters are specified, d_nz storage elements are
2290    allocated for every row of the local diagonal submatrix, and o_nz
2291    storage locations are allocated for every row of the OFF-DIAGONAL submat.
2292    One way to choose d_nz and o_nz is to use the max nonzerors per local
2293    rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
2294    In this case, the values of d_nz,o_nz are:
2295 .vb
2296      proc0 : dnz = 2, o_nz = 2
2297      proc1 : dnz = 3, o_nz = 2
2298      proc2 : dnz = 1, o_nz = 4
2299 .ve
2300    We are allocating m*(d_nz+o_nz) storage locations for every proc. This
2301    translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
2302    for proc3. i.e we are using 12+15+10=37 storage locations to store
2303    34 values.
2304 
2305    When d_nnz, o_nnz parameters are specified, the storage is specified
2306    for every row, coresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
2307    In the above case the values for d_nnz,o_nnz are:
2308 .vb
2309      proc0: d_nnz = [2,2,2] and o_nnz = [2,2,2]
2310      proc1: d_nnz = [3,3,2] and o_nnz = [2,1,1]
2311      proc2: d_nnz = [1,1]   and o_nnz = [4,4]
2312 .ve
2313    Here the space allocated is sum of all the above values i.e 34, and
2314    hence pre-allocation is perfect.
2315 
2316    Level: intermediate
2317 
2318 .keywords: matrix, aij, compressed row, sparse, parallel
2319 
2320 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2321 @*/
2322 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
2323 {
2324   int ierr,size;
2325 
2326   PetscFunctionBegin;
2327   ierr = MatCreate(comm,m,n,M,N,A);CHKERRQ(ierr);
2328   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2329   if (size > 1) {
2330     ierr = MatSetType(*A,MATMPIAIJ);CHKERRQ(ierr);
2331     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
2332   } else {
2333     ierr = MatSetType(*A,MATSEQAIJ);CHKERRQ(ierr);
2334     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
2335   }
2336   PetscFunctionReturn(0);
2337 }
2338