xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 873048ab7197ff85bfa974a84338fd2cbf2f9136)
1 /*$Id: mpiaij.c,v 1.335 2001/04/21 21:13:28 bsmith Exp bsmith $*/
2 
3 #include "src/mat/impls/aij/mpi/mpiaij.h"
4 #include "src/vec/vecimpl.h"
5 #include "src/inline/spops.h"
6 
7 EXTERN int MatSetUpMultiply_MPIAIJ(Mat);
8 EXTERN int DisAssemble_MPIAIJ(Mat);
9 EXTERN int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
10 EXTERN int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
11 EXTERN int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
12 EXTERN int MatPrintHelp_SeqAIJ(Mat);
13 
14 /*
15   Local utility routine that creates a mapping from the global column
16 number to the local number in the off-diagonal part of the local
17 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
18 a slightly higher hash table cost; without it it is not scalable (each processor
19 has an order N integer array but is fast to acess.
20 */
21 #undef __FUNCT__
22 #define __FUNCT__ "CreateColmap_MPIAIJ_Private"
23 int CreateColmap_MPIAIJ_Private(Mat mat)
24 {
25   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
26   int        n = aij->B->n,i,ierr;
27 
28   PetscFunctionBegin;
29 #if defined (PETSC_USE_CTABLE)
30   ierr = PetscTableCreate(n,&aij->colmap);CHKERRQ(ierr);
31   for (i=0; i<n; i++){
32     ierr = PetscTableAdd(aij->colmap,aij->garray[i]+1,i+1);CHKERRQ(ierr);
33   }
34 #else
35   ierr = PetscMalloc((mat->N+1)*sizeof(int),&aij->colmap);CHKERRQ(ierr);
36   PetscLogObjectMemory(mat,mat->N*sizeof(int));
37   ierr = PetscMemzero(aij->colmap,mat->N*sizeof(int));CHKERRQ(ierr);
38   for (i=0; i<n; i++) aij->colmap[aij->garray[i]] = i+1;
39 #endif
40   PetscFunctionReturn(0);
41 }
42 
43 #define CHUNKSIZE   15
44 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
45 { \
46  \
47     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
48     rmax = aimax[row]; nrow = ailen[row];  \
49     col1 = col - shift; \
50      \
51     low = 0; high = nrow; \
52     while (high-low > 5) { \
53       t = (low+high)/2; \
54       if (rp[t] > col) high = t; \
55       else             low  = t; \
56     } \
57       for (_i=low; _i<high; _i++) { \
58         if (rp[_i] > col1) break; \
59         if (rp[_i] == col1) { \
60           if (addv == ADD_VALUES) ap[_i] += value;   \
61           else                  ap[_i] = value; \
62           goto a_noinsert; \
63         } \
64       }  \
65       if (nonew == 1) goto a_noinsert; \
66       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
67       if (nrow >= rmax) { \
68         /* there is no extra room in row, therefore enlarge */ \
69         int    new_nz = ai[am] + CHUNKSIZE,len,*new_i,*new_j; \
70         Scalar *new_a; \
71  \
72         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
73  \
74         /* malloc new storage space */ \
75         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(am+1)*sizeof(int); \
76         ierr    = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
77         new_j   = (int*)(new_a + new_nz); \
78         new_i   = new_j + new_nz; \
79  \
80         /* copy over old data into new slots */ \
81         for (ii=0; ii<row+1; ii++) {new_i[ii] = ai[ii];} \
82         for (ii=row+1; ii<am+1; ii++) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
83         ierr = PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int));CHKERRQ(ierr); \
84         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
85         ierr = PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
86                                                            len*sizeof(int));CHKERRQ(ierr); \
87         ierr = PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar));CHKERRQ(ierr); \
88         ierr = PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
89                                                            len*sizeof(Scalar));CHKERRQ(ierr);  \
90         /* free up old matrix storage */ \
91  \
92         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
93         if (!a->singlemalloc) { \
94            ierr = PetscFree(a->i);CHKERRQ(ierr); \
95            ierr = PetscFree(a->j);CHKERRQ(ierr); \
96         } \
97         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
98         a->singlemalloc = PETSC_TRUE; \
99  \
100         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
101         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
102         PetscLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
103         a->maxnz += CHUNKSIZE; \
104         a->reallocs++; \
105       } \
106       N = nrow++ - 1; a->nz++; \
107       /* shift up all the later entries in this row */ \
108       for (ii=N; ii>=_i; ii--) { \
109         rp[ii+1] = rp[ii]; \
110         ap[ii+1] = ap[ii]; \
111       } \
112       rp[_i] = col1;  \
113       ap[_i] = value;  \
114       a_noinsert: ; \
115       ailen[row] = nrow; \
116 }
117 
118 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
119 { \
120  \
121     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
122     rmax = bimax[row]; nrow = bilen[row];  \
123     col1 = col - shift; \
124      \
125     low = 0; high = nrow; \
126     while (high-low > 5) { \
127       t = (low+high)/2; \
128       if (rp[t] > col) high = t; \
129       else             low  = t; \
130     } \
131        for (_i=low; _i<high; _i++) { \
132         if (rp[_i] > col1) break; \
133         if (rp[_i] == col1) { \
134           if (addv == ADD_VALUES) ap[_i] += value;   \
135           else                  ap[_i] = value; \
136           goto b_noinsert; \
137         } \
138       }  \
139       if (nonew == 1) goto b_noinsert; \
140       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
141       if (nrow >= rmax) { \
142         /* there is no extra room in row, therefore enlarge */ \
143         int    new_nz = bi[bm] + CHUNKSIZE,len,*new_i,*new_j; \
144         Scalar *new_a; \
145  \
146         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
147  \
148         /* malloc new storage space */ \
149         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(bm+1)*sizeof(int); \
150         ierr    = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
151         new_j   = (int*)(new_a + new_nz); \
152         new_i   = new_j + new_nz; \
153  \
154         /* copy over old data into new slots */ \
155         for (ii=0; ii<row+1; ii++) {new_i[ii] = bi[ii];} \
156         for (ii=row+1; ii<bm+1; ii++) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
157         ierr = PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int));CHKERRQ(ierr); \
158         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
159         ierr = PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
160                                                            len*sizeof(int));CHKERRQ(ierr); \
161         ierr = PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar));CHKERRQ(ierr); \
162         ierr = PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
163                                                            len*sizeof(Scalar));CHKERRQ(ierr);  \
164         /* free up old matrix storage */ \
165  \
166         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
167         if (!b->singlemalloc) { \
168           ierr = PetscFree(b->i);CHKERRQ(ierr); \
169           ierr = PetscFree(b->j);CHKERRQ(ierr); \
170         } \
171         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
172         b->singlemalloc = PETSC_TRUE; \
173  \
174         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
175         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
176         PetscLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
177         b->maxnz += CHUNKSIZE; \
178         b->reallocs++; \
179       } \
180       N = nrow++ - 1; b->nz++; \
181       /* shift up all the later entries in this row */ \
182       for (ii=N; ii>=_i; ii--) { \
183         rp[ii+1] = rp[ii]; \
184         ap[ii+1] = ap[ii]; \
185       } \
186       rp[_i] = col1;  \
187       ap[_i] = value;  \
188       b_noinsert: ; \
189       bilen[row] = nrow; \
190 }
191 
192 #undef __FUNCT__
193 #define __FUNCT__ "MatSetValues_MPIAIJ"
194 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
195 {
196   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
197   Scalar     value;
198   int        ierr,i,j,rstart = aij->rstart,rend = aij->rend;
199   int        cstart = aij->cstart,cend = aij->cend,row,col;
200   PetscTruth roworiented = aij->roworiented;
201 
202   /* Some Variables required in the macro */
203   Mat        A = aij->A;
204   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
205   int        *aimax = a->imax,*ai = a->i,*ailen = a->ilen,*aj = a->j;
206   Scalar     *aa = a->a;
207   PetscTruth ignorezeroentries = (((a->ignorezeroentries)&&(addv==ADD_VALUES))?PETSC_TRUE:PETSC_FALSE);
208   Mat        B = aij->B;
209   Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
210   int        *bimax = b->imax,*bi = b->i,*bilen = b->ilen,*bj = b->j,bm = aij->B->m,am = aij->A->m;
211   Scalar     *ba = b->a;
212 
213   int        *rp,ii,nrow,_i,rmax,N,col1,low,high,t;
214   int        nonew = a->nonew,shift = a->indexshift;
215   Scalar     *ap;
216 
217   PetscFunctionBegin;
218   for (i=0; i<m; i++) {
219     if (im[i] < 0) continue;
220 #if defined(PETSC_USE_BOPT_g)
221     if (im[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
222 #endif
223     if (im[i] >= rstart && im[i] < rend) {
224       row = im[i] - rstart;
225       for (j=0; j<n; j++) {
226         if (in[j] >= cstart && in[j] < cend){
227           col = in[j] - cstart;
228           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
229           if (ignorezeroentries && value == 0.0) continue;
230           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
231           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
232         } else if (in[j] < 0) continue;
233 #if defined(PETSC_USE_BOPT_g)
234         else if (in[j] >= mat->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");}
235 #endif
236         else {
237           if (mat->was_assembled) {
238             if (!aij->colmap) {
239               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
240             }
241 #if defined (PETSC_USE_CTABLE)
242             ierr = PetscTableFind(aij->colmap,in[j]+1,&col);CHKERRQ(ierr);
243 	    col--;
244 #else
245             col = aij->colmap[in[j]] - 1;
246 #endif
247             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
248               ierr = DisAssemble_MPIAIJ(mat);CHKERRQ(ierr);
249               col =  in[j];
250               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
251               B = aij->B;
252               b = (Mat_SeqAIJ*)B->data;
253               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
254               ba = b->a;
255             }
256           } else col = in[j];
257           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
258           if (ignorezeroentries && value == 0.0) continue;
259           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
260           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
261         }
262       }
263     } else {
264       if (!aij->donotstash) {
265         if (roworiented) {
266           if (ignorezeroentries && v[i*n] == 0.0) continue;
267           ierr = MatStashValuesRow_Private(&mat->stash,im[i],n,in,v+i*n);CHKERRQ(ierr);
268         } else {
269           if (ignorezeroentries && v[i] == 0.0) continue;
270           ierr = MatStashValuesCol_Private(&mat->stash,im[i],n,in,v+i,m);CHKERRQ(ierr);
271         }
272       }
273     }
274   }
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatGetValues_MPIAIJ"
280 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
281 {
282   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
283   int        ierr,i,j,rstart = aij->rstart,rend = aij->rend;
284   int        cstart = aij->cstart,cend = aij->cend,row,col;
285 
286   PetscFunctionBegin;
287   for (i=0; i<m; i++) {
288     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative row");
289     if (idxm[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
290     if (idxm[i] >= rstart && idxm[i] < rend) {
291       row = idxm[i] - rstart;
292       for (j=0; j<n; j++) {
293         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative column");
294         if (idxn[j] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");
295         if (idxn[j] >= cstart && idxn[j] < cend){
296           col = idxn[j] - cstart;
297           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
298         } else {
299           if (!aij->colmap) {
300             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
301           }
302 #if defined (PETSC_USE_CTABLE)
303           ierr = PetscTableFind(aij->colmap,idxn[j]+1,&col);CHKERRQ(ierr);
304           col --;
305 #else
306           col = aij->colmap[idxn[j]] - 1;
307 #endif
308           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
309           else {
310             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
311           }
312         }
313       }
314     } else {
315       SETERRQ(PETSC_ERR_SUP,"Only local values currently supported");
316     }
317   }
318   PetscFunctionReturn(0);
319 }
320 
321 #undef __FUNCT__
322 #define __FUNCT__ "MatAssemblyBegin_MPIAIJ"
323 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
324 {
325   Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)mat->data;
326   int         ierr,nstash,reallocs;
327   InsertMode  addv;
328 
329   PetscFunctionBegin;
330   if (aij->donotstash) {
331     PetscFunctionReturn(0);
332   }
333 
334   /* make sure all processors are either in INSERTMODE or ADDMODE */
335   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
336   if (addv == (ADD_VALUES|INSERT_VALUES)) {
337     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
338   }
339   mat->insertmode = addv; /* in case this processor had no cache */
340 
341   ierr = MatStashScatterBegin_Private(&mat->stash,aij->rowners);CHKERRQ(ierr);
342   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
343   PetscLogInfo(aij->A,"MatAssemblyBegin_MPIAIJ:Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
344   PetscFunctionReturn(0);
345 }
346 
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatAssemblyEnd_MPIAIJ"
350 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
351 {
352   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
353   int         i,j,rstart,ncols,n,ierr,flg;
354   int         *row,*col,other_disassembled;
355   Scalar      *val;
356   InsertMode  addv = mat->insertmode;
357 
358   PetscFunctionBegin;
359   if (!aij->donotstash) {
360     while (1) {
361       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
362       if (!flg) break;
363 
364       for (i=0; i<n;) {
365         /* Now identify the consecutive vals belonging to the same row */
366         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
367         if (j < n) ncols = j-i;
368         else       ncols = n-i;
369         /* Now assemble all these values with a single function call */
370         ierr = MatSetValues_MPIAIJ(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
371         i = j;
372       }
373     }
374     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
375   }
376 
377   ierr = MatAssemblyBegin(aij->A,mode);CHKERRQ(ierr);
378   ierr = MatAssemblyEnd(aij->A,mode);CHKERRQ(ierr);
379 
380   /* determine if any processor has disassembled, if so we must
381      also disassemble ourselfs, in order that we may reassemble. */
382   /*
383      if nonzero structure of submatrix B cannot change then we know that
384      no processor disassembled thus we can skip this stuff
385   */
386   if (!((Mat_SeqAIJ*)aij->B->data)->nonew)  {
387     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
388     if (mat->was_assembled && !other_disassembled) {
389       ierr = DisAssemble_MPIAIJ(mat);CHKERRQ(ierr);
390     }
391   }
392 
393   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
394     ierr = MatSetUpMultiply_MPIAIJ(mat);CHKERRQ(ierr);
395   }
396   ierr = MatAssemblyBegin(aij->B,mode);CHKERRQ(ierr);
397   ierr = MatAssemblyEnd(aij->B,mode);CHKERRQ(ierr);
398 
399   if (aij->rowvalues) {
400     ierr = PetscFree(aij->rowvalues);CHKERRQ(ierr);
401     aij->rowvalues = 0;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatZeroEntries_MPIAIJ"
408 int MatZeroEntries_MPIAIJ(Mat A)
409 {
410   Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
411   int        ierr;
412 
413   PetscFunctionBegin;
414   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
415   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 #undef __FUNCT__
420 #define __FUNCT__ "MatZeroRows_MPIAIJ"
421 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
422 {
423   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
424   int            i,ierr,N,*rows,*owners = l->rowners,size = l->size;
425   int            *procs,*nprocs,j,idx,nsends,*work,row;
426   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
427   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
428   int            *lens,imdex,*lrows,*values,rstart=l->rstart;
429   MPI_Comm       comm = A->comm;
430   MPI_Request    *send_waits,*recv_waits;
431   MPI_Status     recv_status,*send_status;
432   IS             istmp;
433   PetscTruth     found;
434 
435   PetscFunctionBegin;
436   ierr = ISGetLocalSize(is,&N);CHKERRQ(ierr);
437   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
438 
439   /*  first count number of contributors to each processor */
440   ierr = PetscMalloc(2*size*sizeof(int),&nprocs);CHKERRQ(ierr);
441   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
442   procs  = nprocs + size;
443   ierr = PetscMalloc((N+1)*sizeof(int),&owner);CHKERRQ(ierr); /* see note*/
444   for (i=0; i<N; i++) {
445     idx = rows[i];
446     found = PETSC_FALSE;
447     for (j=0; j<size; j++) {
448       if (idx >= owners[j] && idx < owners[j+1]) {
449         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
450       }
451     }
452     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index out of range");
453   }
454   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}
455 
456   /* inform other processors of number of messages and max length*/
457   ierr = PetscMalloc(2*size*sizeof(int),&work);CHKERRQ(ierr);
458   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
459   nrecvs = work[size+rank];
460   nmax   = work[rank];
461   ierr   = PetscFree(work);CHKERRQ(ierr);
462 
463   /* post receives:   */
464   ierr = PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);CHKERRQ(ierr);
465   ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr);
466   for (i=0; i<nrecvs; i++) {
467     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
468   }
469 
470   /* do sends:
471       1) starts[i] gives the starting index in svalues for stuff going to
472          the ith processor
473   */
474   ierr = PetscMalloc((N+1)*sizeof(int),&svalues);CHKERRQ(ierr);
475   ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr);
476   ierr = PetscMalloc((size+1)*sizeof(int),&starts);CHKERRQ(ierr);
477   starts[0] = 0;
478   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
479   for (i=0; i<N; i++) {
480     svalues[starts[owner[i]]++] = rows[i];
481   }
482   ierr = ISRestoreIndices(is,&rows);CHKERRQ(ierr);
483 
484   starts[0] = 0;
485   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
486   count = 0;
487   for (i=0; i<size; i++) {
488     if (procs[i]) {
489       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
490     }
491   }
492   ierr = PetscFree(starts);CHKERRQ(ierr);
493 
494   base = owners[rank];
495 
496   /*  wait on receives */
497   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);CHKERRQ(ierr);
498   source = lens + nrecvs;
499   count  = nrecvs; slen = 0;
500   while (count) {
501     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
502     /* unpack receives into our local space */
503     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
504     source[imdex]  = recv_status.MPI_SOURCE;
505     lens[imdex]    = n;
506     slen          += n;
507     count--;
508   }
509   ierr = PetscFree(recv_waits);CHKERRQ(ierr);
510 
511   /* move the data into the send scatter */
512   ierr = PetscMalloc((slen+1)*sizeof(int),&lrows);CHKERRQ(ierr);
513   count = 0;
514   for (i=0; i<nrecvs; i++) {
515     values = rvalues + i*nmax;
516     for (j=0; j<lens[i]; j++) {
517       lrows[count++] = values[j] - base;
518     }
519   }
520   ierr = PetscFree(rvalues);CHKERRQ(ierr);
521   ierr = PetscFree(lens);CHKERRQ(ierr);
522   ierr = PetscFree(owner);CHKERRQ(ierr);
523   ierr = PetscFree(nprocs);CHKERRQ(ierr);
524 
525   /* actually zap the local rows */
526   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
527   PetscLogObjectParent(A,istmp);
528 
529   /*
530         Zero the required rows. If the "diagonal block" of the matrix
531      is square and the user wishes to set the diagonal we use seperate
532      code so that MatSetValues() is not called for each diagonal allocating
533      new memory, thus calling lots of mallocs and slowing things down.
534 
535        Contributed by: Mathew Knepley
536   */
537   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
538   ierr = MatZeroRows(l->B,istmp,0);CHKERRQ(ierr);
539   if (diag && (l->A->M == l->A->N)) {
540     ierr      = MatZeroRows(l->A,istmp,diag);CHKERRQ(ierr);
541   } else if (diag) {
542     ierr = MatZeroRows(l->A,istmp,0);CHKERRQ(ierr);
543     if (((Mat_SeqAIJ*)l->A->data)->nonew) {
544       SETERRQ(PETSC_ERR_SUP,"MatZeroRows() on rectangular matrices cannot be used with the Mat options\n\
545 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
546     }
547     for (i = 0; i < slen; i++) {
548       row  = lrows[i] + rstart;
549       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
550     }
551     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
552     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
553   } else {
554     ierr = MatZeroRows(l->A,istmp,0);CHKERRQ(ierr);
555   }
556   ierr = ISDestroy(istmp);CHKERRQ(ierr);
557   ierr = PetscFree(lrows);CHKERRQ(ierr);
558 
559   /* wait on sends */
560   if (nsends) {
561     ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr);
562     ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
563     ierr = PetscFree(send_status);CHKERRQ(ierr);
564   }
565   ierr = PetscFree(send_waits);CHKERRQ(ierr);
566   ierr = PetscFree(svalues);CHKERRQ(ierr);
567 
568   PetscFunctionReturn(0);
569 }
570 
571 #undef __FUNCT__
572 #define __FUNCT__ "MatMult_MPIAIJ"
573 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
574 {
575   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
576   int        ierr,nt;
577 
578   PetscFunctionBegin;
579   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
580   if (nt != A->n) {
581     SETERRQ2(PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%d) and xx (%d)",A->n,nt);
582   }
583   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
584   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
585   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
586   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
587   PetscFunctionReturn(0);
588 }
589 
590 #undef __FUNCT__
591 #define __FUNCT__ "MatMultAdd_MPIAIJ"
592 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
593 {
594   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
595   int        ierr;
596 
597   PetscFunctionBegin;
598   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
599   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
600   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
601   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 
605 #undef __FUNCT__
606 #define __FUNCT__ "MatMultTranspose_MPIAIJ"
607 int MatMultTranspose_MPIAIJ(Mat A,Vec xx,Vec yy)
608 {
609   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
610   int        ierr;
611 
612   PetscFunctionBegin;
613   /* do nondiagonal part */
614   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
615   /* send it on its way */
616   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
617   /* do local part */
618   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
619   /* receive remote parts: note this assumes the values are not actually */
620   /* inserted in yy until the next line, which is true for my implementation*/
621   /* but is not perhaps always true. */
622   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
623   PetscFunctionReturn(0);
624 }
625 
626 #undef __FUNCT__
627 #define __FUNCT__ "MatMultTransposeAdd_MPIAIJ"
628 int MatMultTransposeAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
629 {
630   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
631   int        ierr;
632 
633   PetscFunctionBegin;
634   /* do nondiagonal part */
635   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
636   /* send it on its way */
637   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
638   /* do local part */
639   ierr = (*a->A->ops->multtransposeadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
640   /* receive remote parts: note this assumes the values are not actually */
641   /* inserted in yy until the next line, which is true for my implementation*/
642   /* but is not perhaps always true. */
643   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
644   PetscFunctionReturn(0);
645 }
646 
647 /*
648   This only works correctly for square matrices where the subblock A->A is the
649    diagonal block
650 */
651 #undef __FUNCT__
652 #define __FUNCT__ "MatGetDiagonal_MPIAIJ"
653 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
654 {
655   int        ierr;
656   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
657 
658   PetscFunctionBegin;
659   if (A->M != A->N) SETERRQ(PETSC_ERR_SUP,"Supports only square matrix where A->A is diag block");
660   if (a->rstart != a->cstart || a->rend != a->cend) {
661     SETERRQ(PETSC_ERR_ARG_SIZ,"row partition must equal col partition");
662   }
663   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 #undef __FUNCT__
668 #define __FUNCT__ "MatScale_MPIAIJ"
669 int MatScale_MPIAIJ(Scalar *aa,Mat A)
670 {
671   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
672   int        ierr;
673 
674   PetscFunctionBegin;
675   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
676   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
677   PetscFunctionReturn(0);
678 }
679 
680 #undef __FUNCT__
681 #define __FUNCT__ "MatDestroy_MPIAIJ"
682 int MatDestroy_MPIAIJ(Mat mat)
683 {
684   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
685   int        ierr;
686 
687   PetscFunctionBegin;
688 #if defined(PETSC_USE_LOG)
689   PetscLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",mat->M,mat->N);
690 #endif
691   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
692   ierr = PetscFree(aij->rowners);CHKERRQ(ierr);
693   ierr = MatDestroy(aij->A);CHKERRQ(ierr);
694   ierr = MatDestroy(aij->B);CHKERRQ(ierr);
695 #if defined (PETSC_USE_CTABLE)
696   if (aij->colmap) {ierr = PetscTableDelete(aij->colmap);CHKERRQ(ierr);}
697 #else
698   if (aij->colmap) {ierr = PetscFree(aij->colmap);CHKERRQ(ierr);}
699 #endif
700   if (aij->garray) {ierr = PetscFree(aij->garray);CHKERRQ(ierr);}
701   if (aij->lvec)   {ierr = VecDestroy(aij->lvec);CHKERRQ(ierr);}
702   if (aij->Mvctx)  {ierr = VecScatterDestroy(aij->Mvctx);CHKERRQ(ierr);}
703   if (aij->rowvalues) {ierr = PetscFree(aij->rowvalues);CHKERRQ(ierr);}
704   ierr = PetscFree(aij);CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNCT__
709 #define __FUNCT__ "MatView_MPIAIJ_ASCIIorDraworSocket"
710 int MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat,PetscViewer viewer)
711 {
712   Mat_MPIAIJ        *aij = (Mat_MPIAIJ*)mat->data;
713   Mat_SeqAIJ*       C = (Mat_SeqAIJ*)aij->A->data;
714   int               ierr,shift = C->indexshift,rank = aij->rank,size = aij->size;
715   PetscTruth        isdraw,isascii,flg;
716   PetscViewer       sviewer;
717   PetscViewerFormat format;
718 
719   PetscFunctionBegin;
720   ierr  = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
721   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
722   if (isascii) {
723     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
724     if (format == PETSC_VIEWER_ASCII_INFO_LONG) {
725       MatInfo info;
726       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
727       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
728       ierr = PetscOptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg);CHKERRQ(ierr);
729       if (flg) {
730         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
731 					      rank,mat->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);CHKERRQ(ierr);
732       } else {
733         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
734 		    rank,mat->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);CHKERRQ(ierr);
735       }
736       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
737       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);CHKERRQ(ierr);
738       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
739       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);CHKERRQ(ierr);
740       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
741       ierr = VecScatterView(aij->Mvctx,viewer);CHKERRQ(ierr);
742       PetscFunctionReturn(0);
743     } else if (format == PETSC_VIEWER_ASCII_INFO) {
744       PetscFunctionReturn(0);
745     }
746   } else if (isdraw) {
747     PetscDraw       draw;
748     PetscTruth isnull;
749     ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
750     ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
751   }
752 
753   if (size == 1) {
754     ierr = MatView(aij->A,viewer);CHKERRQ(ierr);
755   } else {
756     /* assemble the entire matrix onto first processor. */
757     Mat         A;
758     Mat_SeqAIJ *Aloc;
759     int         M = mat->M,N = mat->N,m,*ai,*aj,row,*cols,i,*ct;
760     Scalar      *a;
761 
762     if (!rank) {
763       ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
764     } else {
765       ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
766     }
767     PetscLogObjectParent(mat,A);
768 
769     /* copy over the A part */
770     Aloc = (Mat_SeqAIJ*)aij->A->data;
771     m = aij->A->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
772     row = aij->rstart;
773     for (i=0; i<ai[m]+shift; i++) {aj[i] += aij->cstart + shift;}
774     for (i=0; i<m; i++) {
775       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
776       row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
777     }
778     aj = Aloc->j;
779     for (i=0; i<ai[m]+shift; i++) {aj[i] -= aij->cstart + shift;}
780 
781     /* copy over the B part */
782     Aloc = (Mat_SeqAIJ*)aij->B->data;
783     m    = aij->B->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
784     row  = aij->rstart;
785     ierr = PetscMalloc((ai[m]+1)*sizeof(int),&cols);CHKERRQ(ierr);
786     ct   = cols;
787     for (i=0; i<ai[m]+shift; i++) {cols[i] = aij->garray[aj[i]+shift];}
788     for (i=0; i<m; i++) {
789       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
790       row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
791     }
792     ierr = PetscFree(ct);CHKERRQ(ierr);
793     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
794     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
795     /*
796        Everyone has to call to draw the matrix since the graphics waits are
797        synchronized across all processors that share the PetscDraw object
798     */
799     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
800     if (!rank) {
801       ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,sviewer);CHKERRQ(ierr);
802     }
803     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
804     ierr = MatDestroy(A);CHKERRQ(ierr);
805   }
806   PetscFunctionReturn(0);
807 }
808 
809 #undef __FUNCT__
810 #define __FUNCT__ "MatView_MPIAIJ"
811 int MatView_MPIAIJ(Mat mat,PetscViewer viewer)
812 {
813   int        ierr;
814   PetscTruth isascii,isdraw,issocket,isbinary;
815 
816   PetscFunctionBegin;
817   ierr  = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
818   ierr  = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
819   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_BINARY,&isbinary);CHKERRQ(ierr);
820   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_SOCKET,&issocket);CHKERRQ(ierr);
821   if (isascii || isdraw || isbinary || issocket) {
822     ierr = MatView_MPIAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
823   } else {
824     SETERRQ1(1,"Viewer type %s not supported by MPIAIJ matrices",((PetscObject)viewer)->type_name);
825   }
826   PetscFunctionReturn(0);
827 }
828 
829 /*
830     This has to provide several versions.
831 
832      2) a) use only local smoothing updating outer values only once.
833         b) local smoothing updating outer values each inner iteration
834      3) color updating out values betwen colors.
835 */
836 #undef __FUNCT__
837 #define __FUNCT__ "MatRelax_MPIAIJ"
838 int MatRelax_MPIAIJ(Mat matin,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,int its,Vec xx)
839 {
840   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
841   Mat        AA = mat->A,BB = mat->B;
842   Mat_SeqAIJ *A = (Mat_SeqAIJ*)AA->data,*B = (Mat_SeqAIJ *)BB->data;
843   Scalar     *b,*x,*xs,*ls,d,*v,sum;
844   int        ierr,*idx,*diag;
845   int        n = matin->n,m = matin->m,i,shift = A->indexshift;
846 
847   PetscFunctionBegin;
848   if (!A->diag) {ierr = MatMarkDiagonal_SeqAIJ(AA);CHKERRQ(ierr);}
849   diag = A->diag;
850   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
851     if (flag & SOR_ZERO_INITIAL_GUESS) {
852       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
853       PetscFunctionReturn(0);
854     }
855     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
856     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
857     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
858     if (xx != bb) {
859       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
860     } else {
861       b = x;
862     }
863     ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
864     xs = x + shift; /* shift by one for index start of 1 */
865     ls = ls + shift;
866     while (its--) {
867       /* go down through the rows */
868       for (i=0; i<m; i++) {
869         n    = A->i[i+1] - A->i[i];
870 	PetscLogFlops(4*n+3);
871         idx  = A->j + A->i[i] + shift;
872         v    = A->a + A->i[i] + shift;
873         sum  = b[i];
874         SPARSEDENSEMDOT(sum,xs,v,idx,n);
875         d    = fshift + A->a[diag[i]+shift];
876         n    = B->i[i+1] - B->i[i];
877         idx  = B->j + B->i[i] + shift;
878         v    = B->a + B->i[i] + shift;
879         SPARSEDENSEMDOT(sum,ls,v,idx,n);
880         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
881       }
882       /* come up through the rows */
883       for (i=m-1; i>-1; i--) {
884         n    = A->i[i+1] - A->i[i];
885 	PetscLogFlops(4*n+3);
886         idx  = A->j + A->i[i] + shift;
887         v    = A->a + A->i[i] + shift;
888         sum  = b[i];
889         SPARSEDENSEMDOT(sum,xs,v,idx,n);
890         d    = fshift + A->a[diag[i]+shift];
891         n    = B->i[i+1] - B->i[i];
892         idx  = B->j + B->i[i] + shift;
893         v    = B->a + B->i[i] + shift;
894         SPARSEDENSEMDOT(sum,ls,v,idx,n);
895         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
896       }
897     }
898     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
899     if (bb != xx) {ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); }
900     ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
901   } else if (flag & SOR_LOCAL_FORWARD_SWEEP){
902     if (flag & SOR_ZERO_INITIAL_GUESS) {
903       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
904       PetscFunctionReturn(0);
905     }
906     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
907     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
908     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
909     if (xx != bb) {
910       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
911     } else {
912       b = x;
913     }
914     ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
915     xs = x + shift; /* shift by one for index start of 1 */
916     ls = ls + shift;
917     while (its--) {
918       for (i=0; i<m; i++) {
919         n    = A->i[i+1] - A->i[i];
920 	PetscLogFlops(4*n+3);
921         idx  = A->j + A->i[i] + shift;
922         v    = A->a + A->i[i] + shift;
923         sum  = b[i];
924         SPARSEDENSEMDOT(sum,xs,v,idx,n);
925         d    = fshift + A->a[diag[i]+shift];
926         n    = B->i[i+1] - B->i[i];
927         idx  = B->j + B->i[i] + shift;
928         v    = B->a + B->i[i] + shift;
929         SPARSEDENSEMDOT(sum,ls,v,idx,n);
930         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
931       }
932     }
933     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
934     if (bb != xx) {ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); }
935     ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
936   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
937     if (flag & SOR_ZERO_INITIAL_GUESS) {
938       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
939       PetscFunctionReturn(0);
940     }
941     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
942     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
943     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
944     if (xx != bb) {
945       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
946     } else {
947       b = x;
948     }
949     ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
950     xs = x + shift; /* shift by one for index start of 1 */
951     ls = ls + shift;
952     while (its--) {
953       for (i=m-1; i>-1; i--) {
954         n    = A->i[i+1] - A->i[i];
955 	PetscLogFlops(4*n+3);
956         idx  = A->j + A->i[i] + shift;
957         v    = A->a + A->i[i] + shift;
958         sum  = b[i];
959         SPARSEDENSEMDOT(sum,xs,v,idx,n);
960         d    = fshift + A->a[diag[i]+shift];
961         n    = B->i[i+1] - B->i[i];
962         idx  = B->j + B->i[i] + shift;
963         v    = B->a + B->i[i] + shift;
964         SPARSEDENSEMDOT(sum,ls,v,idx,n);
965         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
966       }
967     }
968     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
969     if (bb != xx) {ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); }
970     ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
971   } else {
972     SETERRQ(PETSC_ERR_SUP,"Parallel SOR not supported");
973   }
974   PetscFunctionReturn(0);
975 }
976 
977 #undef __FUNCT__
978 #define __FUNCT__ "MatGetInfo_MPIAIJ"
979 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
980 {
981   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
982   Mat        A = mat->A,B = mat->B;
983   int        ierr;
984   PetscReal  isend[5],irecv[5];
985 
986   PetscFunctionBegin;
987   info->block_size     = 1.0;
988   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
989   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
990   isend[3] = info->memory;  isend[4] = info->mallocs;
991   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
992   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
993   isend[3] += info->memory;  isend[4] += info->mallocs;
994   if (flag == MAT_LOCAL) {
995     info->nz_used      = isend[0];
996     info->nz_allocated = isend[1];
997     info->nz_unneeded  = isend[2];
998     info->memory       = isend[3];
999     info->mallocs      = isend[4];
1000   } else if (flag == MAT_GLOBAL_MAX) {
1001     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1002     info->nz_used      = irecv[0];
1003     info->nz_allocated = irecv[1];
1004     info->nz_unneeded  = irecv[2];
1005     info->memory       = irecv[3];
1006     info->mallocs      = irecv[4];
1007   } else if (flag == MAT_GLOBAL_SUM) {
1008     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1009     info->nz_used      = irecv[0];
1010     info->nz_allocated = irecv[1];
1011     info->nz_unneeded  = irecv[2];
1012     info->memory       = irecv[3];
1013     info->mallocs      = irecv[4];
1014   }
1015   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1016   info->fill_ratio_needed = 0;
1017   info->factor_mallocs    = 0;
1018   info->rows_global       = (double)matin->M;
1019   info->columns_global    = (double)matin->N;
1020   info->rows_local        = (double)matin->m;
1021   info->columns_local     = (double)matin->N;
1022 
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 #undef __FUNCT__
1027 #define __FUNCT__ "MatSetOption_MPIAIJ"
1028 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1029 {
1030   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1031   int        ierr;
1032 
1033   PetscFunctionBegin;
1034   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1035       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1036       op == MAT_COLUMNS_UNSORTED ||
1037       op == MAT_COLUMNS_SORTED ||
1038       op == MAT_NEW_NONZERO_ALLOCATION_ERR ||
1039       op == MAT_KEEP_ZEROED_ROWS ||
1040       op == MAT_NEW_NONZERO_LOCATION_ERR ||
1041       op == MAT_USE_INODES ||
1042       op == MAT_DO_NOT_USE_INODES ||
1043       op == MAT_IGNORE_ZERO_ENTRIES) {
1044         ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1045         ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1046   } else if (op == MAT_ROW_ORIENTED) {
1047     a->roworiented = PETSC_TRUE;
1048     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1049     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1050   } else if (op == MAT_ROWS_SORTED ||
1051              op == MAT_ROWS_UNSORTED ||
1052              op == MAT_YES_NEW_DIAGONALS) {
1053     PetscLogInfo(A,"MatSetOption_MPIAIJ:Option ignored\n");
1054   } else if (op == MAT_COLUMN_ORIENTED) {
1055     a->roworiented = PETSC_FALSE;
1056     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1057     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1058   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1059     a->donotstash = PETSC_TRUE;
1060   } else if (op == MAT_NO_NEW_DIAGONALS){
1061     SETERRQ(PETSC_ERR_SUP,"MAT_NO_NEW_DIAGONALS");
1062   } else {
1063     SETERRQ(PETSC_ERR_SUP,"unknown option");
1064   }
1065   PetscFunctionReturn(0);
1066 }
1067 
1068 #undef __FUNCT__
1069 #define __FUNCT__ "MatGetOwnershipRange_MPIAIJ"
1070 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1071 {
1072   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
1073 
1074   PetscFunctionBegin;
1075   if (m) *m = mat->rstart;
1076   if (n) *n = mat->rend;
1077   PetscFunctionReturn(0);
1078 }
1079 
1080 #undef __FUNCT__
1081 #define __FUNCT__ "MatGetRow_MPIAIJ"
1082 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1083 {
1084   Mat_MPIAIJ *mat = (Mat_MPIAIJ*)matin->data;
1085   Scalar     *vworkA,*vworkB,**pvA,**pvB,*v_p;
1086   int        i,ierr,*cworkA,*cworkB,**pcA,**pcB,cstart = mat->cstart;
1087   int        nztot,nzA,nzB,lrow,rstart = mat->rstart,rend = mat->rend;
1088   int        *cmap,*idx_p;
1089 
1090   PetscFunctionBegin;
1091   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Already active");
1092   mat->getrowactive = PETSC_TRUE;
1093 
1094   if (!mat->rowvalues && (idx || v)) {
1095     /*
1096         allocate enough space to hold information from the longest row.
1097     */
1098     Mat_SeqAIJ *Aa = (Mat_SeqAIJ*)mat->A->data,*Ba = (Mat_SeqAIJ*)mat->B->data;
1099     int     max = 1,tmp;
1100     for (i=0; i<matin->m; i++) {
1101       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1102       if (max < tmp) { max = tmp; }
1103     }
1104     ierr = PetscMalloc(max*(sizeof(int)+sizeof(Scalar)),&mat->rowvalues);CHKERRQ(ierr);
1105     mat->rowindices = (int*)(mat->rowvalues + max);
1106   }
1107 
1108   if (row < rstart || row >= rend) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Only local rows")
1109   lrow = row - rstart;
1110 
1111   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1112   if (!v)   {pvA = 0; pvB = 0;}
1113   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1114   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1115   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1116   nztot = nzA + nzB;
1117 
1118   cmap  = mat->garray;
1119   if (v  || idx) {
1120     if (nztot) {
1121       /* Sort by increasing column numbers, assuming A and B already sorted */
1122       int imark = -1;
1123       if (v) {
1124         *v = v_p = mat->rowvalues;
1125         for (i=0; i<nzB; i++) {
1126           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1127           else break;
1128         }
1129         imark = i;
1130         for (i=0; i<nzA; i++)     v_p[imark+i] = vworkA[i];
1131         for (i=imark; i<nzB; i++) v_p[nzA+i]   = vworkB[i];
1132       }
1133       if (idx) {
1134         *idx = idx_p = mat->rowindices;
1135         if (imark > -1) {
1136           for (i=0; i<imark; i++) {
1137             idx_p[i] = cmap[cworkB[i]];
1138           }
1139         } else {
1140           for (i=0; i<nzB; i++) {
1141             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1142             else break;
1143           }
1144           imark = i;
1145         }
1146         for (i=0; i<nzA; i++)     idx_p[imark+i] = cstart + cworkA[i];
1147         for (i=imark; i<nzB; i++) idx_p[nzA+i]   = cmap[cworkB[i]];
1148       }
1149     } else {
1150       if (idx) *idx = 0;
1151       if (v)   *v   = 0;
1152     }
1153   }
1154   *nz = nztot;
1155   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1156   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 #undef __FUNCT__
1161 #define __FUNCT__ "MatRestoreRow_MPIAIJ"
1162 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1163 {
1164   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
1165 
1166   PetscFunctionBegin;
1167   if (aij->getrowactive == PETSC_FALSE) {
1168     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"MatGetRow not called");
1169   }
1170   aij->getrowactive = PETSC_FALSE;
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatNorm_MPIAIJ"
1176 int MatNorm_MPIAIJ(Mat mat,NormType type,PetscReal *norm)
1177 {
1178   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
1179   Mat_SeqAIJ *amat = (Mat_SeqAIJ*)aij->A->data,*bmat = (Mat_SeqAIJ*)aij->B->data;
1180   int        ierr,i,j,cstart = aij->cstart,shift = amat->indexshift;
1181   PetscReal  sum = 0.0;
1182   Scalar     *v;
1183 
1184   PetscFunctionBegin;
1185   if (aij->size == 1) {
1186     ierr =  MatNorm(aij->A,type,norm);CHKERRQ(ierr);
1187   } else {
1188     if (type == NORM_FROBENIUS) {
1189       v = amat->a;
1190       for (i=0; i<amat->nz; i++) {
1191 #if defined(PETSC_USE_COMPLEX)
1192         sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1193 #else
1194         sum += (*v)*(*v); v++;
1195 #endif
1196       }
1197       v = bmat->a;
1198       for (i=0; i<bmat->nz; i++) {
1199 #if defined(PETSC_USE_COMPLEX)
1200         sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1201 #else
1202         sum += (*v)*(*v); v++;
1203 #endif
1204       }
1205       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1206       *norm = sqrt(*norm);
1207     } else if (type == NORM_1) { /* max column norm */
1208       PetscReal *tmp,*tmp2;
1209       int    *jj,*garray = aij->garray;
1210       ierr = PetscMalloc((mat->N+1)*sizeof(PetscReal),&tmp);CHKERRQ(ierr);
1211       ierr = PetscMalloc((mat->N+1)*sizeof(PetscReal),&tmp2);CHKERRQ(ierr);
1212       ierr = PetscMemzero(tmp,mat->N*sizeof(PetscReal));CHKERRQ(ierr);
1213       *norm = 0.0;
1214       v = amat->a; jj = amat->j;
1215       for (j=0; j<amat->nz; j++) {
1216         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1217       }
1218       v = bmat->a; jj = bmat->j;
1219       for (j=0; j<bmat->nz; j++) {
1220         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1221       }
1222       ierr = MPI_Allreduce(tmp,tmp2,mat->N,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1223       for (j=0; j<mat->N; j++) {
1224         if (tmp2[j] > *norm) *norm = tmp2[j];
1225       }
1226       ierr = PetscFree(tmp);CHKERRQ(ierr);
1227       ierr = PetscFree(tmp2);CHKERRQ(ierr);
1228     } else if (type == NORM_INFINITY) { /* max row norm */
1229       PetscReal ntemp = 0.0;
1230       for (j=0; j<aij->A->m; j++) {
1231         v = amat->a + amat->i[j] + shift;
1232         sum = 0.0;
1233         for (i=0; i<amat->i[j+1]-amat->i[j]; i++) {
1234           sum += PetscAbsScalar(*v); v++;
1235         }
1236         v = bmat->a + bmat->i[j] + shift;
1237         for (i=0; i<bmat->i[j+1]-bmat->i[j]; i++) {
1238           sum += PetscAbsScalar(*v); v++;
1239         }
1240         if (sum > ntemp) ntemp = sum;
1241       }
1242       ierr = MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);CHKERRQ(ierr);
1243     } else {
1244       SETERRQ(PETSC_ERR_SUP,"No support for two norm");
1245     }
1246   }
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 #undef __FUNCT__
1251 #define __FUNCT__ "MatTranspose_MPIAIJ"
1252 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1253 {
1254   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1255   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ*)a->A->data;
1256   int        ierr,shift = Aloc->indexshift;
1257   int        M = A->M,N = A->N,m,*ai,*aj,row,*cols,i,*ct;
1258   Mat        B;
1259   Scalar     *array;
1260 
1261   PetscFunctionBegin;
1262   if (!matout && M != N) {
1263     SETERRQ(PETSC_ERR_ARG_SIZ,"Square matrix only for in-place");
1264   }
1265 
1266   ierr = MatCreateMPIAIJ(A->comm,A->n,A->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1267 
1268   /* copy over the A part */
1269   Aloc = (Mat_SeqAIJ*)a->A->data;
1270   m = a->A->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1271   row = a->rstart;
1272   for (i=0; i<ai[m]+shift; i++) {aj[i] += a->cstart + shift;}
1273   for (i=0; i<m; i++) {
1274     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1275     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1276   }
1277   aj = Aloc->j;
1278   for (i=0; i<ai[m]+shift; i++) {aj[i] -= a->cstart + shift;}
1279 
1280   /* copy over the B part */
1281   Aloc = (Mat_SeqAIJ*)a->B->data;
1282   m = a->B->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1283   row  = a->rstart;
1284   ierr = PetscMalloc((1+ai[m]-shift)*sizeof(int),&cols);CHKERRQ(ierr);
1285   ct   = cols;
1286   for (i=0; i<ai[m]+shift; i++) {cols[i] = a->garray[aj[i]+shift];}
1287   for (i=0; i<m; i++) {
1288     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1289     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1290   }
1291   ierr = PetscFree(ct);CHKERRQ(ierr);
1292   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1293   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1294   if (matout) {
1295     *matout = B;
1296   } else {
1297     ierr = MatHeaderCopy(A,B);CHKERRQ(ierr);
1298   }
1299   PetscFunctionReturn(0);
1300 }
1301 
1302 #undef __FUNCT__
1303 #define __FUNCT__ "MatDiagonalScale_MPIAIJ"
1304 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1305 {
1306   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data;
1307   Mat        a = aij->A,b = aij->B;
1308   int        ierr,s1,s2,s3;
1309 
1310   PetscFunctionBegin;
1311   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1312   if (rr) {
1313     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1314     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,"right vector non-conforming local size");
1315     /* Overlap communication with computation. */
1316     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx);CHKERRQ(ierr);
1317   }
1318   if (ll) {
1319     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1320     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,"left vector non-conforming local size");
1321     ierr = (*b->ops->diagonalscale)(b,ll,0);CHKERRQ(ierr);
1322   }
1323   /* scale  the diagonal block */
1324   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1325 
1326   if (rr) {
1327     /* Do a scatter end and then right scale the off-diagonal block */
1328     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx);CHKERRQ(ierr);
1329     ierr = (*b->ops->diagonalscale)(b,0,aij->lvec);CHKERRQ(ierr);
1330   }
1331 
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 
1336 #undef __FUNCT__
1337 #define __FUNCT__ "MatPrintHelp_MPIAIJ"
1338 int MatPrintHelp_MPIAIJ(Mat A)
1339 {
1340   Mat_MPIAIJ *a   = (Mat_MPIAIJ*)A->data;
1341   int        ierr;
1342 
1343   PetscFunctionBegin;
1344   if (!a->rank) {
1345     ierr = MatPrintHelp_SeqAIJ(a->A);CHKERRQ(ierr);
1346   }
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 #undef __FUNCT__
1351 #define __FUNCT__ "MatGetBlockSize_MPIAIJ"
1352 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1353 {
1354   PetscFunctionBegin;
1355   *bs = 1;
1356   PetscFunctionReturn(0);
1357 }
1358 #undef __FUNCT__
1359 #define __FUNCT__ "MatSetUnfactored_MPIAIJ"
1360 int MatSetUnfactored_MPIAIJ(Mat A)
1361 {
1362   Mat_MPIAIJ *a   = (Mat_MPIAIJ*)A->data;
1363   int        ierr;
1364 
1365   PetscFunctionBegin;
1366   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 #undef __FUNCT__
1371 #define __FUNCT__ "MatEqual_MPIAIJ"
1372 int MatEqual_MPIAIJ(Mat A,Mat B,PetscTruth *flag)
1373 {
1374   Mat_MPIAIJ *matB = (Mat_MPIAIJ*)B->data,*matA = (Mat_MPIAIJ*)A->data;
1375   Mat        a,b,c,d;
1376   PetscTruth flg;
1377   int        ierr;
1378 
1379   PetscFunctionBegin;
1380   ierr = PetscTypeCompare((PetscObject)B,MATMPIAIJ,&flg);CHKERRQ(ierr);
1381   if (!flg) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1382   a = matA->A; b = matA->B;
1383   c = matB->A; d = matB->B;
1384 
1385   ierr = MatEqual(a,c,&flg);CHKERRQ(ierr);
1386   if (flg == PETSC_TRUE) {
1387     ierr = MatEqual(b,d,&flg);CHKERRQ(ierr);
1388   }
1389   ierr = MPI_Allreduce(&flg,flag,1,MPI_INT,MPI_LAND,A->comm);CHKERRQ(ierr);
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 #undef __FUNCT__
1394 #define __FUNCT__ "MatCopy_MPIAIJ"
1395 int MatCopy_MPIAIJ(Mat A,Mat B,MatStructure str)
1396 {
1397   int        ierr;
1398   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1399   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
1400   PetscTruth flg;
1401 
1402   PetscFunctionBegin;
1403   ierr = PetscTypeCompare((PetscObject)B,MATMPIAIJ,&flg);CHKERRQ(ierr);
1404   if (str != SAME_NONZERO_PATTERN || !flg) {
1405     /* because of the column compression in the off-processor part of the matrix a->B,
1406        the number of columns in a->B and b->B may be different, hence we cannot call
1407        the MatCopy() directly on the two parts. If need be, we can provide a more
1408        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
1409        then copying the submatrices */
1410     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1411   } else {
1412     ierr = MatCopy(a->A,b->A,str);CHKERRQ(ierr);
1413     ierr = MatCopy(a->B,b->B,str);CHKERRQ(ierr);
1414   }
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 #undef __FUNCT__
1419 #define __FUNCT__ "MatSetUpPreallocation_MPIAIJ"
1420 int MatSetUpPreallocation_MPIAIJ(Mat A)
1421 {
1422   int        ierr;
1423 
1424   PetscFunctionBegin;
1425   ierr =  MatMPIAIJSetPreallocation(A,PETSC_DEFAULT,0,PETSC_DEFAULT,0);CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 EXTERN int MatDuplicate_MPIAIJ(Mat,MatDuplicateOption,Mat *);
1430 EXTERN int MatIncreaseOverlap_MPIAIJ(Mat,int,IS *,int);
1431 EXTERN int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1432 EXTERN int MatGetSubMatrices_MPIAIJ (Mat,int,IS *,IS *,MatReuse,Mat **);
1433 EXTERN int MatGetSubMatrix_MPIAIJ (Mat,IS,IS,int,MatReuse,Mat *);
1434 #if !defined(PETSC_USE_COMPLEX)
1435 EXTERN int MatLUFactorSymbolic_MPIAIJ_TFS(Mat,IS,IS,MatLUInfo*,Mat*);
1436 #endif
1437 
1438 /* -------------------------------------------------------------------*/
1439 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
1440        MatGetRow_MPIAIJ,
1441        MatRestoreRow_MPIAIJ,
1442        MatMult_MPIAIJ,
1443        MatMultAdd_MPIAIJ,
1444        MatMultTranspose_MPIAIJ,
1445        MatMultTransposeAdd_MPIAIJ,
1446        0,
1447        0,
1448        0,
1449        0,
1450        0,
1451        0,
1452        MatRelax_MPIAIJ,
1453        MatTranspose_MPIAIJ,
1454        MatGetInfo_MPIAIJ,
1455        MatEqual_MPIAIJ,
1456        MatGetDiagonal_MPIAIJ,
1457        MatDiagonalScale_MPIAIJ,
1458        MatNorm_MPIAIJ,
1459        MatAssemblyBegin_MPIAIJ,
1460        MatAssemblyEnd_MPIAIJ,
1461        0,
1462        MatSetOption_MPIAIJ,
1463        MatZeroEntries_MPIAIJ,
1464        MatZeroRows_MPIAIJ,
1465 #if !defined(PETSC_USE_COMPLEX)
1466 				       MatLUFactorSymbolic_MPIAIJ_TFS,
1467 #else
1468        0,
1469 #endif
1470        0,
1471        0,
1472        0,
1473        MatSetUpPreallocation_MPIAIJ,
1474        0,
1475        MatGetOwnershipRange_MPIAIJ,
1476        0,
1477        0,
1478        0,
1479        0,
1480        MatDuplicate_MPIAIJ,
1481        0,
1482        0,
1483        0,
1484        0,
1485        0,
1486        MatGetSubMatrices_MPIAIJ,
1487        MatIncreaseOverlap_MPIAIJ,
1488        MatGetValues_MPIAIJ,
1489        MatCopy_MPIAIJ,
1490        MatPrintHelp_MPIAIJ,
1491        MatScale_MPIAIJ,
1492        0,
1493        0,
1494        0,
1495        MatGetBlockSize_MPIAIJ,
1496        0,
1497        0,
1498        0,
1499        0,
1500        MatFDColoringCreate_MPIAIJ,
1501        0,
1502        MatSetUnfactored_MPIAIJ,
1503        0,
1504        0,
1505        MatGetSubMatrix_MPIAIJ,
1506        MatDestroy_MPIAIJ,
1507        MatView_MPIAIJ,
1508        MatGetMaps_Petsc,
1509        0,
1510        0,
1511        0,
1512        0,
1513        0,
1514        0,
1515        0,
1516        0,
1517        MatSetColoring_MPIAIJ,
1518        MatSetValuesAdic_MPIAIJ,
1519        MatSetValuesAdifor_MPIAIJ
1520 };
1521 
1522 /* ----------------------------------------------------------------------------------------*/
1523 
1524 EXTERN_C_BEGIN
1525 #undef __FUNCT__
1526 #define __FUNCT__ "MatStoreValues_MPIAIJ"
1527 int MatStoreValues_MPIAIJ(Mat mat)
1528 {
1529   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1530   int        ierr;
1531 
1532   PetscFunctionBegin;
1533   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
1534   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 EXTERN_C_END
1538 
1539 EXTERN_C_BEGIN
1540 #undef __FUNCT__
1541 #define __FUNCT__ "MatRetrieveValues_MPIAIJ"
1542 int MatRetrieveValues_MPIAIJ(Mat mat)
1543 {
1544   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1545   int        ierr;
1546 
1547   PetscFunctionBegin;
1548   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
1549   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552 EXTERN_C_END
1553 
1554 #include "petscpc.h"
1555 EXTERN_C_BEGIN
1556 EXTERN int MatGetDiagonalBlock_MPIAIJ(Mat,PetscTruth *,MatReuse,Mat *);
1557 EXTERN_C_END
1558 
1559 EXTERN_C_BEGIN
1560 #undef __FUNCT__
1561 #define __FUNCT__ "MatCreate_MPIAIJ"
1562 int MatCreate_MPIAIJ(Mat B)
1563 {
1564   Mat_MPIAIJ   *b;
1565   int          ierr,i,size;
1566 
1567   PetscFunctionBegin;
1568   ierr = MPI_Comm_size(B->comm,&size);CHKERRQ(ierr);
1569 
1570   ierr            = PetscNew(Mat_MPIAIJ,&b);CHKERRQ(ierr);
1571   B->data         = (void*)b;
1572   ierr            = PetscMemzero(b,sizeof(Mat_MPIAIJ));CHKERRQ(ierr);
1573   ierr            = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1574   B->factor       = 0;
1575   B->assembled    = PETSC_FALSE;
1576   B->mapping      = 0;
1577 
1578   B->insertmode      = NOT_SET_VALUES;
1579   b->size            = size;
1580   ierr = MPI_Comm_rank(B->comm,&b->rank);CHKERRQ(ierr);
1581 
1582   ierr = PetscSplitOwnership(B->comm,&B->m,&B->M);CHKERRQ(ierr);
1583   ierr = PetscSplitOwnership(B->comm,&B->n,&B->N);CHKERRQ(ierr);
1584 
1585   /* the information in the maps duplicates the information computed below, eventually
1586      we should remove the duplicate information that is not contained in the maps */
1587   ierr = MapCreateMPI(B->comm,B->m,B->M,&B->rmap);CHKERRQ(ierr);
1588   ierr = MapCreateMPI(B->comm,B->n,B->N,&B->cmap);CHKERRQ(ierr);
1589 
1590   /* build local table of row and column ownerships */
1591   ierr = PetscMalloc(2*(b->size+2)*sizeof(int),&b->rowners);CHKERRQ(ierr);
1592   PetscLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1593   b->cowners = b->rowners + b->size + 2;
1594   ierr = MPI_Allgather(&B->m,1,MPI_INT,b->rowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1595   b->rowners[0] = 0;
1596   for (i=2; i<=b->size; i++) {
1597     b->rowners[i] += b->rowners[i-1];
1598   }
1599   b->rstart = b->rowners[b->rank];
1600   b->rend   = b->rowners[b->rank+1];
1601   ierr = MPI_Allgather(&B->n,1,MPI_INT,b->cowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1602   b->cowners[0] = 0;
1603   for (i=2; i<=b->size; i++) {
1604     b->cowners[i] += b->cowners[i-1];
1605   }
1606   b->cstart = b->cowners[b->rank];
1607   b->cend   = b->cowners[b->rank+1];
1608 
1609   /* build cache for off array entries formed */
1610   ierr = MatStashCreate_Private(B->comm,1,&B->stash);CHKERRQ(ierr);
1611   b->donotstash  = PETSC_FALSE;
1612   b->colmap      = 0;
1613   b->garray      = 0;
1614   b->roworiented = PETSC_TRUE;
1615 
1616   /* stuff used for matrix vector multiply */
1617   b->lvec      = PETSC_NULL;
1618   b->Mvctx     = PETSC_NULL;
1619 
1620   /* stuff for MatGetRow() */
1621   b->rowindices   = 0;
1622   b->rowvalues    = 0;
1623   b->getrowactive = PETSC_FALSE;
1624 
1625   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatStoreValues_C",
1626                                      "MatStoreValues_MPIAIJ",
1627                                      MatStoreValues_MPIAIJ);CHKERRQ(ierr);
1628   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatRetrieveValues_C",
1629                                      "MatRetrieveValues_MPIAIJ",
1630                                      MatRetrieveValues_MPIAIJ);CHKERRQ(ierr);
1631   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetDiagonalBlock_C",
1632                                      "MatGetDiagonalBlock_MPIAIJ",
1633                                      MatGetDiagonalBlock_MPIAIJ);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 EXTERN_C_END
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatDuplicate_MPIAIJ"
1640 int MatDuplicate_MPIAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1641 {
1642   Mat        mat;
1643   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ*)matin->data;
1644   int        ierr;
1645 
1646   PetscFunctionBegin;
1647   *newmat       = 0;
1648   ierr = MatCreate(matin->comm,matin->m,matin->n,matin->M,matin->N,&mat);CHKERRQ(ierr);
1649   ierr = MatSetType(mat,MATMPIAIJ);CHKERRQ(ierr);
1650   a    = (Mat_MPIAIJ*)mat->data;
1651   ierr              = PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1652   mat->factor       = matin->factor;
1653   mat->assembled    = PETSC_TRUE;
1654   mat->insertmode   = NOT_SET_VALUES;
1655   mat->preallocated = PETSC_TRUE;
1656 
1657   a->rstart       = oldmat->rstart;
1658   a->rend         = oldmat->rend;
1659   a->cstart       = oldmat->cstart;
1660   a->cend         = oldmat->cend;
1661   a->size         = oldmat->size;
1662   a->rank         = oldmat->rank;
1663   a->donotstash   = oldmat->donotstash;
1664   a->roworiented  = oldmat->roworiented;
1665   a->rowindices   = 0;
1666   a->rowvalues    = 0;
1667   a->getrowactive = PETSC_FALSE;
1668 
1669   ierr       = PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));CHKERRQ(ierr);
1670   ierr       = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
1671   if (oldmat->colmap) {
1672 #if defined (PETSC_USE_CTABLE)
1673     ierr = PetscTableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
1674 #else
1675     ierr = PetscMalloc((mat->N)*sizeof(int),&a->colmap);CHKERRQ(ierr);
1676     PetscLogObjectMemory(mat,(mat->N)*sizeof(int));
1677     ierr      = PetscMemcpy(a->colmap,oldmat->colmap,(mat->N)*sizeof(int));CHKERRQ(ierr);
1678 #endif
1679   } else a->colmap = 0;
1680   if (oldmat->garray) {
1681     int len;
1682     len  = oldmat->B->n;
1683     ierr = PetscMalloc((len+1)*sizeof(int),&a->garray);CHKERRQ(ierr);
1684     PetscLogObjectMemory(mat,len*sizeof(int));
1685     if (len) { ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr); }
1686   } else a->garray = 0;
1687 
1688   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
1689   PetscLogObjectParent(mat,a->lvec);
1690   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
1691   PetscLogObjectParent(mat,a->Mvctx);
1692   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
1693   PetscLogObjectParent(mat,a->A);
1694   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
1695   PetscLogObjectParent(mat,a->B);
1696   ierr = PetscFListDuplicate(matin->qlist,&mat->qlist);CHKERRQ(ierr);
1697   *newmat = mat;
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 #include "petscsys.h"
1702 
1703 EXTERN_C_BEGIN
1704 #undef __FUNCT__
1705 #define __FUNCT__ "MatLoad_MPIAIJ"
1706 int MatLoad_MPIAIJ(PetscViewer viewer,MatType type,Mat *newmat)
1707 {
1708   Mat          A;
1709   Scalar       *vals,*svals;
1710   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1711   MPI_Status   status;
1712   int          i,nz,ierr,j,rstart,rend,fd;
1713   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1714   int          *ourlens,*sndcounts = 0,*procsnz = 0,*offlens,jj,*mycols,*smycols;
1715   int          tag = ((PetscObject)viewer)->tag,cend,cstart,n;
1716 
1717   PetscFunctionBegin;
1718   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1719   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1720   if (!rank) {
1721     ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
1722     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
1723     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"not matrix object");
1724     if (header[3] < 0) {
1725       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"Matrix in special format on disk, cannot load as MPIAIJ");
1726     }
1727   }
1728 
1729   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
1730   M = header[1]; N = header[2];
1731   /* determine ownership of all rows */
1732   m = M/size + ((M % size) > rank);
1733   ierr = PetscMalloc((size+2)*sizeof(int),&rowners);CHKERRQ(ierr);
1734   ierr = MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1735   rowners[0] = 0;
1736   for (i=2; i<=size; i++) {
1737     rowners[i] += rowners[i-1];
1738   }
1739   rstart = rowners[rank];
1740   rend   = rowners[rank+1];
1741 
1742   /* distribute row lengths to all processors */
1743   ierr    = PetscMalloc(2*(rend-rstart+1)*sizeof(int),&ourlens);CHKERRQ(ierr);
1744   offlens = ourlens + (rend-rstart);
1745   if (!rank) {
1746     ierr = PetscMalloc(M*sizeof(int),&rowlengths);CHKERRQ(ierr);
1747     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
1748     ierr = PetscMalloc(size*sizeof(int),&sndcounts);CHKERRQ(ierr);
1749     for (i=0; i<size; i++) sndcounts[i] = rowners[i+1] - rowners[i];
1750     ierr = MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
1751     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
1752   } else {
1753     ierr = MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
1754   }
1755 
1756   if (!rank) {
1757     /* calculate the number of nonzeros on each processor */
1758     ierr = PetscMalloc(size*sizeof(int),&procsnz);CHKERRQ(ierr);
1759     ierr = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
1760     for (i=0; i<size; i++) {
1761       for (j=rowners[i]; j< rowners[i+1]; j++) {
1762         procsnz[i] += rowlengths[j];
1763       }
1764     }
1765     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
1766 
1767     /* determine max buffer needed and allocate it */
1768     maxnz = 0;
1769     for (i=0; i<size; i++) {
1770       maxnz = PetscMax(maxnz,procsnz[i]);
1771     }
1772     ierr = PetscMalloc(maxnz*sizeof(int),&cols);CHKERRQ(ierr);
1773 
1774     /* read in my part of the matrix column indices  */
1775     nz   = procsnz[0];
1776     ierr = PetscMalloc(nz*sizeof(int),&mycols);CHKERRQ(ierr);
1777     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
1778 
1779     /* read in every one elses and ship off */
1780     for (i=1; i<size; i++) {
1781       nz   = procsnz[i];
1782       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
1783       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
1784     }
1785     ierr = PetscFree(cols);CHKERRQ(ierr);
1786   } else {
1787     /* determine buffer space needed for message */
1788     nz = 0;
1789     for (i=0; i<m; i++) {
1790       nz += ourlens[i];
1791     }
1792     ierr = PetscMalloc((nz+1)*sizeof(int),&mycols);CHKERRQ(ierr);
1793 
1794     /* receive message of column indices*/
1795     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
1796     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
1797     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
1798   }
1799 
1800   /* determine column ownership if matrix is not square */
1801   if (N != M) {
1802     n      = N/size + ((N % size) > rank);
1803     ierr   = MPI_Scan(&n,&cend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1804     cstart = cend - n;
1805   } else {
1806     cstart = rstart;
1807     cend   = rend;
1808     n      = cend - cstart;
1809   }
1810 
1811   /* loop over local rows, determining number of off diagonal entries */
1812   ierr = PetscMemzero(offlens,m*sizeof(int));CHKERRQ(ierr);
1813   jj = 0;
1814   for (i=0; i<m; i++) {
1815     for (j=0; j<ourlens[i]; j++) {
1816       if (mycols[jj] < cstart || mycols[jj] >= cend) offlens[i]++;
1817       jj++;
1818     }
1819   }
1820 
1821   /* create our matrix */
1822   for (i=0; i<m; i++) {
1823     ourlens[i] -= offlens[i];
1824   }
1825   ierr = MatCreateMPIAIJ(comm,m,n,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1826   A = *newmat;
1827   ierr = MatSetOption(A,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
1828   for (i=0; i<m; i++) {
1829     ourlens[i] += offlens[i];
1830   }
1831 
1832   if (!rank) {
1833     ierr = PetscMalloc(maxnz*sizeof(Scalar),&vals);CHKERRQ(ierr);
1834 
1835     /* read in my part of the matrix numerical values  */
1836     nz   = procsnz[0];
1837     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
1838 
1839     /* insert into matrix */
1840     jj      = rstart;
1841     smycols = mycols;
1842     svals   = vals;
1843     for (i=0; i<m; i++) {
1844       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1845       smycols += ourlens[i];
1846       svals   += ourlens[i];
1847       jj++;
1848     }
1849 
1850     /* read in other processors and ship out */
1851     for (i=1; i<size; i++) {
1852       nz   = procsnz[i];
1853       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
1854       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
1855     }
1856     ierr = PetscFree(procsnz);CHKERRQ(ierr);
1857   } else {
1858     /* receive numeric values */
1859     ierr = PetscMalloc((nz+1)*sizeof(Scalar),&vals);CHKERRQ(ierr);
1860 
1861     /* receive message of values*/
1862     ierr = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
1863     ierr = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
1864     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
1865 
1866     /* insert into matrix */
1867     jj      = rstart;
1868     smycols = mycols;
1869     svals   = vals;
1870     for (i=0; i<m; i++) {
1871       ierr     = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1872       smycols += ourlens[i];
1873       svals   += ourlens[i];
1874       jj++;
1875     }
1876   }
1877   ierr = PetscFree(ourlens);CHKERRQ(ierr);
1878   ierr = PetscFree(vals);CHKERRQ(ierr);
1879   ierr = PetscFree(mycols);CHKERRQ(ierr);
1880   ierr = PetscFree(rowners);CHKERRQ(ierr);
1881 
1882   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1883   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1884   PetscFunctionReturn(0);
1885 }
1886 EXTERN_C_END
1887 
1888 #undef __FUNCT__
1889 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ"
1890 /*
1891     Not great since it makes two copies of the submatrix, first an SeqAIJ
1892   in local and then by concatenating the local matrices the end result.
1893   Writing it directly would be much like MatGetSubMatrices_MPIAIJ()
1894 */
1895 int MatGetSubMatrix_MPIAIJ(Mat mat,IS isrow,IS iscol,int csize,MatReuse call,Mat *newmat)
1896 {
1897   int        ierr,i,m,n,rstart,row,rend,nz,*cwork,size,rank,j;
1898   int        *ii,*jj,nlocal,*dlens,*olens,dlen,olen,jend;
1899   Mat        *local,M,Mreuse;
1900   Scalar     *vwork,*aa;
1901   MPI_Comm   comm = mat->comm;
1902   Mat_SeqAIJ *aij;
1903 
1904 
1905   PetscFunctionBegin;
1906   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1907   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1908 
1909   if (call ==  MAT_REUSE_MATRIX) {
1910     ierr = PetscObjectQuery((PetscObject)*newmat,"SubMatrix",(PetscObject *)&Mreuse);CHKERRQ(ierr);
1911     if (!Mreuse) SETERRQ(1,"Submatrix passed in was not used before, cannot reuse");
1912     local = &Mreuse;
1913     ierr  = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_REUSE_MATRIX,&local);CHKERRQ(ierr);
1914   } else {
1915     ierr = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&local);CHKERRQ(ierr);
1916     Mreuse = *local;
1917     ierr = PetscFree(local);CHKERRQ(ierr);
1918   }
1919 
1920   /*
1921       m - number of local rows
1922       n - number of columns (same on all processors)
1923       rstart - first row in new global matrix generated
1924   */
1925   ierr = MatGetSize(Mreuse,&m,&n);CHKERRQ(ierr);
1926   if (call == MAT_INITIAL_MATRIX) {
1927     aij = (Mat_SeqAIJ*)(Mreuse)->data;
1928     if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,"No support for index shifted matrix");
1929     ii  = aij->i;
1930     jj  = aij->j;
1931 
1932     /*
1933         Determine the number of non-zeros in the diagonal and off-diagonal
1934         portions of the matrix in order to do correct preallocation
1935     */
1936 
1937     /* first get start and end of "diagonal" columns */
1938     if (csize == PETSC_DECIDE) {
1939       nlocal = n/size + ((n % size) > rank);
1940     } else {
1941       nlocal = csize;
1942     }
1943     ierr   = MPI_Scan(&nlocal,&rend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1944     rstart = rend - nlocal;
1945     if (rank == size - 1 && rend != n) {
1946       SETERRQ(1,"Local column sizes do not add up to total number of columns");
1947     }
1948 
1949     /* next, compute all the lengths */
1950     ierr  = PetscMalloc((2*m+1)*sizeof(int),&dlens);CHKERRQ(ierr);
1951     olens = dlens + m;
1952     for (i=0; i<m; i++) {
1953       jend = ii[i+1] - ii[i];
1954       olen = 0;
1955       dlen = 0;
1956       for (j=0; j<jend; j++) {
1957         if (*jj < rstart || *jj >= rend) olen++;
1958         else dlen++;
1959         jj++;
1960       }
1961       olens[i] = olen;
1962       dlens[i] = dlen;
1963     }
1964     ierr = MatCreateMPIAIJ(comm,m,nlocal,PETSC_DECIDE,n,0,dlens,0,olens,&M);CHKERRQ(ierr);
1965     ierr = PetscFree(dlens);CHKERRQ(ierr);
1966   } else {
1967     int ml,nl;
1968 
1969     M = *newmat;
1970     ierr = MatGetLocalSize(M,&ml,&nl);CHKERRQ(ierr);
1971     if (ml != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Previous matrix must be same size/layout as request");
1972     ierr = MatZeroEntries(M);CHKERRQ(ierr);
1973     /*
1974          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
1975        rather than the slower MatSetValues().
1976     */
1977     M->was_assembled = PETSC_TRUE;
1978     M->assembled     = PETSC_FALSE;
1979   }
1980   ierr = MatGetOwnershipRange(M,&rstart,&rend);CHKERRQ(ierr);
1981   aij = (Mat_SeqAIJ*)(Mreuse)->data;
1982   if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,"No support for index shifted matrix");
1983   ii  = aij->i;
1984   jj  = aij->j;
1985   aa  = aij->a;
1986   for (i=0; i<m; i++) {
1987     row   = rstart + i;
1988     nz    = ii[i+1] - ii[i];
1989     cwork = jj;     jj += nz;
1990     vwork = aa;     aa += nz;
1991     ierr = MatSetValues_MPIAIJ(M,1,&row,nz,cwork,vwork,INSERT_VALUES);CHKERRQ(ierr);
1992   }
1993 
1994   ierr = MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1995   ierr = MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1996   *newmat = M;
1997 
1998   /* save submatrix used in processor for next request */
1999   if (call ==  MAT_INITIAL_MATRIX) {
2000     ierr = PetscObjectCompose((PetscObject)M,"SubMatrix",(PetscObject)Mreuse);CHKERRQ(ierr);
2001     ierr = PetscObjectDereference((PetscObject)Mreuse);CHKERRQ(ierr);
2002   }
2003 
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 #undef __FUNCT__
2008 #define __FUNCT__ "MatMPIAIJSetPreallocation"
2009 /*@C
2010    MatMPIAIJSetPreallocation - Creates a sparse parallel matrix in AIJ format
2011    (the default parallel PETSc format).  For good matrix assembly performance
2012    the user should preallocate the matrix storage by setting the parameters
2013    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
2014    performance can be increased by more than a factor of 50.
2015 
2016    Collective on MPI_Comm
2017 
2018    Input Parameters:
2019 +  A - the matrix
2020 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
2021            (same value is used for all local rows)
2022 .  d_nnz - array containing the number of nonzeros in the various rows of the
2023            DIAGONAL portion of the local submatrix (possibly different for each row)
2024            or PETSC_NULL, if d_nz is used to specify the nonzero structure.
2025            The size of this array is equal to the number of local rows, i.e 'm'.
2026            You must leave room for the diagonal entry even if it is zero.
2027 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
2028            submatrix (same value is used for all local rows).
2029 -  o_nnz - array containing the number of nonzeros in the various rows of the
2030            OFF-DIAGONAL portion of the local submatrix (possibly different for
2031            each row) or PETSC_NULL, if o_nz is used to specify the nonzero
2032            structure. The size of this array is equal to the number
2033            of local rows, i.e 'm'.
2034 
2035    The AIJ format (also called the Yale sparse matrix format or
2036    compressed row storage), is fully compatible with standard Fortran 77
2037    storage.  That is, the stored row and column indices can begin at
2038    either one (as in Fortran) or zero.  See the users manual for details.
2039 
2040    The user MUST specify either the local or global matrix dimensions
2041    (possibly both).
2042 
2043    The parallel matrix is partitioned such that the first m0 rows belong to
2044    process 0, the next m1 rows belong to process 1, the next m2 rows belong
2045    to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
2046 
2047    The DIAGONAL portion of the local submatrix of a processor can be defined
2048    as the submatrix which is obtained by extraction the part corresponding
2049    to the rows r1-r2 and columns r1-r2 of the global matrix, where r1 is the
2050    first row that belongs to the processor, and r2 is the last row belonging
2051    to the this processor. This is a square mxm matrix. The remaining portion
2052    of the local submatrix (mxN) constitute the OFF-DIAGONAL portion.
2053 
2054    If o_nnz, d_nnz are specified, then o_nz, and d_nz are ignored.
2055 
2056    By default, this format uses inodes (identical nodes) when possible.
2057    We search for consecutive rows with the same nonzero structure, thereby
2058    reusing matrix information to achieve increased efficiency.
2059 
2060    Options Database Keys:
2061 +  -mat_aij_no_inode  - Do not use inodes
2062 .  -mat_aij_inode_limit <limit> - Sets inode limit (max limit=5)
2063 -  -mat_aij_oneindex - Internally use indexing starting at 1
2064         rather than 0.  Note that when calling MatSetValues(),
2065         the user still MUST index entries starting at 0!
2066 
2067    Example usage:
2068 
2069    Consider the following 8x8 matrix with 34 non-zero values, that is
2070    assembled across 3 processors. Lets assume that proc0 owns 3 rows,
2071    proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
2072    as follows:
2073 
2074 .vb
2075             1  2  0  |  0  3  0  |  0  4
2076     Proc0   0  5  6  |  7  0  0  |  8  0
2077             9  0 10  | 11  0  0  | 12  0
2078     -------------------------------------
2079            13  0 14  | 15 16 17  |  0  0
2080     Proc1   0 18  0  | 19 20 21  |  0  0
2081             0  0  0  | 22 23  0  | 24  0
2082     -------------------------------------
2083     Proc2  25 26 27  |  0  0 28  | 29  0
2084            30  0  0  | 31 32 33  |  0 34
2085 .ve
2086 
2087    This can be represented as a collection of submatrices as:
2088 
2089 .vb
2090       A B C
2091       D E F
2092       G H I
2093 .ve
2094 
2095    Where the submatrices A,B,C are owned by proc0, D,E,F are
2096    owned by proc1, G,H,I are owned by proc2.
2097 
2098    The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2099    The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2100    The 'M','N' parameters are 8,8, and have the same values on all procs.
2101 
2102    The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
2103    submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
2104    corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
2105    Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
2106    part as SeqAIJ matrices. for eg: proc1 will store [E] as a SeqAIJ
2107    matrix, ans [DF] as another SeqAIJ matrix.
2108 
2109    When d_nz, o_nz parameters are specified, d_nz storage elements are
2110    allocated for every row of the local diagonal submatrix, and o_nz
2111    storage locations are allocated for every row of the OFF-DIAGONAL submat.
2112    One way to choose d_nz and o_nz is to use the max nonzerors per local
2113    rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
2114    In this case, the values of d_nz,o_nz are:
2115 .vb
2116      proc0 : dnz = 2, o_nz = 2
2117      proc1 : dnz = 3, o_nz = 2
2118      proc2 : dnz = 1, o_nz = 4
2119 .ve
2120    We are allocating m*(d_nz+o_nz) storage locations for every proc. This
2121    translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
2122    for proc3. i.e we are using 12+15+10=37 storage locations to store
2123    34 values.
2124 
2125    When d_nnz, o_nnz parameters are specified, the storage is specified
2126    for every row, coresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
2127    In the above case the values for d_nnz,o_nnz are:
2128 .vb
2129      proc0: d_nnz = [2,2,2] and o_nnz = [2,2,2]
2130      proc1: d_nnz = [3,3,2] and o_nnz = [2,1,1]
2131      proc2: d_nnz = [1,1]   and o_nnz = [4,4]
2132 .ve
2133    Here the space allocated is sum of all the above values i.e 34, and
2134    hence pre-allocation is perfect.
2135 
2136    Level: intermediate
2137 
2138 .keywords: matrix, aij, compressed row, sparse, parallel
2139 
2140 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2141 @*/
2142 int MatMPIAIJSetPreallocation(Mat B,int d_nz,int *d_nnz,int o_nz,int *o_nnz)
2143 {
2144   Mat_MPIAIJ   *b;
2145   int          ierr,i;
2146   PetscTruth   flg2;
2147 
2148   PetscFunctionBegin;
2149   ierr = PetscTypeCompare((PetscObject)B,MATMPIAIJ,&flg2);CHKERRQ(ierr);
2150   if (!flg2) PetscFunctionReturn(0);
2151   B->preallocated = PETSC_TRUE;
2152   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
2153   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
2154   if (d_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %d",d_nz);
2155   if (o_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %d",o_nz);
2156   if (d_nnz) {
2157     for (i=0; i<B->m; i++) {
2158       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %d value %d",i,d_nnz[i]);
2159     }
2160   }
2161   if (o_nnz) {
2162     for (i=0; i<B->m; i++) {
2163       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %d value %d",i,o_nnz[i]);
2164     }
2165   }
2166   b = (Mat_MPIAIJ*)B->data;
2167 
2168   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,B->m,B->n,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
2169   PetscLogObjectParent(B,b->A);
2170   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,B->m,B->N,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
2171   PetscLogObjectParent(B,b->B);
2172 
2173   PetscFunctionReturn(0);
2174 }
2175 
2176 #undef __FUNCT__
2177 #define __FUNCT__ "MatCreateMPIAIJ"
2178 /*@C
2179    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
2180    (the default parallel PETSc format).  For good matrix assembly performance
2181    the user should preallocate the matrix storage by setting the parameters
2182    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
2183    performance can be increased by more than a factor of 50.
2184 
2185    Collective on MPI_Comm
2186 
2187    Input Parameters:
2188 +  comm - MPI communicator
2189 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
2190            This value should be the same as the local size used in creating the
2191            y vector for the matrix-vector product y = Ax.
2192 .  n - This value should be the same as the local size used in creating the
2193        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
2194        calculated if N is given) For square matrices n is almost always m.
2195 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
2196 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
2197 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
2198            (same value is used for all local rows)
2199 .  d_nnz - array containing the number of nonzeros in the various rows of the
2200            DIAGONAL portion of the local submatrix (possibly different for each row)
2201            or PETSC_NULL, if d_nz is used to specify the nonzero structure.
2202            The size of this array is equal to the number of local rows, i.e 'm'.
2203            You must leave room for the diagonal entry even if it is zero.
2204 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
2205            submatrix (same value is used for all local rows).
2206 -  o_nnz - array containing the number of nonzeros in the various rows of the
2207            OFF-DIAGONAL portion of the local submatrix (possibly different for
2208            each row) or PETSC_NULL, if o_nz is used to specify the nonzero
2209            structure. The size of this array is equal to the number
2210            of local rows, i.e 'm'.
2211 
2212    Output Parameter:
2213 .  A - the matrix
2214 
2215    Notes:
2216    m,n,M,N parameters specify the size of the matrix, and its partitioning across
2217    processors, while d_nz,d_nnz,o_nz,o_nnz parameters specify the approximate
2218    storage requirements for this matrix.
2219 
2220    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one
2221    processor than it must be used on all processors that share the object for
2222    that argument.
2223 
2224    The AIJ format (also called the Yale sparse matrix format or
2225    compressed row storage), is fully compatible with standard Fortran 77
2226    storage.  That is, the stored row and column indices can begin at
2227    either one (as in Fortran) or zero.  See the users manual for details.
2228 
2229    The user MUST specify either the local or global matrix dimensions
2230    (possibly both).
2231 
2232    The parallel matrix is partitioned such that the first m0 rows belong to
2233    process 0, the next m1 rows belong to process 1, the next m2 rows belong
2234    to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
2235 
2236    The DIAGONAL portion of the local submatrix of a processor can be defined
2237    as the submatrix which is obtained by extraction the part corresponding
2238    to the rows r1-r2 and columns r1-r2 of the global matrix, where r1 is the
2239    first row that belongs to the processor, and r2 is the last row belonging
2240    to the this processor. This is a square mxm matrix. The remaining portion
2241    of the local submatrix (mxN) constitute the OFF-DIAGONAL portion.
2242 
2243    If o_nnz, d_nnz are specified, then o_nz, and d_nz are ignored.
2244 
2245    By default, this format uses inodes (identical nodes) when possible.
2246    We search for consecutive rows with the same nonzero structure, thereby
2247    reusing matrix information to achieve increased efficiency.
2248 
2249    Options Database Keys:
2250 +  -mat_aij_no_inode  - Do not use inodes
2251 .  -mat_aij_inode_limit <limit> - Sets inode limit (max limit=5)
2252 -  -mat_aij_oneindex - Internally use indexing starting at 1
2253         rather than 0.  Note that when calling MatSetValues(),
2254         the user still MUST index entries starting at 0!
2255 
2256 
2257    Example usage:
2258 
2259    Consider the following 8x8 matrix with 34 non-zero values, that is
2260    assembled across 3 processors. Lets assume that proc0 owns 3 rows,
2261    proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
2262    as follows:
2263 
2264 .vb
2265             1  2  0  |  0  3  0  |  0  4
2266     Proc0   0  5  6  |  7  0  0  |  8  0
2267             9  0 10  | 11  0  0  | 12  0
2268     -------------------------------------
2269            13  0 14  | 15 16 17  |  0  0
2270     Proc1   0 18  0  | 19 20 21  |  0  0
2271             0  0  0  | 22 23  0  | 24  0
2272     -------------------------------------
2273     Proc2  25 26 27  |  0  0 28  | 29  0
2274            30  0  0  | 31 32 33  |  0 34
2275 .ve
2276 
2277    This can be represented as a collection of submatrices as:
2278 
2279 .vb
2280       A B C
2281       D E F
2282       G H I
2283 .ve
2284 
2285    Where the submatrices A,B,C are owned by proc0, D,E,F are
2286    owned by proc1, G,H,I are owned by proc2.
2287 
2288    The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2289    The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
2290    The 'M','N' parameters are 8,8, and have the same values on all procs.
2291 
2292    The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
2293    submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
2294    corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
2295    Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
2296    part as SeqAIJ matrices. for eg: proc1 will store [E] as a SeqAIJ
2297    matrix, ans [DF] as another SeqAIJ matrix.
2298 
2299    When d_nz, o_nz parameters are specified, d_nz storage elements are
2300    allocated for every row of the local diagonal submatrix, and o_nz
2301    storage locations are allocated for every row of the OFF-DIAGONAL submat.
2302    One way to choose d_nz and o_nz is to use the max nonzerors per local
2303    rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
2304    In this case, the values of d_nz,o_nz are:
2305 .vb
2306      proc0 : dnz = 2, o_nz = 2
2307      proc1 : dnz = 3, o_nz = 2
2308      proc2 : dnz = 1, o_nz = 4
2309 .ve
2310    We are allocating m*(d_nz+o_nz) storage locations for every proc. This
2311    translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
2312    for proc3. i.e we are using 12+15+10=37 storage locations to store
2313    34 values.
2314 
2315    When d_nnz, o_nnz parameters are specified, the storage is specified
2316    for every row, coresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
2317    In the above case the values for d_nnz,o_nnz are:
2318 .vb
2319      proc0: d_nnz = [2,2,2] and o_nnz = [2,2,2]
2320      proc1: d_nnz = [3,3,2] and o_nnz = [2,1,1]
2321      proc2: d_nnz = [1,1]   and o_nnz = [4,4]
2322 .ve
2323    Here the space allocated is sum of all the above values i.e 34, and
2324    hence pre-allocation is perfect.
2325 
2326    Level: intermediate
2327 
2328 .keywords: matrix, aij, compressed row, sparse, parallel
2329 
2330 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2331 @*/
2332 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
2333 {
2334   int ierr,size;
2335 
2336   PetscFunctionBegin;
2337   ierr = MatCreate(comm,m,n,M,N,A);CHKERRQ(ierr);
2338   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2339   if (size > 1) {
2340     ierr = MatSetType(*A,MATMPIAIJ);CHKERRQ(ierr);
2341     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
2342   } else {
2343     ierr = MatSetType(*A,MATSEQAIJ);CHKERRQ(ierr);
2344     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
2345   }
2346   PetscFunctionReturn(0);
2347 }
2348 
2349 #undef __FUNCT__
2350 #define __FUNCT__ "MatMPIAIJGetSeqAIJ"
2351 int MatMPIAIJGetSeqAIJ(Mat A,Mat *Ad,Mat *Ao,int **colmap)
2352 {
2353   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2354   PetscFunctionBegin;
2355   *Ad     = a->A;
2356   *Ao     = a->B;
2357   *colmap = a->garray;
2358   PetscFunctionReturn(0);
2359 }
2360 
2361 #undef __FUNCT__
2362 #define __FUNCT__ "MatSetColoring_MPIAIJ"
2363 int MatSetColoring_MPIAIJ(Mat A,ISColoring coloring)
2364 {
2365   int        ierr;
2366   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
2367 
2368   PetscFunctionBegin;
2369   if (coloring->ctype == IS_COLORING_LOCAL) {
2370     int        *allcolors,*colors,i;
2371     ISColoring ocoloring;
2372 
2373     /* set coloring for diagonal portion */
2374     ierr = MatSetColoring_SeqAIJ(a->A,coloring);CHKERRQ(ierr);
2375 
2376     /* set coloring for off-diagonal portion */
2377     ierr = ISAllGatherIndices(A->comm,coloring->n,coloring->colors,PETSC_NULL,&allcolors);CHKERRQ(ierr);
2378     ierr = PetscMalloc((a->B->n+1)*sizeof(int),&colors);CHKERRQ(ierr);
2379     for (i=0; i<a->B->n; i++) {
2380       colors[i] = allcolors[a->garray[i]];
2381     }
2382     ierr = PetscFree(allcolors);CHKERRQ(ierr);
2383     ierr = ISColoringCreate(MPI_COMM_SELF,a->B->n,colors,&ocoloring);CHKERRQ(ierr);
2384     ierr = MatSetColoring_SeqAIJ(a->B,ocoloring);CHKERRQ(ierr);
2385     ierr = ISColoringDestroy(ocoloring);CHKERRQ(ierr);
2386   } else if (coloring->ctype == IS_COLORING_GHOSTED) {
2387     int        *colors,i,*larray;
2388     ISColoring ocoloring;
2389 
2390     /* set coloring for diagonal portion */
2391     ierr = PetscMalloc((a->A->n+1)*sizeof(int),&larray);CHKERRQ(ierr);
2392     for (i=0; i<a->A->n; i++) {
2393       larray[i] = i + a->cstart;
2394     }
2395     ierr = ISGlobalToLocalMappingApply(A->mapping,IS_GTOLM_MASK,a->A->n,larray,PETSC_NULL,larray);CHKERRQ(ierr);
2396     ierr = PetscMalloc((a->A->n+1)*sizeof(int),&colors);CHKERRQ(ierr);
2397     for (i=0; i<a->A->n; i++) {
2398       colors[i] = coloring->colors[larray[i]];
2399     }
2400     ierr = PetscFree(larray);CHKERRQ(ierr);
2401     ierr = ISColoringCreate(MPI_COMM_SELF,a->A->n,colors,&ocoloring);CHKERRQ(ierr);
2402     ierr = MatSetColoring_SeqAIJ(a->A,ocoloring);CHKERRQ(ierr);
2403     ierr = ISColoringDestroy(ocoloring);CHKERRQ(ierr);
2404 
2405     /* set coloring for off-diagonal portion */
2406     ierr = PetscMalloc((a->B->n+1)*sizeof(int),&larray);CHKERRQ(ierr);
2407     ierr = ISGlobalToLocalMappingApply(A->mapping,IS_GTOLM_MASK,a->B->n,a->garray,PETSC_NULL,larray);CHKERRQ(ierr);
2408     ierr = PetscMalloc((a->B->n+1)*sizeof(int),&colors);CHKERRQ(ierr);
2409     for (i=0; i<a->B->n; i++) {
2410       colors[i] = coloring->colors[larray[i]];
2411     }
2412     ierr = PetscFree(larray);CHKERRQ(ierr);
2413     ierr = ISColoringCreate(MPI_COMM_SELF,a->B->n,colors,&ocoloring);CHKERRQ(ierr);
2414     ierr = MatSetColoring_SeqAIJ(a->B,ocoloring);CHKERRQ(ierr);
2415     ierr = ISColoringDestroy(ocoloring);CHKERRQ(ierr);
2416   } else {
2417     SETERRQ1(1,"No support ISColoringType %d",coloring->ctype);
2418   }
2419 
2420   PetscFunctionReturn(0);
2421 }
2422 
2423 #undef __FUNCT__
2424 #define __FUNCT__ "MatSetValuesAdic_MPIAIJ"
2425 int MatSetValuesAdic_MPIAIJ(Mat A,void *advalues)
2426 {
2427   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
2428   int        ierr;
2429 
2430   PetscFunctionBegin;
2431   ierr = MatSetValuesAdic_SeqAIJ(a->A,advalues);CHKERRQ(ierr);
2432   ierr = MatSetValuesAdic_SeqAIJ(a->B,advalues);CHKERRQ(ierr);
2433   PetscFunctionReturn(0);
2434 }
2435 
2436 #undef __FUNCT__
2437 #define __FUNCT__ "MatSetValuesAdifor_MPIAIJ"
2438 int MatSetValuesAdifor_MPIAIJ(Mat A,int nl,void *advalues)
2439 {
2440   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
2441   int        ierr;
2442 
2443   PetscFunctionBegin;
2444   ierr = MatSetValuesAdifor_SeqAIJ(a->A,nl,advalues);CHKERRQ(ierr);
2445   ierr = MatSetValuesAdifor_SeqAIJ(a->B,nl,advalues);CHKERRQ(ierr);
2446   PetscFunctionReturn(0);
2447 }
2448