xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 07fdaae1fdcf82bbf3df981bb743e49930d8d418)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpiaij.c,v 1.268 1998/12/17 22:10:16 bsmith Exp bsmith $";
3 #endif
4 
5 #include "src/mat/impls/aij/mpi/mpiaij.h"
6 #include "src/vec/vecimpl.h"
7 #include "src/inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 #undef __FUNC__
15 #define __FUNC__ "CreateColmap_MPIAIJ_Private"
16 int CreateColmap_MPIAIJ_Private(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
19   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
20   int        n = B->n,i;
21 
22   PetscFunctionBegin;
23   aij->colmap = (int *) PetscMalloc((aij->N+1)*sizeof(int));CHKPTRQ(aij->colmap);
24   PLogObjectMemory(mat,aij->N*sizeof(int));
25   PetscMemzero(aij->colmap,aij->N*sizeof(int));
26   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
27   PetscFunctionReturn(0);
28 }
29 
30 extern int DisAssemble_MPIAIJ(Mat);
31 
32 #define CHUNKSIZE   15
33 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
34 { \
35  \
36     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
37     rmax = aimax[row]; nrow = ailen[row];  \
38     col1 = col - shift; \
39      \
40     low = 0; high = nrow; \
41     while (high-low > 5) { \
42       t = (low+high)/2; \
43       if (rp[t] > col) high = t; \
44       else             low  = t; \
45     } \
46       for ( _i=0; _i<nrow; _i++ ) { \
47         if (rp[_i] > col1) break; \
48         if (rp[_i] == col1) { \
49           if (addv == ADD_VALUES) ap[_i] += value;   \
50           else                  ap[_i] = value; \
51           goto a_noinsert; \
52         } \
53       }  \
54       if (nonew == 1) goto a_noinsert; \
55       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
56       if (nrow >= rmax) { \
57         /* there is no extra room in row, therefore enlarge */ \
58         int    new_nz = ai[a->m] + CHUNKSIZE,len,*new_i,*new_j; \
59         Scalar *new_a; \
60  \
61         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
62  \
63         /* malloc new storage space */ \
64         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(a->m+1)*sizeof(int); \
65         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
66         new_j   = (int *) (new_a + new_nz); \
67         new_i   = new_j + new_nz; \
68  \
69         /* copy over old data into new slots */ \
70         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];} \
71         for ( ii=row+1; ii<a->m+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
72         PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int)); \
73         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
74         PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
75                                                            len*sizeof(int)); \
76         PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar)); \
77         PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
78                                                            len*sizeof(Scalar));  \
79         /* free up old matrix storage */ \
80  \
81         PetscFree(a->a);  \
82         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
83         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
84         a->singlemalloc = 1; \
85  \
86         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
87         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
88         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
89         a->maxnz += CHUNKSIZE; \
90         a->reallocs++; \
91       } \
92       N = nrow++ - 1; a->nz++; \
93       /* shift up all the later entries in this row */ \
94       for ( ii=N; ii>=_i; ii-- ) { \
95         rp[ii+1] = rp[ii]; \
96         ap[ii+1] = ap[ii]; \
97       } \
98       rp[_i] = col1;  \
99       ap[_i] = value;  \
100       a_noinsert: ; \
101       ailen[row] = nrow; \
102 }
103 
104 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
105 { \
106  \
107     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
108     rmax = bimax[row]; nrow = bilen[row];  \
109     col1 = col - shift; \
110      \
111     low = 0; high = nrow; \
112     while (high-low > 5) { \
113       t = (low+high)/2; \
114       if (rp[t] > col) high = t; \
115       else             low  = t; \
116     } \
117        for ( _i=0; _i<nrow; _i++ ) { \
118         if (rp[_i] > col1) break; \
119         if (rp[_i] == col1) { \
120           if (addv == ADD_VALUES) ap[_i] += value;   \
121           else                  ap[_i] = value; \
122           goto b_noinsert; \
123         } \
124       }  \
125       if (nonew == 1) goto b_noinsert; \
126       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
127       if (nrow >= rmax) { \
128         /* there is no extra room in row, therefore enlarge */ \
129         int    new_nz = bi[b->m] + CHUNKSIZE,len,*new_i,*new_j; \
130         Scalar *new_a; \
131  \
132         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
133  \
134         /* malloc new storage space */ \
135         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(b->m+1)*sizeof(int); \
136         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
137         new_j   = (int *) (new_a + new_nz); \
138         new_i   = new_j + new_nz; \
139  \
140         /* copy over old data into new slots */ \
141         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = bi[ii];} \
142         for ( ii=row+1; ii<b->m+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
143         PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int)); \
144         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
145         PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
146                                                            len*sizeof(int)); \
147         PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar)); \
148         PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
149                                                            len*sizeof(Scalar));  \
150         /* free up old matrix storage */ \
151  \
152         PetscFree(b->a);  \
153         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
154         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
155         b->singlemalloc = 1; \
156  \
157         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
158         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
159         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
160         b->maxnz += CHUNKSIZE; \
161         b->reallocs++; \
162       } \
163       N = nrow++ - 1; b->nz++; \
164       /* shift up all the later entries in this row */ \
165       for ( ii=N; ii>=_i; ii-- ) { \
166         rp[ii+1] = rp[ii]; \
167         ap[ii+1] = ap[ii]; \
168       } \
169       rp[_i] = col1;  \
170       ap[_i] = value;  \
171       b_noinsert: ; \
172       bilen[row] = nrow; \
173 }
174 
175 extern int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
176 #undef __FUNC__
177 #define __FUNC__ "MatSetValues_MPIAIJ"
178 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
179 {
180   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
181   Scalar     value;
182   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
183   int        cstart = aij->cstart, cend = aij->cend,row,col;
184   int        roworiented = aij->roworiented;
185 
186   /* Some Variables required in the macro */
187   Mat        A = aij->A;
188   Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
189   int        *aimax = a->imax, *ai = a->i, *ailen = a->ilen,*aj = a->j;
190   Scalar     *aa = a->a;
191 
192   Mat        B = aij->B;
193   Mat_SeqAIJ *b = (Mat_SeqAIJ *) B->data;
194   int        *bimax = b->imax, *bi = b->i, *bilen = b->ilen,*bj = b->j;
195   Scalar     *ba = b->a;
196 
197   int        *rp,ii,nrow,_i,rmax, N, col1,low,high,t;
198   int        nonew = a->nonew,shift = a->indexshift;
199   Scalar     *ap;
200 
201   PetscFunctionBegin;
202   for ( i=0; i<m; i++ ) {
203 #if defined(USE_PETSC_BOPT_g)
204     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
205     if (im[i] >= aij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
206 #endif
207     if (im[i] >= rstart && im[i] < rend) {
208       row = im[i] - rstart;
209       for ( j=0; j<n; j++ ) {
210         if (in[j] >= cstart && in[j] < cend){
211           col = in[j] - cstart;
212           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
213           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
214           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
215         }
216 #if defined(USE_PETSC_BOPT_g)
217         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
218         else if (in[j] >= aij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
219 #endif
220         else {
221           if (mat->was_assembled) {
222             if (!aij->colmap) {
223               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
224             }
225             col = aij->colmap[in[j]] - 1;
226             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
227               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
228               col =  in[j];
229               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
230               B = aij->B;
231               b = (Mat_SeqAIJ *) B->data;
232               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
233               ba = b->a;
234             }
235           } else col = in[j];
236           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
237           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
238           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
239         }
240       }
241     }
242     else {
243       if (roworiented && !aij->donotstash) {
244         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
245       }
246       else {
247         if (!aij->donotstash) {
248           row = im[i];
249           for ( j=0; j<n; j++ ) {
250             ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
251           }
252         }
253       }
254     }
255   }
256   PetscFunctionReturn(0);
257 }
258 
259 #undef __FUNC__
260 #define __FUNC__ "MatGetValues_MPIAIJ"
261 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
262 {
263   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
264   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
265   int        cstart = aij->cstart, cend = aij->cend,row,col;
266 
267   PetscFunctionBegin;
268   for ( i=0; i<m; i++ ) {
269     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
270     if (idxm[i] >= aij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
271     if (idxm[i] >= rstart && idxm[i] < rend) {
272       row = idxm[i] - rstart;
273       for ( j=0; j<n; j++ ) {
274         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
275         if (idxn[j] >= aij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
276         if (idxn[j] >= cstart && idxn[j] < cend){
277           col = idxn[j] - cstart;
278           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
279         } else {
280           if (!aij->colmap) {
281             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
282           }
283           col = aij->colmap[idxn[j]] - 1;
284           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
285           else {
286             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
287           }
288         }
289       }
290     } else {
291       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
292     }
293   }
294   PetscFunctionReturn(0);
295 }
296 
297 #undef __FUNC__
298 #define __FUNC__ "MatAssemblyBegin_MPIAIJ"
299 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
300 {
301   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
302   MPI_Comm    comm = mat->comm;
303   int         size = aij->size, *owners = aij->rowners;
304   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
305   MPI_Request *send_waits,*recv_waits;
306   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
307   InsertMode  addv;
308   Scalar      *rvalues,*svalues;
309 
310   PetscFunctionBegin;
311   if (aij->donotstash) {
312     aij->svalues    = 0; aij->rvalues    = 0;
313     aij->nsends     = 0; aij->nrecvs     = 0;
314     aij->send_waits = 0; aij->recv_waits = 0;
315     aij->rmax       = 0;
316     PetscFunctionReturn(0);
317   }
318 
319   /* make sure all processors are either in INSERTMODE or ADDMODE */
320   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
321   if (addv == (ADD_VALUES|INSERT_VALUES)) {
322     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
323   }
324   mat->insertmode = addv; /* in case this processor had no cache */
325 
326   /*  first count number of contributors to each processor */
327   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
328   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
329   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
330   for ( i=0; i<aij->stash.n; i++ ) {
331     idx = aij->stash.idx[i];
332     for ( j=0; j<size; j++ ) {
333       if (idx >= owners[j] && idx < owners[j+1]) {
334         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
335       }
336     }
337   }
338   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
339 
340   /* inform other processors of number of messages and max length*/
341   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
342   ierr = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
343   nreceives = work[rank];
344   ierr = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
345   nmax = work[rank];
346   PetscFree(work);
347 
348   /* post receives:
349        1) each message will consist of ordered pairs
350      (global index,value) we store the global index as a double
351      to simplify the message passing.
352        2) since we don't know how long each individual message is we
353      allocate the largest needed buffer for each receive. Potentially
354      this is a lot of wasted space.
355 
356 
357        This could be done better.
358   */
359   rvalues    = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
360   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
361   for ( i=0; i<nreceives; i++ ) {
362     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
363               comm,recv_waits+i);CHKERRQ(ierr);
364   }
365 
366   /* do sends:
367       1) starts[i] gives the starting index in svalues for stuff going to
368          the ith processor
369   */
370   svalues    = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
371   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
372   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
373   starts[0]  = 0;
374   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
375   for ( i=0; i<aij->stash.n; i++ ) {
376     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
377     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
378     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
379   }
380   PetscFree(owner);
381   starts[0] = 0;
382   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
383   count = 0;
384   for ( i=0; i<size; i++ ) {
385     if (procs[i]) {
386       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
387     }
388   }
389   PetscFree(starts);
390   PetscFree(nprocs);
391 
392   /* Free cache space */
393   PLogInfo(aij->A,"MatAssemblyBegin_MPIAIJ:Number of off-processor values %d\n",aij->stash.n);
394   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
395 
396   aij->svalues    = svalues;    aij->rvalues    = rvalues;
397   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
398   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
399   aij->rmax       = nmax;
400 
401   PetscFunctionReturn(0);
402 }
403 extern int MatSetUpMultiply_MPIAIJ(Mat);
404 
405 #undef __FUNC__
406 #define __FUNC__ "MatAssemblyEnd_MPIAIJ"
407 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
408 {
409   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
410   MPI_Status  *send_status,recv_status;
411   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
412   int         row,col,other_disassembled;
413   Scalar      *values,val;
414   InsertMode  addv = mat->insertmode;
415 
416   PetscFunctionBegin;
417 
418   /*  wait on receives */
419   while (count) {
420     ierr = MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
421     /* unpack receives into our local space */
422     values = aij->rvalues + 3*imdex*aij->rmax;
423     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
424     n = n/3;
425     for ( i=0; i<n; i++ ) {
426       row = (int) PetscReal(values[3*i]) - aij->rstart;
427       col = (int) PetscReal(values[3*i+1]);
428       val = values[3*i+2];
429       if (col >= aij->cstart && col < aij->cend) {
430         col -= aij->cstart;
431         ierr = MatSetValues(aij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
432       } else {
433         if (mat->was_assembled || mat->assembled) {
434           if (!aij->colmap) {
435             ierr = CreateColmap_MPIAIJ_Private(mat); CHKERRQ(ierr);
436           }
437           col = aij->colmap[col] - 1;
438           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
439             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
440             col = (int) PetscReal(values[3*i+1]);
441           }
442         }
443         ierr = MatSetValues(aij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
444       }
445     }
446     count--;
447   }
448   if (aij->recv_waits) PetscFree(aij->recv_waits);
449   if (aij->rvalues)    PetscFree(aij->rvalues);
450 
451   /* wait on sends */
452   if (aij->nsends) {
453     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
454     ierr        = MPI_Waitall(aij->nsends,aij->send_waits,send_status);CHKERRQ(ierr);
455     PetscFree(send_status);
456   }
457   if (aij->send_waits) PetscFree(aij->send_waits);
458   if (aij->svalues)    PetscFree(aij->svalues);
459 
460   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
461   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
462 
463   /* determine if any processor has disassembled, if so we must
464      also disassemble ourselfs, in order that we may reassemble. */
465   /*
466      if nonzero structure of submatrix B cannot change then we know that
467      no processor disassembled thus we can skip this stuff
468   */
469   if (!((Mat_SeqAIJ*) aij->B->data)->nonew)  {
470     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
471     if (mat->was_assembled && !other_disassembled) {
472       ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
473     }
474   }
475 
476   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
477     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
478   }
479   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
480   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
481 
482   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
483   PetscFunctionReturn(0);
484 }
485 
486 #undef __FUNC__
487 #define __FUNC__ "MatZeroEntries_MPIAIJ"
488 int MatZeroEntries_MPIAIJ(Mat A)
489 {
490   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
491   int        ierr;
492 
493   PetscFunctionBegin;
494   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
495   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
496   PetscFunctionReturn(0);
497 }
498 
499 #undef __FUNC__
500 #define __FUNC__ "MatZeroRows_MPIAIJ"
501 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
502 {
503   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
504   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
505   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
506   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
507   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
508   int            *lens,imdex,*lrows,*values,rstart=l->rstart;
509   MPI_Comm       comm = A->comm;
510   MPI_Request    *send_waits,*recv_waits;
511   MPI_Status     recv_status,*send_status;
512   IS             istmp;
513   PetscTruth     localdiag;
514 
515   PetscFunctionBegin;
516   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
517   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
518 
519   /*  first count number of contributors to each processor */
520   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
521   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
522   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
523   for ( i=0; i<N; i++ ) {
524     idx = rows[i];
525     found = 0;
526     for ( j=0; j<size; j++ ) {
527       if (idx >= owners[j] && idx < owners[j+1]) {
528         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
529       }
530     }
531     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
532   }
533   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
534 
535   /* inform other processors of number of messages and max length*/
536   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
537   ierr = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
538   nrecvs = work[rank];
539   ierr = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
540   nmax = work[rank];
541   PetscFree(work);
542 
543   /* post receives:   */
544   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int));CHKPTRQ(rvalues);
545   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
546   for ( i=0; i<nrecvs; i++ ) {
547     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
548   }
549 
550   /* do sends:
551       1) starts[i] gives the starting index in svalues for stuff going to
552          the ith processor
553   */
554   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
555   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
556   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
557   starts[0] = 0;
558   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
559   for ( i=0; i<N; i++ ) {
560     svalues[starts[owner[i]]++] = rows[i];
561   }
562   ISRestoreIndices(is,&rows);
563 
564   starts[0] = 0;
565   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
566   count = 0;
567   for ( i=0; i<size; i++ ) {
568     if (procs[i]) {
569       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
570     }
571   }
572   PetscFree(starts);
573 
574   base = owners[rank];
575 
576   /*  wait on receives */
577   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
578   source = lens + nrecvs;
579   count  = nrecvs; slen = 0;
580   while (count) {
581     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
582     /* unpack receives into our local space */
583     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
584     source[imdex]  = recv_status.MPI_SOURCE;
585     lens[imdex]  = n;
586     slen += n;
587     count--;
588   }
589   PetscFree(recv_waits);
590 
591   /* move the data into the send scatter */
592   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
593   count = 0;
594   for ( i=0; i<nrecvs; i++ ) {
595     values = rvalues + i*nmax;
596     for ( j=0; j<lens[i]; j++ ) {
597       lrows[count++] = values[j] - base;
598     }
599   }
600   PetscFree(rvalues); PetscFree(lens);
601   PetscFree(owner); PetscFree(nprocs);
602 
603   /* actually zap the local rows */
604   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
605   PLogObjectParent(A,istmp);
606 
607   /*
608         Zero the required rows. If the "diagonal block" of the matrix
609      is square and the user wishes to set the diagonal we use seperate
610      code so that MatSetValues() is not called for each diagonal allocating
611      new memory, thus calling lots of mallocs and slowing things down.
612 
613        Contributed by: Mathew Knepley
614   */
615   localdiag = PETSC_FALSE;
616   if (diag && (l->A->M == l->A->N)) {
617     localdiag = PETSC_TRUE;
618     ierr      = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
619   } else {
620     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
621   }
622   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
623   ierr = ISDestroy(istmp); CHKERRQ(ierr);
624 
625   if (diag && (localdiag == PETSC_FALSE)) {
626     for ( i = 0; i < slen; i++ ) {
627       row = lrows[i] + rstart;
628       MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);
629     }
630     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
631     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
632   }
633 
634   if (diag) {
635     for ( i = 0; i < slen; i++ ) {
636       row = lrows[i] + rstart;
637       MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);
638     }
639     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
640     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
641   }
642   PetscFree(lrows);
643 
644   /* wait on sends */
645   if (nsends) {
646     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
647     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
648     PetscFree(send_status);
649   }
650   PetscFree(send_waits); PetscFree(svalues);
651 
652   PetscFunctionReturn(0);
653 }
654 
655 #undef __FUNC__
656 #define __FUNC__ "MatMult_MPIAIJ"
657 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
658 {
659   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
660   int        ierr,nt;
661 
662   PetscFunctionBegin;
663   ierr = VecGetLocalSize(xx,&nt);  CHKERRQ(ierr);
664   if (nt != a->n) {
665     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
666   }
667   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
668   ierr = (*a->A->ops->mult)(a->A,xx,yy); CHKERRQ(ierr);
669   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
670   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 #undef __FUNC__
675 #define __FUNC__ "MatMultAdd_MPIAIJ"
676 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
677 {
678   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
679   int        ierr;
680 
681   PetscFunctionBegin;
682   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
683   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
684   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
685   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
686   PetscFunctionReturn(0);
687 }
688 
689 #undef __FUNC__
690 #define __FUNC__ "MatMultTrans_MPIAIJ"
691 int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
692 {
693   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
694   int        ierr;
695 
696   PetscFunctionBegin;
697   /* do nondiagonal part */
698   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
699   /* send it on its way */
700   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
701   /* do local part */
702   ierr = (*a->A->ops->multtrans)(a->A,xx,yy); CHKERRQ(ierr);
703   /* receive remote parts: note this assumes the values are not actually */
704   /* inserted in yy until the next line, which is true for my implementation*/
705   /* but is not perhaps always true. */
706   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNC__
711 #define __FUNC__ "MatMultTransAdd_MPIAIJ"
712 int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
713 {
714   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
715   int        ierr;
716 
717   PetscFunctionBegin;
718   /* do nondiagonal part */
719   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
720   /* send it on its way */
721   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
722   /* do local part */
723   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
724   /* receive remote parts: note this assumes the values are not actually */
725   /* inserted in yy until the next line, which is true for my implementation*/
726   /* but is not perhaps always true. */
727   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
728   PetscFunctionReturn(0);
729 }
730 
731 /*
732   This only works correctly for square matrices where the subblock A->A is the
733    diagonal block
734 */
735 #undef __FUNC__
736 #define __FUNC__ "MatGetDiagonal_MPIAIJ"
737 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
738 {
739   int        ierr;
740   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
741 
742   PetscFunctionBegin;
743   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
744   if (a->rstart != a->cstart || a->rend != a->cend) {
745     SETERRQ(PETSC_ERR_ARG_SIZ,0,"row partition must equal col partition");
746   }
747   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
748   PetscFunctionReturn(0);
749 }
750 
751 #undef __FUNC__
752 #define __FUNC__ "MatScale_MPIAIJ"
753 int MatScale_MPIAIJ(Scalar *aa,Mat A)
754 {
755   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
756   int        ierr;
757 
758   PetscFunctionBegin;
759   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
760   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
761   PetscFunctionReturn(0);
762 }
763 
764 #undef __FUNC__
765 #define __FUNC__ "MatDestroy_MPIAIJ"
766 int MatDestroy_MPIAIJ(Mat mat)
767 {
768   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
769   int        ierr;
770 
771   PetscFunctionBegin;
772   if (--mat->refct > 0) PetscFunctionReturn(0);
773 
774   if (mat->mapping) {
775     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
776   }
777   if (mat->bmapping) {
778     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping); CHKERRQ(ierr);
779   }
780   if (mat->rmap) {
781     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
782   }
783   if (mat->cmap) {
784     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
785   }
786 #if defined(USE_PETSC_LOG)
787   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",aij->M,aij->N);
788 #endif
789   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
790   PetscFree(aij->rowners);
791   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
792   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
793   if (aij->colmap) PetscFree(aij->colmap);
794   if (aij->garray) PetscFree(aij->garray);
795   if (aij->lvec)   VecDestroy(aij->lvec);
796   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
797   if (aij->rowvalues) PetscFree(aij->rowvalues);
798   PetscFree(aij);
799   PLogObjectDestroy(mat);
800   PetscHeaderDestroy(mat);
801   PetscFunctionReturn(0);
802 }
803 
804 #undef __FUNC__
805 #define __FUNC__ "MatView_MPIAIJ_Binary"
806 extern int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
807 {
808   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
809   int         ierr;
810 
811   PetscFunctionBegin;
812   if (aij->size == 1) {
813     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
814   }
815   else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
816   PetscFunctionReturn(0);
817 }
818 
819 #undef __FUNC__
820 #define __FUNC__ "MatView_MPIAIJ_ASCIIorDraworMatlab"
821 extern int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
822 {
823   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
824   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
825   int         ierr, format,shift = C->indexshift,rank = aij->rank ;
826   int         size = aij->size;
827   FILE        *fd;
828   ViewerType  vtype;
829 
830   PetscFunctionBegin;
831   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
832   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
833     ierr = ViewerGetFormat(viewer,&format);
834     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
835       MatInfo info;
836       int     flg;
837       MPI_Comm_rank(mat->comm,&rank);
838       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
839       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
840       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
841       PetscSequentialPhaseBegin(mat->comm,1);
842       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
843          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
844       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
845          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
846       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);
847       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);
848       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);
849       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);
850       fflush(fd);
851       PetscSequentialPhaseEnd(mat->comm,1);
852       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
853       PetscFunctionReturn(0);
854     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
855       PetscFunctionReturn(0);
856     }
857   } else if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
858     Draw       draw;
859     PetscTruth isnull;
860     ierr = ViewerDrawGetDraw(viewer,0,&draw); CHKERRQ(ierr);
861     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
862   }
863 
864   if (size == 1) {
865     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
866   } else {
867     /* assemble the entire matrix onto first processor. */
868     Mat         A;
869     Mat_SeqAIJ *Aloc;
870     int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
871     Scalar      *a;
872 
873     if (!rank) {
874       ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
875     } else {
876       ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
877     }
878     PLogObjectParent(mat,A);
879 
880     /* copy over the A part */
881     Aloc = (Mat_SeqAIJ*) aij->A->data;
882     m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
883     row = aij->rstart;
884     for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
885     for ( i=0; i<m; i++ ) {
886       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
887       row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
888     }
889     aj = Aloc->j;
890     for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
891 
892     /* copy over the B part */
893     Aloc = (Mat_SeqAIJ*) aij->B->data;
894     m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
895     row = aij->rstart;
896     ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
897     for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
898     for ( i=0; i<m; i++ ) {
899       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
900       row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
901     }
902     PetscFree(ct);
903     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
904     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
905     /*
906        Everyone has to call to draw the matrix since the graphics waits are
907        synchronized across all processors that share the Draw object
908     */
909     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
910       ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
911     }
912     ierr = MatDestroy(A); CHKERRQ(ierr);
913   }
914   PetscFunctionReturn(0);
915 }
916 
917 #undef __FUNC__
918 #define __FUNC__ "MatView_MPIAIJ"
919 int MatView_MPIAIJ(Mat mat,Viewer viewer)
920 {
921   int         ierr;
922   ViewerType  vtype;
923 
924   PetscFunctionBegin;
925   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
926   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
927       PetscTypeCompare(vtype,MATLAB_VIEWER)) {
928     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
929   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
930     ierr = MatView_MPIAIJ_Binary(mat,viewer);CHKERRQ(ierr);
931   } else {
932     SETERRQ(1,1,"Viewer type not supported by PETSc object");
933   }
934   PetscFunctionReturn(0);
935 }
936 
937 /*
938     This has to provide several versions.
939 
940      2) a) use only local smoothing updating outer values only once.
941         b) local smoothing updating outer values each inner iteration
942      3) color updating out values betwen colors.
943 */
944 #undef __FUNC__
945 #define __FUNC__ "MatRelax_MPIAIJ"
946 int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
947                            double fshift,int its,Vec xx)
948 {
949   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
950   Mat        AA = mat->A, BB = mat->B;
951   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
952   Scalar     *b,*x,*xs,*ls,d,*v,sum;
953   int        ierr,*idx, *diag;
954   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
955 
956   PetscFunctionBegin;
957   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
958   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
959   ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
960   xs = x + shift; /* shift by one for index start of 1 */
961   ls = ls + shift;
962   if (!A->diag) {ierr = MatMarkDiag_SeqAIJ(AA); CHKERRQ(ierr);}
963   diag = A->diag;
964   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
965     if (flag & SOR_ZERO_INITIAL_GUESS) {
966       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
967       ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
968       ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
969       ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
970       PetscFunctionReturn(0);
971     }
972     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
973     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
974     while (its--) {
975       /* go down through the rows */
976       for ( i=0; i<m; i++ ) {
977         n    = A->i[i+1] - A->i[i];
978 	PLogFlops(4*n+3);
979         idx  = A->j + A->i[i] + shift;
980         v    = A->a + A->i[i] + shift;
981         sum  = b[i];
982         SPARSEDENSEMDOT(sum,xs,v,idx,n);
983         d    = fshift + A->a[diag[i]+shift];
984         n    = B->i[i+1] - B->i[i];
985         idx  = B->j + B->i[i] + shift;
986         v    = B->a + B->i[i] + shift;
987         SPARSEDENSEMDOT(sum,ls,v,idx,n);
988         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
989       }
990       /* come up through the rows */
991       for ( i=m-1; i>-1; i-- ) {
992         n    = A->i[i+1] - A->i[i];
993 	PLogFlops(4*n+3)
994         idx  = A->j + A->i[i] + shift;
995         v    = A->a + A->i[i] + shift;
996         sum  = b[i];
997         SPARSEDENSEMDOT(sum,xs,v,idx,n);
998         d    = fshift + A->a[diag[i]+shift];
999         n    = B->i[i+1] - B->i[i];
1000         idx  = B->j + B->i[i] + shift;
1001         v    = B->a + B->i[i] + shift;
1002         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1003         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1004       }
1005     }
1006   } else if (flag & SOR_LOCAL_FORWARD_SWEEP){
1007     if (flag & SOR_ZERO_INITIAL_GUESS) {
1008       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
1009       ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1010       ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
1011       ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
1012       PetscFunctionReturn(0);
1013     }
1014     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1015     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1016     while (its--) {
1017       for ( i=0; i<m; i++ ) {
1018         n    = A->i[i+1] - A->i[i];
1019 	PLogFlops(4*n+3);
1020         idx  = A->j + A->i[i] + shift;
1021         v    = A->a + A->i[i] + shift;
1022         sum  = b[i];
1023         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1024         d    = fshift + A->a[diag[i]+shift];
1025         n    = B->i[i+1] - B->i[i];
1026         idx  = B->j + B->i[i] + shift;
1027         v    = B->a + B->i[i] + shift;
1028         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1029         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1030       }
1031     }
1032   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
1033     if (flag & SOR_ZERO_INITIAL_GUESS) {
1034       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
1035       ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1036       ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
1037       ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
1038       PetscFunctionReturn(0);
1039     }
1040     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1041     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1042     while (its--) {
1043       for ( i=m-1; i>-1; i-- ) {
1044         n    = A->i[i+1] - A->i[i];
1045 	PLogFlops(4*n+3);
1046         idx  = A->j + A->i[i] + shift;
1047         v    = A->a + A->i[i] + shift;
1048         sum  = b[i];
1049         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1050         d    = fshift + A->a[diag[i]+shift];
1051         n    = B->i[i+1] - B->i[i];
1052         idx  = B->j + B->i[i] + shift;
1053         v    = B->a + B->i[i] + shift;
1054         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1055         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1056       }
1057     }
1058   } else {
1059     SETERRQ(PETSC_ERR_SUP,0,"Parallel SOR not supported");
1060   }
1061   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1062   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
1063   ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
1064   PetscFunctionReturn(0);
1065 }
1066 
1067 #undef __FUNC__
1068 #define __FUNC__ "MatGetInfo_MPIAIJ"
1069 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1070 {
1071   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1072   Mat        A = mat->A, B = mat->B;
1073   int        ierr;
1074   double     isend[5], irecv[5];
1075 
1076   PetscFunctionBegin;
1077   info->block_size     = 1.0;
1078   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1079   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1080   isend[3] = info->memory;  isend[4] = info->mallocs;
1081   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1082   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1083   isend[3] += info->memory;  isend[4] += info->mallocs;
1084   if (flag == MAT_LOCAL) {
1085     info->nz_used      = isend[0];
1086     info->nz_allocated = isend[1];
1087     info->nz_unneeded  = isend[2];
1088     info->memory       = isend[3];
1089     info->mallocs      = isend[4];
1090   } else if (flag == MAT_GLOBAL_MAX) {
1091     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1092     info->nz_used      = irecv[0];
1093     info->nz_allocated = irecv[1];
1094     info->nz_unneeded  = irecv[2];
1095     info->memory       = irecv[3];
1096     info->mallocs      = irecv[4];
1097   } else if (flag == MAT_GLOBAL_SUM) {
1098     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1099     info->nz_used      = irecv[0];
1100     info->nz_allocated = irecv[1];
1101     info->nz_unneeded  = irecv[2];
1102     info->memory       = irecv[3];
1103     info->mallocs      = irecv[4];
1104   }
1105   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1106   info->fill_ratio_needed = 0;
1107   info->factor_mallocs    = 0;
1108   info->rows_global       = (double)mat->M;
1109   info->columns_global    = (double)mat->N;
1110   info->rows_local        = (double)mat->m;
1111   info->columns_local     = (double)mat->N;
1112 
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 #undef __FUNC__
1117 #define __FUNC__ "MatSetOption_MPIAIJ"
1118 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1119 {
1120   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1121 
1122   PetscFunctionBegin;
1123   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1124       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1125       op == MAT_COLUMNS_UNSORTED ||
1126       op == MAT_COLUMNS_SORTED ||
1127       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1128       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1129         MatSetOption(a->A,op);
1130         MatSetOption(a->B,op);
1131   } else if (op == MAT_ROW_ORIENTED) {
1132     a->roworiented = 1;
1133     MatSetOption(a->A,op);
1134     MatSetOption(a->B,op);
1135   } else if (op == MAT_ROWS_SORTED ||
1136              op == MAT_ROWS_UNSORTED ||
1137              op == MAT_SYMMETRIC ||
1138              op == MAT_STRUCTURALLY_SYMMETRIC ||
1139              op == MAT_YES_NEW_DIAGONALS)
1140     PLogInfo(A,"MatSetOption_MPIAIJ:Option ignored\n");
1141   else if (op == MAT_COLUMN_ORIENTED) {
1142     a->roworiented = 0;
1143     MatSetOption(a->A,op);
1144     MatSetOption(a->B,op);
1145   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1146     a->donotstash = 1;
1147   } else if (op == MAT_NO_NEW_DIAGONALS){
1148     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1149   } else {
1150     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1151   }
1152   PetscFunctionReturn(0);
1153 }
1154 
1155 #undef __FUNC__
1156 #define __FUNC__ "MatGetSize_MPIAIJ"
1157 int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1158 {
1159   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1160 
1161   PetscFunctionBegin;
1162   if (m) *m = mat->M;
1163   if (n) *n = mat->N;
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 #undef __FUNC__
1168 #define __FUNC__ "MatGetLocalSize_MPIAIJ"
1169 int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1170 {
1171   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1172 
1173   PetscFunctionBegin;
1174   if (m) *m = mat->m;
1175   if (n) *n = mat->n;
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 #undef __FUNC__
1180 #define __FUNC__ "MatGetOwnershipRange_MPIAIJ"
1181 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1182 {
1183   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1184 
1185   PetscFunctionBegin;
1186   *m = mat->rstart; *n = mat->rend;
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1191 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1192 
1193 #undef __FUNC__
1194 #define __FUNC__ "MatGetRow_MPIAIJ"
1195 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1196 {
1197   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1198   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1199   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1200   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1201   int        *cmap, *idx_p;
1202 
1203   PetscFunctionBegin;
1204   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1205   mat->getrowactive = PETSC_TRUE;
1206 
1207   if (!mat->rowvalues && (idx || v)) {
1208     /*
1209         allocate enough space to hold information from the longest row.
1210     */
1211     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1212     int     max = 1,m = mat->m,tmp;
1213     for ( i=0; i<m; i++ ) {
1214       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1215       if (max < tmp) { max = tmp; }
1216     }
1217     mat->rowvalues  = (Scalar *) PetscMalloc(max*(sizeof(int)+sizeof(Scalar)));CHKPTRQ(mat->rowvalues);
1218     mat->rowindices = (int *) (mat->rowvalues + max);
1219   }
1220 
1221   if (row < rstart || row >= rend) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Only local rows")
1222   lrow = row - rstart;
1223 
1224   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1225   if (!v)   {pvA = 0; pvB = 0;}
1226   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1227   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1228   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1229   nztot = nzA + nzB;
1230 
1231   cmap  = mat->garray;
1232   if (v  || idx) {
1233     if (nztot) {
1234       /* Sort by increasing column numbers, assuming A and B already sorted */
1235       int imark = -1;
1236       if (v) {
1237         *v = v_p = mat->rowvalues;
1238         for ( i=0; i<nzB; i++ ) {
1239           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1240           else break;
1241         }
1242         imark = i;
1243         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1244         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1245       }
1246       if (idx) {
1247         *idx = idx_p = mat->rowindices;
1248         if (imark > -1) {
1249           for ( i=0; i<imark; i++ ) {
1250             idx_p[i] = cmap[cworkB[i]];
1251           }
1252         } else {
1253           for ( i=0; i<nzB; i++ ) {
1254             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1255             else break;
1256           }
1257           imark = i;
1258         }
1259         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1260         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1261       }
1262     } else {
1263       if (idx) *idx = 0;
1264       if (v)   *v   = 0;
1265     }
1266   }
1267   *nz = nztot;
1268   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1269   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1270   PetscFunctionReturn(0);
1271 }
1272 
1273 #undef __FUNC__
1274 #define __FUNC__ "MatRestoreRow_MPIAIJ"
1275 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1276 {
1277   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1278 
1279   PetscFunctionBegin;
1280   if (aij->getrowactive == PETSC_FALSE) {
1281     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1282   }
1283   aij->getrowactive = PETSC_FALSE;
1284   PetscFunctionReturn(0);
1285 }
1286 
1287 #undef __FUNC__
1288 #define __FUNC__ "MatNorm_MPIAIJ"
1289 int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1290 {
1291   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1292   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1293   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1294   double     sum = 0.0;
1295   Scalar     *v;
1296 
1297   PetscFunctionBegin;
1298   if (aij->size == 1) {
1299     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1300   } else {
1301     if (type == NORM_FROBENIUS) {
1302       v = amat->a;
1303       for (i=0; i<amat->nz; i++ ) {
1304 #if defined(USE_PETSC_COMPLEX)
1305         sum += PetscReal(PetscConj(*v)*(*v)); v++;
1306 #else
1307         sum += (*v)*(*v); v++;
1308 #endif
1309       }
1310       v = bmat->a;
1311       for (i=0; i<bmat->nz; i++ ) {
1312 #if defined(USE_PETSC_COMPLEX)
1313         sum += PetscReal(PetscConj(*v)*(*v)); v++;
1314 #else
1315         sum += (*v)*(*v); v++;
1316 #endif
1317       }
1318       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1319       *norm = sqrt(*norm);
1320     } else if (type == NORM_1) { /* max column norm */
1321       double *tmp, *tmp2;
1322       int    *jj, *garray = aij->garray;
1323       tmp  = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp);
1324       tmp2 = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp2);
1325       PetscMemzero(tmp,aij->N*sizeof(double));
1326       *norm = 0.0;
1327       v = amat->a; jj = amat->j;
1328       for ( j=0; j<amat->nz; j++ ) {
1329         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1330       }
1331       v = bmat->a; jj = bmat->j;
1332       for ( j=0; j<bmat->nz; j++ ) {
1333         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1334       }
1335       ierr = MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1336       for ( j=0; j<aij->N; j++ ) {
1337         if (tmp2[j] > *norm) *norm = tmp2[j];
1338       }
1339       PetscFree(tmp); PetscFree(tmp2);
1340     } else if (type == NORM_INFINITY) { /* max row norm */
1341       double ntemp = 0.0;
1342       for ( j=0; j<amat->m; j++ ) {
1343         v = amat->a + amat->i[j] + shift;
1344         sum = 0.0;
1345         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1346           sum += PetscAbsScalar(*v); v++;
1347         }
1348         v = bmat->a + bmat->i[j] + shift;
1349         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1350           sum += PetscAbsScalar(*v); v++;
1351         }
1352         if (sum > ntemp) ntemp = sum;
1353       }
1354       ierr = MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);CHKERRQ(ierr);
1355     } else {
1356       SETERRQ(PETSC_ERR_SUP,0,"No support for two norm");
1357     }
1358   }
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNC__
1363 #define __FUNC__ "MatTranspose_MPIAIJ"
1364 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1365 {
1366   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1367   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1368   int        ierr,shift = Aloc->indexshift;
1369   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1370   Mat        B;
1371   Scalar     *array;
1372 
1373   PetscFunctionBegin;
1374   if (matout == PETSC_NULL && M != N) {
1375     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1376   }
1377 
1378   ierr = MatCreateMPIAIJ(A->comm,a->n,a->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1379 
1380   /* copy over the A part */
1381   Aloc = (Mat_SeqAIJ*) a->A->data;
1382   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1383   row = a->rstart;
1384   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1385   for ( i=0; i<m; i++ ) {
1386     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1387     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1388   }
1389   aj = Aloc->j;
1390   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1391 
1392   /* copy over the B part */
1393   Aloc = (Mat_SeqAIJ*) a->B->data;
1394   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1395   row = a->rstart;
1396   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1397   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1398   for ( i=0; i<m; i++ ) {
1399     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1400     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1401   }
1402   PetscFree(ct);
1403   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1404   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1405   if (matout != PETSC_NULL) {
1406     *matout = B;
1407   } else {
1408     PetscOps       *Abops;
1409     struct _MatOps *Aops;
1410 
1411     /* This isn't really an in-place transpose .... but free data structures from a */
1412     PetscFree(a->rowners);
1413     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1414     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1415     if (a->colmap) PetscFree(a->colmap);
1416     if (a->garray) PetscFree(a->garray);
1417     if (a->lvec) VecDestroy(a->lvec);
1418     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1419     PetscFree(a);
1420 
1421     /*
1422        This is horrible, horrible code. We need to keep the
1423       A pointers for the bops and ops but copy everything
1424       else from C.
1425     */
1426     Abops = A->bops;
1427     Aops  = A->ops;
1428     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1429     A->bops = Abops;
1430     A->ops  = Aops;
1431     PetscHeaderDestroy(B);
1432   }
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 #undef __FUNC__
1437 #define __FUNC__ "MatDiagonalScale_MPIAIJ"
1438 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1439 {
1440   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1441   Mat a = aij->A, b = aij->B;
1442   int ierr,s1,s2,s3;
1443 
1444   PetscFunctionBegin;
1445   ierr = MatGetLocalSize(mat,&s2,&s3); CHKERRQ(ierr);
1446   if (rr) {
1447     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1448     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,0,"right vector non-conforming local size");
1449     /* Overlap communication with computation. */
1450     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1451   }
1452   if (ll) {
1453     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1454     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"left vector non-conforming local size");
1455     ierr = (*b->ops->diagonalscale)(b,ll,0); CHKERRQ(ierr);
1456   }
1457   /* scale  the diagonal block */
1458   ierr = (*a->ops->diagonalscale)(a,ll,rr); CHKERRQ(ierr);
1459 
1460   if (rr) {
1461     /* Do a scatter end and then right scale the off-diagonal block */
1462     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1463     ierr = (*b->ops->diagonalscale)(b,0,aij->lvec); CHKERRQ(ierr);
1464   }
1465 
1466   PetscFunctionReturn(0);
1467 }
1468 
1469 
1470 extern int MatPrintHelp_SeqAIJ(Mat);
1471 #undef __FUNC__
1472 #define __FUNC__ "MatPrintHelp_MPIAIJ"
1473 int MatPrintHelp_MPIAIJ(Mat A)
1474 {
1475   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1476   int        ierr;
1477 
1478   PetscFunctionBegin;
1479   if (!a->rank) {
1480     ierr = MatPrintHelp_SeqAIJ(a->A);CHKERRQ(ierr);
1481   }
1482   PetscFunctionReturn(0);
1483 }
1484 
1485 #undef __FUNC__
1486 #define __FUNC__ "MatGetBlockSize_MPIAIJ"
1487 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1488 {
1489   PetscFunctionBegin;
1490   *bs = 1;
1491   PetscFunctionReturn(0);
1492 }
1493 #undef __FUNC__
1494 #define __FUNC__ "MatSetUnfactored_MPIAIJ"
1495 int MatSetUnfactored_MPIAIJ(Mat A)
1496 {
1497   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1498   int        ierr;
1499 
1500   PetscFunctionBegin;
1501   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 #undef __FUNC__
1506 #define __FUNC__ "MatEqual_MPIAIJ"
1507 int MatEqual_MPIAIJ(Mat A, Mat B, PetscTruth *flag)
1508 {
1509   Mat_MPIAIJ *matB = (Mat_MPIAIJ *) B->data,*matA = (Mat_MPIAIJ *) A->data;
1510   Mat        a, b, c, d;
1511   PetscTruth flg;
1512   int        ierr;
1513 
1514   PetscFunctionBegin;
1515   if (B->type != MATMPIAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1516   a = matA->A; b = matA->B;
1517   c = matB->A; d = matB->B;
1518 
1519   ierr = MatEqual(a, c, &flg); CHKERRQ(ierr);
1520   if (flg == PETSC_TRUE) {
1521     ierr = MatEqual(b, d, &flg); CHKERRQ(ierr);
1522   }
1523   ierr = MPI_Allreduce(&flg, flag, 1, MPI_INT, MPI_LAND, A->comm);CHKERRQ(ierr);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 #undef __FUNC__
1528 #define __FUNC__ "MatCopy_MPIAIJ"
1529 int MatCopy_MPIAIJ(Mat A,Mat B,MatStructure str)
1530 {
1531   int        ierr;
1532   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1533   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
1534 
1535   PetscFunctionBegin;
1536   if (str != SAME_NONZERO_PATTERN || B->type != MATMPIAIJ) {
1537     /* because of the column compression in the off-processor part of the matrix a->B,
1538        the number of columns in a->B and b->B may be different, hence we cannot call
1539        the MatCopy() directly on the two parts. If need be, we can provide a more
1540        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
1541        then copying the submatrices */
1542     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1543   } else {
1544     ierr = MatCopy(a->A,b->A,str);CHKERRQ(ierr);
1545     ierr = MatCopy(a->B,b->B,str);CHKERRQ(ierr);
1546   }
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 extern int MatDuplicate_MPIAIJ(Mat,MatDuplicateOption,Mat *);
1551 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1552 extern int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1553 extern int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1554 extern int MatGetSubMatrix_MPIAIJ (Mat ,IS,IS,int,MatGetSubMatrixCall,Mat *);
1555 
1556 /* -------------------------------------------------------------------*/
1557 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
1558        MatGetRow_MPIAIJ,
1559        MatRestoreRow_MPIAIJ,
1560        MatMult_MPIAIJ,
1561        MatMultAdd_MPIAIJ,
1562        MatMultTrans_MPIAIJ,
1563        MatMultTransAdd_MPIAIJ,
1564        0,
1565        0,
1566        0,
1567        0,
1568        0,
1569        0,
1570        MatRelax_MPIAIJ,
1571        MatTranspose_MPIAIJ,
1572        MatGetInfo_MPIAIJ,
1573        MatEqual_MPIAIJ,
1574        MatGetDiagonal_MPIAIJ,
1575        MatDiagonalScale_MPIAIJ,
1576        MatNorm_MPIAIJ,
1577        MatAssemblyBegin_MPIAIJ,
1578        MatAssemblyEnd_MPIAIJ,
1579        0,
1580        MatSetOption_MPIAIJ,
1581        MatZeroEntries_MPIAIJ,
1582        MatZeroRows_MPIAIJ,
1583        0,
1584        0,
1585        0,
1586        0,
1587        MatGetSize_MPIAIJ,
1588        MatGetLocalSize_MPIAIJ,
1589        MatGetOwnershipRange_MPIAIJ,
1590        0,
1591        0,
1592        0,
1593        0,
1594        MatDuplicate_MPIAIJ,
1595        0,
1596        0,
1597        0,
1598        0,
1599        0,
1600        MatGetSubMatrices_MPIAIJ,
1601        MatIncreaseOverlap_MPIAIJ,
1602        MatGetValues_MPIAIJ,
1603        MatCopy_MPIAIJ,
1604        MatPrintHelp_MPIAIJ,
1605        MatScale_MPIAIJ,
1606        0,
1607        0,
1608        0,
1609        MatGetBlockSize_MPIAIJ,
1610        0,
1611        0,
1612        0,
1613        0,
1614        MatFDColoringCreate_MPIAIJ,
1615        0,
1616        MatSetUnfactored_MPIAIJ,
1617        0,
1618        0,
1619        MatGetSubMatrix_MPIAIJ,
1620        0,
1621        0,
1622        MatGetMaps_Petsc};
1623 
1624 /* ----------------------------------------------------------------------------------------*/
1625 
1626 EXTERN_C_BEGIN
1627 #undef __FUNC__
1628 #define __FUNC__ "MatStoreValues_MPIAIJ"
1629 int MatStoreValues_MPIAIJ(Mat mat)
1630 {
1631   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1632   int        ierr;
1633 
1634   PetscFunctionBegin;
1635   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
1636   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
1637   PetscFunctionReturn(0);
1638 }
1639 EXTERN_C_END
1640 
1641 EXTERN_C_BEGIN
1642 #undef __FUNC__
1643 #define __FUNC__ "MatRetrieveValues_MPIAIJ"
1644 int MatRetrieveValues_MPIAIJ(Mat mat)
1645 {
1646   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1647   int        ierr;
1648 
1649   PetscFunctionBegin;
1650   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
1651   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
1652   PetscFunctionReturn(0);
1653 }
1654 EXTERN_C_END
1655 
1656 #undef __FUNC__
1657 #define __FUNC__ "MatCreateMPIAIJ"
1658 /*@C
1659    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1660    (the default parallel PETSc format).  For good matrix assembly performance
1661    the user should preallocate the matrix storage by setting the parameters
1662    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1663    performance can be increased by more than a factor of 50.
1664 
1665    Collective on MPI_Comm
1666 
1667    Input Parameters:
1668 +  comm - MPI communicator
1669 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1670            This value should be the same as the local size used in creating the
1671            y vector for the matrix-vector product y = Ax.
1672 .  n - This value should be the same as the local size used in creating the
1673        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
1674        calculated if N is given) For square matrices n is almost always m.
1675 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1676 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1677 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
1678            (same value is used for all local rows)
1679 .  d_nnz - array containing the number of nonzeros in the various rows of the
1680            DIAGONAL portion of the local submatrix (possibly different for each row)
1681            or PETSC_NULL, if d_nz is used to specify the nonzero structure.
1682            The size of this array is equal to the number of local rows, i.e 'm'.
1683            You must leave room for the diagonal entry even if it is zero.
1684 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
1685            submatrix (same value is used for all local rows).
1686 -  o_nnz - array containing the number of nonzeros in the various rows of the
1687            OFF-DIAGONAL portion of the local submatrix (possibly different for
1688            each row) or PETSC_NULL, if o_nz is used to specify the nonzero
1689            structure. The size of this array is equal to the number
1690            of local rows, i.e 'm'.
1691 
1692    Output Parameter:
1693 .  A - the matrix
1694 
1695    Notes:
1696    m,n,M,N parameters specify the size of the matrix, and its partitioning across
1697    processors, while d_nz,d_nnz,o_nz,o_nnz parameters specify the approximate
1698    storage requirements for this matrix.
1699 
1700    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one
1701    processor than it must be used on all processors that share the object for
1702    that argument.
1703 
1704    The AIJ format (also called the Yale sparse matrix format or
1705    compressed row storage), is fully compatible with standard Fortran 77
1706    storage.  That is, the stored row and column indices can begin at
1707    either one (as in Fortran) or zero.  See the users manual for details.
1708 
1709    The user MUST specify either the local or global matrix dimensions
1710    (possibly both).
1711 
1712    The parallel matrix is partitioned such that the first m0 rows belong to
1713    process 0, the next m1 rows belong to process 1, the next m2 rows belong
1714    to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
1715 
1716    The DIAGONAL portion of the local submatrix of a processor can be defined
1717    as the submatrix which is obtained by extraction the part corresponding
1718    to the rows r1-r2 and columns r1-r2 of the global matrix, where r1 is the
1719    first row that belongs to the processor, and r2 is the last row belonging
1720    to the this processor. This is a square mxm matrix. The remaining portion
1721    of the local submatrix (mxN) constitute the OFF-DIAGONAL portion.
1722 
1723    If o_nnz, d_nnz are specified, then o_nz, and d_nz are ignored.
1724 
1725    By default, this format uses inodes (identical nodes) when possible.
1726    We search for consecutive rows with the same nonzero structure, thereby
1727    reusing matrix information to achieve increased efficiency.
1728 
1729    Options Database Keys:
1730 +  -mat_aij_no_inode  - Do not use inodes
1731 .  -mat_aij_inode_limit <limit> - Sets inode limit (max limit=5)
1732 -  -mat_aij_oneindex - Internally use indexing starting at 1
1733         rather than 0.  Note that when calling MatSetValues(),
1734         the user still MUST index entries starting at 0!
1735 .   -mat_mpi - use the parallel matrix data structures even on one processor
1736                (defaults to using SeqBAIJ format on one processor)
1737 .   -mat_mpi - use the parallel matrix data structures even on one processor
1738                (defaults to using SeqAIJ format on one processor)
1739 
1740 
1741    Example usage:
1742 
1743    Consider the following 8x8 matrix with 34 non-zero values, that is
1744    assembled across 3 processors. Lets assume that proc0 owns 3 rows,
1745    proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
1746    as follows:
1747 
1748 .vb
1749             1  2  0  |  0  3  0  |  0  4
1750     Proc0   0  5  6  |  7  0  0  |  8  0
1751             9  0 10  | 11  0  0  | 12  0
1752     -------------------------------------
1753            13  0 14  | 15 16 17  |  0  0
1754     Proc1   0 18  0  | 19 20 21  |  0  0
1755             0  0  0  | 22 23  0  | 24  0
1756     -------------------------------------
1757     Proc2  25 26 27  |  0  0 28  | 29  0
1758            30  0  0  | 31 32 33  |  0 34
1759 .ve
1760 
1761    This can be represented as a collection of submatrices as:
1762 
1763 .vb
1764       A B C
1765       D E F
1766       G H I
1767 .ve
1768 
1769    Where the submatrices A,B,C are owned by proc0, D,E,F are
1770    owned by proc1, G,H,I are owned by proc2.
1771 
1772    The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
1773    The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
1774    The 'M','N' parameters are 8,8, and have the same values on all procs.
1775 
1776    The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
1777    submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
1778    corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
1779    Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
1780    part as SeqAIJ matrices. for eg: proc1 will store [E] as a SeqAIJ
1781    matrix, ans [DF] as another SeqAIJ matrix.
1782 
1783    When d_nz, o_nz parameters are specified, d_nz storage elements are
1784    allocated for every row of the local diagonal submatrix, and o_nz
1785    storage locations are allocated for every row of the OFF-DIAGONAL submat.
1786    One way to choose d_nz and o_nz is to use the max nonzerors per local
1787    rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
1788    In this case, the values of d_nz,o_nz are:
1789 .vb
1790      proc0 : dnz = 2, o_nz = 2
1791      proc1 : dnz = 3, o_nz = 2
1792      proc2 : dnz = 1, o_nz = 4
1793 .ve
1794    We are allocating m*(d_nz+o_nz) storage locations for every proc. This
1795    translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
1796    for proc3. i.e we are using 12+15+10=37 storage locations to store
1797    34 values.
1798 
1799    When d_nnz, o_nnz parameters are specified, the storage is specified
1800    for every row, coresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
1801    In the above case the values for d_nnz,o_nnz are:
1802 .vb
1803      proc0: d_nnz = [2,2,2] and o_nnz = [2,2,2]
1804      proc1: d_nnz = [3,3,2] and o_nnz = [2,1,1]
1805      proc2: d_nnz = [1,1]   and o_nnz = [4,4]
1806 .ve
1807    Here the space allocated is sum of all the above values i.e 34, and
1808    hence pre-allocation is perfect.
1809 
1810 .keywords: matrix, aij, compressed row, sparse, parallel
1811 
1812 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1813 @*/
1814 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1815 {
1816   Mat          B;
1817   Mat_MPIAIJ   *b;
1818   int          ierr, i,sum[2],work[2],size,flag1 = 0, flag2 = 0;
1819 
1820   PetscFunctionBegin;
1821   MPI_Comm_size(comm,&size);
1822   ierr = OptionsHasName(PETSC_NULL,"-mat_mpiaij",&flag1); CHKERRQ(ierr);
1823   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2); CHKERRQ(ierr);
1824   if (!flag1 && !flag2 && size == 1) {
1825     if (M == PETSC_DECIDE) M = m;
1826     if (N == PETSC_DECIDE) N = n;
1827     ierr = MatCreateSeqAIJ(comm,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1828     PetscFunctionReturn(0);
1829   }
1830 
1831   *A = 0;
1832   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIAIJ,"Mat",comm,MatDestroy,MatView);
1833   PLogObjectCreate(B);
1834   B->data            = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1835   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1836   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
1837   B->ops->destroy    = MatDestroy_MPIAIJ;
1838   B->ops->view       = MatView_MPIAIJ;
1839   B->factor          = 0;
1840   B->assembled       = PETSC_FALSE;
1841   B->mapping         = 0;
1842 
1843   B->insertmode      = NOT_SET_VALUES;
1844   b->size            = size;
1845   MPI_Comm_rank(comm,&b->rank);
1846 
1847   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1848     SETERRQ(PETSC_ERR_ARG_WRONG,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1849   }
1850 
1851   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1852     work[0] = m; work[1] = n;
1853     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1854     if (M == PETSC_DECIDE) M = sum[0];
1855     if (N == PETSC_DECIDE) N = sum[1];
1856   }
1857   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1858   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1859   b->m = m; B->m = m;
1860   b->n = n; B->n = n;
1861   b->N = N; B->N = N;
1862   b->M = M; B->M = M;
1863 
1864   /* the information in the maps duplicates the information computed below, eventually
1865      we should remove the duplicate information that is not contained in the maps */
1866   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
1867   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
1868 
1869   /* build local table of row and column ownerships */
1870   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1871   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1872   b->cowners = b->rowners + b->size + 2;
1873   ierr = MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1874   b->rowners[0] = 0;
1875   for ( i=2; i<=b->size; i++ ) {
1876     b->rowners[i] += b->rowners[i-1];
1877   }
1878   b->rstart = b->rowners[b->rank];
1879   b->rend   = b->rowners[b->rank+1];
1880   ierr = MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1881   b->cowners[0] = 0;
1882   for ( i=2; i<=b->size; i++ ) {
1883     b->cowners[i] += b->cowners[i-1];
1884   }
1885   b->cstart = b->cowners[b->rank];
1886   b->cend   = b->cowners[b->rank+1];
1887 
1888   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1889   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1890   PLogObjectParent(B,b->A);
1891   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1892   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1893   PLogObjectParent(B,b->B);
1894 
1895   /* build cache for off array entries formed */
1896   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1897   b->donotstash  = 0;
1898   b->colmap      = 0;
1899   b->garray      = 0;
1900   b->roworiented = 1;
1901 
1902   /* stuff used for matrix vector multiply */
1903   b->lvec      = 0;
1904   b->Mvctx     = 0;
1905 
1906   /* stuff for MatGetRow() */
1907   b->rowindices   = 0;
1908   b->rowvalues    = 0;
1909   b->getrowactive = PETSC_FALSE;
1910 
1911   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",
1912                                      "MatStoreValues_MPIAIJ",
1913                                      (void*)MatStoreValues_MPIAIJ);CHKERRQ(ierr);
1914   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",
1915                                      "MatRetrieveValues_MPIAIJ",
1916                                      (void*)MatRetrieveValues_MPIAIJ);CHKERRQ(ierr);
1917   *A = B;
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 #undef __FUNC__
1922 #define __FUNC__ "MatDuplicate_MPIAIJ"
1923 int MatDuplicate_MPIAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1924 {
1925   Mat        mat;
1926   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1927   int        ierr, len=0, flg;
1928 
1929   PetscFunctionBegin;
1930   *newmat       = 0;
1931   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIAIJ,"Mat",matin->comm,MatDestroy,MatView);
1932   PLogObjectCreate(mat);
1933   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1934   PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));
1935   mat->ops->destroy    = MatDestroy_MPIAIJ;
1936   mat->ops->view       = MatView_MPIAIJ;
1937   mat->factor     = matin->factor;
1938   mat->assembled  = PETSC_TRUE;
1939 
1940   a->m = mat->m   = oldmat->m;
1941   a->n = mat->n   = oldmat->n;
1942   a->M = mat->M   = oldmat->M;
1943   a->N = mat->N   = oldmat->N;
1944 
1945   a->rstart       = oldmat->rstart;
1946   a->rend         = oldmat->rend;
1947   a->cstart       = oldmat->cstart;
1948   a->cend         = oldmat->cend;
1949   a->size         = oldmat->size;
1950   a->rank         = oldmat->rank;
1951   mat->insertmode = NOT_SET_VALUES;
1952   a->rowvalues    = 0;
1953   a->getrowactive = PETSC_FALSE;
1954 
1955   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1956   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1957   a->cowners = a->rowners + a->size + 2;
1958   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1959   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1960   if (oldmat->colmap) {
1961     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1962     PLogObjectMemory(mat,(a->N)*sizeof(int));
1963     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1964   } else a->colmap = 0;
1965   if (oldmat->garray) {
1966     len = ((Mat_SeqAIJ *) (oldmat->B->data))->n;
1967     a->garray = (int *) PetscMalloc((len+1)*sizeof(int)); CHKPTRQ(a->garray);
1968     PLogObjectMemory(mat,len*sizeof(int));
1969     if (len) PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1970   } else a->garray = 0;
1971 
1972   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1973   PLogObjectParent(mat,a->lvec);
1974   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1975   PLogObjectParent(mat,a->Mvctx);
1976   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A); CHKERRQ(ierr);
1977   PLogObjectParent(mat,a->A);
1978   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B); CHKERRQ(ierr);
1979   PLogObjectParent(mat,a->B);
1980   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1981   if (flg) {
1982     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1983   }
1984   *newmat = mat;
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 #include "sys.h"
1989 
1990 #undef __FUNC__
1991 #define __FUNC__ "MatLoad_MPIAIJ"
1992 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1993 {
1994   Mat          A;
1995   Scalar       *vals,*svals;
1996   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1997   MPI_Status   status;
1998   int          i, nz, ierr, j,rstart, rend, fd;
1999   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
2000   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
2001   int          tag = ((PetscObject)viewer)->tag,cend,cstart,n;
2002 
2003   PetscFunctionBegin;
2004   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2005   if (!rank) {
2006     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2007     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2008     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2009     if (header[3] < 0) {
2010       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix in special format on disk, cannot load as MPIAIJ");
2011     }
2012   }
2013 
2014   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2015   M = header[1]; N = header[2];
2016   /* determine ownership of all rows */
2017   m = M/size + ((M % size) > rank);
2018   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
2019   ierr = MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2020   rowners[0] = 0;
2021   for ( i=2; i<=size; i++ ) {
2022     rowners[i] += rowners[i-1];
2023   }
2024   rstart = rowners[rank];
2025   rend   = rowners[rank+1];
2026 
2027   /* distribute row lengths to all processors */
2028   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
2029   offlens = ourlens + (rend-rstart);
2030   if (!rank) {
2031     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
2032     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2033     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2034     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
2035     ierr = MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
2036     PetscFree(sndcounts);
2037   } else {
2038     ierr = MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);CHKERRQ(ierr);
2039   }
2040 
2041   if (!rank) {
2042     /* calculate the number of nonzeros on each processor */
2043     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2044     PetscMemzero(procsnz,size*sizeof(int));
2045     for ( i=0; i<size; i++ ) {
2046       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
2047         procsnz[i] += rowlengths[j];
2048       }
2049     }
2050     PetscFree(rowlengths);
2051 
2052     /* determine max buffer needed and allocate it */
2053     maxnz = 0;
2054     for ( i=0; i<size; i++ ) {
2055       maxnz = PetscMax(maxnz,procsnz[i]);
2056     }
2057     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2058 
2059     /* read in my part of the matrix column indices  */
2060     nz     = procsnz[0];
2061     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
2062     ierr   = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2063 
2064     /* read in every one elses and ship off */
2065     for ( i=1; i<size; i++ ) {
2066       nz   = procsnz[i];
2067       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2068       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2069     }
2070     PetscFree(cols);
2071   } else {
2072     /* determine buffer space needed for message */
2073     nz = 0;
2074     for ( i=0; i<m; i++ ) {
2075       nz += ourlens[i];
2076     }
2077     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
2078 
2079     /* receive message of column indices*/
2080     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2081     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2082     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2083   }
2084 
2085   /* determine column ownership if matrix is not square */
2086   if (N != M) {
2087     n      = N/size + ((N % size) > rank);
2088     ierr   = MPI_Scan(&n,&cend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
2089     cstart = cend - n;
2090   } else {
2091     cstart = rstart;
2092     cend   = rend;
2093     n      = cend - cstart;
2094   }
2095 
2096   /* loop over local rows, determining number of off diagonal entries */
2097   PetscMemzero(offlens,m*sizeof(int));
2098   jj = 0;
2099   for ( i=0; i<m; i++ ) {
2100     for ( j=0; j<ourlens[i]; j++ ) {
2101       if (mycols[jj] < cstart || mycols[jj] >= cend) offlens[i]++;
2102       jj++;
2103     }
2104   }
2105 
2106   /* create our matrix */
2107   for ( i=0; i<m; i++ ) {
2108     ourlens[i] -= offlens[i];
2109   }
2110   ierr = MatCreateMPIAIJ(comm,m,n,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
2111   A = *newmat;
2112   ierr = MatSetOption(A,MAT_COLUMNS_SORTED); CHKERRQ(ierr);
2113   for ( i=0; i<m; i++ ) {
2114     ourlens[i] += offlens[i];
2115   }
2116 
2117   if (!rank) {
2118     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
2119 
2120     /* read in my part of the matrix numerical values  */
2121     nz   = procsnz[0];
2122     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2123 
2124     /* insert into matrix */
2125     jj      = rstart;
2126     smycols = mycols;
2127     svals   = vals;
2128     for ( i=0; i<m; i++ ) {
2129       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
2130       smycols += ourlens[i];
2131       svals   += ourlens[i];
2132       jj++;
2133     }
2134 
2135     /* read in other processors and ship out */
2136     for ( i=1; i<size; i++ ) {
2137       nz   = procsnz[i];
2138       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2139       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2140     }
2141     PetscFree(procsnz);
2142   } else {
2143     /* receive numeric values */
2144     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
2145 
2146     /* receive message of values*/
2147     ierr = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2148     ierr = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2149     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2150 
2151     /* insert into matrix */
2152     jj      = rstart;
2153     smycols = mycols;
2154     svals   = vals;
2155     for ( i=0; i<m; i++ ) {
2156       ierr     = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
2157       smycols += ourlens[i];
2158       svals   += ourlens[i];
2159       jj++;
2160     }
2161   }
2162   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
2163 
2164   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2165   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2166   PetscFunctionReturn(0);
2167 }
2168 
2169 #undef __FUNC__
2170 #define __FUNC__ "MatGetSubMatrix_MPIAIJ"
2171 /*
2172     Not great since it makes two copies of the submatrix, first an SeqAIJ
2173   in local and then by concatenating the local matrices the end result.
2174   Writing it directly would be much like MatGetSubMatrices_MPIAIJ()
2175 */
2176 int MatGetSubMatrix_MPIAIJ(Mat mat,IS isrow,IS iscol,int csize,MatGetSubMatrixCall call,Mat *newmat)
2177 {
2178   int        ierr, i, m,n,rstart,row,rend,nz,*cwork,size,rank,j;
2179   Mat        *local,M, Mreuse;
2180   Scalar     *vwork,*aa;
2181   MPI_Comm   comm = mat->comm;
2182   Mat_SeqAIJ *aij;
2183   int        *ii, *jj,nlocal,*dlens,*olens,dlen,olen,jend;
2184 
2185   PetscFunctionBegin;
2186   MPI_Comm_rank(comm,&rank);
2187   MPI_Comm_size(comm,&size);
2188 
2189   if (call ==  MAT_REUSE_MATRIX) {
2190     ierr = PetscObjectQuery((PetscObject)*newmat,"SubMatrix",(PetscObject *)&Mreuse);CHKERRQ(ierr);
2191     if (!Mreuse) SETERRQ(1,1,"Submatrix passed in was not used before, cannot reuse");
2192     local = &Mreuse;
2193     ierr  = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_REUSE_MATRIX,&local);CHKERRQ(ierr);
2194   } else {
2195     ierr = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&local);CHKERRQ(ierr);
2196     Mreuse = *local;
2197     PetscFree(local);
2198   }
2199 
2200   /*
2201       m - number of local rows
2202       n - number of columns (same on all processors)
2203       rstart - first row in new global matrix generated
2204   */
2205   ierr = MatGetSize(Mreuse,&m,&n);CHKERRQ(ierr);
2206   if (call == MAT_INITIAL_MATRIX) {
2207     aij = (Mat_SeqAIJ *) (Mreuse)->data;
2208     if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,1,"No support for index shifted matrix");
2209     ii  = aij->i;
2210     jj  = aij->j;
2211 
2212     /*
2213         Determine the number of non-zeros in the diagonal and off-diagonal
2214         portions of the matrix in order to do correct preallocation
2215     */
2216 
2217     /* first get start and end of "diagonal" columns */
2218     if (csize == PETSC_DECIDE) {
2219       nlocal = n/size + ((n % size) > rank);
2220     } else {
2221       nlocal = csize;
2222     }
2223     ierr   = MPI_Scan(&nlocal,&rend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
2224     rstart = rend - nlocal;
2225     if (rank == size - 1 && rend != n) {
2226       SETERRQ(1,1,"Local column sizes do not add up to total number of columns");
2227     }
2228 
2229     /* next, compute all the lengths */
2230     dlens = (int *) PetscMalloc( (2*m+1)*sizeof(int) );CHKPTRQ(dlens);
2231     olens = dlens + m;
2232     for ( i=0; i<m; i++ ) {
2233       jend = ii[i+1] - ii[i];
2234       olen = 0;
2235       dlen = 0;
2236       for ( j=0; j<jend; j++ ) {
2237         if ( *jj < rstart || *jj >= rend) olen++;
2238         else dlen++;
2239         jj++;
2240       }
2241       olens[i] = olen;
2242       dlens[i] = dlen;
2243     }
2244     ierr = MatCreateMPIAIJ(comm,m,nlocal,PETSC_DECIDE,n,0,dlens,0,olens,&M);CHKERRQ(ierr);
2245     PetscFree(dlens);
2246   } else {
2247     int ml,nl;
2248 
2249     M = *newmat;
2250     ierr = MatGetLocalSize(M,&ml,&nl);CHKERRQ(ierr);
2251     if (ml != m) SETERRQ(PETSC_ERR_ARG_SIZ,1,"Previous matrix must be same size/layout as request");
2252     ierr = MatZeroEntries(M);CHKERRQ(ierr);
2253     /*
2254          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
2255        rather than the slower MatSetValues().
2256     */
2257     M->was_assembled = PETSC_TRUE;
2258     M->assembled     = PETSC_FALSE;
2259   }
2260   ierr = MatGetOwnershipRange(M,&rstart,&rend); CHKERRQ(ierr);
2261   aij = (Mat_SeqAIJ *) (Mreuse)->data;
2262   if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,1,"No support for index shifted matrix");
2263   ii  = aij->i;
2264   jj  = aij->j;
2265   aa  = aij->a;
2266   for (i=0; i<m; i++) {
2267     row   = rstart + i;
2268     nz    = ii[i+1] - ii[i];
2269     cwork = jj;     jj += nz;
2270     vwork = aa;     aa += nz;
2271     ierr = MatSetValues_MPIAIJ(M,1,&row,nz,cwork,vwork,INSERT_VALUES); CHKERRQ(ierr);
2272   }
2273 
2274   ierr = MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2275   ierr = MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2276   *newmat = M;
2277 
2278   /* save submatrix used in processor for next request */
2279   if (call ==  MAT_INITIAL_MATRIX) {
2280     ierr = PetscObjectCompose((PetscObject)M,"SubMatrix",(PetscObject)Mreuse);CHKERRQ(ierr);
2281     ierr = PetscObjectDereference((PetscObject)Mreuse);CHKERRQ(ierr);
2282   }
2283 
2284   PetscFunctionReturn(0);
2285 }
2286