xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 522c5e43ca724847a26b473814af9ba6607d4c94)
1 
2 #ifdef PETSC_RCS_HEADER
3 static char vcid[] = "$Id: mpiaij.c,v 1.226 1997/11/07 01:58:23 bsmith Exp bsmith $";
4 #endif
5 
6 #include "pinclude/pviewer.h"
7 #include "src/mat/impls/aij/mpi/mpiaij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/spops.h"
10 
11 /* local utility routine that creates a mapping from the global column
12 number to the local number in the off-diagonal part of the local
13 storage of the matrix.  This is done in a non scable way since the
14 length of colmap equals the global matrix length.
15 */
16 #undef __FUNC__
17 #define __FUNC__ "CreateColmap_MPIAIJ_Private"
18 int CreateColmap_MPIAIJ_Private(Mat mat)
19 {
20   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
21   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
22   int        n = B->n,i;
23 
24   PetscFunctionBegin;
25   aij->colmap = (int *) PetscMalloc((aij->N+1)*sizeof(int));CHKPTRQ(aij->colmap);
26   PLogObjectMemory(mat,aij->N*sizeof(int));
27   PetscMemzero(aij->colmap,aij->N*sizeof(int));
28   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
29   PetscFunctionReturn(0);
30 }
31 
32 extern int DisAssemble_MPIAIJ(Mat);
33 
34 #define CHUNKSIZE   15
35 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
36 { \
37  \
38     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
39     rmax = aimax[row]; nrow = ailen[row];  \
40     col1 = col - shift; \
41      \
42     low = 0; high = nrow; \
43     while (high-low > 5) { \
44       t = (low+high)/2; \
45       if (rp[t] > col) high = t; \
46       else             low  = t; \
47     } \
48       for ( _i=0; _i<nrow; _i++ ) { \
49         if (rp[_i] > col1) break; \
50         if (rp[_i] == col1) { \
51           if (addv == ADD_VALUES) ap[_i] += value;   \
52           else                  ap[_i] = value; \
53           goto a_noinsert; \
54         } \
55       }  \
56       if (nonew == 1) goto a_noinsert; \
57       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
58       if (nrow >= rmax) { \
59         /* there is no extra room in row, therefore enlarge */ \
60         int    new_nz = ai[a->m] + CHUNKSIZE,len,*new_i,*new_j; \
61         Scalar *new_a; \
62  \
63         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
64  \
65         /* malloc new storage space */ \
66         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(a->m+1)*sizeof(int); \
67         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
68         new_j   = (int *) (new_a + new_nz); \
69         new_i   = new_j + new_nz; \
70  \
71         /* copy over old data into new slots */ \
72         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];} \
73         for ( ii=row+1; ii<a->m+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
74         PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int)); \
75         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
76         PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
77                                                            len*sizeof(int)); \
78         PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar)); \
79         PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
80                                                            len*sizeof(Scalar));  \
81         /* free up old matrix storage */ \
82  \
83         PetscFree(a->a);  \
84         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
85         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
86         a->singlemalloc = 1; \
87  \
88         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
89         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
90         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
91         a->maxnz += CHUNKSIZE; \
92         a->reallocs++; \
93       } \
94       N = nrow++ - 1; a->nz++; \
95       /* shift up all the later entries in this row */ \
96       for ( ii=N; ii>=_i; ii-- ) { \
97         rp[ii+1] = rp[ii]; \
98         ap[ii+1] = ap[ii]; \
99       } \
100       rp[_i] = col1;  \
101       ap[_i] = value;  \
102       a_noinsert: ; \
103       ailen[row] = nrow; \
104 }
105 
106 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
107 { \
108  \
109     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
110     rmax = bimax[row]; nrow = bilen[row];  \
111     col1 = col - shift; \
112      \
113     low = 0; high = nrow; \
114     while (high-low > 5) { \
115       t = (low+high)/2; \
116       if (rp[t] > col) high = t; \
117       else             low  = t; \
118     } \
119        for ( _i=0; _i<nrow; _i++ ) { \
120         if (rp[_i] > col1) break; \
121         if (rp[_i] == col1) { \
122           if (addv == ADD_VALUES) ap[_i] += value;   \
123           else                  ap[_i] = value; \
124           goto b_noinsert; \
125         } \
126       }  \
127       if (nonew == 1) goto b_noinsert; \
128       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
129       if (nrow >= rmax) { \
130         /* there is no extra room in row, therefore enlarge */ \
131         int    new_nz = bi[b->m] + CHUNKSIZE,len,*new_i,*new_j; \
132         Scalar *new_a; \
133  \
134         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
135  \
136         /* malloc new storage space */ \
137         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(b->m+1)*sizeof(int); \
138         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
139         new_j   = (int *) (new_a + new_nz); \
140         new_i   = new_j + new_nz; \
141  \
142         /* copy over old data into new slots */ \
143         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = bi[ii];} \
144         for ( ii=row+1; ii<b->m+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
145         PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int)); \
146         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
147         PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
148                                                            len*sizeof(int)); \
149         PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar)); \
150         PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
151                                                            len*sizeof(Scalar));  \
152         /* free up old matrix storage */ \
153  \
154         PetscFree(b->a);  \
155         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
156         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
157         b->singlemalloc = 1; \
158  \
159         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
160         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
161         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
162         b->maxnz += CHUNKSIZE; \
163         b->reallocs++; \
164       } \
165       N = nrow++ - 1; b->nz++; \
166       /* shift up all the later entries in this row */ \
167       for ( ii=N; ii>=_i; ii-- ) { \
168         rp[ii+1] = rp[ii]; \
169         ap[ii+1] = ap[ii]; \
170       } \
171       rp[_i] = col1;  \
172       ap[_i] = value;  \
173       b_noinsert: ; \
174       bilen[row] = nrow; \
175 }
176 
177 extern int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
178 #undef __FUNC__
179 #define __FUNC__ "MatSetValues_MPIAIJ"
180 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
181 {
182   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
183   Scalar     value;
184   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
185   int        cstart = aij->cstart, cend = aij->cend,row,col;
186   int        roworiented = aij->roworiented;
187 
188   /* Some Variables required in the macro */
189   Mat        A = aij->A;
190   Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
191   int        *aimax = a->imax, *ai = a->i, *ailen = a->ilen,*aj = a->j;
192   Scalar     *aa = a->a;
193 
194   Mat        B = aij->B;
195   Mat_SeqAIJ *b = (Mat_SeqAIJ *) B->data;
196   int        *bimax = b->imax, *bi = b->i, *bilen = b->ilen,*bj = b->j;
197   Scalar     *ba = b->a;
198 
199   int        *rp,ii,nrow,_i,rmax, N, col1,low,high,t;
200   int        nonew = a->nonew,shift = a->indexshift;
201   Scalar     *ap;
202 
203   PetscFunctionBegin;
204   for ( i=0; i<m; i++ ) {
205 #if defined(USE_PETSC_BOPT_g)
206     if (im[i] < 0) SETERRQ(1,0,"Negative row");
207     if (im[i] >= aij->M) SETERRQ(1,0,"Row too large");
208 #endif
209     if (im[i] >= rstart && im[i] < rend) {
210       row = im[i] - rstart;
211       for ( j=0; j<n; j++ ) {
212         if (in[j] >= cstart && in[j] < cend){
213           col = in[j] - cstart;
214           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
215           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
216           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
217         }
218 #if defined(USE_PETSC_BOPT_g)
219         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
220         else if (in[j] >= aij->N) {SETERRQ(1,0,"Col too large");}
221 #endif
222         else {
223           if (mat->was_assembled) {
224             if (!aij->colmap) {
225               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
226             }
227             col = aij->colmap[in[j]] - 1;
228             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
229               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
230               col =  in[j];
231               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
232               B = aij->B;
233               b = (Mat_SeqAIJ *) B->data;
234               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
235               ba = b->a;
236             }
237           }
238           else col = in[j];
239           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
240           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
241           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
242         }
243       }
244     }
245     else {
246       if (roworiented && !aij->donotstash) {
247         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
248       }
249       else {
250         if (!aij->donotstash) {
251           row = im[i];
252           for ( j=0; j<n; j++ ) {
253             ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
254           }
255         }
256       }
257     }
258   }
259   PetscFunctionReturn(0);
260 }
261 
262 #undef __FUNC__
263 #define __FUNC__ "MatGetValues_MPIAIJ"
264 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
265 {
266   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
267   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
268   int        cstart = aij->cstart, cend = aij->cend,row,col;
269 
270   PetscFunctionBegin;
271   for ( i=0; i<m; i++ ) {
272     if (idxm[i] < 0) SETERRQ(1,0,"Negative row");
273     if (idxm[i] >= aij->M) SETERRQ(1,0,"Row too large");
274     if (idxm[i] >= rstart && idxm[i] < rend) {
275       row = idxm[i] - rstart;
276       for ( j=0; j<n; j++ ) {
277         if (idxn[j] < 0) SETERRQ(1,0,"Negative column");
278         if (idxn[j] >= aij->N) SETERRQ(1,0,"Col too large");
279         if (idxn[j] >= cstart && idxn[j] < cend){
280           col = idxn[j] - cstart;
281           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
282         }
283         else {
284           if (!aij->colmap) {
285             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
286           }
287           col = aij->colmap[idxn[j]] - 1;
288           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
289           else {
290             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
291           }
292         }
293       }
294     }
295     else {
296       SETERRQ(1,0,"Only local values currently supported");
297     }
298   }
299   PetscFunctionReturn(0);
300 }
301 
302 #undef __FUNC__
303 #define __FUNC__ "MatAssemblyBegin_MPIAIJ"
304 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
305 {
306   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
307   MPI_Comm    comm = mat->comm;
308   int         size = aij->size, *owners = aij->rowners;
309   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
310   MPI_Request *send_waits,*recv_waits;
311   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
312   InsertMode  addv;
313   Scalar      *rvalues,*svalues;
314 
315   PetscFunctionBegin;
316   /* make sure all processors are either in INSERTMODE or ADDMODE */
317   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
318   if (addv == (ADD_VALUES|INSERT_VALUES)) {
319     SETERRQ(1,0,"Some processors inserted others added");
320   }
321   mat->insertmode = addv; /* in case this processor had no cache */
322 
323   /*  first count number of contributors to each processor */
324   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
325   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
326   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
327   for ( i=0; i<aij->stash.n; i++ ) {
328     idx = aij->stash.idx[i];
329     for ( j=0; j<size; j++ ) {
330       if (idx >= owners[j] && idx < owners[j+1]) {
331         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
332       }
333     }
334   }
335   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
336 
337   /* inform other processors of number of messages and max length*/
338   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
339   ierr = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
340   nreceives = work[rank];
341   ierr = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
342   nmax = work[rank];
343   PetscFree(work);
344 
345   /* post receives:
346        1) each message will consist of ordered pairs
347      (global index,value) we store the global index as a double
348      to simplify the message passing.
349        2) since we don't know how long each individual message is we
350      allocate the largest needed buffer for each receive. Potentially
351      this is a lot of wasted space.
352 
353 
354        This could be done better.
355   */
356   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
357   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
358   for ( i=0; i<nreceives; i++ ) {
359     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
360               comm,recv_waits+i);CHKERRQ(ierr);
361   }
362 
363   /* do sends:
364       1) starts[i] gives the starting index in svalues for stuff going to
365          the ith processor
366   */
367   svalues    = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
368   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
369   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
370   starts[0]  = 0;
371   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
372   for ( i=0; i<aij->stash.n; i++ ) {
373     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
374     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
375     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
376   }
377   PetscFree(owner);
378   starts[0] = 0;
379   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
380   count = 0;
381   for ( i=0; i<size; i++ ) {
382     if (procs[i]) {
383       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
384     }
385   }
386   PetscFree(starts); PetscFree(nprocs);
387 
388   /* Free cache space */
389   PLogInfo(mat,"MatAssemblyBegin_MPIAIJ:Number of off-processor values %d\n",aij->stash.n);
390   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
391 
392   aij->svalues    = svalues;    aij->rvalues    = rvalues;
393   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
394   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
395   aij->rmax       = nmax;
396 
397   PetscFunctionReturn(0);
398 }
399 extern int MatSetUpMultiply_MPIAIJ(Mat);
400 
401 #undef __FUNC__
402 #define __FUNC__ "MatAssemblyEnd_MPIAIJ"
403 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
404 {
405   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
406   MPI_Status  *send_status,recv_status;
407   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
408   int         row,col,other_disassembled;
409   Scalar      *values,val;
410   InsertMode  addv = mat->insertmode;
411 
412   PetscFunctionBegin;
413   /*  wait on receives */
414   while (count) {
415     ierr = MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
416     /* unpack receives into our local space */
417     values = aij->rvalues + 3*imdex*aij->rmax;
418     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
419     n = n/3;
420     for ( i=0; i<n; i++ ) {
421       row = (int) PetscReal(values[3*i]) - aij->rstart;
422       col = (int) PetscReal(values[3*i+1]);
423       val = values[3*i+2];
424       if (col >= aij->cstart && col < aij->cend) {
425         col -= aij->cstart;
426         ierr = MatSetValues(aij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
427       } else {
428         if (mat->was_assembled || mat->assembled) {
429           if (!aij->colmap) {
430             ierr = CreateColmap_MPIAIJ_Private(mat); CHKERRQ(ierr);
431           }
432           col = aij->colmap[col] - 1;
433           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
434             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
435             col = (int) PetscReal(values[3*i+1]);
436           }
437         }
438         ierr = MatSetValues(aij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
439       }
440     }
441     count--;
442   }
443   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
444 
445   /* wait on sends */
446   if (aij->nsends) {
447     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
448     ierr        = MPI_Waitall(aij->nsends,aij->send_waits,send_status);CHKERRQ(ierr);
449     PetscFree(send_status);
450   }
451   PetscFree(aij->send_waits); PetscFree(aij->svalues);
452 
453   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
454   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
455 
456   /* determine if any processor has disassembled, if so we must
457      also disassemble ourselfs, in order that we may reassemble. */
458   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
459   if (mat->was_assembled && !other_disassembled) {
460     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
461   }
462 
463   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
464     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
465   }
466   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
467   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
468 
469   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
470   PetscFunctionReturn(0);
471 }
472 
473 #undef __FUNC__
474 #define __FUNC__ "MatZeroEntries_MPIAIJ"
475 int MatZeroEntries_MPIAIJ(Mat A)
476 {
477   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
478   int        ierr;
479 
480   PetscFunctionBegin;
481   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
482   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 /* the code does not do the diagonal entries correctly unless the
487    matrix is square and the column and row owerships are identical.
488    This is a BUG. The only way to fix it seems to be to access
489    aij->A and aij->B directly and not through the MatZeroRows()
490    routine.
491 */
492 #undef __FUNC__
493 #define __FUNC__ "MatZeroRows_MPIAIJ"
494 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
495 {
496   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
497   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
498   int            *procs,*nprocs,j,found,idx,nsends,*work;
499   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
500   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
501   int            *lens,imdex,*lrows,*values;
502   MPI_Comm       comm = A->comm;
503   MPI_Request    *send_waits,*recv_waits;
504   MPI_Status     recv_status,*send_status;
505   IS             istmp;
506 
507   PetscFunctionBegin;
508   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
509   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
510 
511   /*  first count number of contributors to each processor */
512   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
513   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
514   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
515   for ( i=0; i<N; i++ ) {
516     idx = rows[i];
517     found = 0;
518     for ( j=0; j<size; j++ ) {
519       if (idx >= owners[j] && idx < owners[j+1]) {
520         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
521       }
522     }
523     if (!found) SETERRQ(1,0,"Index out of range");
524   }
525   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
526 
527   /* inform other processors of number of messages and max length*/
528   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
529   ierr = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
530   nrecvs = work[rank];
531   ierr = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
532   nmax = work[rank];
533   PetscFree(work);
534 
535   /* post receives:   */
536   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int));CHKPTRQ(rvalues);
537   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
538   for ( i=0; i<nrecvs; i++ ) {
539     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
540   }
541 
542   /* do sends:
543       1) starts[i] gives the starting index in svalues for stuff going to
544          the ith processor
545   */
546   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
547   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
548   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
549   starts[0] = 0;
550   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
551   for ( i=0; i<N; i++ ) {
552     svalues[starts[owner[i]]++] = rows[i];
553   }
554   ISRestoreIndices(is,&rows);
555 
556   starts[0] = 0;
557   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
558   count = 0;
559   for ( i=0; i<size; i++ ) {
560     if (procs[i]) {
561       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
562     }
563   }
564   PetscFree(starts);
565 
566   base = owners[rank];
567 
568   /*  wait on receives */
569   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
570   source = lens + nrecvs;
571   count  = nrecvs; slen = 0;
572   while (count) {
573     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
574     /* unpack receives into our local space */
575     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
576     source[imdex]  = recv_status.MPI_SOURCE;
577     lens[imdex]  = n;
578     slen += n;
579     count--;
580   }
581   PetscFree(recv_waits);
582 
583   /* move the data into the send scatter */
584   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
585   count = 0;
586   for ( i=0; i<nrecvs; i++ ) {
587     values = rvalues + i*nmax;
588     for ( j=0; j<lens[i]; j++ ) {
589       lrows[count++] = values[j] - base;
590     }
591   }
592   PetscFree(rvalues); PetscFree(lens);
593   PetscFree(owner); PetscFree(nprocs);
594 
595   /* actually zap the local rows */
596   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
597   PLogObjectParent(A,istmp);
598   PetscFree(lrows);
599   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
600   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
601   ierr = ISDestroy(istmp); CHKERRQ(ierr);
602 
603   /* wait on sends */
604   if (nsends) {
605     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
606     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
607     PetscFree(send_status);
608   }
609   PetscFree(send_waits); PetscFree(svalues);
610 
611   PetscFunctionReturn(0);
612 }
613 
614 #undef __FUNC__
615 #define __FUNC__ "MatMult_MPIAIJ"
616 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
617 {
618   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
619   int        ierr,nt;
620 
621   PetscFunctionBegin;
622   ierr = VecGetLocalSize(xx,&nt);  CHKERRQ(ierr);
623   if (nt != a->n) {
624     SETERRQ(1,0,"Incompatible partition of A and xx");
625   }
626   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
627   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
628   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
629   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
630   PetscFunctionReturn(0);
631 }
632 
633 #undef __FUNC__
634 #define __FUNC__ "MatMultAdd_MPIAIJ"
635 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
636 {
637   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
638   int        ierr;
639 
640   PetscFunctionBegin;
641   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
642   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
643   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
644   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647 
648 #undef __FUNC__
649 #define __FUNC__ "MatMultTrans_MPIAIJ"
650 int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
651 {
652   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
653   int        ierr;
654 
655   PetscFunctionBegin;
656   /* do nondiagonal part */
657   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
658   /* send it on its way */
659   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
660   /* do local part */
661   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
662   /* receive remote parts: note this assumes the values are not actually */
663   /* inserted in yy until the next line, which is true for my implementation*/
664   /* but is not perhaps always true. */
665   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 #undef __FUNC__
670 #define __FUNC__ "MatMultTransAdd_MPIAIJ"
671 int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
672 {
673   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
674   int        ierr;
675 
676   PetscFunctionBegin;
677   /* do nondiagonal part */
678   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
679   /* send it on its way */
680   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
681   /* do local part */
682   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
683   /* receive remote parts: note this assumes the values are not actually */
684   /* inserted in yy until the next line, which is true for my implementation*/
685   /* but is not perhaps always true. */
686   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
687   PetscFunctionReturn(0);
688 }
689 
690 /*
691   This only works correctly for square matrices where the subblock A->A is the
692    diagonal block
693 */
694 #undef __FUNC__
695 #define __FUNC__ "MatGetDiagonal_MPIAIJ"
696 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
697 {
698   int        ierr;
699   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
700 
701   PetscFunctionBegin;
702   if (a->M != a->N) SETERRQ(1,0,"Supports only square matrix where A->A is diag block");
703   if (a->rstart != a->cstart || a->rend != a->cend) {
704     SETERRQ(1,0,"row partition must equal col partition");
705   }
706   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNC__
711 #define __FUNC__ "MatScale_MPIAIJ"
712 int MatScale_MPIAIJ(Scalar *aa,Mat A)
713 {
714   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
715   int        ierr;
716 
717   PetscFunctionBegin;
718   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
719   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
720   PetscFunctionReturn(0);
721 }
722 
723 #undef __FUNC__
724 #define __FUNC__ "MatDestroy_MPIAIJ"
725 int MatDestroy_MPIAIJ(PetscObject obj)
726 {
727   Mat        mat = (Mat) obj;
728   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
729   int        ierr;
730 
731   PetscFunctionBegin;
732 #if defined(USE_PETSC_LOG)
733   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
734 #endif
735   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
736   PetscFree(aij->rowners);
737   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
738   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
739   if (aij->colmap) PetscFree(aij->colmap);
740   if (aij->garray) PetscFree(aij->garray);
741   if (aij->lvec)   VecDestroy(aij->lvec);
742   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
743   if (aij->rowvalues) PetscFree(aij->rowvalues);
744   PetscFree(aij);
745   PLogObjectDestroy(mat);
746   PetscHeaderDestroy(mat);
747   PetscFunctionReturn(0);
748 }
749 
750 #undef __FUNC__
751 #define __FUNC__ "MatView_MPIAIJ_Binary"
752 extern int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
753 {
754   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
755   int         ierr;
756 
757   PetscFunctionBegin;
758   if (aij->size == 1) {
759     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
760   }
761   else SETERRQ(1,0,"Only uniprocessor output supported");
762   PetscFunctionReturn(0);
763 }
764 
765 #undef __FUNC__
766 #define __FUNC__ "MatView_MPIAIJ_ASCIIorDraworMatlab"
767 extern int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
768 {
769   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
770   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
771   int         ierr, format,shift = C->indexshift,rank;
772   FILE        *fd;
773   ViewerType  vtype;
774 
775   PetscFunctionBegin;
776   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
777   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
778     ierr = ViewerGetFormat(viewer,&format);
779     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
780       MatInfo info;
781       int     flg;
782       MPI_Comm_rank(mat->comm,&rank);
783       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
784       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
785       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
786       PetscSequentialPhaseBegin(mat->comm,1);
787       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
788          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
789       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
790          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
791       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);
792       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);
793       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);
794       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);
795       fflush(fd);
796       PetscSequentialPhaseEnd(mat->comm,1);
797       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
798       PetscFunctionReturn(0);
799     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
800       PetscFunctionReturn(0);
801     }
802   }
803 
804   if (vtype == DRAW_VIEWER) {
805     Draw       draw;
806     PetscTruth isnull;
807     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
808     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
809   }
810 
811   if (vtype == ASCII_FILE_VIEWER) {
812     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
813     PetscSequentialPhaseBegin(mat->comm,1);
814     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
815            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
816            aij->cend);
817     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
818     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
819     fflush(fd);
820     PetscSequentialPhaseEnd(mat->comm,1);
821   } else {
822     int size = aij->size;
823     rank = aij->rank;
824     if (size == 1) {
825       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
826     } else {
827       /* assemble the entire matrix onto first processor. */
828       Mat         A;
829       Mat_SeqAIJ *Aloc;
830       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
831       Scalar      *a;
832 
833       if (!rank) {
834         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
835       } else {
836         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
837       }
838       PLogObjectParent(mat,A);
839 
840       /* copy over the A part */
841       Aloc = (Mat_SeqAIJ*) aij->A->data;
842       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
843       row = aij->rstart;
844       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
845       for ( i=0; i<m; i++ ) {
846         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
847         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
848       }
849       aj = Aloc->j;
850       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
851 
852       /* copy over the B part */
853       Aloc = (Mat_SeqAIJ*) aij->B->data;
854       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
855       row = aij->rstart;
856       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
857       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
858       for ( i=0; i<m; i++ ) {
859         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
860         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
861       }
862       PetscFree(ct);
863       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
864       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
865       /*
866          Everyone has to call to draw the matrix since the graphics waits are
867          synchronized across all processors that share the Draw object
868       */
869       if (!rank || vtype == DRAW_VIEWER) {
870         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
871       }
872       ierr = MatDestroy(A); CHKERRQ(ierr);
873     }
874   }
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNC__
879 #define __FUNC__ "MatView_MPIAIJ"
880 int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
881 {
882   Mat         mat = (Mat) obj;
883   int         ierr;
884   ViewerType  vtype;
885 
886   PetscFunctionBegin;
887   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
888   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
889       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
890     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
891   }
892   else if (vtype == BINARY_FILE_VIEWER) {
893     ierr = MatView_MPIAIJ_Binary(mat,viewer);CHKERRQ(ierr);
894   }
895   PetscFunctionReturn(0);
896 }
897 
898 /*
899     This has to provide several versions.
900 
901      2) a) use only local smoothing updating outer values only once.
902         b) local smoothing updating outer values each inner iteration
903      3) color updating out values betwen colors.
904 */
905 #undef __FUNC__
906 #define __FUNC__ "MatRelax_MPIAIJ"
907 int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
908                            double fshift,int its,Vec xx)
909 {
910   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
911   Mat        AA = mat->A, BB = mat->B;
912   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
913   Scalar     *b,*x,*xs,*ls,d,*v,sum;
914   int        ierr,*idx, *diag;
915   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
916 
917   PetscFunctionBegin;
918   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
919   xs = x + shift; /* shift by one for index start of 1 */
920   ls = ls + shift;
921   if (!A->diag) {ierr = MatMarkDiag_SeqAIJ(AA); CHKERRQ(ierr);}
922   diag = A->diag;
923   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
924     if (flag & SOR_ZERO_INITIAL_GUESS) {
925       ierr = (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
926       PetscFunctionReturn(0);
927     }
928     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
929     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
930     while (its--) {
931       /* go down through the rows */
932       for ( i=0; i<m; i++ ) {
933         n    = A->i[i+1] - A->i[i];
934         idx  = A->j + A->i[i] + shift;
935         v    = A->a + A->i[i] + shift;
936         sum  = b[i];
937         SPARSEDENSEMDOT(sum,xs,v,idx,n);
938         d    = fshift + A->a[diag[i]+shift];
939         n    = B->i[i+1] - B->i[i];
940         idx  = B->j + B->i[i] + shift;
941         v    = B->a + B->i[i] + shift;
942         SPARSEDENSEMDOT(sum,ls,v,idx,n);
943         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
944       }
945       /* come up through the rows */
946       for ( i=m-1; i>-1; i-- ) {
947         n    = A->i[i+1] - A->i[i];
948         idx  = A->j + A->i[i] + shift;
949         v    = A->a + A->i[i] + shift;
950         sum  = b[i];
951         SPARSEDENSEMDOT(sum,xs,v,idx,n);
952         d    = fshift + A->a[diag[i]+shift];
953         n    = B->i[i+1] - B->i[i];
954         idx  = B->j + B->i[i] + shift;
955         v    = B->a + B->i[i] + shift;
956         SPARSEDENSEMDOT(sum,ls,v,idx,n);
957         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
958       }
959     }
960   } else if (flag & SOR_LOCAL_FORWARD_SWEEP){
961     if (flag & SOR_ZERO_INITIAL_GUESS) {
962       ierr = (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
963       PetscFunctionReturn(0);
964     }
965     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
966     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
967     while (its--) {
968       for ( i=0; i<m; i++ ) {
969         n    = A->i[i+1] - A->i[i];
970         idx  = A->j + A->i[i] + shift;
971         v    = A->a + A->i[i] + shift;
972         sum  = b[i];
973         SPARSEDENSEMDOT(sum,xs,v,idx,n);
974         d    = fshift + A->a[diag[i]+shift];
975         n    = B->i[i+1] - B->i[i];
976         idx  = B->j + B->i[i] + shift;
977         v    = B->a + B->i[i] + shift;
978         SPARSEDENSEMDOT(sum,ls,v,idx,n);
979         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
980       }
981     }
982   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
983     if (flag & SOR_ZERO_INITIAL_GUESS) {
984       ierr = (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
985       PetscFunctionReturn(0);
986     }
987     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,
988                             mat->Mvctx); CHKERRQ(ierr);
989     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,
990                             mat->Mvctx); CHKERRQ(ierr);
991     while (its--) {
992       for ( i=m-1; i>-1; i-- ) {
993         n    = A->i[i+1] - A->i[i];
994         idx  = A->j + A->i[i] + shift;
995         v    = A->a + A->i[i] + shift;
996         sum  = b[i];
997         SPARSEDENSEMDOT(sum,xs,v,idx,n);
998         d    = fshift + A->a[diag[i]+shift];
999         n    = B->i[i+1] - B->i[i];
1000         idx  = B->j + B->i[i] + shift;
1001         v    = B->a + B->i[i] + shift;
1002         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1003         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1004       }
1005     }
1006   } else {
1007     SETERRQ(1,0,"Parallel SOR not supported");
1008   }
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 #undef __FUNC__
1013 #define __FUNC__ "MatGetInfo_MPIAIJ"
1014 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1015 {
1016   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1017   Mat        A = mat->A, B = mat->B;
1018   int        ierr;
1019   double     isend[5], irecv[5];
1020 
1021   PetscFunctionBegin;
1022   info->block_size     = 1.0;
1023   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1024   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1025   isend[3] = info->memory;  isend[4] = info->mallocs;
1026   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1027   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1028   isend[3] += info->memory;  isend[4] += info->mallocs;
1029   if (flag == MAT_LOCAL) {
1030     info->nz_used      = isend[0];
1031     info->nz_allocated = isend[1];
1032     info->nz_unneeded  = isend[2];
1033     info->memory       = isend[3];
1034     info->mallocs      = isend[4];
1035   } else if (flag == MAT_GLOBAL_MAX) {
1036     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1037     info->nz_used      = irecv[0];
1038     info->nz_allocated = irecv[1];
1039     info->nz_unneeded  = irecv[2];
1040     info->memory       = irecv[3];
1041     info->mallocs      = irecv[4];
1042   } else if (flag == MAT_GLOBAL_SUM) {
1043     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1044     info->nz_used      = irecv[0];
1045     info->nz_allocated = irecv[1];
1046     info->nz_unneeded  = irecv[2];
1047     info->memory       = irecv[3];
1048     info->mallocs      = irecv[4];
1049   }
1050   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1051   info->fill_ratio_needed = 0;
1052   info->factor_mallocs    = 0;
1053   info->rows_global       = (double)mat->M;
1054   info->columns_global    = (double)mat->N;
1055   info->rows_local        = (double)mat->m;
1056   info->columns_local     = (double)mat->N;
1057 
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 #undef __FUNC__
1062 #define __FUNC__ "MatSetOption_MPIAIJ"
1063 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1064 {
1065   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1066 
1067   PetscFunctionBegin;
1068   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1069       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1070       op == MAT_COLUMNS_UNSORTED ||
1071       op == MAT_COLUMNS_SORTED ||
1072       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1073       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1074         MatSetOption(a->A,op);
1075         MatSetOption(a->B,op);
1076   } else if (op == MAT_ROW_ORIENTED) {
1077     a->roworiented = 1;
1078     MatSetOption(a->A,op);
1079     MatSetOption(a->B,op);
1080   } else if (op == MAT_ROWS_SORTED ||
1081              op == MAT_ROWS_UNSORTED ||
1082              op == MAT_SYMMETRIC ||
1083              op == MAT_STRUCTURALLY_SYMMETRIC ||
1084              op == MAT_YES_NEW_DIAGONALS)
1085     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1086   else if (op == MAT_COLUMN_ORIENTED) {
1087     a->roworiented = 0;
1088     MatSetOption(a->A,op);
1089     MatSetOption(a->B,op);
1090   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1091     a->donotstash = 1;
1092   } else if (op == MAT_NO_NEW_DIAGONALS){
1093     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1094   } else {
1095     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1096   }
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 #undef __FUNC__
1101 #define __FUNC__ "MatGetSize_MPIAIJ"
1102 int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1103 {
1104   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1105 
1106   PetscFunctionBegin;
1107   if (m) *m = mat->M;
1108   if (n) *n = mat->N;
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 #undef __FUNC__
1113 #define __FUNC__ "MatGetLocalSize_MPIAIJ"
1114 int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1115 {
1116   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1117 
1118   PetscFunctionBegin;
1119   if (m) *m = mat->m;
1120   if (n) *n = mat->N;
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 #undef __FUNC__
1125 #define __FUNC__ "MatGetOwnershipRange_MPIAIJ"
1126 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1127 {
1128   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1129 
1130   PetscFunctionBegin;
1131   *m = mat->rstart; *n = mat->rend;
1132   PetscFunctionReturn(0);
1133 }
1134 
1135 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1136 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1137 
1138 #undef __FUNC__
1139 #define __FUNC__ "MatGetRow_MPIAIJ"
1140 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1141 {
1142   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1143   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1144   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1145   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1146   int        *cmap, *idx_p;
1147 
1148   PetscFunctionBegin;
1149   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,0,"Already active");
1150   mat->getrowactive = PETSC_TRUE;
1151 
1152   if (!mat->rowvalues && (idx || v)) {
1153     /*
1154         allocate enough space to hold information from the longest row.
1155     */
1156     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1157     int     max = 1,m = mat->m,tmp;
1158     for ( i=0; i<m; i++ ) {
1159       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1160       if (max < tmp) { max = tmp; }
1161     }
1162     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1163     CHKPTRQ(mat->rowvalues);
1164     mat->rowindices = (int *) (mat->rowvalues + max);
1165   }
1166 
1167   if (row < rstart || row >= rend) SETERRQ(1,0,"Only local rows")
1168   lrow = row - rstart;
1169 
1170   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1171   if (!v)   {pvA = 0; pvB = 0;}
1172   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1173   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1174   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1175   nztot = nzA + nzB;
1176 
1177   cmap  = mat->garray;
1178   if (v  || idx) {
1179     if (nztot) {
1180       /* Sort by increasing column numbers, assuming A and B already sorted */
1181       int imark = -1;
1182       if (v) {
1183         *v = v_p = mat->rowvalues;
1184         for ( i=0; i<nzB; i++ ) {
1185           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1186           else break;
1187         }
1188         imark = i;
1189         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1190         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1191       }
1192       if (idx) {
1193         *idx = idx_p = mat->rowindices;
1194         if (imark > -1) {
1195           for ( i=0; i<imark; i++ ) {
1196             idx_p[i] = cmap[cworkB[i]];
1197           }
1198         } else {
1199           for ( i=0; i<nzB; i++ ) {
1200             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1201             else break;
1202           }
1203           imark = i;
1204         }
1205         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1206         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1207       }
1208     }
1209     else {
1210       if (idx) *idx = 0;
1211       if (v)   *v   = 0;
1212     }
1213   }
1214   *nz = nztot;
1215   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1216   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 #undef __FUNC__
1221 #define __FUNC__ "MatRestoreRow_MPIAIJ"
1222 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1223 {
1224   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1225 
1226   PetscFunctionBegin;
1227   if (aij->getrowactive == PETSC_FALSE) {
1228     SETERRQ(1,0,"MatGetRow not called");
1229   }
1230   aij->getrowactive = PETSC_FALSE;
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 #undef __FUNC__
1235 #define __FUNC__ "MatNorm_MPIAIJ"
1236 int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1237 {
1238   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1239   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1240   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1241   double     sum = 0.0;
1242   Scalar     *v;
1243 
1244   PetscFunctionBegin;
1245   if (aij->size == 1) {
1246     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1247   } else {
1248     if (type == NORM_FROBENIUS) {
1249       v = amat->a;
1250       for (i=0; i<amat->nz; i++ ) {
1251 #if defined(USE_PETSC_COMPLEX)
1252         sum += real(conj(*v)*(*v)); v++;
1253 #else
1254         sum += (*v)*(*v); v++;
1255 #endif
1256       }
1257       v = bmat->a;
1258       for (i=0; i<bmat->nz; i++ ) {
1259 #if defined(USE_PETSC_COMPLEX)
1260         sum += real(conj(*v)*(*v)); v++;
1261 #else
1262         sum += (*v)*(*v); v++;
1263 #endif
1264       }
1265       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1266       *norm = sqrt(*norm);
1267     } else if (type == NORM_1) { /* max column norm */
1268       double *tmp, *tmp2;
1269       int    *jj, *garray = aij->garray;
1270       tmp  = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp);
1271       tmp2 = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp2);
1272       PetscMemzero(tmp,aij->N*sizeof(double));
1273       *norm = 0.0;
1274       v = amat->a; jj = amat->j;
1275       for ( j=0; j<amat->nz; j++ ) {
1276         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1277       }
1278       v = bmat->a; jj = bmat->j;
1279       for ( j=0; j<bmat->nz; j++ ) {
1280         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1281       }
1282       ierr = MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1283       for ( j=0; j<aij->N; j++ ) {
1284         if (tmp2[j] > *norm) *norm = tmp2[j];
1285       }
1286       PetscFree(tmp); PetscFree(tmp2);
1287     } else if (type == NORM_INFINITY) { /* max row norm */
1288       double ntemp = 0.0;
1289       for ( j=0; j<amat->m; j++ ) {
1290         v = amat->a + amat->i[j] + shift;
1291         sum = 0.0;
1292         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1293           sum += PetscAbsScalar(*v); v++;
1294         }
1295         v = bmat->a + bmat->i[j] + shift;
1296         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1297           sum += PetscAbsScalar(*v); v++;
1298         }
1299         if (sum > ntemp) ntemp = sum;
1300       }
1301       ierr = MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);CHKERRQ(ierr);
1302     } else {
1303       SETERRQ(1,0,"No support for two norm");
1304     }
1305   }
1306   PetscFunctionReturn(0);
1307 }
1308 
1309 #undef __FUNC__
1310 #define __FUNC__ "MatTranspose_MPIAIJ"
1311 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1312 {
1313   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1314   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1315   int        ierr,shift = Aloc->indexshift;
1316   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1317   Mat        B;
1318   Scalar     *array;
1319 
1320   PetscFunctionBegin;
1321   if (matout == PETSC_NULL && M != N) {
1322     SETERRQ(1,0,"Square matrix only for in-place");
1323   }
1324 
1325   ierr = MatCreateMPIAIJ(A->comm,a->n,a->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1326 
1327   /* copy over the A part */
1328   Aloc = (Mat_SeqAIJ*) a->A->data;
1329   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1330   row = a->rstart;
1331   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1332   for ( i=0; i<m; i++ ) {
1333     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1334     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1335   }
1336   aj = Aloc->j;
1337   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1338 
1339   /* copy over the B part */
1340   Aloc = (Mat_SeqAIJ*) a->B->data;
1341   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1342   row = a->rstart;
1343   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1344   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1345   for ( i=0; i<m; i++ ) {
1346     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1347     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1348   }
1349   PetscFree(ct);
1350   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1351   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1352   if (matout != PETSC_NULL) {
1353     *matout = B;
1354   } else {
1355     /* This isn't really an in-place transpose .... but free data structures from a */
1356     PetscFree(a->rowners);
1357     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1358     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1359     if (a->colmap) PetscFree(a->colmap);
1360     if (a->garray) PetscFree(a->garray);
1361     if (a->lvec) VecDestroy(a->lvec);
1362     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1363     PetscFree(a);
1364     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1365     PetscHeaderDestroy(B);
1366   }
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 #undef __FUNC__
1371 #define __FUNC__ "MatDiagonalScale_MPIAIJ"
1372 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1373 {
1374   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1375   Mat a = aij->A, b = aij->B;
1376   int ierr,s1,s2,s3;
1377 
1378   PetscFunctionBegin;
1379   ierr = MatGetLocalSize(mat,&s2,&s3); CHKERRQ(ierr);
1380   if (rr) {
1381     s3 = aij->n;
1382     VecGetLocalSize_Fast(rr,s1);
1383     if (s1!=s3) SETERRQ(1,0,"right vector non-conforming local size");
1384     /* Overlap communication with computation. */
1385     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1386   }
1387   if (ll) {
1388     VecGetLocalSize_Fast(ll,s1);
1389     if (s1!=s2) SETERRQ(1,0,"left vector non-conforming local size");
1390     ierr = (*b->ops.diagonalscale)(b,ll,0); CHKERRQ(ierr);
1391   }
1392   /* scale  the diagonal block */
1393   ierr = (*a->ops.diagonalscale)(a,ll,rr); CHKERRQ(ierr);
1394 
1395   if (rr) {
1396     /* Do a scatter end and then right scale the off-diagonal block */
1397     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1398     ierr = (*b->ops.diagonalscale)(b,0,aij->lvec); CHKERRQ(ierr);
1399   }
1400 
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 
1405 extern int MatPrintHelp_SeqAIJ(Mat);
1406 #undef __FUNC__
1407 #define __FUNC__ "MatPrintHelp_MPIAIJ"
1408 int MatPrintHelp_MPIAIJ(Mat A)
1409 {
1410   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1411   int        ierr;
1412 
1413   PetscFunctionBegin;
1414   if (!a->rank) {
1415     ierr = MatPrintHelp_SeqAIJ(a->A);CHKERRQ(ierr);
1416   }
1417   PetscFunctionReturn(0);
1418 }
1419 
1420 #undef __FUNC__
1421 #define __FUNC__ "MatGetBlockSize_MPIAIJ"
1422 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1423 {
1424   PetscFunctionBegin;
1425   *bs = 1;
1426   PetscFunctionReturn(0);
1427 }
1428 #undef __FUNC__
1429 #define __FUNC__ "MatSetUnfactored_MPIAIJ"
1430 int MatSetUnfactored_MPIAIJ(Mat A)
1431 {
1432   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1433   int        ierr;
1434 
1435   PetscFunctionBegin;
1436   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 #undef __FUNC__
1441 #define __FUNC__ "MatEqual_MPIAIJ"
1442 int MatEqual_MPIAIJ(Mat A, Mat B, PetscTruth *flag)
1443 {
1444   Mat_MPIAIJ *matB = (Mat_MPIAIJ *) B->data,*matA = (Mat_MPIAIJ *) A->data;
1445   Mat        a, b, c, d;
1446   PetscTruth flg;
1447   int        ierr;
1448 
1449   PetscFunctionBegin;
1450   if (B->type != MATMPIAIJ) SETERRQ(1,0,"Matrices must be same type");
1451   a = matA->A; b = matA->B;
1452   c = matB->A; d = matB->B;
1453 
1454   ierr = MatEqual(a, c, &flg); CHKERRQ(ierr);
1455   if (flg == PETSC_TRUE) {
1456     ierr = MatEqual(b, d, &flg); CHKERRQ(ierr);
1457   }
1458   ierr = MPI_Allreduce(&flg, flag, 1, MPI_INT, MPI_LAND, A->comm);CHKERRQ(ierr);
1459   PetscFunctionReturn(0);
1460 }
1461 
1462 extern int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1463 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1464 extern int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1465 extern int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1466 extern int MatGetSubMatrix_MPIAIJ (Mat ,IS,IS,MatGetSubMatrixCall,Mat *);
1467 
1468 /* -------------------------------------------------------------------*/
1469 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1470        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1471        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1472        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1473        0,0,
1474        0,0,
1475        0,0,
1476        MatRelax_MPIAIJ,
1477        MatTranspose_MPIAIJ,
1478        MatGetInfo_MPIAIJ,MatEqual_MPIAIJ,
1479        MatGetDiagonal_MPIAIJ,MatDiagonalScale_MPIAIJ,MatNorm_MPIAIJ,
1480        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1481        0,
1482        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1483        0,0,0,0,
1484        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1485        0,0,
1486        0,0,MatConvertSameType_MPIAIJ,0,0,
1487        0,0,0,
1488        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1489        MatPrintHelp_MPIAIJ,
1490        MatScale_MPIAIJ,0,0,0,
1491        MatGetBlockSize_MPIAIJ,0,0,0,0,
1492        MatFDColoringCreate_MPIAIJ,0,MatSetUnfactored_MPIAIJ,
1493        0,0,MatGetSubMatrix_MPIAIJ};
1494 
1495 
1496 #undef __FUNC__
1497 #define __FUNC__ "MatCreateMPIAIJ"
1498 /*@C
1499    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1500    (the default parallel PETSc format).  For good matrix assembly performance
1501    the user should preallocate the matrix storage by setting the parameters
1502    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1503    performance can be increased by more than a factor of 50.
1504 
1505    Input Parameters:
1506 .  comm - MPI communicator
1507 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1508            This value should be the same as the local size used in creating the
1509            y vector for the matrix-vector product y = Ax.
1510 .  n - This value should be the same as the local size used in creating the
1511        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
1512        calculated if N is given) For square matrices n is almost always m.
1513 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1514 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1515 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1516            (same for all local rows)
1517 .  d_nzz - array containing the number of nonzeros in the various rows of the
1518            diagonal portion of the local submatrix (possibly different for each row)
1519            or PETSC_NULL. You must leave room for the diagonal entry even if
1520            it is zero.
1521 .  o_nz - number of nonzeros per row in the off-diagonal portion of local
1522            submatrix (same for all local rows).
1523 .  o_nzz - array containing the number of nonzeros in the various rows of the
1524            off-diagonal portion of the local submatrix (possibly different for
1525            each row) or PETSC_NULL.
1526 
1527    Output Parameter:
1528 .  A - the matrix
1529 
1530    Notes:
1531    The AIJ format (also called the Yale sparse matrix format or
1532    compressed row storage), is fully compatible with standard Fortran 77
1533    storage.  That is, the stored row and column indices can begin at
1534    either one (as in Fortran) or zero.  See the users manual for details.
1535 
1536    The user MUST specify either the local or global matrix dimensions
1537    (possibly both).
1538 
1539    By default, this format uses inodes (identical nodes) when possible.
1540    We search for consecutive rows with the same nonzero structure, thereby
1541    reusing matrix information to achieve increased efficiency.
1542 
1543    Options Database Keys:
1544 $    -mat_aij_no_inode  - Do not use inodes
1545 $    -mat_aij_inode_limit <limit> - Set inode limit.
1546 $        (max limit=5)
1547 $    -mat_aij_oneindex - Internally use indexing starting at 1
1548 $        rather than 0.  Note: When calling MatSetValues(),
1549 $        the user still MUST index entries starting at 0!
1550 
1551    Storage Information:
1552    For a square global matrix we define each processor's diagonal portion
1553    to be its local rows and the corresponding columns (a square submatrix);
1554    each processor's off-diagonal portion encompasses the remainder of the
1555    local matrix (a rectangular submatrix).
1556 
1557    The user can specify preallocated storage for the diagonal part of
1558    the local submatrix with either d_nz or d_nnz (not both).  Set
1559    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1560    memory allocation.  Likewise, specify preallocated storage for the
1561    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1562 
1563    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1564    the figure below we depict these three local rows and all columns (0-11).
1565 
1566 $          0 1 2 3 4 5 6 7 8 9 10 11
1567 $         -------------------
1568 $  row 3  |  o o o d d d o o o o o o
1569 $  row 4  |  o o o d d d o o o o o o
1570 $  row 5  |  o o o d d d o o o o o o
1571 $         -------------------
1572 $
1573 
1574    Thus, any entries in the d locations are stored in the d (diagonal)
1575    submatrix, and any entries in the o locations are stored in the
1576    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1577    stored simply in the MATSEQAIJ format for compressed row storage.
1578 
1579    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1580    and o_nz should indicate the number of nonzeros per row in the o matrix.
1581    In general, for PDE problems in which most nonzeros are near the diagonal,
1582    one expects d_nz >> o_nz. For large problems you MUST preallocate memory
1583    or you will get TERRIBLE performance; see the users' manual chapter on
1584    matrices.
1585 
1586 .keywords: matrix, aij, compressed row, sparse, parallel
1587 
1588 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1589 @*/
1590 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1591                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1592 {
1593   Mat          B;
1594   Mat_MPIAIJ   *b;
1595   int          ierr, i,sum[2],work[2],size;
1596 
1597   PetscFunctionBegin;
1598   MPI_Comm_size(comm,&size);
1599   if (size == 1) {
1600     if (M == PETSC_DECIDE) M = m;
1601     if (N == PETSC_DECIDE) N = n;
1602     ierr = MatCreateSeqAIJ(comm,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1603     PetscFunctionReturn(0);
1604   }
1605 
1606   *A = 0;
1607   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIAIJ,comm,MatDestroy,MatView);
1608   PLogObjectCreate(B);
1609   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1610   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1611   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1612   B->destroy    = MatDestroy_MPIAIJ;
1613   B->view       = MatView_MPIAIJ;
1614   B->factor     = 0;
1615   B->assembled  = PETSC_FALSE;
1616   B->mapping    = 0;
1617 
1618   B->insertmode = NOT_SET_VALUES;
1619   b->size       = size;
1620   MPI_Comm_rank(comm,&b->rank);
1621 
1622   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1623     SETERRQ(1,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1624   }
1625 
1626   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1627     work[0] = m; work[1] = n;
1628     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1629     if (M == PETSC_DECIDE) M = sum[0];
1630     if (N == PETSC_DECIDE) N = sum[1];
1631   }
1632   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1633   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1634   b->m = m; B->m = m;
1635   b->n = n; B->n = n;
1636   b->N = N; B->N = N;
1637   b->M = M; B->M = M;
1638 
1639   /* build local table of row and column ownerships */
1640   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1641   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1642   b->cowners = b->rowners + b->size + 2;
1643   ierr = MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1644   b->rowners[0] = 0;
1645   for ( i=2; i<=b->size; i++ ) {
1646     b->rowners[i] += b->rowners[i-1];
1647   }
1648   b->rstart = b->rowners[b->rank];
1649   b->rend   = b->rowners[b->rank+1];
1650   ierr = MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1651   b->cowners[0] = 0;
1652   for ( i=2; i<=b->size; i++ ) {
1653     b->cowners[i] += b->cowners[i-1];
1654   }
1655   b->cstart = b->cowners[b->rank];
1656   b->cend   = b->cowners[b->rank+1];
1657 
1658   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1659   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1660   PLogObjectParent(B,b->A);
1661   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1662   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1663   PLogObjectParent(B,b->B);
1664 
1665   /* build cache for off array entries formed */
1666   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1667   b->donotstash  = 0;
1668   b->colmap      = 0;
1669   b->garray      = 0;
1670   b->roworiented = 1;
1671 
1672   /* stuff used for matrix vector multiply */
1673   b->lvec      = 0;
1674   b->Mvctx     = 0;
1675 
1676   /* stuff for MatGetRow() */
1677   b->rowindices   = 0;
1678   b->rowvalues    = 0;
1679   b->getrowactive = PETSC_FALSE;
1680 
1681   *A = B;
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 #undef __FUNC__
1686 #define __FUNC__ "MatConvertSameType_MPIAIJ"
1687 int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1688 {
1689   Mat        mat;
1690   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1691   int        ierr, len=0, flg;
1692 
1693   PetscFunctionBegin;
1694   *newmat       = 0;
1695   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm,MatDestroy,MatView);
1696   PLogObjectCreate(mat);
1697   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1698   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1699   mat->destroy    = MatDestroy_MPIAIJ;
1700   mat->view       = MatView_MPIAIJ;
1701   mat->factor     = matin->factor;
1702   mat->assembled  = PETSC_TRUE;
1703 
1704   a->m = mat->m   = oldmat->m;
1705   a->n = mat->n   = oldmat->n;
1706   a->M = mat->M   = oldmat->M;
1707   a->N = mat->N   = oldmat->N;
1708 
1709   a->rstart       = oldmat->rstart;
1710   a->rend         = oldmat->rend;
1711   a->cstart       = oldmat->cstart;
1712   a->cend         = oldmat->cend;
1713   a->size         = oldmat->size;
1714   a->rank         = oldmat->rank;
1715   mat->insertmode = NOT_SET_VALUES;
1716   a->rowvalues    = 0;
1717   a->getrowactive = PETSC_FALSE;
1718 
1719   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1720   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1721   a->cowners = a->rowners + a->size + 2;
1722   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1723   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1724   if (oldmat->colmap) {
1725     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1726     PLogObjectMemory(mat,(a->N)*sizeof(int));
1727     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1728   } else a->colmap = 0;
1729   if (oldmat->garray) {
1730     len = ((Mat_SeqAIJ *) (oldmat->B->data))->n;
1731     a->garray = (int *) PetscMalloc((len+1)*sizeof(int)); CHKPTRQ(a->garray);
1732     PLogObjectMemory(mat,len*sizeof(int));
1733     if (len) PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1734   } else a->garray = 0;
1735 
1736   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1737   PLogObjectParent(mat,a->lvec);
1738   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1739   PLogObjectParent(mat,a->Mvctx);
1740   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1741   PLogObjectParent(mat,a->A);
1742   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1743   PLogObjectParent(mat,a->B);
1744   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1745   if (flg) {
1746     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1747   }
1748   *newmat = mat;
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 #include "sys.h"
1753 
1754 #undef __FUNC__
1755 #define __FUNC__ "MatLoad_MPIAIJ"
1756 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1757 {
1758   Mat          A;
1759   Scalar       *vals,*svals;
1760   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1761   MPI_Status   status;
1762   int          i, nz, ierr, j,rstart, rend, fd;
1763   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1764   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1765   int          tag = ((PetscObject)viewer)->tag;
1766 
1767   PetscFunctionBegin;
1768   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1769   if (!rank) {
1770     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1771     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
1772     if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not matrix object");
1773     if (header[3] < 0) {
1774       SETERRQ(1,1,"Matrix stored in special format on disk, cannot load as MPIAIJ");
1775     }
1776   }
1777 
1778 
1779   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
1780   M = header[1]; N = header[2];
1781   /* determine ownership of all rows */
1782   m = M/size + ((M % size) > rank);
1783   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1784   ierr = MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1785   rowners[0] = 0;
1786   for ( i=2; i<=size; i++ ) {
1787     rowners[i] += rowners[i-1];
1788   }
1789   rstart = rowners[rank];
1790   rend   = rowners[rank+1];
1791 
1792   /* distribute row lengths to all processors */
1793   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1794   offlens = ourlens + (rend-rstart);
1795   if (!rank) {
1796     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1797     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
1798     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1799     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1800     ierr = MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
1801     PetscFree(sndcounts);
1802   } else {
1803     ierr = MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);CHKERRQ(ierr);
1804   }
1805 
1806   if (!rank) {
1807     /* calculate the number of nonzeros on each processor */
1808     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1809     PetscMemzero(procsnz,size*sizeof(int));
1810     for ( i=0; i<size; i++ ) {
1811       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1812         procsnz[i] += rowlengths[j];
1813       }
1814     }
1815     PetscFree(rowlengths);
1816 
1817     /* determine max buffer needed and allocate it */
1818     maxnz = 0;
1819     for ( i=0; i<size; i++ ) {
1820       maxnz = PetscMax(maxnz,procsnz[i]);
1821     }
1822     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1823 
1824     /* read in my part of the matrix column indices  */
1825     nz     = procsnz[0];
1826     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1827     ierr   = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
1828 
1829     /* read in every one elses and ship off */
1830     for ( i=1; i<size; i++ ) {
1831       nz   = procsnz[i];
1832       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
1833       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
1834     }
1835     PetscFree(cols);
1836   } else {
1837     /* determine buffer space needed for message */
1838     nz = 0;
1839     for ( i=0; i<m; i++ ) {
1840       nz += ourlens[i];
1841     }
1842     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1843 
1844     /* receive message of column indices*/
1845     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
1846     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
1847     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1848   }
1849 
1850   /* loop over local rows, determining number of off diagonal entries */
1851   PetscMemzero(offlens,m*sizeof(int));
1852   jj = 0;
1853   for ( i=0; i<m; i++ ) {
1854     for ( j=0; j<ourlens[i]; j++ ) {
1855       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1856       jj++;
1857     }
1858   }
1859 
1860   /* create our matrix */
1861   for ( i=0; i<m; i++ ) {
1862     ourlens[i] -= offlens[i];
1863   }
1864   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1865   A = *newmat;
1866   MatSetOption(A,MAT_COLUMNS_SORTED);
1867   for ( i=0; i<m; i++ ) {
1868     ourlens[i] += offlens[i];
1869   }
1870 
1871   if (!rank) {
1872     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1873 
1874     /* read in my part of the matrix numerical values  */
1875     nz = procsnz[0];
1876     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
1877 
1878     /* insert into matrix */
1879     jj      = rstart;
1880     smycols = mycols;
1881     svals   = vals;
1882     for ( i=0; i<m; i++ ) {
1883       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1884       smycols += ourlens[i];
1885       svals   += ourlens[i];
1886       jj++;
1887     }
1888 
1889     /* read in other processors and ship out */
1890     for ( i=1; i<size; i++ ) {
1891       nz   = procsnz[i];
1892       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
1893       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
1894     }
1895     PetscFree(procsnz);
1896   } else {
1897     /* receive numeric values */
1898     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1899 
1900     /* receive message of values*/
1901     ierr = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
1902     ierr = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
1903     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1904 
1905     /* insert into matrix */
1906     jj      = rstart;
1907     smycols = mycols;
1908     svals   = vals;
1909     for ( i=0; i<m; i++ ) {
1910       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1911       smycols += ourlens[i];
1912       svals   += ourlens[i];
1913       jj++;
1914     }
1915   }
1916   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1917 
1918   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1919   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1920   PetscFunctionReturn(0);
1921 }
1922 
1923 #undef __FUNC__
1924 #define __FUNC__ "MatGetSubMatrix_MPIAIJ"
1925 /*
1926     Not great since it makes two copies of the submatrix, first an SeqAIJ
1927   in local and then by concatenating the local matrices the end result.
1928   Writing it directly would be much like MatGetSubMatrices_MPIAIJ()
1929 */
1930 int MatGetSubMatrix_MPIAIJ(Mat mat,IS isrow,IS iscol,MatGetSubMatrixCall call,Mat *newmat)
1931 {
1932   int        ierr, i, m,n,rstart,row,rend,nz,*cwork,size,rank,j;
1933   Mat        *local,M;
1934   Scalar     *vwork,*aa;
1935   MPI_Comm   comm = mat->comm;
1936   Mat_SeqAIJ *aij;
1937   int        *ii, *jj,nlocal,*dlens,*olens,dlen,olen,jend;
1938 
1939   PetscFunctionBegin;
1940   MPI_Comm_rank(comm,&rank);
1941   MPI_Comm_size(comm,&size);
1942 
1943   ierr = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&local);CHKERRQ(ierr);
1944 
1945   /*
1946       m - number of local rows
1947       n - number of columns (same on all processors)
1948       rstart - first row in new global matrix generated
1949   */
1950   ierr = MatGetSize(*local,&m,&n);CHKERRQ(ierr);
1951   if (call == MAT_INITIAL_MATRIX) {
1952     aij = (Mat_SeqAIJ *) (*local)->data;
1953     if (aij->indexshift) SETERRQ(1,1,"No support for index shifted matrix");
1954     ii  = aij->i;
1955     jj  = aij->j;
1956 
1957     /*
1958         Determine the number of non-zeros in the diagonal and off-diagonal
1959         portions of the matrix in order to do correct preallocation
1960     */
1961 
1962     /* first get start and end of "diagonal" columns */
1963     nlocal = n/size + ((n % size) > rank);
1964     ierr   = MPI_Scan(&nlocal,&rend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1965     rstart = rend - nlocal;
1966 
1967     /* next, compute all the lengths */
1968     dlens = (int *) PetscMalloc( (2*m+1)*sizeof(int) );CHKPTRQ(dlens);
1969     olens = dlens + m;
1970     for ( i=0; i<m; i++ ) {
1971       jend = ii[i+1] - ii[i];
1972       olen = 0;
1973       dlen = 0;
1974       for ( j=0; j<jend; j++ ) {
1975         if ( *jj < rstart || *jj >= rend) olen++;
1976         else dlen++;
1977         jj++;
1978       }
1979       olens[i] = olen;
1980       dlens[i] = dlen;
1981     }
1982     ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,PETSC_DECIDE,n,0,dlens,0,olens,&M);CHKERRQ(ierr);
1983     PetscFree(dlens);
1984   } else {
1985     int ml,nl;
1986 
1987     M = *newmat;
1988     ierr = MatGetLocalSize(M,&ml,&nl);CHKERRQ(ierr);
1989     if (ml != m) SETERRQ(1,1,"Previous matrix must be same size/layout as request");
1990     ierr = MatZeroEntries(M);CHKERRQ(ierr);
1991   }
1992   ierr = MatGetOwnershipRange(M,&rstart,&rend); CHKERRQ(ierr);
1993   aij = (Mat_SeqAIJ *) (*local)->data;
1994   if (aij->indexshift) SETERRQ(1,1,"No support for index shifted matrix");
1995   ii  = aij->i;
1996   jj  = aij->j;
1997   aa  = aij->a;
1998   for (i=0; i<m; i++) {
1999     row   = rstart + i;
2000     nz    = ii[i+1] - ii[i];
2001     cwork = jj;     jj += nz;
2002     vwork = aa;     aa += nz;
2003     ierr = MatSetValues_MPIAIJ(M,1,&row,nz,cwork,vwork,INSERT_VALUES); CHKERRQ(ierr);
2004   }
2005 
2006   ierr = MatDestroyMatrices(1,&local);CHKERRQ(ierr);
2007   ierr = MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2008   ierr = MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2009   *newmat = M;
2010   PetscFunctionReturn(0);
2011 }
2012