xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision bbd702db22f56e953ae24ab45c34809dbb746cb2)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpiaij.c,v 1.215 1997/09/11 20:39:34 bsmith Exp bsmith $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/aij/mpi/mpiaij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 
10 /* local utility routine that creates a mapping from the global column
11 number to the local number in the off-diagonal part of the local
12 storage of the matrix.  This is done in a non scable way since the
13 length of colmap equals the global matrix length.
14 */
15 #undef __FUNC__
16 #define __FUNC__ "CreateColmap_MPIAIJ_Private"
17 int CreateColmap_MPIAIJ_Private(Mat mat)
18 {
19   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
20   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
21   int        n = B->n,i;
22 
23   aij->colmap = (int *) PetscMalloc((aij->N+1)*sizeof(int));CHKPTRQ(aij->colmap);
24   PLogObjectMemory(mat,aij->N*sizeof(int));
25   PetscMemzero(aij->colmap,aij->N*sizeof(int));
26   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
27   return 0;
28 }
29 
30 extern int DisAssemble_MPIAIJ(Mat);
31 
32 #define CHUNKSIZE   15
33 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
34 { \
35  \
36     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
37     rmax = aimax[row]; nrow = ailen[row];  \
38     col1 = col - shift; \
39      \
40     low = 0; high = nrow; \
41     while (high-low > 5) { \
42       t = (low+high)/2; \
43       if (rp[t] > col) high = t; \
44       else             low  = t; \
45     } \
46       for ( _i=0; _i<nrow; _i++ ) { \
47         if (rp[_i] > col1) break; \
48         if (rp[_i] == col1) { \
49           if (addv == ADD_VALUES) ap[_i] += value;   \
50           else                  ap[_i] = value; \
51           goto a_noinsert; \
52         } \
53       }  \
54       if (nonew == 1) goto a_noinsert; \
55       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
56       if (nrow >= rmax) { \
57         /* there is no extra room in row, therefore enlarge */ \
58         int    new_nz = ai[a->m] + CHUNKSIZE,len,*new_i,*new_j; \
59         Scalar *new_a; \
60  \
61         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
62  \
63         /* malloc new storage space */ \
64         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(a->m+1)*sizeof(int); \
65         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
66         new_j   = (int *) (new_a + new_nz); \
67         new_i   = new_j + new_nz; \
68  \
69         /* copy over old data into new slots */ \
70         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];} \
71         for ( ii=row+1; ii<a->m+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
72         PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int)); \
73         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
74         PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
75                                                            len*sizeof(int)); \
76         PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar)); \
77         PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
78                                                            len*sizeof(Scalar));  \
79         /* free up old matrix storage */ \
80  \
81         PetscFree(a->a);  \
82         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
83         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
84         a->singlemalloc = 1; \
85  \
86         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
87         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
88         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
89         a->maxnz += CHUNKSIZE; \
90         a->reallocs++; \
91       } \
92       N = nrow++ - 1; a->nz++; \
93       /* shift up all the later entries in this row */ \
94       for ( ii=N; ii>=_i; ii-- ) { \
95         rp[ii+1] = rp[ii]; \
96         ap[ii+1] = ap[ii]; \
97       } \
98       rp[_i] = col1;  \
99       ap[_i] = value;  \
100       a_noinsert: ; \
101       ailen[row] = nrow; \
102 }
103 
104 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
105 { \
106  \
107     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
108     rmax = bimax[row]; nrow = bilen[row];  \
109     col1 = col - shift; \
110      \
111     low = 0; high = nrow; \
112     while (high-low > 5) { \
113       t = (low+high)/2; \
114       if (rp[t] > col) high = t; \
115       else             low  = t; \
116     } \
117        for ( _i=0; _i<nrow; _i++ ) { \
118         if (rp[_i] > col1) break; \
119         if (rp[_i] == col1) { \
120           if (addv == ADD_VALUES) ap[_i] += value;   \
121           else                  ap[_i] = value; \
122           goto b_noinsert; \
123         } \
124       }  \
125       if (nonew == 1) goto b_noinsert; \
126       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
127       if (nrow >= rmax) { \
128         /* there is no extra room in row, therefore enlarge */ \
129         int    new_nz = bi[b->m] + CHUNKSIZE,len,*new_i,*new_j; \
130         Scalar *new_a; \
131  \
132         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
133  \
134         /* malloc new storage space */ \
135         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(b->m+1)*sizeof(int); \
136         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
137         new_j   = (int *) (new_a + new_nz); \
138         new_i   = new_j + new_nz; \
139  \
140         /* copy over old data into new slots */ \
141         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = bi[ii];} \
142         for ( ii=row+1; ii<b->m+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
143         PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int)); \
144         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
145         PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
146                                                            len*sizeof(int)); \
147         PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar)); \
148         PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
149                                                            len*sizeof(Scalar));  \
150         /* free up old matrix storage */ \
151  \
152         PetscFree(b->a);  \
153         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
154         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
155         b->singlemalloc = 1; \
156  \
157         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
158         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
159         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
160         b->maxnz += CHUNKSIZE; \
161         b->reallocs++; \
162       } \
163       N = nrow++ - 1; b->nz++; \
164       /* shift up all the later entries in this row */ \
165       for ( ii=N; ii>=_i; ii-- ) { \
166         rp[ii+1] = rp[ii]; \
167         ap[ii+1] = ap[ii]; \
168       } \
169       rp[_i] = col1;  \
170       ap[_i] = value;  \
171       b_noinsert: ; \
172       bilen[row] = nrow; \
173 }
174 
175 extern int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
176 #undef __FUNC__
177 #define __FUNC__ "MatSetValues_MPIAIJ"
178 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
179 {
180   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
181   Scalar     value;
182   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
183   int        cstart = aij->cstart, cend = aij->cend,row,col;
184   int        roworiented = aij->roworiented;
185 
186   /* Some Variables required in the macro */
187   Mat        A = aij->A;
188   Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
189   int        *aimax = a->imax, *ai = a->i, *ailen = a->ilen,*aj = a->j;
190   Scalar     *aa = a->a;
191 
192   Mat        B = aij->B;
193   Mat_SeqAIJ *b = (Mat_SeqAIJ *) B->data;
194   int        *bimax = b->imax, *bi = b->i, *bilen = b->ilen,*bj = b->j;
195   Scalar     *ba = b->a;
196 
197   int        *rp,ii,nrow,_i,rmax, N, col1,low,high,t;
198   int        nonew = a->nonew,shift = a->indexshift;
199   Scalar     *ap;
200 
201   for ( i=0; i<m; i++ ) {
202 #if defined(PETSC_BOPT_g)
203     if (im[i] < 0) SETERRQ(1,0,"Negative row");
204     if (im[i] >= aij->M) SETERRQ(1,0,"Row too large");
205 #endif
206     if (im[i] >= rstart && im[i] < rend) {
207       row = im[i] - rstart;
208       for ( j=0; j<n; j++ ) {
209         if (in[j] >= cstart && in[j] < cend){
210           col = in[j] - cstart;
211           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
212           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
213           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
214         }
215 #if defined(PETSC_BOPT_g)
216         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
217         else if (in[j] >= aij->N) {SETERRQ(1,0,"Col too large");}
218 #endif
219         else {
220           if (mat->was_assembled) {
221             if (!aij->colmap) {
222               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
223             }
224             col = aij->colmap[in[j]] - 1;
225             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
226               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
227               col =  in[j];
228               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
229               B = aij->B;
230               b = (Mat_SeqAIJ *) B->data;
231               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
232               ba = b->a;
233             }
234           }
235           else col = in[j];
236           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
237           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
238           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
239         }
240       }
241     }
242     else {
243       if (roworiented && !aij->donotstash) {
244         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
245       }
246       else {
247         if (!aij->donotstash) {
248           row = im[i];
249           for ( j=0; j<n; j++ ) {
250             ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
251           }
252         }
253       }
254     }
255   }
256   return 0;
257 }
258 
259 #undef __FUNC__
260 #define __FUNC__ "MatGetValues_MPIAIJ"
261 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
262 {
263   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
264   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
265   int        cstart = aij->cstart, cend = aij->cend,row,col;
266 
267   for ( i=0; i<m; i++ ) {
268     if (idxm[i] < 0) SETERRQ(1,0,"Negative row");
269     if (idxm[i] >= aij->M) SETERRQ(1,0,"Row too large");
270     if (idxm[i] >= rstart && idxm[i] < rend) {
271       row = idxm[i] - rstart;
272       for ( j=0; j<n; j++ ) {
273         if (idxn[j] < 0) SETERRQ(1,0,"Negative column");
274         if (idxn[j] >= aij->N) SETERRQ(1,0,"Col too large");
275         if (idxn[j] >= cstart && idxn[j] < cend){
276           col = idxn[j] - cstart;
277           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
278         }
279         else {
280           if (!aij->colmap) {
281             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
282           }
283           col = aij->colmap[idxn[j]] - 1;
284           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
285           else {
286             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
287           }
288         }
289       }
290     }
291     else {
292       SETERRQ(1,0,"Only local values currently supported");
293     }
294   }
295   return 0;
296 }
297 
298 #undef __FUNC__
299 #define __FUNC__ "MatAssemblyBegin_MPIAIJ"
300 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
301 {
302   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
303   MPI_Comm    comm = mat->comm;
304   int         size = aij->size, *owners = aij->rowners;
305   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
306   MPI_Request *send_waits,*recv_waits;
307   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
308   InsertMode  addv;
309   Scalar      *rvalues,*svalues;
310 
311   /* make sure all processors are either in INSERTMODE or ADDMODE */
312   MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
313   if (addv == (ADD_VALUES|INSERT_VALUES)) {
314     SETERRQ(1,0,"Some processors inserted others added");
315   }
316   mat->insertmode = addv; /* in case this processor had no cache */
317 
318   /*  first count number of contributors to each processor */
319   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
320   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
321   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
322   for ( i=0; i<aij->stash.n; i++ ) {
323     idx = aij->stash.idx[i];
324     for ( j=0; j<size; j++ ) {
325       if (idx >= owners[j] && idx < owners[j+1]) {
326         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
327       }
328     }
329   }
330   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
331 
332   /* inform other processors of number of messages and max length*/
333   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
334   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
335   nreceives = work[rank];
336   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
337   nmax = work[rank];
338   PetscFree(work);
339 
340   /* post receives:
341        1) each message will consist of ordered pairs
342      (global index,value) we store the global index as a double
343      to simplify the message passing.
344        2) since we don't know how long each individual message is we
345      allocate the largest needed buffer for each receive. Potentially
346      this is a lot of wasted space.
347 
348 
349        This could be done better.
350   */
351   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
352   CHKPTRQ(rvalues);
353   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
354   CHKPTRQ(recv_waits);
355   for ( i=0; i<nreceives; i++ ) {
356     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
357               comm,recv_waits+i);
358   }
359 
360   /* do sends:
361       1) starts[i] gives the starting index in svalues for stuff going to
362          the ith processor
363   */
364   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
365   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
366   CHKPTRQ(send_waits);
367   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
368   starts[0] = 0;
369   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
370   for ( i=0; i<aij->stash.n; i++ ) {
371     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
372     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
373     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
374   }
375   PetscFree(owner);
376   starts[0] = 0;
377   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
378   count = 0;
379   for ( i=0; i<size; i++ ) {
380     if (procs[i]) {
381       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
382                 comm,send_waits+count++);
383     }
384   }
385   PetscFree(starts); PetscFree(nprocs);
386 
387   /* Free cache space */
388   PLogInfo(mat,"MatAssemblyBegin_MPIAIJ:Number of off-processor values %d\n",aij->stash.n);
389   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
390 
391   aij->svalues    = svalues;    aij->rvalues    = rvalues;
392   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
393   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
394   aij->rmax       = nmax;
395 
396   return 0;
397 }
398 extern int MatSetUpMultiply_MPIAIJ(Mat);
399 
400 #undef __FUNC__
401 #define __FUNC__ "MatAssemblyEnd_MPIAIJ"
402 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
403 {
404   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
405   MPI_Status  *send_status,recv_status;
406   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
407   int         row,col,other_disassembled;
408   Scalar      *values,val;
409   InsertMode  addv = mat->insertmode;
410 
411   /*  wait on receives */
412   while (count) {
413     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
414     /* unpack receives into our local space */
415     values = aij->rvalues + 3*imdex*aij->rmax;
416     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
417     n = n/3;
418     for ( i=0; i<n; i++ ) {
419       row = (int) PetscReal(values[3*i]) - aij->rstart;
420       col = (int) PetscReal(values[3*i+1]);
421       val = values[3*i+2];
422       if (col >= aij->cstart && col < aij->cend) {
423         col -= aij->cstart;
424         ierr = MatSetValues(aij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
425       }
426       else {
427         if (mat->was_assembled || mat->assembled) {
428           if (!aij->colmap) {
429             ierr = CreateColmap_MPIAIJ_Private(mat); CHKERRQ(ierr);
430           }
431           col = aij->colmap[col] - 1;
432           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
433             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
434             col = (int) PetscReal(values[3*i+1]);
435           }
436         }
437         ierr = MatSetValues(aij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
438       }
439     }
440     count--;
441   }
442   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
443 
444   /* wait on sends */
445   if (aij->nsends) {
446     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
447     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
448     PetscFree(send_status);
449   }
450   PetscFree(aij->send_waits); PetscFree(aij->svalues);
451 
452   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
453   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
454 
455   /* determine if any processor has disassembled, if so we must
456      also disassemble ourselfs, in order that we may reassemble. */
457   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
458   if (mat->was_assembled && !other_disassembled) {
459     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
460   }
461 
462   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
463     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
464   }
465   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
466   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
467 
468   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
469   return 0;
470 }
471 
472 #undef __FUNC__
473 #define __FUNC__ "MatZeroEntries_MPIAIJ"
474 int MatZeroEntries_MPIAIJ(Mat A)
475 {
476   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
477   int        ierr;
478   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
479   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
480   return 0;
481 }
482 
483 /* the code does not do the diagonal entries correctly unless the
484    matrix is square and the column and row owerships are identical.
485    This is a BUG. The only way to fix it seems to be to access
486    aij->A and aij->B directly and not through the MatZeroRows()
487    routine.
488 */
489 #undef __FUNC__
490 #define __FUNC__ "MatZeroRows_MPIAIJ"
491 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
492 {
493   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
494   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
495   int            *procs,*nprocs,j,found,idx,nsends,*work;
496   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
497   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
498   int            *lens,imdex,*lrows,*values;
499   MPI_Comm       comm = A->comm;
500   MPI_Request    *send_waits,*recv_waits;
501   MPI_Status     recv_status,*send_status;
502   IS             istmp;
503 
504   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
505   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
506 
507   /*  first count number of contributors to each processor */
508   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
509   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
510   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
511   for ( i=0; i<N; i++ ) {
512     idx = rows[i];
513     found = 0;
514     for ( j=0; j<size; j++ ) {
515       if (idx >= owners[j] && idx < owners[j+1]) {
516         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
517       }
518     }
519     if (!found) SETERRQ(1,0,"Index out of range");
520   }
521   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
522 
523   /* inform other processors of number of messages and max length*/
524   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
525   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
526   nrecvs = work[rank];
527   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
528   nmax = work[rank];
529   PetscFree(work);
530 
531   /* post receives:   */
532   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
533   CHKPTRQ(rvalues);
534   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
535   CHKPTRQ(recv_waits);
536   for ( i=0; i<nrecvs; i++ ) {
537     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
538   }
539 
540   /* do sends:
541       1) starts[i] gives the starting index in svalues for stuff going to
542          the ith processor
543   */
544   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
545   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
546   CHKPTRQ(send_waits);
547   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
548   starts[0] = 0;
549   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
550   for ( i=0; i<N; i++ ) {
551     svalues[starts[owner[i]]++] = rows[i];
552   }
553   ISRestoreIndices(is,&rows);
554 
555   starts[0] = 0;
556   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
557   count = 0;
558   for ( i=0; i<size; i++ ) {
559     if (procs[i]) {
560       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
561     }
562   }
563   PetscFree(starts);
564 
565   base = owners[rank];
566 
567   /*  wait on receives */
568   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
569   source = lens + nrecvs;
570   count  = nrecvs; slen = 0;
571   while (count) {
572     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
573     /* unpack receives into our local space */
574     MPI_Get_count(&recv_status,MPI_INT,&n);
575     source[imdex]  = recv_status.MPI_SOURCE;
576     lens[imdex]  = n;
577     slen += n;
578     count--;
579   }
580   PetscFree(recv_waits);
581 
582   /* move the data into the send scatter */
583   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
584   count = 0;
585   for ( i=0; i<nrecvs; i++ ) {
586     values = rvalues + i*nmax;
587     for ( j=0; j<lens[i]; j++ ) {
588       lrows[count++] = values[j] - base;
589     }
590   }
591   PetscFree(rvalues); PetscFree(lens);
592   PetscFree(owner); PetscFree(nprocs);
593 
594   /* actually zap the local rows */
595   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
596   PLogObjectParent(A,istmp);
597   PetscFree(lrows);
598   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
599   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
600   ierr = ISDestroy(istmp); CHKERRQ(ierr);
601 
602   /* wait on sends */
603   if (nsends) {
604     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
605     CHKPTRQ(send_status);
606     MPI_Waitall(nsends,send_waits,send_status);
607     PetscFree(send_status);
608   }
609   PetscFree(send_waits); PetscFree(svalues);
610 
611   return 0;
612 }
613 
614 #undef __FUNC__
615 #define __FUNC__ "MatMult_MPIAIJ"
616 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
617 {
618   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
619   int        ierr,nt;
620 
621   ierr = VecGetLocalSize(xx,&nt);  CHKERRQ(ierr);
622   if (nt != a->n) {
623     SETERRQ(1,0,"Incompatible partition of A and xx");
624   }
625   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
626   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
627   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
628   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
629   return 0;
630 }
631 
632 #undef __FUNC__
633 #define __FUNC__ "MatMultAdd_MPIAIJ"
634 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
635 {
636   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
637   int        ierr;
638   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
639   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
640   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
641   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
642   return 0;
643 }
644 
645 #undef __FUNC__
646 #define __FUNC__ "MatMultTrans_MPIAIJ"
647 int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
648 {
649   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
650   int        ierr;
651 
652   /* do nondiagonal part */
653   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
654   /* send it on its way */
655   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
656   /* do local part */
657   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
658   /* receive remote parts: note this assumes the values are not actually */
659   /* inserted in yy until the next line, which is true for my implementation*/
660   /* but is not perhaps always true. */
661   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
662   return 0;
663 }
664 
665 #undef __FUNC__
666 #define __FUNC__ "MatMultTransAdd_MPIAIJ"
667 int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
668 {
669   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
670   int        ierr;
671 
672   /* do nondiagonal part */
673   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
674   /* send it on its way */
675   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
676   /* do local part */
677   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
678   /* receive remote parts: note this assumes the values are not actually */
679   /* inserted in yy until the next line, which is true for my implementation*/
680   /* but is not perhaps always true. */
681   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
682   return 0;
683 }
684 
685 /*
686   This only works correctly for square matrices where the subblock A->A is the
687    diagonal block
688 */
689 #undef __FUNC__
690 #define __FUNC__ "MatGetDiagonal_MPIAIJ"
691 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
692 {
693   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
694   if (a->M != a->N)
695     SETERRQ(1,0,"Supports only square matrix where A->A is diag block");
696   if (a->rstart != a->cstart || a->rend != a->cend) {
697     SETERRQ(1,0,"row partition must equal col partition");  }
698   return MatGetDiagonal(a->A,v);
699 }
700 
701 #undef __FUNC__
702 #define __FUNC__ "MatScale_MPIAIJ"
703 int MatScale_MPIAIJ(Scalar *aa,Mat A)
704 {
705   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
706   int        ierr;
707   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
708   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
709   return 0;
710 }
711 
712 #undef __FUNC__
713 #define __FUNC__ "MatDestroy_MPIAIJ"
714 int MatDestroy_MPIAIJ(PetscObject obj)
715 {
716   Mat        mat = (Mat) obj;
717   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
718   int        ierr;
719 
720 #if defined(PETSC_LOG)
721   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
722 #endif
723   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
724   PetscFree(aij->rowners);
725   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
726   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
727   if (aij->colmap) PetscFree(aij->colmap);
728   if (aij->garray) PetscFree(aij->garray);
729   if (aij->lvec)   VecDestroy(aij->lvec);
730   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
731   if (aij->rowvalues) PetscFree(aij->rowvalues);
732   PetscFree(aij);
733   PLogObjectDestroy(mat);
734   PetscHeaderDestroy(mat);
735   return 0;
736 }
737 
738 #undef __FUNC__
739 #define __FUNC__ "MatView_MPIAIJ_Binary"
740 extern int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
741 {
742   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
743   int         ierr;
744 
745   if (aij->size == 1) {
746     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
747   }
748   else SETERRQ(1,0,"Only uniprocessor output supported");
749   return 0;
750 }
751 
752 #undef __FUNC__
753 #define __FUNC__ "MatView_MPIAIJ_ASCIIorDraworMatlab"
754 extern int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
755 {
756   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
757   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
758   int         ierr, format,shift = C->indexshift,rank;
759   FILE        *fd;
760   ViewerType  vtype;
761 
762   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
763   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
764     ierr = ViewerGetFormat(viewer,&format);
765     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
766       MatInfo info;
767       int     flg;
768       MPI_Comm_rank(mat->comm,&rank);
769       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
770       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
771       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
772       PetscSequentialPhaseBegin(mat->comm,1);
773       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
774          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
775       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
776          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
777       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);
778       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);
779       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);
780       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);
781       fflush(fd);
782       PetscSequentialPhaseEnd(mat->comm,1);
783       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
784       return 0;
785     }
786     else if (format == VIEWER_FORMAT_ASCII_INFO) {
787       return 0;
788     }
789   }
790 
791   if (vtype == DRAW_VIEWER) {
792     Draw       draw;
793     PetscTruth isnull;
794     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
795     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
796   }
797 
798   if (vtype == ASCII_FILE_VIEWER) {
799     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
800     PetscSequentialPhaseBegin(mat->comm,1);
801     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
802            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
803            aij->cend);
804     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
805     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
806     fflush(fd);
807     PetscSequentialPhaseEnd(mat->comm,1);
808   }
809   else {
810     int size = aij->size;
811     rank = aij->rank;
812     if (size == 1) {
813       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
814     }
815     else {
816       /* assemble the entire matrix onto first processor. */
817       Mat         A;
818       Mat_SeqAIJ *Aloc;
819       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
820       Scalar      *a;
821 
822       if (!rank) {
823         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
824                CHKERRQ(ierr);
825       }
826       else {
827         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
828                CHKERRQ(ierr);
829       }
830       PLogObjectParent(mat,A);
831 
832       /* copy over the A part */
833       Aloc = (Mat_SeqAIJ*) aij->A->data;
834       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
835       row = aij->rstart;
836       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
837       for ( i=0; i<m; i++ ) {
838         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
839         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
840       }
841       aj = Aloc->j;
842       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
843 
844       /* copy over the B part */
845       Aloc = (Mat_SeqAIJ*) aij->B->data;
846       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
847       row = aij->rstart;
848       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
849       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
850       for ( i=0; i<m; i++ ) {
851         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
852         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
853       }
854       PetscFree(ct);
855       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
856       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
857       if (!rank) {
858         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
859       }
860       ierr = MatDestroy(A); CHKERRQ(ierr);
861     }
862   }
863   return 0;
864 }
865 
866 #undef __FUNC__
867 #define __FUNC__ "MatView_MPIAIJ"
868 int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
869 {
870   Mat         mat = (Mat) obj;
871   int         ierr;
872   ViewerType  vtype;
873 
874   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
875   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
876       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
877     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
878   }
879   else if (vtype == BINARY_FILE_VIEWER) {
880     return MatView_MPIAIJ_Binary(mat,viewer);
881   }
882   return 0;
883 }
884 
885 /*
886     This has to provide several versions.
887 
888      2) a) use only local smoothing updating outer values only once.
889         b) local smoothing updating outer values each inner iteration
890      3) color updating out values betwen colors.
891 */
892 #undef __FUNC__
893 #define __FUNC__ "MatRelax_MPIAIJ"
894 int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
895                            double fshift,int its,Vec xx)
896 {
897   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
898   Mat        AA = mat->A, BB = mat->B;
899   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
900   Scalar     *b,*x,*xs,*ls,d,*v,sum;
901   int        ierr,*idx, *diag;
902   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
903 
904   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
905   xs = x + shift; /* shift by one for index start of 1 */
906   ls = ls + shift;
907   if (!A->diag) {ierr = MatMarkDiag_SeqAIJ(AA); CHKERRQ(ierr);}
908   diag = A->diag;
909   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
910     if (flag & SOR_ZERO_INITIAL_GUESS) {
911       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
912     }
913     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
914     CHKERRQ(ierr);
915     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
916     CHKERRQ(ierr);
917     while (its--) {
918       /* go down through the rows */
919       for ( i=0; i<m; i++ ) {
920         n    = A->i[i+1] - A->i[i];
921         idx  = A->j + A->i[i] + shift;
922         v    = A->a + A->i[i] + shift;
923         sum  = b[i];
924         SPARSEDENSEMDOT(sum,xs,v,idx,n);
925         d    = fshift + A->a[diag[i]+shift];
926         n    = B->i[i+1] - B->i[i];
927         idx  = B->j + B->i[i] + shift;
928         v    = B->a + B->i[i] + shift;
929         SPARSEDENSEMDOT(sum,ls,v,idx,n);
930         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
931       }
932       /* come up through the rows */
933       for ( i=m-1; i>-1; i-- ) {
934         n    = A->i[i+1] - A->i[i];
935         idx  = A->j + A->i[i] + shift;
936         v    = A->a + A->i[i] + shift;
937         sum  = b[i];
938         SPARSEDENSEMDOT(sum,xs,v,idx,n);
939         d    = fshift + A->a[diag[i]+shift];
940         n    = B->i[i+1] - B->i[i];
941         idx  = B->j + B->i[i] + shift;
942         v    = B->a + B->i[i] + shift;
943         SPARSEDENSEMDOT(sum,ls,v,idx,n);
944         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
945       }
946     }
947   }
948   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
949     if (flag & SOR_ZERO_INITIAL_GUESS) {
950       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
951     }
952     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
953     CHKERRQ(ierr);
954     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
955     CHKERRQ(ierr);
956     while (its--) {
957       for ( i=0; i<m; i++ ) {
958         n    = A->i[i+1] - A->i[i];
959         idx  = A->j + A->i[i] + shift;
960         v    = A->a + A->i[i] + shift;
961         sum  = b[i];
962         SPARSEDENSEMDOT(sum,xs,v,idx,n);
963         d    = fshift + A->a[diag[i]+shift];
964         n    = B->i[i+1] - B->i[i];
965         idx  = B->j + B->i[i] + shift;
966         v    = B->a + B->i[i] + shift;
967         SPARSEDENSEMDOT(sum,ls,v,idx,n);
968         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
969       }
970     }
971   }
972   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
973     if (flag & SOR_ZERO_INITIAL_GUESS) {
974       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
975     }
976     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,
977                             mat->Mvctx); CHKERRQ(ierr);
978     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,
979                             mat->Mvctx); CHKERRQ(ierr);
980     while (its--) {
981       for ( i=m-1; i>-1; i-- ) {
982         n    = A->i[i+1] - A->i[i];
983         idx  = A->j + A->i[i] + shift;
984         v    = A->a + A->i[i] + shift;
985         sum  = b[i];
986         SPARSEDENSEMDOT(sum,xs,v,idx,n);
987         d    = fshift + A->a[diag[i]+shift];
988         n    = B->i[i+1] - B->i[i];
989         idx  = B->j + B->i[i] + shift;
990         v    = B->a + B->i[i] + shift;
991         SPARSEDENSEMDOT(sum,ls,v,idx,n);
992         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
993       }
994     }
995   }
996   else {
997     SETERRQ(1,0,"Option not supported");
998   }
999   return 0;
1000 }
1001 
1002 #undef __FUNC__
1003 #define __FUNC__ "MatGetInfo_MPIAIJ"
1004 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1005 {
1006   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1007   Mat        A = mat->A, B = mat->B;
1008   int        ierr;
1009   double     isend[5], irecv[5];
1010 
1011   info->rows_global    = (double)mat->M;
1012   info->columns_global = (double)mat->N;
1013   info->rows_local     = (double)mat->m;
1014   info->columns_local  = (double)mat->N;
1015   info->block_size     = 1.0;
1016   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1017   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1018   isend[3] = info->memory;  isend[4] = info->mallocs;
1019   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1020   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1021   isend[3] += info->memory;  isend[4] += info->mallocs;
1022   if (flag == MAT_LOCAL) {
1023     info->nz_used      = isend[0];
1024     info->nz_allocated = isend[1];
1025     info->nz_unneeded  = isend[2];
1026     info->memory       = isend[3];
1027     info->mallocs      = isend[4];
1028   } else if (flag == MAT_GLOBAL_MAX) {
1029     MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);
1030     info->nz_used      = irecv[0];
1031     info->nz_allocated = irecv[1];
1032     info->nz_unneeded  = irecv[2];
1033     info->memory       = irecv[3];
1034     info->mallocs      = irecv[4];
1035   } else if (flag == MAT_GLOBAL_SUM) {
1036     MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);
1037     info->nz_used      = irecv[0];
1038     info->nz_allocated = irecv[1];
1039     info->nz_unneeded  = irecv[2];
1040     info->memory       = irecv[3];
1041     info->mallocs      = irecv[4];
1042   }
1043   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1044   info->fill_ratio_needed = 0;
1045   info->factor_mallocs    = 0;
1046 
1047   return 0;
1048 }
1049 
1050 #undef __FUNC__
1051 #define __FUNC__ "MatSetOption_MPIAIJ"
1052 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1053 {
1054   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1055 
1056   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1057       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1058       op == MAT_COLUMNS_UNSORTED ||
1059       op == MAT_COLUMNS_SORTED ||
1060       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1061       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1062         MatSetOption(a->A,op);
1063         MatSetOption(a->B,op);
1064   } else if (op == MAT_ROW_ORIENTED) {
1065     a->roworiented = 1;
1066     MatSetOption(a->A,op);
1067     MatSetOption(a->B,op);
1068   } else if (op == MAT_ROWS_SORTED ||
1069              op == MAT_ROWS_UNSORTED ||
1070              op == MAT_SYMMETRIC ||
1071              op == MAT_STRUCTURALLY_SYMMETRIC ||
1072              op == MAT_YES_NEW_DIAGONALS)
1073     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1074   else if (op == MAT_COLUMN_ORIENTED) {
1075     a->roworiented = 0;
1076     MatSetOption(a->A,op);
1077     MatSetOption(a->B,op);
1078   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1079     a->donotstash = 1;
1080   } else if (op == MAT_NO_NEW_DIAGONALS)
1081     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
1082   else
1083     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
1084   return 0;
1085 }
1086 
1087 #undef __FUNC__
1088 #define __FUNC__ "MatGetSize_MPIAIJ"
1089 int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1090 {
1091   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1092   if (m) *m = mat->M;
1093   if (n) *n = mat->N;
1094   return 0;
1095 }
1096 
1097 #undef __FUNC__
1098 #define __FUNC__ "MatGetLocalSize_MPIAIJ"
1099 int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1100 {
1101   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1102   if (m) *m = mat->m;
1103   if (n) *n = mat->N;
1104   return 0;
1105 }
1106 
1107 #undef __FUNC__
1108 #define __FUNC__ "MatGetOwnershipRange_MPIAIJ"
1109 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1110 {
1111   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1112   *m = mat->rstart; *n = mat->rend;
1113   return 0;
1114 }
1115 
1116 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1117 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1118 
1119 #undef __FUNC__
1120 #define __FUNC__ "MatGetRow_MPIAIJ"
1121 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1122 {
1123   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1124   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1125   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1126   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1127   int        *cmap, *idx_p;
1128 
1129   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,0,"Already active");
1130   mat->getrowactive = PETSC_TRUE;
1131 
1132   if (!mat->rowvalues && (idx || v)) {
1133     /*
1134         allocate enough space to hold information from the longest row.
1135     */
1136     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1137     int     max = 1,m = mat->m,tmp;
1138     for ( i=0; i<m; i++ ) {
1139       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1140       if (max < tmp) { max = tmp; }
1141     }
1142     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1143     CHKPTRQ(mat->rowvalues);
1144     mat->rowindices = (int *) (mat->rowvalues + max);
1145   }
1146 
1147   if (row < rstart || row >= rend) SETERRQ(1,0,"Only local rows")
1148   lrow = row - rstart;
1149 
1150   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1151   if (!v)   {pvA = 0; pvB = 0;}
1152   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1153   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1154   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1155   nztot = nzA + nzB;
1156 
1157   cmap  = mat->garray;
1158   if (v  || idx) {
1159     if (nztot) {
1160       /* Sort by increasing column numbers, assuming A and B already sorted */
1161       int imark = -1;
1162       if (v) {
1163         *v = v_p = mat->rowvalues;
1164         for ( i=0; i<nzB; i++ ) {
1165           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1166           else break;
1167         }
1168         imark = i;
1169         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1170         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1171       }
1172       if (idx) {
1173         *idx = idx_p = mat->rowindices;
1174         if (imark > -1) {
1175           for ( i=0; i<imark; i++ ) {
1176             idx_p[i] = cmap[cworkB[i]];
1177           }
1178         } else {
1179           for ( i=0; i<nzB; i++ ) {
1180             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1181             else break;
1182           }
1183           imark = i;
1184         }
1185         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1186         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1187       }
1188     }
1189     else {
1190       if (idx) *idx = 0;
1191       if (v)   *v   = 0;
1192     }
1193   }
1194   *nz = nztot;
1195   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1196   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1197   return 0;
1198 }
1199 
1200 #undef __FUNC__
1201 #define __FUNC__ "MatRestoreRow_MPIAIJ"
1202 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1203 {
1204   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1205   if (aij->getrowactive == PETSC_FALSE) {
1206     SETERRQ(1,0,"MatGetRow not called");
1207   }
1208   aij->getrowactive = PETSC_FALSE;
1209   return 0;
1210 }
1211 
1212 #undef __FUNC__
1213 #define __FUNC__ "MatNorm_MPIAIJ"
1214 int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1215 {
1216   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1217   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1218   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1219   double     sum = 0.0;
1220   Scalar     *v;
1221 
1222   if (aij->size == 1) {
1223     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1224   } else {
1225     if (type == NORM_FROBENIUS) {
1226       v = amat->a;
1227       for (i=0; i<amat->nz; i++ ) {
1228 #if defined(PETSC_COMPLEX)
1229         sum += real(conj(*v)*(*v)); v++;
1230 #else
1231         sum += (*v)*(*v); v++;
1232 #endif
1233       }
1234       v = bmat->a;
1235       for (i=0; i<bmat->nz; i++ ) {
1236 #if defined(PETSC_COMPLEX)
1237         sum += real(conj(*v)*(*v)); v++;
1238 #else
1239         sum += (*v)*(*v); v++;
1240 #endif
1241       }
1242       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1243       *norm = sqrt(*norm);
1244     }
1245     else if (type == NORM_1) { /* max column norm */
1246       double *tmp, *tmp2;
1247       int    *jj, *garray = aij->garray;
1248       tmp  = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp);
1249       tmp2 = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp2);
1250       PetscMemzero(tmp,aij->N*sizeof(double));
1251       *norm = 0.0;
1252       v = amat->a; jj = amat->j;
1253       for ( j=0; j<amat->nz; j++ ) {
1254         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1255       }
1256       v = bmat->a; jj = bmat->j;
1257       for ( j=0; j<bmat->nz; j++ ) {
1258         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1259       }
1260       MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1261       for ( j=0; j<aij->N; j++ ) {
1262         if (tmp2[j] > *norm) *norm = tmp2[j];
1263       }
1264       PetscFree(tmp); PetscFree(tmp2);
1265     }
1266     else if (type == NORM_INFINITY) { /* max row norm */
1267       double ntemp = 0.0;
1268       for ( j=0; j<amat->m; j++ ) {
1269         v = amat->a + amat->i[j] + shift;
1270         sum = 0.0;
1271         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1272           sum += PetscAbsScalar(*v); v++;
1273         }
1274         v = bmat->a + bmat->i[j] + shift;
1275         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1276           sum += PetscAbsScalar(*v); v++;
1277         }
1278         if (sum > ntemp) ntemp = sum;
1279       }
1280       MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1281     }
1282     else {
1283       SETERRQ(1,0,"No support for two norm");
1284     }
1285   }
1286   return 0;
1287 }
1288 
1289 #undef __FUNC__
1290 #define __FUNC__ "MatTranspose_MPIAIJ"
1291 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1292 {
1293   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1294   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1295   int        ierr,shift = Aloc->indexshift;
1296   Mat        B;
1297   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1298   Scalar     *array;
1299 
1300   if (matout == PETSC_NULL && M != N) {
1301     SETERRQ(1,0,"Square matrix only for in-place");
1302   }
1303 
1304   ierr = MatCreateMPIAIJ(A->comm,a->n,a->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1305 
1306   /* copy over the A part */
1307   Aloc = (Mat_SeqAIJ*) a->A->data;
1308   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1309   row = a->rstart;
1310   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1311   for ( i=0; i<m; i++ ) {
1312     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1313     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1314   }
1315   aj = Aloc->j;
1316   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1317 
1318   /* copy over the B part */
1319   Aloc = (Mat_SeqAIJ*) a->B->data;
1320   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1321   row = a->rstart;
1322   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1323   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1324   for ( i=0; i<m; i++ ) {
1325     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1326     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1327   }
1328   PetscFree(ct);
1329   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1330   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1331   if (matout != PETSC_NULL) {
1332     *matout = B;
1333   } else {
1334     /* This isn't really an in-place transpose .... but free data structures from a */
1335     PetscFree(a->rowners);
1336     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1337     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1338     if (a->colmap) PetscFree(a->colmap);
1339     if (a->garray) PetscFree(a->garray);
1340     if (a->lvec) VecDestroy(a->lvec);
1341     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1342     PetscFree(a);
1343     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1344     PetscHeaderDestroy(B);
1345   }
1346   return 0;
1347 }
1348 
1349 #undef __FUNC__
1350 #define __FUNC__ "MatDiagonalScale_MPIAIJ"
1351 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1352 {
1353   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1354   Mat a = aij->A, b = aij->B;
1355   int ierr,s1,s2,s3;
1356 
1357   ierr = MatGetLocalSize(mat,&s2,&s3); CHKERRQ(ierr);
1358   if (rr) {
1359     s3 = aij->n;
1360     VecGetLocalSize_Fast(rr,s1);
1361     if (s1!=s3) SETERRQ(1,0,"right vector non-conforming local size");
1362     /* Overlap communication with computation. */
1363     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1364   }
1365   if (ll) {
1366     VecGetLocalSize_Fast(ll,s1);
1367     if (s1!=s2) SETERRQ(1,0,"left vector non-conforming local size");
1368     ierr = (*b->ops.diagonalscale)(b,ll,0); CHKERRQ(ierr);
1369   }
1370   /* scale  the diagonal block */
1371   ierr = (*a->ops.diagonalscale)(a,ll,rr); CHKERRQ(ierr);
1372 
1373   if (rr) {
1374     /* Do a scatter end and then right scale the off-diagonal block */
1375     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1376     ierr = (*b->ops.diagonalscale)(b,0,aij->lvec); CHKERRQ(ierr);
1377   }
1378 
1379   return 0;
1380 }
1381 
1382 
1383 extern int MatPrintHelp_SeqAIJ(Mat);
1384 #undef __FUNC__
1385 #define __FUNC__ "MatPrintHelp_MPIAIJ"
1386 int MatPrintHelp_MPIAIJ(Mat A)
1387 {
1388   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1389 
1390   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1391   else return 0;
1392 }
1393 
1394 #undef __FUNC__
1395 #define __FUNC__ "MatGetBlockSize_MPIAIJ"
1396 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1397 {
1398   *bs = 1;
1399   return 0;
1400 }
1401 #undef __FUNC__
1402 #define __FUNC__ "MatSetUnfactored_MPIAIJ"
1403 int MatSetUnfactored_MPIAIJ(Mat A)
1404 {
1405   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1406   int        ierr;
1407   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1408   return 0;
1409 }
1410 
1411 #undef __FUNC__
1412 #define __FUNC__ "MatEqual_MPIAIJ"
1413 int MatEqual_MPIAIJ(Mat A, Mat B, PetscTruth *flag)
1414 {
1415   Mat_MPIAIJ *matB = (Mat_MPIAIJ *) B->data,*matA = (Mat_MPIAIJ *) A->data;
1416   Mat        a, b, c, d;
1417   PetscTruth flg;
1418   int        ierr;
1419 
1420   if (B->type != MATMPIAIJ) SETERRQ(1,0,"Matrices must be same type");
1421   a = matA->A; b = matA->B;
1422   c = matB->A; d = matB->B;
1423 
1424   ierr = MatEqual(a, c, &flg); CHKERRQ(ierr);
1425   if (flg == PETSC_TRUE) {
1426     ierr = MatEqual(b, d, &flg); CHKERRQ(ierr);
1427   }
1428   MPI_Allreduce(&flg, flag, 1, MPI_INT, MPI_LAND, A->comm);
1429   return(0);
1430 }
1431 
1432 extern int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1433 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1434 extern int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1435 extern int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1436 /* -------------------------------------------------------------------*/
1437 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1438        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1439        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1440        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1441        0,0,
1442        0,0,
1443        0,0,
1444        MatRelax_MPIAIJ,
1445        MatTranspose_MPIAIJ,
1446        MatGetInfo_MPIAIJ,MatEqual_MPIAIJ,
1447        MatGetDiagonal_MPIAIJ,MatDiagonalScale_MPIAIJ,MatNorm_MPIAIJ,
1448        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1449        0,
1450        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1451        0,0,0,0,
1452        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1453        0,0,
1454        0,0,MatConvertSameType_MPIAIJ,0,0,
1455        0,0,0,
1456        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1457        MatPrintHelp_MPIAIJ,
1458        MatScale_MPIAIJ,0,0,0,
1459        MatGetBlockSize_MPIAIJ,0,0,0,0,
1460        MatFDColoringCreate_MPIAIJ,0,MatSetUnfactored_MPIAIJ};
1461 
1462 
1463 #undef __FUNC__
1464 #define __FUNC__ "MatCreateMPIAIJ"
1465 /*@C
1466    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1467    (the default parallel PETSc format).  For good matrix assembly performance
1468    the user should preallocate the matrix storage by setting the parameters
1469    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1470    performance can be increased by more than a factor of 50.
1471 
1472    Input Parameters:
1473 .  comm - MPI communicator
1474 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1475            This value should be the same as the local size used in creating the
1476            y vector for the matrix-vector product y = Ax.
1477 .  n - This value should be the same as the local size used in creating the
1478        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
1479        calculated if N is given) For square matrices n is almost always m.
1480 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1481 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1482 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1483            (same for all local rows)
1484 .  d_nzz - array containing the number of nonzeros in the various rows of the
1485            diagonal portion of the local submatrix (possibly different for each row)
1486            or PETSC_NULL. You must leave room for the diagonal entry even if
1487            it is zero.
1488 .  o_nz - number of nonzeros per row in the off-diagonal portion of local
1489            submatrix (same for all local rows).
1490 .  o_nzz - array containing the number of nonzeros in the various rows of the
1491            off-diagonal portion of the local submatrix (possibly different for
1492            each row) or PETSC_NULL.
1493 
1494    Output Parameter:
1495 .  A - the matrix
1496 
1497    Notes:
1498    The AIJ format (also called the Yale sparse matrix format or
1499    compressed row storage), is fully compatible with standard Fortran 77
1500    storage.  That is, the stored row and column indices can begin at
1501    either one (as in Fortran) or zero.  See the users manual for details.
1502 
1503    The user MUST specify either the local or global matrix dimensions
1504    (possibly both).
1505 
1506    By default, this format uses inodes (identical nodes) when possible.
1507    We search for consecutive rows with the same nonzero structure, thereby
1508    reusing matrix information to achieve increased efficiency.
1509 
1510    Options Database Keys:
1511 $    -mat_aij_no_inode  - Do not use inodes
1512 $    -mat_aij_inode_limit <limit> - Set inode limit.
1513 $        (max limit=5)
1514 $    -mat_aij_oneindex - Internally use indexing starting at 1
1515 $        rather than 0.  Note: When calling MatSetValues(),
1516 $        the user still MUST index entries starting at 0!
1517 
1518    Storage Information:
1519    For a square global matrix we define each processor's diagonal portion
1520    to be its local rows and the corresponding columns (a square submatrix);
1521    each processor's off-diagonal portion encompasses the remainder of the
1522    local matrix (a rectangular submatrix).
1523 
1524    The user can specify preallocated storage for the diagonal part of
1525    the local submatrix with either d_nz or d_nnz (not both).  Set
1526    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1527    memory allocation.  Likewise, specify preallocated storage for the
1528    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1529 
1530    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1531    the figure below we depict these three local rows and all columns (0-11).
1532 
1533 $          0 1 2 3 4 5 6 7 8 9 10 11
1534 $         -------------------
1535 $  row 3  |  o o o d d d o o o o o o
1536 $  row 4  |  o o o d d d o o o o o o
1537 $  row 5  |  o o o d d d o o o o o o
1538 $         -------------------
1539 $
1540 
1541    Thus, any entries in the d locations are stored in the d (diagonal)
1542    submatrix, and any entries in the o locations are stored in the
1543    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1544    stored simply in the MATSEQAIJ format for compressed row storage.
1545 
1546    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1547    and o_nz should indicate the number of nonzeros per row in the o matrix.
1548    In general, for PDE problems in which most nonzeros are near the diagonal,
1549    one expects d_nz >> o_nz. For large problems you MUST preallocate memory
1550    or you will get TERRIBLE performance; see the users' manual chapter on
1551    matrices.
1552 
1553 .keywords: matrix, aij, compressed row, sparse, parallel
1554 
1555 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1556 @*/
1557 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1558                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1559 {
1560   Mat          B;
1561   Mat_MPIAIJ   *b;
1562   int          ierr, i,sum[2],work[2],size;
1563 
1564   MPI_Comm_size(comm,&size);
1565   if (size == 1) {
1566     if (M == PETSC_DECIDE) M = m;
1567     if (N == PETSC_DECIDE) N = n;
1568     ierr = MatCreateSeqAIJ(comm,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1569     return 0;
1570   }
1571 
1572   *A = 0;
1573   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIAIJ,comm,MatDestroy,MatView);
1574   PLogObjectCreate(B);
1575   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1576   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1577   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1578   B->destroy    = MatDestroy_MPIAIJ;
1579   B->view       = MatView_MPIAIJ;
1580   B->factor     = 0;
1581   B->assembled  = PETSC_FALSE;
1582   B->mapping    = 0;
1583 
1584   B->insertmode = NOT_SET_VALUES;
1585   b->size       = size;
1586   MPI_Comm_rank(comm,&b->rank);
1587 
1588   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1589     SETERRQ(1,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1590 
1591   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1592     work[0] = m; work[1] = n;
1593     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1594     if (M == PETSC_DECIDE) M = sum[0];
1595     if (N == PETSC_DECIDE) N = sum[1];
1596   }
1597   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1598   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1599   b->m = m; B->m = m;
1600   b->n = n; B->n = n;
1601   b->N = N; B->N = N;
1602   b->M = M; B->M = M;
1603 
1604   /* build local table of row and column ownerships */
1605   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1606   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1607   b->cowners = b->rowners + b->size + 2;
1608   MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1609   b->rowners[0] = 0;
1610   for ( i=2; i<=b->size; i++ ) {
1611     b->rowners[i] += b->rowners[i-1];
1612   }
1613   b->rstart = b->rowners[b->rank];
1614   b->rend   = b->rowners[b->rank+1];
1615   MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1616   b->cowners[0] = 0;
1617   for ( i=2; i<=b->size; i++ ) {
1618     b->cowners[i] += b->cowners[i-1];
1619   }
1620   b->cstart = b->cowners[b->rank];
1621   b->cend   = b->cowners[b->rank+1];
1622 
1623   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1624   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1625   PLogObjectParent(B,b->A);
1626   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1627   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1628   PLogObjectParent(B,b->B);
1629 
1630   /* build cache for off array entries formed */
1631   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1632   b->donotstash  = 0;
1633   b->colmap      = 0;
1634   b->garray      = 0;
1635   b->roworiented = 1;
1636 
1637   /* stuff used for matrix vector multiply */
1638   b->lvec      = 0;
1639   b->Mvctx     = 0;
1640 
1641   /* stuff for MatGetRow() */
1642   b->rowindices   = 0;
1643   b->rowvalues    = 0;
1644   b->getrowactive = PETSC_FALSE;
1645 
1646   *A = B;
1647   return 0;
1648 }
1649 
1650 #undef __FUNC__
1651 #define __FUNC__ "MatConvertSameType_MPIAIJ"
1652 int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1653 {
1654   Mat        mat;
1655   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1656   int        ierr, len=0, flg;
1657 
1658   *newmat       = 0;
1659   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm,MatDestroy,MatView);
1660   PLogObjectCreate(mat);
1661   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1662   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1663   mat->destroy    = MatDestroy_MPIAIJ;
1664   mat->view       = MatView_MPIAIJ;
1665   mat->factor     = matin->factor;
1666   mat->assembled  = PETSC_TRUE;
1667 
1668   a->m = mat->m   = oldmat->m;
1669   a->n = mat->n   = oldmat->n;
1670   a->M = mat->M   = oldmat->M;
1671   a->N = mat->N   = oldmat->N;
1672 
1673   a->rstart       = oldmat->rstart;
1674   a->rend         = oldmat->rend;
1675   a->cstart       = oldmat->cstart;
1676   a->cend         = oldmat->cend;
1677   a->size         = oldmat->size;
1678   a->rank         = oldmat->rank;
1679   mat->insertmode = NOT_SET_VALUES;
1680   a->rowvalues    = 0;
1681   a->getrowactive = PETSC_FALSE;
1682 
1683   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1684   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1685   a->cowners = a->rowners + a->size + 2;
1686   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1687   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1688   if (oldmat->colmap) {
1689     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1690     PLogObjectMemory(mat,(a->N)*sizeof(int));
1691     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1692   } else a->colmap = 0;
1693   if (oldmat->garray) {
1694     len = ((Mat_SeqAIJ *) (oldmat->B->data))->n;
1695     a->garray = (int *) PetscMalloc((len+1)*sizeof(int)); CHKPTRQ(a->garray);
1696     PLogObjectMemory(mat,len*sizeof(int));
1697     if (len) PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1698   } else a->garray = 0;
1699 
1700   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1701   PLogObjectParent(mat,a->lvec);
1702   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1703   PLogObjectParent(mat,a->Mvctx);
1704   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1705   PLogObjectParent(mat,a->A);
1706   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1707   PLogObjectParent(mat,a->B);
1708   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1709   if (flg) {
1710     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1711   }
1712   *newmat = mat;
1713   return 0;
1714 }
1715 
1716 #include "sys.h"
1717 
1718 #undef __FUNC__
1719 #define __FUNC__ "MatLoad_MPIAIJ"
1720 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1721 {
1722   Mat          A;
1723   int          i, nz, ierr, j,rstart, rend, fd;
1724   Scalar       *vals,*svals;
1725   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1726   MPI_Status   status;
1727   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1728   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1729   int          tag = ((PetscObject)viewer)->tag;
1730 
1731   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1732   if (!rank) {
1733     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1734     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
1735     if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not matrix object");
1736   }
1737 
1738   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1739   M = header[1]; N = header[2];
1740   /* determine ownership of all rows */
1741   m = M/size + ((M % size) > rank);
1742   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1743   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1744   rowners[0] = 0;
1745   for ( i=2; i<=size; i++ ) {
1746     rowners[i] += rowners[i-1];
1747   }
1748   rstart = rowners[rank];
1749   rend   = rowners[rank+1];
1750 
1751   /* distribute row lengths to all processors */
1752   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1753   offlens = ourlens + (rend-rstart);
1754   if (!rank) {
1755     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1756     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
1757     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1758     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1759     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1760     PetscFree(sndcounts);
1761   }
1762   else {
1763     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1764   }
1765 
1766   if (!rank) {
1767     /* calculate the number of nonzeros on each processor */
1768     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1769     PetscMemzero(procsnz,size*sizeof(int));
1770     for ( i=0; i<size; i++ ) {
1771       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1772         procsnz[i] += rowlengths[j];
1773       }
1774     }
1775     PetscFree(rowlengths);
1776 
1777     /* determine max buffer needed and allocate it */
1778     maxnz = 0;
1779     for ( i=0; i<size; i++ ) {
1780       maxnz = PetscMax(maxnz,procsnz[i]);
1781     }
1782     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1783 
1784     /* read in my part of the matrix column indices  */
1785     nz = procsnz[0];
1786     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1787     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
1788 
1789     /* read in every one elses and ship off */
1790     for ( i=1; i<size; i++ ) {
1791       nz = procsnz[i];
1792       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
1793       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1794     }
1795     PetscFree(cols);
1796   }
1797   else {
1798     /* determine buffer space needed for message */
1799     nz = 0;
1800     for ( i=0; i<m; i++ ) {
1801       nz += ourlens[i];
1802     }
1803     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1804 
1805     /* receive message of column indices*/
1806     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1807     MPI_Get_count(&status,MPI_INT,&maxnz);
1808     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1809   }
1810 
1811   /* loop over local rows, determining number of off diagonal entries */
1812   PetscMemzero(offlens,m*sizeof(int));
1813   jj = 0;
1814   for ( i=0; i<m; i++ ) {
1815     for ( j=0; j<ourlens[i]; j++ ) {
1816       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1817       jj++;
1818     }
1819   }
1820 
1821   /* create our matrix */
1822   for ( i=0; i<m; i++ ) {
1823     ourlens[i] -= offlens[i];
1824   }
1825   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1826   A = *newmat;
1827   MatSetOption(A,MAT_COLUMNS_SORTED);
1828   for ( i=0; i<m; i++ ) {
1829     ourlens[i] += offlens[i];
1830   }
1831 
1832   if (!rank) {
1833     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1834 
1835     /* read in my part of the matrix numerical values  */
1836     nz = procsnz[0];
1837     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
1838 
1839     /* insert into matrix */
1840     jj      = rstart;
1841     smycols = mycols;
1842     svals   = vals;
1843     for ( i=0; i<m; i++ ) {
1844       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1845       smycols += ourlens[i];
1846       svals   += ourlens[i];
1847       jj++;
1848     }
1849 
1850     /* read in other processors and ship out */
1851     for ( i=1; i<size; i++ ) {
1852       nz = procsnz[i];
1853       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
1854       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1855     }
1856     PetscFree(procsnz);
1857   }
1858   else {
1859     /* receive numeric values */
1860     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1861 
1862     /* receive message of values*/
1863     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1864     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1865     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1866 
1867     /* insert into matrix */
1868     jj      = rstart;
1869     smycols = mycols;
1870     svals   = vals;
1871     for ( i=0; i<m; i++ ) {
1872       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1873       smycols += ourlens[i];
1874       svals   += ourlens[i];
1875       jj++;
1876     }
1877   }
1878   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1879 
1880   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1881   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1882   return 0;
1883 }
1884