xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision d08a53c38dd4bae43ea11adda40bc48bec0e9e9e)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpiaij.c,v 1.213 1997/08/08 18:44:38 balay Exp bsmith $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/aij/mpi/mpiaij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 
10 /* local utility routine that creates a mapping from the global column
11 number to the local number in the off-diagonal part of the local
12 storage of the matrix.  This is done in a non scable way since the
13 length of colmap equals the global matrix length.
14 */
15 #undef __FUNC__
16 #define __FUNC__ "CreateColmap_MPIAIJ_Private"
17 int CreateColmap_MPIAIJ_Private(Mat mat)
18 {
19   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
20   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
21   int        n = B->n,i;
22 
23   aij->colmap = (int *) PetscMalloc((aij->N+1)*sizeof(int));CHKPTRQ(aij->colmap);
24   PLogObjectMemory(mat,aij->N*sizeof(int));
25   PetscMemzero(aij->colmap,aij->N*sizeof(int));
26   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
27   return 0;
28 }
29 
30 extern int DisAssemble_MPIAIJ(Mat);
31 
32 #define CHUNKSIZE   15
33 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
34 { \
35  \
36     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
37     rmax = aimax[row]; nrow = ailen[row];  \
38     col1 = col - shift; \
39      \
40     low = 0; high = nrow; \
41     while (high-low > 5) { \
42       t = (low+high)/2; \
43       if (rp[t] > col) high = t; \
44       else             low  = t; \
45     } \
46       for ( _i=0; _i<nrow; _i++ ) { \
47         if (rp[_i] > col1) break; \
48         if (rp[_i] == col1) { \
49           if (addv == ADD_VALUES) ap[_i] += value;   \
50           else                  ap[_i] = value; \
51           goto a_noinsert; \
52         } \
53       }  \
54       if (nonew == 1) goto a_noinsert; \
55       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
56       if (nrow >= rmax) { \
57         /* there is no extra room in row, therefore enlarge */ \
58         int    new_nz = ai[a->m] + CHUNKSIZE,len,*new_i,*new_j; \
59         Scalar *new_a; \
60  \
61         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
62  \
63         /* malloc new storage space */ \
64         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(a->m+1)*sizeof(int); \
65         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
66         new_j   = (int *) (new_a + new_nz); \
67         new_i   = new_j + new_nz; \
68  \
69         /* copy over old data into new slots */ \
70         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];} \
71         for ( ii=row+1; ii<a->m+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
72         PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int)); \
73         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
74         PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
75                                                            len*sizeof(int)); \
76         PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar)); \
77         PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
78                                                            len*sizeof(Scalar));  \
79         /* free up old matrix storage */ \
80  \
81         PetscFree(a->a);  \
82         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
83         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
84         a->singlemalloc = 1; \
85  \
86         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
87         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
88         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
89         a->maxnz += CHUNKSIZE; \
90         a->reallocs++; \
91       } \
92       N = nrow++ - 1; a->nz++; \
93       /* shift up all the later entries in this row */ \
94       for ( ii=N; ii>=_i; ii-- ) { \
95         rp[ii+1] = rp[ii]; \
96         ap[ii+1] = ap[ii]; \
97       } \
98       rp[_i] = col1;  \
99       ap[_i] = value;  \
100       a_noinsert: ; \
101       ailen[row] = nrow; \
102 }
103 
104 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
105 { \
106  \
107     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
108     rmax = bimax[row]; nrow = bilen[row];  \
109     col1 = col - shift; \
110      \
111     low = 0; high = nrow; \
112     while (high-low > 5) { \
113       t = (low+high)/2; \
114       if (rp[t] > col) high = t; \
115       else             low  = t; \
116     } \
117        for ( _i=0; _i<nrow; _i++ ) { \
118         if (rp[_i] > col1) break; \
119         if (rp[_i] == col1) { \
120           if (addv == ADD_VALUES) ap[_i] += value;   \
121           else                  ap[_i] = value; \
122           goto b_noinsert; \
123         } \
124       }  \
125       if (nonew == 1) goto b_noinsert; \
126       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
127       if (nrow >= rmax) { \
128         /* there is no extra room in row, therefore enlarge */ \
129         int    new_nz = bi[b->m] + CHUNKSIZE,len,*new_i,*new_j; \
130         Scalar *new_a; \
131  \
132         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
133  \
134         /* malloc new storage space */ \
135         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(b->m+1)*sizeof(int); \
136         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
137         new_j   = (int *) (new_a + new_nz); \
138         new_i   = new_j + new_nz; \
139  \
140         /* copy over old data into new slots */ \
141         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = bi[ii];} \
142         for ( ii=row+1; ii<b->m+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
143         PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int)); \
144         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
145         PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
146                                                            len*sizeof(int)); \
147         PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar)); \
148         PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
149                                                            len*sizeof(Scalar));  \
150         /* free up old matrix storage */ \
151  \
152         PetscFree(b->a);  \
153         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
154         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
155         b->singlemalloc = 1; \
156  \
157         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
158         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
159         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
160         b->maxnz += CHUNKSIZE; \
161         b->reallocs++; \
162       } \
163       N = nrow++ - 1; b->nz++; \
164       /* shift up all the later entries in this row */ \
165       for ( ii=N; ii>=_i; ii-- ) { \
166         rp[ii+1] = rp[ii]; \
167         ap[ii+1] = ap[ii]; \
168       } \
169       rp[_i] = col1;  \
170       ap[_i] = value;  \
171       b_noinsert: ; \
172       bilen[row] = nrow; \
173 }
174 
175 extern int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
176 #undef __FUNC__
177 #define __FUNC__ "MatSetValues_MPIAIJ"
178 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
179 {
180   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
181   Scalar     value;
182   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
183   int        cstart = aij->cstart, cend = aij->cend,row,col;
184   int        roworiented = aij->roworiented;
185 
186   /* Some Variables required in the macro */
187   Mat        A = aij->A;
188   Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
189   int        *aimax = a->imax, *ai = a->i, *ailen = a->ilen,*aj = a->j;
190   Scalar     *aa = a->a;
191 
192   Mat        B = aij->B;
193   Mat_SeqAIJ *b = (Mat_SeqAIJ *) B->data;
194   int        *bimax = b->imax, *bi = b->i, *bilen = b->ilen,*bj = b->j;
195   Scalar     *ba = b->a;
196 
197   int        *rp,ii,nrow,_i,rmax, N, col1,low,high,t;
198   int        nonew = a->nonew,shift = a->indexshift;
199   Scalar     *ap;
200 
201   for ( i=0; i<m; i++ ) {
202 #if defined(PETSC_BOPT_g)
203     if (im[i] < 0) SETERRQ(1,0,"Negative row");
204     if (im[i] >= aij->M) SETERRQ(1,0,"Row too large");
205 #endif
206     if (im[i] >= rstart && im[i] < rend) {
207       row = im[i] - rstart;
208       for ( j=0; j<n; j++ ) {
209         if (in[j] >= cstart && in[j] < cend){
210           col = in[j] - cstart;
211           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
212           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
213           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
214         }
215 #if defined(PETSC_BOPT_g)
216         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
217         else if (in[j] >= aij->N) {SETERRQ(1,0,"Col too large");}
218 #endif
219         else {
220           if (mat->was_assembled) {
221             if (!aij->colmap) {
222               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
223             }
224             col = aij->colmap[in[j]] - 1;
225             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
226               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
227               col =  in[j];
228               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
229               B = aij->B;
230               b = (Mat_SeqAIJ *) B->data;
231               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
232               ba = b->a;
233             }
234           }
235           else col = in[j];
236           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
237           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
238           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
239         }
240       }
241     }
242     else {
243       if (roworiented && !aij->donotstash) {
244         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
245       }
246       else {
247         if (!aij->donotstash) {
248           row = im[i];
249           for ( j=0; j<n; j++ ) {
250             ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
251           }
252         }
253       }
254     }
255   }
256   return 0;
257 }
258 
259 #undef __FUNC__
260 #define __FUNC__ "MatGetValues_MPIAIJ"
261 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
262 {
263   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
264   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
265   int        cstart = aij->cstart, cend = aij->cend,row,col;
266 
267   for ( i=0; i<m; i++ ) {
268     if (idxm[i] < 0) SETERRQ(1,0,"Negative row");
269     if (idxm[i] >= aij->M) SETERRQ(1,0,"Row too large");
270     if (idxm[i] >= rstart && idxm[i] < rend) {
271       row = idxm[i] - rstart;
272       for ( j=0; j<n; j++ ) {
273         if (idxn[j] < 0) SETERRQ(1,0,"Negative column");
274         if (idxn[j] >= aij->N) SETERRQ(1,0,"Col too large");
275         if (idxn[j] >= cstart && idxn[j] < cend){
276           col = idxn[j] - cstart;
277           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
278         }
279         else {
280           if (!aij->colmap) {
281             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
282           }
283           col = aij->colmap[idxn[j]] - 1;
284           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
285           else {
286             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
287           }
288         }
289       }
290     }
291     else {
292       SETERRQ(1,0,"Only local values currently supported");
293     }
294   }
295   return 0;
296 }
297 
298 #undef __FUNC__
299 #define __FUNC__ "MatAssemblyBegin_MPIAIJ"
300 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
301 {
302   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
303   MPI_Comm    comm = mat->comm;
304   int         size = aij->size, *owners = aij->rowners;
305   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
306   MPI_Request *send_waits,*recv_waits;
307   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
308   InsertMode  addv;
309   Scalar      *rvalues,*svalues;
310 
311   /* make sure all processors are either in INSERTMODE or ADDMODE */
312   MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
313   if (addv == (ADD_VALUES|INSERT_VALUES)) {
314     SETERRQ(1,0,"Some processors inserted others added");
315   }
316   mat->insertmode = addv; /* in case this processor had no cache */
317 
318   /*  first count number of contributors to each processor */
319   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
320   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
321   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
322   for ( i=0; i<aij->stash.n; i++ ) {
323     idx = aij->stash.idx[i];
324     for ( j=0; j<size; j++ ) {
325       if (idx >= owners[j] && idx < owners[j+1]) {
326         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
327       }
328     }
329   }
330   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
331 
332   /* inform other processors of number of messages and max length*/
333   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
334   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
335   nreceives = work[rank];
336   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
337   nmax = work[rank];
338   PetscFree(work);
339 
340   /* post receives:
341        1) each message will consist of ordered pairs
342      (global index,value) we store the global index as a double
343      to simplify the message passing.
344        2) since we don't know how long each individual message is we
345      allocate the largest needed buffer for each receive. Potentially
346      this is a lot of wasted space.
347 
348 
349        This could be done better.
350   */
351   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
352   CHKPTRQ(rvalues);
353   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
354   CHKPTRQ(recv_waits);
355   for ( i=0; i<nreceives; i++ ) {
356     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
357               comm,recv_waits+i);
358   }
359 
360   /* do sends:
361       1) starts[i] gives the starting index in svalues for stuff going to
362          the ith processor
363   */
364   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
365   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
366   CHKPTRQ(send_waits);
367   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
368   starts[0] = 0;
369   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
370   for ( i=0; i<aij->stash.n; i++ ) {
371     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
372     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
373     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
374   }
375   PetscFree(owner);
376   starts[0] = 0;
377   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
378   count = 0;
379   for ( i=0; i<size; i++ ) {
380     if (procs[i]) {
381       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
382                 comm,send_waits+count++);
383     }
384   }
385   PetscFree(starts); PetscFree(nprocs);
386 
387   /* Free cache space */
388   PLogInfo(mat,"MatAssemblyBegin_MPIAIJ:Number of off-processor values %d\n",aij->stash.n);
389   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
390 
391   aij->svalues    = svalues;    aij->rvalues    = rvalues;
392   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
393   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
394   aij->rmax       = nmax;
395 
396   return 0;
397 }
398 extern int MatSetUpMultiply_MPIAIJ(Mat);
399 
400 #undef __FUNC__
401 #define __FUNC__ "MatAssemblyEnd_MPIAIJ"
402 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
403 {
404   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
405   MPI_Status  *send_status,recv_status;
406   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
407   int         row,col,other_disassembled;
408   Scalar      *values,val;
409   InsertMode  addv = mat->insertmode;
410 
411   /*  wait on receives */
412   while (count) {
413     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
414     /* unpack receives into our local space */
415     values = aij->rvalues + 3*imdex*aij->rmax;
416     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
417     n = n/3;
418     for ( i=0; i<n; i++ ) {
419       row = (int) PetscReal(values[3*i]) - aij->rstart;
420       col = (int) PetscReal(values[3*i+1]);
421       val = values[3*i+2];
422       if (col >= aij->cstart && col < aij->cend) {
423         col -= aij->cstart;
424         ierr = MatSetValues(aij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
425       }
426       else {
427         if (mat->was_assembled || mat->assembled) {
428           if (!aij->colmap) {
429             ierr = CreateColmap_MPIAIJ_Private(mat); CHKERRQ(ierr);
430           }
431           col = aij->colmap[col] - 1;
432           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
433             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
434             col = (int) PetscReal(values[3*i+1]);
435           }
436         }
437         ierr = MatSetValues(aij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
438       }
439     }
440     count--;
441   }
442   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
443 
444   /* wait on sends */
445   if (aij->nsends) {
446     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
447     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
448     PetscFree(send_status);
449   }
450   PetscFree(aij->send_waits); PetscFree(aij->svalues);
451 
452   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
453   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
454 
455   /* determine if any processor has disassembled, if so we must
456      also disassemble ourselfs, in order that we may reassemble. */
457   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
458   if (mat->was_assembled && !other_disassembled) {
459     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
460   }
461 
462   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
463     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
464   }
465   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
466   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
467 
468   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
469   return 0;
470 }
471 
472 #undef __FUNC__
473 #define __FUNC__ "MatZeroEntries_MPIAIJ"
474 int MatZeroEntries_MPIAIJ(Mat A)
475 {
476   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
477   int        ierr;
478   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
479   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
480   return 0;
481 }
482 
483 /* the code does not do the diagonal entries correctly unless the
484    matrix is square and the column and row owerships are identical.
485    This is a BUG. The only way to fix it seems to be to access
486    aij->A and aij->B directly and not through the MatZeroRows()
487    routine.
488 */
489 #undef __FUNC__
490 #define __FUNC__ "MatZeroRows_MPIAIJ"
491 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
492 {
493   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
494   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
495   int            *procs,*nprocs,j,found,idx,nsends,*work;
496   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
497   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
498   int            *lens,imdex,*lrows,*values;
499   MPI_Comm       comm = A->comm;
500   MPI_Request    *send_waits,*recv_waits;
501   MPI_Status     recv_status,*send_status;
502   IS             istmp;
503 
504   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
505   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
506 
507   /*  first count number of contributors to each processor */
508   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
509   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
510   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
511   for ( i=0; i<N; i++ ) {
512     idx = rows[i];
513     found = 0;
514     for ( j=0; j<size; j++ ) {
515       if (idx >= owners[j] && idx < owners[j+1]) {
516         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
517       }
518     }
519     if (!found) SETERRQ(1,0,"Index out of range");
520   }
521   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
522 
523   /* inform other processors of number of messages and max length*/
524   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
525   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
526   nrecvs = work[rank];
527   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
528   nmax = work[rank];
529   PetscFree(work);
530 
531   /* post receives:   */
532   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
533   CHKPTRQ(rvalues);
534   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
535   CHKPTRQ(recv_waits);
536   for ( i=0; i<nrecvs; i++ ) {
537     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
538   }
539 
540   /* do sends:
541       1) starts[i] gives the starting index in svalues for stuff going to
542          the ith processor
543   */
544   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
545   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
546   CHKPTRQ(send_waits);
547   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
548   starts[0] = 0;
549   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
550   for ( i=0; i<N; i++ ) {
551     svalues[starts[owner[i]]++] = rows[i];
552   }
553   ISRestoreIndices(is,&rows);
554 
555   starts[0] = 0;
556   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
557   count = 0;
558   for ( i=0; i<size; i++ ) {
559     if (procs[i]) {
560       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
561     }
562   }
563   PetscFree(starts);
564 
565   base = owners[rank];
566 
567   /*  wait on receives */
568   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
569   source = lens + nrecvs;
570   count  = nrecvs; slen = 0;
571   while (count) {
572     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
573     /* unpack receives into our local space */
574     MPI_Get_count(&recv_status,MPI_INT,&n);
575     source[imdex]  = recv_status.MPI_SOURCE;
576     lens[imdex]  = n;
577     slen += n;
578     count--;
579   }
580   PetscFree(recv_waits);
581 
582   /* move the data into the send scatter */
583   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
584   count = 0;
585   for ( i=0; i<nrecvs; i++ ) {
586     values = rvalues + i*nmax;
587     for ( j=0; j<lens[i]; j++ ) {
588       lrows[count++] = values[j] - base;
589     }
590   }
591   PetscFree(rvalues); PetscFree(lens);
592   PetscFree(owner); PetscFree(nprocs);
593 
594   /* actually zap the local rows */
595   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
596   PLogObjectParent(A,istmp);
597   PetscFree(lrows);
598   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
599   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
600   ierr = ISDestroy(istmp); CHKERRQ(ierr);
601 
602   /* wait on sends */
603   if (nsends) {
604     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
605     CHKPTRQ(send_status);
606     MPI_Waitall(nsends,send_waits,send_status);
607     PetscFree(send_status);
608   }
609   PetscFree(send_waits); PetscFree(svalues);
610 
611   return 0;
612 }
613 
614 #undef __FUNC__
615 #define __FUNC__ "MatMult_MPIAIJ"
616 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
617 {
618   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
619   int        ierr,nt;
620 
621   ierr = VecGetLocalSize(xx,&nt);  CHKERRQ(ierr);
622   if (nt != a->n) {
623     SETERRQ(1,0,"Incompatible partition of A and xx");
624   }
625   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
626   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
627   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
628   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
629   return 0;
630 }
631 
632 #undef __FUNC__
633 #define __FUNC__ "MatMultAdd_MPIAIJ"
634 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
635 {
636   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
637   int        ierr;
638   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
639   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
640   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
641   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
642   return 0;
643 }
644 
645 #undef __FUNC__
646 #define __FUNC__ "MatMultTrans_MPIAIJ"
647 int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
648 {
649   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
650   int        ierr;
651 
652   /* do nondiagonal part */
653   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
654   /* send it on its way */
655   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
656   /* do local part */
657   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
658   /* receive remote parts: note this assumes the values are not actually */
659   /* inserted in yy until the next line, which is true for my implementation*/
660   /* but is not perhaps always true. */
661   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
662   return 0;
663 }
664 
665 #undef __FUNC__
666 #define __FUNC__ "MatMultTransAdd_MPIAIJ"
667 int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
668 {
669   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
670   int        ierr;
671 
672   /* do nondiagonal part */
673   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
674   /* send it on its way */
675   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
676   /* do local part */
677   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
678   /* receive remote parts: note this assumes the values are not actually */
679   /* inserted in yy until the next line, which is true for my implementation*/
680   /* but is not perhaps always true. */
681   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
682   return 0;
683 }
684 
685 /*
686   This only works correctly for square matrices where the subblock A->A is the
687    diagonal block
688 */
689 #undef __FUNC__
690 #define __FUNC__ "MatGetDiagonal_MPIAIJ"
691 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
692 {
693   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
694   if (a->M != a->N)
695     SETERRQ(1,0,"Supports only square matrix where A->A is diag block");
696   if (a->rstart != a->cstart || a->rend != a->cend) {
697     SETERRQ(1,0,"row partition must equal col partition");  }
698   return MatGetDiagonal(a->A,v);
699 }
700 
701 #undef __FUNC__
702 #define __FUNC__ "MatScale_MPIAIJ"
703 int MatScale_MPIAIJ(Scalar *aa,Mat A)
704 {
705   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
706   int        ierr;
707   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
708   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
709   return 0;
710 }
711 
712 #undef __FUNC__
713 #define __FUNC__ "MatDestroy_MPIAIJ"
714 int MatDestroy_MPIAIJ(PetscObject obj)
715 {
716   Mat        mat = (Mat) obj;
717   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
718   int        ierr;
719 
720 #if defined(PETSC_LOG)
721   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
722 #endif
723   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
724   PetscFree(aij->rowners);
725   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
726   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
727   if (aij->colmap) PetscFree(aij->colmap);
728   if (aij->garray) PetscFree(aij->garray);
729   if (aij->lvec)   VecDestroy(aij->lvec);
730   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
731   if (aij->rowvalues) PetscFree(aij->rowvalues);
732   PetscFree(aij);
733   if (mat->mapping) {
734     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
735   }
736   PLogObjectDestroy(mat);
737   PetscHeaderDestroy(mat);
738   return 0;
739 }
740 
741 #undef __FUNC__
742 #define __FUNC__ "MatView_MPIAIJ_Binary"
743 extern int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
744 {
745   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
746   int         ierr;
747 
748   if (aij->size == 1) {
749     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
750   }
751   else SETERRQ(1,0,"Only uniprocessor output supported");
752   return 0;
753 }
754 
755 #undef __FUNC__
756 #define __FUNC__ "MatView_MPIAIJ_ASCIIorDraworMatlab"
757 extern int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
758 {
759   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
760   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
761   int         ierr, format,shift = C->indexshift,rank;
762   FILE        *fd;
763   ViewerType  vtype;
764 
765   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
766   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
767     ierr = ViewerGetFormat(viewer,&format);
768     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
769       MatInfo info;
770       int     flg;
771       MPI_Comm_rank(mat->comm,&rank);
772       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
773       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
774       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
775       PetscSequentialPhaseBegin(mat->comm,1);
776       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
777          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
778       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
779          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
780       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);
781       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);
782       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);
783       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);
784       fflush(fd);
785       PetscSequentialPhaseEnd(mat->comm,1);
786       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
787       return 0;
788     }
789     else if (format == VIEWER_FORMAT_ASCII_INFO) {
790       return 0;
791     }
792   }
793 
794   if (vtype == DRAW_VIEWER) {
795     Draw       draw;
796     PetscTruth isnull;
797     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
798     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
799   }
800 
801   if (vtype == ASCII_FILE_VIEWER) {
802     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
803     PetscSequentialPhaseBegin(mat->comm,1);
804     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
805            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
806            aij->cend);
807     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
808     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
809     fflush(fd);
810     PetscSequentialPhaseEnd(mat->comm,1);
811   }
812   else {
813     int size = aij->size;
814     rank = aij->rank;
815     if (size == 1) {
816       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
817     }
818     else {
819       /* assemble the entire matrix onto first processor. */
820       Mat         A;
821       Mat_SeqAIJ *Aloc;
822       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
823       Scalar      *a;
824 
825       if (!rank) {
826         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
827                CHKERRQ(ierr);
828       }
829       else {
830         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
831                CHKERRQ(ierr);
832       }
833       PLogObjectParent(mat,A);
834 
835       /* copy over the A part */
836       Aloc = (Mat_SeqAIJ*) aij->A->data;
837       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
838       row = aij->rstart;
839       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
840       for ( i=0; i<m; i++ ) {
841         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
842         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
843       }
844       aj = Aloc->j;
845       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
846 
847       /* copy over the B part */
848       Aloc = (Mat_SeqAIJ*) aij->B->data;
849       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
850       row = aij->rstart;
851       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
852       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
853       for ( i=0; i<m; i++ ) {
854         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
855         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
856       }
857       PetscFree(ct);
858       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
859       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
860       if (!rank) {
861         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
862       }
863       ierr = MatDestroy(A); CHKERRQ(ierr);
864     }
865   }
866   return 0;
867 }
868 
869 #undef __FUNC__
870 #define __FUNC__ "MatView_MPIAIJ"
871 int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
872 {
873   Mat         mat = (Mat) obj;
874   int         ierr;
875   ViewerType  vtype;
876 
877   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
878   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
879       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
880     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
881   }
882   else if (vtype == BINARY_FILE_VIEWER) {
883     return MatView_MPIAIJ_Binary(mat,viewer);
884   }
885   return 0;
886 }
887 
888 /*
889     This has to provide several versions.
890 
891      2) a) use only local smoothing updating outer values only once.
892         b) local smoothing updating outer values each inner iteration
893      3) color updating out values betwen colors.
894 */
895 #undef __FUNC__
896 #define __FUNC__ "MatRelax_MPIAIJ"
897 int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
898                            double fshift,int its,Vec xx)
899 {
900   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
901   Mat        AA = mat->A, BB = mat->B;
902   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
903   Scalar     *b,*x,*xs,*ls,d,*v,sum;
904   int        ierr,*idx, *diag;
905   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
906 
907   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
908   xs = x + shift; /* shift by one for index start of 1 */
909   ls = ls + shift;
910   if (!A->diag) {ierr = MatMarkDiag_SeqAIJ(AA); CHKERRQ(ierr);}
911   diag = A->diag;
912   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
913     if (flag & SOR_ZERO_INITIAL_GUESS) {
914       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
915     }
916     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
917     CHKERRQ(ierr);
918     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
919     CHKERRQ(ierr);
920     while (its--) {
921       /* go down through the rows */
922       for ( i=0; i<m; i++ ) {
923         n    = A->i[i+1] - A->i[i];
924         idx  = A->j + A->i[i] + shift;
925         v    = A->a + A->i[i] + shift;
926         sum  = b[i];
927         SPARSEDENSEMDOT(sum,xs,v,idx,n);
928         d    = fshift + A->a[diag[i]+shift];
929         n    = B->i[i+1] - B->i[i];
930         idx  = B->j + B->i[i] + shift;
931         v    = B->a + B->i[i] + shift;
932         SPARSEDENSEMDOT(sum,ls,v,idx,n);
933         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
934       }
935       /* come up through the rows */
936       for ( i=m-1; i>-1; i-- ) {
937         n    = A->i[i+1] - A->i[i];
938         idx  = A->j + A->i[i] + shift;
939         v    = A->a + A->i[i] + shift;
940         sum  = b[i];
941         SPARSEDENSEMDOT(sum,xs,v,idx,n);
942         d    = fshift + A->a[diag[i]+shift];
943         n    = B->i[i+1] - B->i[i];
944         idx  = B->j + B->i[i] + shift;
945         v    = B->a + B->i[i] + shift;
946         SPARSEDENSEMDOT(sum,ls,v,idx,n);
947         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
948       }
949     }
950   }
951   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
952     if (flag & SOR_ZERO_INITIAL_GUESS) {
953       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
954     }
955     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
956     CHKERRQ(ierr);
957     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);
958     CHKERRQ(ierr);
959     while (its--) {
960       for ( i=0; i<m; i++ ) {
961         n    = A->i[i+1] - A->i[i];
962         idx  = A->j + A->i[i] + shift;
963         v    = A->a + A->i[i] + shift;
964         sum  = b[i];
965         SPARSEDENSEMDOT(sum,xs,v,idx,n);
966         d    = fshift + A->a[diag[i]+shift];
967         n    = B->i[i+1] - B->i[i];
968         idx  = B->j + B->i[i] + shift;
969         v    = B->a + B->i[i] + shift;
970         SPARSEDENSEMDOT(sum,ls,v,idx,n);
971         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
972       }
973     }
974   }
975   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
976     if (flag & SOR_ZERO_INITIAL_GUESS) {
977       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
978     }
979     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,
980                             mat->Mvctx); CHKERRQ(ierr);
981     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,
982                             mat->Mvctx); CHKERRQ(ierr);
983     while (its--) {
984       for ( i=m-1; i>-1; i-- ) {
985         n    = A->i[i+1] - A->i[i];
986         idx  = A->j + A->i[i] + shift;
987         v    = A->a + A->i[i] + shift;
988         sum  = b[i];
989         SPARSEDENSEMDOT(sum,xs,v,idx,n);
990         d    = fshift + A->a[diag[i]+shift];
991         n    = B->i[i+1] - B->i[i];
992         idx  = B->j + B->i[i] + shift;
993         v    = B->a + B->i[i] + shift;
994         SPARSEDENSEMDOT(sum,ls,v,idx,n);
995         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
996       }
997     }
998   }
999   else {
1000     SETERRQ(1,0,"Option not supported");
1001   }
1002   return 0;
1003 }
1004 
1005 #undef __FUNC__
1006 #define __FUNC__ "MatGetInfo_MPIAIJ"
1007 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1008 {
1009   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1010   Mat        A = mat->A, B = mat->B;
1011   int        ierr;
1012   double     isend[5], irecv[5];
1013 
1014   info->rows_global    = (double)mat->M;
1015   info->columns_global = (double)mat->N;
1016   info->rows_local     = (double)mat->m;
1017   info->columns_local  = (double)mat->N;
1018   info->block_size     = 1.0;
1019   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1020   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1021   isend[3] = info->memory;  isend[4] = info->mallocs;
1022   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1023   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1024   isend[3] += info->memory;  isend[4] += info->mallocs;
1025   if (flag == MAT_LOCAL) {
1026     info->nz_used      = isend[0];
1027     info->nz_allocated = isend[1];
1028     info->nz_unneeded  = isend[2];
1029     info->memory       = isend[3];
1030     info->mallocs      = isend[4];
1031   } else if (flag == MAT_GLOBAL_MAX) {
1032     MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);
1033     info->nz_used      = irecv[0];
1034     info->nz_allocated = irecv[1];
1035     info->nz_unneeded  = irecv[2];
1036     info->memory       = irecv[3];
1037     info->mallocs      = irecv[4];
1038   } else if (flag == MAT_GLOBAL_SUM) {
1039     MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);
1040     info->nz_used      = irecv[0];
1041     info->nz_allocated = irecv[1];
1042     info->nz_unneeded  = irecv[2];
1043     info->memory       = irecv[3];
1044     info->mallocs      = irecv[4];
1045   }
1046   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1047   info->fill_ratio_needed = 0;
1048   info->factor_mallocs    = 0;
1049 
1050   return 0;
1051 }
1052 
1053 #undef __FUNC__
1054 #define __FUNC__ "MatSetOption_MPIAIJ"
1055 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1056 {
1057   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1058 
1059   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1060       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1061       op == MAT_COLUMNS_UNSORTED ||
1062       op == MAT_COLUMNS_SORTED ||
1063       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1064       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1065         MatSetOption(a->A,op);
1066         MatSetOption(a->B,op);
1067   } else if (op == MAT_ROW_ORIENTED) {
1068     a->roworiented = 1;
1069     MatSetOption(a->A,op);
1070     MatSetOption(a->B,op);
1071   } else if (op == MAT_ROWS_SORTED ||
1072              op == MAT_ROWS_UNSORTED ||
1073              op == MAT_SYMMETRIC ||
1074              op == MAT_STRUCTURALLY_SYMMETRIC ||
1075              op == MAT_YES_NEW_DIAGONALS)
1076     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1077   else if (op == MAT_COLUMN_ORIENTED) {
1078     a->roworiented = 0;
1079     MatSetOption(a->A,op);
1080     MatSetOption(a->B,op);
1081   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1082     a->donotstash = 1;
1083   } else if (op == MAT_NO_NEW_DIAGONALS)
1084     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
1085   else
1086     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
1087   return 0;
1088 }
1089 
1090 #undef __FUNC__
1091 #define __FUNC__ "MatGetSize_MPIAIJ"
1092 int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1093 {
1094   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1095   *m = mat->M; *n = mat->N;
1096   return 0;
1097 }
1098 
1099 #undef __FUNC__
1100 #define __FUNC__ "MatGetLocalSize_MPIAIJ"
1101 int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1102 {
1103   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1104   *m = mat->m; *n = mat->N;
1105   return 0;
1106 }
1107 
1108 #undef __FUNC__
1109 #define __FUNC__ "MatGetOwnershipRange_MPIAIJ"
1110 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1111 {
1112   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1113   *m = mat->rstart; *n = mat->rend;
1114   return 0;
1115 }
1116 
1117 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1118 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1119 
1120 #undef __FUNC__
1121 #define __FUNC__ "MatGetRow_MPIAIJ"
1122 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1123 {
1124   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1125   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1126   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1127   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1128   int        *cmap, *idx_p;
1129 
1130   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,0,"Already active");
1131   mat->getrowactive = PETSC_TRUE;
1132 
1133   if (!mat->rowvalues && (idx || v)) {
1134     /*
1135         allocate enough space to hold information from the longest row.
1136     */
1137     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1138     int     max = 1,m = mat->m,tmp;
1139     for ( i=0; i<m; i++ ) {
1140       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1141       if (max < tmp) { max = tmp; }
1142     }
1143     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1144     CHKPTRQ(mat->rowvalues);
1145     mat->rowindices = (int *) (mat->rowvalues + max);
1146   }
1147 
1148   if (row < rstart || row >= rend) SETERRQ(1,0,"Only local rows")
1149   lrow = row - rstart;
1150 
1151   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1152   if (!v)   {pvA = 0; pvB = 0;}
1153   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1154   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1155   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1156   nztot = nzA + nzB;
1157 
1158   cmap  = mat->garray;
1159   if (v  || idx) {
1160     if (nztot) {
1161       /* Sort by increasing column numbers, assuming A and B already sorted */
1162       int imark = -1;
1163       if (v) {
1164         *v = v_p = mat->rowvalues;
1165         for ( i=0; i<nzB; i++ ) {
1166           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1167           else break;
1168         }
1169         imark = i;
1170         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1171         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1172       }
1173       if (idx) {
1174         *idx = idx_p = mat->rowindices;
1175         if (imark > -1) {
1176           for ( i=0; i<imark; i++ ) {
1177             idx_p[i] = cmap[cworkB[i]];
1178           }
1179         } else {
1180           for ( i=0; i<nzB; i++ ) {
1181             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1182             else break;
1183           }
1184           imark = i;
1185         }
1186         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1187         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1188       }
1189     }
1190     else {
1191       if (idx) *idx = 0;
1192       if (v)   *v   = 0;
1193     }
1194   }
1195   *nz = nztot;
1196   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1197   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1198   return 0;
1199 }
1200 
1201 #undef __FUNC__
1202 #define __FUNC__ "MatRestoreRow_MPIAIJ"
1203 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1204 {
1205   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1206   if (aij->getrowactive == PETSC_FALSE) {
1207     SETERRQ(1,0,"MatGetRow not called");
1208   }
1209   aij->getrowactive = PETSC_FALSE;
1210   return 0;
1211 }
1212 
1213 #undef __FUNC__
1214 #define __FUNC__ "MatNorm_MPIAIJ"
1215 int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1216 {
1217   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1218   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1219   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1220   double     sum = 0.0;
1221   Scalar     *v;
1222 
1223   if (aij->size == 1) {
1224     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1225   } else {
1226     if (type == NORM_FROBENIUS) {
1227       v = amat->a;
1228       for (i=0; i<amat->nz; i++ ) {
1229 #if defined(PETSC_COMPLEX)
1230         sum += real(conj(*v)*(*v)); v++;
1231 #else
1232         sum += (*v)*(*v); v++;
1233 #endif
1234       }
1235       v = bmat->a;
1236       for (i=0; i<bmat->nz; i++ ) {
1237 #if defined(PETSC_COMPLEX)
1238         sum += real(conj(*v)*(*v)); v++;
1239 #else
1240         sum += (*v)*(*v); v++;
1241 #endif
1242       }
1243       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1244       *norm = sqrt(*norm);
1245     }
1246     else if (type == NORM_1) { /* max column norm */
1247       double *tmp, *tmp2;
1248       int    *jj, *garray = aij->garray;
1249       tmp  = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp);
1250       tmp2 = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp2);
1251       PetscMemzero(tmp,aij->N*sizeof(double));
1252       *norm = 0.0;
1253       v = amat->a; jj = amat->j;
1254       for ( j=0; j<amat->nz; j++ ) {
1255         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1256       }
1257       v = bmat->a; jj = bmat->j;
1258       for ( j=0; j<bmat->nz; j++ ) {
1259         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1260       }
1261       MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1262       for ( j=0; j<aij->N; j++ ) {
1263         if (tmp2[j] > *norm) *norm = tmp2[j];
1264       }
1265       PetscFree(tmp); PetscFree(tmp2);
1266     }
1267     else if (type == NORM_INFINITY) { /* max row norm */
1268       double ntemp = 0.0;
1269       for ( j=0; j<amat->m; j++ ) {
1270         v = amat->a + amat->i[j] + shift;
1271         sum = 0.0;
1272         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1273           sum += PetscAbsScalar(*v); v++;
1274         }
1275         v = bmat->a + bmat->i[j] + shift;
1276         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1277           sum += PetscAbsScalar(*v); v++;
1278         }
1279         if (sum > ntemp) ntemp = sum;
1280       }
1281       MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1282     }
1283     else {
1284       SETERRQ(1,0,"No support for two norm");
1285     }
1286   }
1287   return 0;
1288 }
1289 
1290 #undef __FUNC__
1291 #define __FUNC__ "MatTranspose_MPIAIJ"
1292 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1293 {
1294   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1295   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1296   int        ierr,shift = Aloc->indexshift;
1297   Mat        B;
1298   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1299   Scalar     *array;
1300 
1301   if (matout == PETSC_NULL && M != N) {
1302     SETERRQ(1,0,"Square matrix only for in-place");
1303   }
1304 
1305   ierr = MatCreateMPIAIJ(A->comm,a->n,a->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1306 
1307   /* copy over the A part */
1308   Aloc = (Mat_SeqAIJ*) a->A->data;
1309   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1310   row = a->rstart;
1311   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1312   for ( i=0; i<m; i++ ) {
1313     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1314     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1315   }
1316   aj = Aloc->j;
1317   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1318 
1319   /* copy over the B part */
1320   Aloc = (Mat_SeqAIJ*) a->B->data;
1321   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1322   row = a->rstart;
1323   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1324   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1325   for ( i=0; i<m; i++ ) {
1326     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1327     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1328   }
1329   PetscFree(ct);
1330   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1331   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1332   if (matout != PETSC_NULL) {
1333     *matout = B;
1334   } else {
1335     /* This isn't really an in-place transpose .... but free data structures from a */
1336     PetscFree(a->rowners);
1337     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1338     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1339     if (a->colmap) PetscFree(a->colmap);
1340     if (a->garray) PetscFree(a->garray);
1341     if (a->lvec) VecDestroy(a->lvec);
1342     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1343     PetscFree(a);
1344     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1345     PetscHeaderDestroy(B);
1346   }
1347   return 0;
1348 }
1349 
1350 #undef __FUNC__
1351 #define __FUNC__ "MatDiagonalScale_MPIAIJ"
1352 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1353 {
1354   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1355   Mat a = aij->A, b = aij->B;
1356   int ierr,s1,s2,s3;
1357 
1358   ierr = MatGetLocalSize(mat,&s2,&s3); CHKERRQ(ierr);
1359   if (rr) {
1360     s3 = aij->n;
1361     VecGetLocalSize_Fast(rr,s1);
1362     if (s1!=s3) SETERRQ(1,0,"right vector non-conforming local size");
1363     /* Overlap communication with computation. */
1364     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1365   }
1366   if (ll) {
1367     VecGetLocalSize_Fast(ll,s1);
1368     if (s1!=s2) SETERRQ(1,0,"left vector non-conforming local size");
1369     ierr = (*b->ops.diagonalscale)(b,ll,0); CHKERRQ(ierr);
1370   }
1371   /* scale  the diagonal block */
1372   ierr = (*a->ops.diagonalscale)(a,ll,rr); CHKERRQ(ierr);
1373 
1374   if (rr) {
1375     /* Do a scatter end and then right scale the off-diagonal block */
1376     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1377     ierr = (*b->ops.diagonalscale)(b,0,aij->lvec); CHKERRQ(ierr);
1378   }
1379 
1380   return 0;
1381 }
1382 
1383 
1384 extern int MatPrintHelp_SeqAIJ(Mat);
1385 #undef __FUNC__
1386 #define __FUNC__ "MatPrintHelp_MPIAIJ"
1387 int MatPrintHelp_MPIAIJ(Mat A)
1388 {
1389   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1390 
1391   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1392   else return 0;
1393 }
1394 
1395 #undef __FUNC__
1396 #define __FUNC__ "MatGetBlockSize_MPIAIJ"
1397 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1398 {
1399   *bs = 1;
1400   return 0;
1401 }
1402 #undef __FUNC__
1403 #define __FUNC__ "MatSetUnfactored_MPIAIJ"
1404 int MatSetUnfactored_MPIAIJ(Mat A)
1405 {
1406   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1407   int        ierr;
1408   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1409   return 0;
1410 }
1411 
1412 #undef __FUNC__
1413 #define __FUNC__ "MatEqual_MPIAIJ"
1414 int MatEqual_MPIAIJ(Mat A, Mat B, PetscTruth *flag)
1415 {
1416   Mat_MPIAIJ *matB = (Mat_MPIAIJ *) B->data,*matA = (Mat_MPIAIJ *) A->data;
1417   Mat        a, b, c, d;
1418   PetscTruth flg;
1419   int        ierr;
1420 
1421   if (B->type != MATMPIAIJ) SETERRQ(1,0,"Matrices must be same type");
1422   a = matA->A; b = matA->B;
1423   c = matB->A; d = matB->B;
1424 
1425   ierr = MatEqual(a, c, &flg); CHKERRQ(ierr);
1426   if (flg == PETSC_TRUE) {
1427     ierr = MatEqual(b, d, &flg); CHKERRQ(ierr);
1428   }
1429   MPI_Allreduce(&flg, flag, 1, MPI_INT, MPI_LAND, A->comm);
1430   return(0);
1431 }
1432 
1433 extern int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1434 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1435 extern int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1436 extern int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1437 /* -------------------------------------------------------------------*/
1438 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1439        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1440        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1441        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1442        0,0,
1443        0,0,
1444        0,0,
1445        MatRelax_MPIAIJ,
1446        MatTranspose_MPIAIJ,
1447        MatGetInfo_MPIAIJ,MatEqual_MPIAIJ,
1448        MatGetDiagonal_MPIAIJ,MatDiagonalScale_MPIAIJ,MatNorm_MPIAIJ,
1449        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1450        0,
1451        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1452        0,0,0,0,
1453        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1454        0,0,
1455        0,0,MatConvertSameType_MPIAIJ,0,0,
1456        0,0,0,
1457        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1458        MatPrintHelp_MPIAIJ,
1459        MatScale_MPIAIJ,0,0,0,
1460        MatGetBlockSize_MPIAIJ,0,0,0,0,
1461        MatFDColoringCreate_MPIAIJ,0,MatSetUnfactored_MPIAIJ};
1462 
1463 
1464 #undef __FUNC__
1465 #define __FUNC__ "MatCreateMPIAIJ"
1466 /*@C
1467    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1468    (the default parallel PETSc format).  For good matrix assembly performance
1469    the user should preallocate the matrix storage by setting the parameters
1470    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1471    performance can be increased by more than a factor of 50.
1472 
1473    Input Parameters:
1474 .  comm - MPI communicator
1475 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1476            This value should be the same as the local size used in creating the
1477            y vector for the matrix-vector product y = Ax.
1478 .  n - This value should be the same as the local size used in creating the
1479        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
1480        calculated if N is given) For square matrices n is almost always m.
1481 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1482 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1483 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1484            (same for all local rows)
1485 .  d_nzz - array containing the number of nonzeros in the various rows of the
1486            diagonal portion of the local submatrix (possibly different for each row)
1487            or PETSC_NULL. You must leave room for the diagonal entry even if
1488            it is zero.
1489 .  o_nz - number of nonzeros per row in the off-diagonal portion of local
1490            submatrix (same for all local rows).
1491 .  o_nzz - array containing the number of nonzeros in the various rows of the
1492            off-diagonal portion of the local submatrix (possibly different for
1493            each row) or PETSC_NULL.
1494 
1495    Output Parameter:
1496 .  A - the matrix
1497 
1498    Notes:
1499    The AIJ format (also called the Yale sparse matrix format or
1500    compressed row storage), is fully compatible with standard Fortran 77
1501    storage.  That is, the stored row and column indices can begin at
1502    either one (as in Fortran) or zero.  See the users manual for details.
1503 
1504    The user MUST specify either the local or global matrix dimensions
1505    (possibly both).
1506 
1507    By default, this format uses inodes (identical nodes) when possible.
1508    We search for consecutive rows with the same nonzero structure, thereby
1509    reusing matrix information to achieve increased efficiency.
1510 
1511    Options Database Keys:
1512 $    -mat_aij_no_inode  - Do not use inodes
1513 $    -mat_aij_inode_limit <limit> - Set inode limit.
1514 $        (max limit=5)
1515 $    -mat_aij_oneindex - Internally use indexing starting at 1
1516 $        rather than 0.  Note: When calling MatSetValues(),
1517 $        the user still MUST index entries starting at 0!
1518 
1519    Storage Information:
1520    For a square global matrix we define each processor's diagonal portion
1521    to be its local rows and the corresponding columns (a square submatrix);
1522    each processor's off-diagonal portion encompasses the remainder of the
1523    local matrix (a rectangular submatrix).
1524 
1525    The user can specify preallocated storage for the diagonal part of
1526    the local submatrix with either d_nz or d_nnz (not both).  Set
1527    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1528    memory allocation.  Likewise, specify preallocated storage for the
1529    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1530 
1531    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1532    the figure below we depict these three local rows and all columns (0-11).
1533 
1534 $          0 1 2 3 4 5 6 7 8 9 10 11
1535 $         -------------------
1536 $  row 3  |  o o o d d d o o o o o o
1537 $  row 4  |  o o o d d d o o o o o o
1538 $  row 5  |  o o o d d d o o o o o o
1539 $         -------------------
1540 $
1541 
1542    Thus, any entries in the d locations are stored in the d (diagonal)
1543    submatrix, and any entries in the o locations are stored in the
1544    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1545    stored simply in the MATSEQAIJ format for compressed row storage.
1546 
1547    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1548    and o_nz should indicate the number of nonzeros per row in the o matrix.
1549    In general, for PDE problems in which most nonzeros are near the diagonal,
1550    one expects d_nz >> o_nz. For large problems you MUST preallocate memory
1551    or you will get TERRIBLE performance; see the users' manual chapter on
1552    matrices.
1553 
1554 .keywords: matrix, aij, compressed row, sparse, parallel
1555 
1556 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1557 @*/
1558 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1559                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1560 {
1561   Mat          B;
1562   Mat_MPIAIJ   *b;
1563   int          ierr, i,sum[2],work[2],size;
1564 
1565   MPI_Comm_size(comm,&size);
1566   if (size == 1) {
1567     if (M == PETSC_DECIDE) M = m;
1568     if (N == PETSC_DECIDE) N = n;
1569     ierr = MatCreateSeqAIJ(comm,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1570     return 0;
1571   }
1572 
1573   *A = 0;
1574   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIAIJ,comm,MatDestroy,MatView);
1575   PLogObjectCreate(B);
1576   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1577   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1578   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1579   B->destroy    = MatDestroy_MPIAIJ;
1580   B->view       = MatView_MPIAIJ;
1581   B->factor     = 0;
1582   B->assembled  = PETSC_FALSE;
1583   B->mapping    = 0;
1584 
1585   B->insertmode = NOT_SET_VALUES;
1586   b->size       = size;
1587   MPI_Comm_rank(comm,&b->rank);
1588 
1589   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1590     SETERRQ(1,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1591 
1592   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1593     work[0] = m; work[1] = n;
1594     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1595     if (M == PETSC_DECIDE) M = sum[0];
1596     if (N == PETSC_DECIDE) N = sum[1];
1597   }
1598   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1599   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1600   b->m = m; B->m = m;
1601   b->n = n; B->n = n;
1602   b->N = N; B->N = N;
1603   b->M = M; B->M = M;
1604 
1605   /* build local table of row and column ownerships */
1606   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1607   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1608   b->cowners = b->rowners + b->size + 2;
1609   MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1610   b->rowners[0] = 0;
1611   for ( i=2; i<=b->size; i++ ) {
1612     b->rowners[i] += b->rowners[i-1];
1613   }
1614   b->rstart = b->rowners[b->rank];
1615   b->rend   = b->rowners[b->rank+1];
1616   MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1617   b->cowners[0] = 0;
1618   for ( i=2; i<=b->size; i++ ) {
1619     b->cowners[i] += b->cowners[i-1];
1620   }
1621   b->cstart = b->cowners[b->rank];
1622   b->cend   = b->cowners[b->rank+1];
1623 
1624   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1625   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1626   PLogObjectParent(B,b->A);
1627   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1628   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1629   PLogObjectParent(B,b->B);
1630 
1631   /* build cache for off array entries formed */
1632   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1633   b->donotstash  = 0;
1634   b->colmap      = 0;
1635   b->garray      = 0;
1636   b->roworiented = 1;
1637 
1638   /* stuff used for matrix vector multiply */
1639   b->lvec      = 0;
1640   b->Mvctx     = 0;
1641 
1642   /* stuff for MatGetRow() */
1643   b->rowindices   = 0;
1644   b->rowvalues    = 0;
1645   b->getrowactive = PETSC_FALSE;
1646 
1647   *A = B;
1648   return 0;
1649 }
1650 
1651 #undef __FUNC__
1652 #define __FUNC__ "MatConvertSameType_MPIAIJ"
1653 int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1654 {
1655   Mat        mat;
1656   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1657   int        ierr, len=0, flg;
1658 
1659   *newmat       = 0;
1660   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm,MatDestroy,MatView);
1661   PLogObjectCreate(mat);
1662   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1663   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1664   mat->destroy    = MatDestroy_MPIAIJ;
1665   mat->view       = MatView_MPIAIJ;
1666   mat->factor     = matin->factor;
1667   mat->assembled  = PETSC_TRUE;
1668 
1669   a->m = mat->m   = oldmat->m;
1670   a->n = mat->n   = oldmat->n;
1671   a->M = mat->M   = oldmat->M;
1672   a->N = mat->N   = oldmat->N;
1673 
1674   a->rstart       = oldmat->rstart;
1675   a->rend         = oldmat->rend;
1676   a->cstart       = oldmat->cstart;
1677   a->cend         = oldmat->cend;
1678   a->size         = oldmat->size;
1679   a->rank         = oldmat->rank;
1680   mat->insertmode = NOT_SET_VALUES;
1681   a->rowvalues    = 0;
1682   a->getrowactive = PETSC_FALSE;
1683 
1684   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1685   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1686   a->cowners = a->rowners + a->size + 2;
1687   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1688   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1689   if (oldmat->colmap) {
1690     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1691     PLogObjectMemory(mat,(a->N)*sizeof(int));
1692     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1693   } else a->colmap = 0;
1694   if (oldmat->garray) {
1695     len = ((Mat_SeqAIJ *) (oldmat->B->data))->n;
1696     a->garray = (int *) PetscMalloc((len+1)*sizeof(int)); CHKPTRQ(a->garray);
1697     PLogObjectMemory(mat,len*sizeof(int));
1698     if (len) PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1699   } else a->garray = 0;
1700 
1701   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1702   PLogObjectParent(mat,a->lvec);
1703   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1704   PLogObjectParent(mat,a->Mvctx);
1705   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1706   PLogObjectParent(mat,a->A);
1707   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1708   PLogObjectParent(mat,a->B);
1709   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1710   if (flg) {
1711     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1712   }
1713   *newmat = mat;
1714   return 0;
1715 }
1716 
1717 #include "sys.h"
1718 
1719 #undef __FUNC__
1720 #define __FUNC__ "MatLoad_MPIAIJ"
1721 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1722 {
1723   Mat          A;
1724   int          i, nz, ierr, j,rstart, rend, fd;
1725   Scalar       *vals,*svals;
1726   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1727   MPI_Status   status;
1728   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1729   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1730   int          tag = ((PetscObject)viewer)->tag;
1731 
1732   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1733   if (!rank) {
1734     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1735     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1736     if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not matrix object");
1737   }
1738 
1739   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1740   M = header[1]; N = header[2];
1741   /* determine ownership of all rows */
1742   m = M/size + ((M % size) > rank);
1743   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1744   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1745   rowners[0] = 0;
1746   for ( i=2; i<=size; i++ ) {
1747     rowners[i] += rowners[i-1];
1748   }
1749   rstart = rowners[rank];
1750   rend   = rowners[rank+1];
1751 
1752   /* distribute row lengths to all processors */
1753   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1754   offlens = ourlens + (rend-rstart);
1755   if (!rank) {
1756     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1757     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1758     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1759     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1760     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1761     PetscFree(sndcounts);
1762   }
1763   else {
1764     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1765   }
1766 
1767   if (!rank) {
1768     /* calculate the number of nonzeros on each processor */
1769     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1770     PetscMemzero(procsnz,size*sizeof(int));
1771     for ( i=0; i<size; i++ ) {
1772       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1773         procsnz[i] += rowlengths[j];
1774       }
1775     }
1776     PetscFree(rowlengths);
1777 
1778     /* determine max buffer needed and allocate it */
1779     maxnz = 0;
1780     for ( i=0; i<size; i++ ) {
1781       maxnz = PetscMax(maxnz,procsnz[i]);
1782     }
1783     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1784 
1785     /* read in my part of the matrix column indices  */
1786     nz = procsnz[0];
1787     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1788     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1789 
1790     /* read in every one elses and ship off */
1791     for ( i=1; i<size; i++ ) {
1792       nz = procsnz[i];
1793       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1794       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1795     }
1796     PetscFree(cols);
1797   }
1798   else {
1799     /* determine buffer space needed for message */
1800     nz = 0;
1801     for ( i=0; i<m; i++ ) {
1802       nz += ourlens[i];
1803     }
1804     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1805 
1806     /* receive message of column indices*/
1807     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1808     MPI_Get_count(&status,MPI_INT,&maxnz);
1809     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1810   }
1811 
1812   /* loop over local rows, determining number of off diagonal entries */
1813   PetscMemzero(offlens,m*sizeof(int));
1814   jj = 0;
1815   for ( i=0; i<m; i++ ) {
1816     for ( j=0; j<ourlens[i]; j++ ) {
1817       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1818       jj++;
1819     }
1820   }
1821 
1822   /* create our matrix */
1823   for ( i=0; i<m; i++ ) {
1824     ourlens[i] -= offlens[i];
1825   }
1826   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1827   A = *newmat;
1828   MatSetOption(A,MAT_COLUMNS_SORTED);
1829   for ( i=0; i<m; i++ ) {
1830     ourlens[i] += offlens[i];
1831   }
1832 
1833   if (!rank) {
1834     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1835 
1836     /* read in my part of the matrix numerical values  */
1837     nz = procsnz[0];
1838     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1839 
1840     /* insert into matrix */
1841     jj      = rstart;
1842     smycols = mycols;
1843     svals   = vals;
1844     for ( i=0; i<m; i++ ) {
1845       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1846       smycols += ourlens[i];
1847       svals   += ourlens[i];
1848       jj++;
1849     }
1850 
1851     /* read in other processors and ship out */
1852     for ( i=1; i<size; i++ ) {
1853       nz = procsnz[i];
1854       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1855       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1856     }
1857     PetscFree(procsnz);
1858   }
1859   else {
1860     /* receive numeric values */
1861     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1862 
1863     /* receive message of values*/
1864     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1865     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1866     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1867 
1868     /* insert into matrix */
1869     jj      = rstart;
1870     smycols = mycols;
1871     svals   = vals;
1872     for ( i=0; i<m; i++ ) {
1873       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1874       smycols += ourlens[i];
1875       svals   += ourlens[i];
1876       jj++;
1877     }
1878   }
1879   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1880 
1881   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1882   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1883   return 0;
1884 }
1885