xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 7eb4e439e85a62817f0f2ee27461d73cf6c9615d)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpiaij.c,v 1.271 1999/01/08 21:30:25 balay Exp balay $";
3 #endif
4 
5 #include "src/mat/impls/aij/mpi/mpiaij.h"
6 #include "src/vec/vecimpl.h"
7 #include "src/inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 #undef __FUNC__
15 #define __FUNC__ "CreateColmap_MPIAIJ_Private"
16 int CreateColmap_MPIAIJ_Private(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
19   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
20   int        n = B->n,i,ierr;
21 
22   PetscFunctionBegin;
23 #if defined (USE_CTABLE)
24   ierr = TableCreate( &aij->colmap, aij->n/5 ); CHKERRQ(ierr);
25   for ( i=0; i<n; i++ ){
26     ierr = TableAdd( aij->colmap, aij->garray[i] + 1, i+1 );CHKERRQ(ierr);
27   }
28 #else
29   aij->colmap = (int *) PetscMalloc((aij->N+1)*sizeof(int));CHKPTRQ(aij->colmap);
30   PLogObjectMemory(mat,aij->N*sizeof(int));
31   PetscMemzero(aij->colmap,aij->N*sizeof(int));
32   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i+1;
33 #endif
34   PetscFunctionReturn(0);
35 }
36 
37 extern int DisAssemble_MPIAIJ(Mat);
38 
39 #define CHUNKSIZE   15
40 #define MatSetValues_SeqAIJ_A_Private(row,col,value,addv) \
41 { \
42  \
43     rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
44     rmax = aimax[row]; nrow = ailen[row];  \
45     col1 = col - shift; \
46      \
47     low = 0; high = nrow; \
48     while (high-low > 5) { \
49       t = (low+high)/2; \
50       if (rp[t] > col) high = t; \
51       else             low  = t; \
52     } \
53       for ( _i=0; _i<nrow; _i++ ) { \
54         if (rp[_i] > col1) break; \
55         if (rp[_i] == col1) { \
56           if (addv == ADD_VALUES) ap[_i] += value;   \
57           else                  ap[_i] = value; \
58           goto a_noinsert; \
59         } \
60       }  \
61       if (nonew == 1) goto a_noinsert; \
62       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
63       if (nrow >= rmax) { \
64         /* there is no extra room in row, therefore enlarge */ \
65         int    new_nz = ai[a->m] + CHUNKSIZE,len,*new_i,*new_j; \
66         Scalar *new_a; \
67  \
68         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
69  \
70         /* malloc new storage space */ \
71         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(a->m+1)*sizeof(int); \
72         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
73         new_j   = (int *) (new_a + new_nz); \
74         new_i   = new_j + new_nz; \
75  \
76         /* copy over old data into new slots */ \
77         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];} \
78         for ( ii=row+1; ii<a->m+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
79         PetscMemcpy(new_j,aj,(ai[row]+nrow+shift)*sizeof(int)); \
80         len = (new_nz - CHUNKSIZE - ai[row] - nrow - shift); \
81         PetscMemcpy(new_j+ai[row]+shift+nrow+CHUNKSIZE,aj+ai[row]+shift+nrow, \
82                                                            len*sizeof(int)); \
83         PetscMemcpy(new_a,aa,(ai[row]+nrow+shift)*sizeof(Scalar)); \
84         PetscMemcpy(new_a+ai[row]+shift+nrow+CHUNKSIZE,aa+ai[row]+shift+nrow, \
85                                                            len*sizeof(Scalar));  \
86         /* free up old matrix storage */ \
87  \
88         PetscFree(a->a);  \
89         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
90         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
91         a->singlemalloc = 1; \
92  \
93         rp   = aj + ai[row] + shift; ap = aa + ai[row] + shift; \
94         rmax = aimax[row] = aimax[row] + CHUNKSIZE; \
95         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
96         a->maxnz += CHUNKSIZE; \
97         a->reallocs++; \
98       } \
99       N = nrow++ - 1; a->nz++; \
100       /* shift up all the later entries in this row */ \
101       for ( ii=N; ii>=_i; ii-- ) { \
102         rp[ii+1] = rp[ii]; \
103         ap[ii+1] = ap[ii]; \
104       } \
105       rp[_i] = col1;  \
106       ap[_i] = value;  \
107       a_noinsert: ; \
108       ailen[row] = nrow; \
109 }
110 
111 #define MatSetValues_SeqAIJ_B_Private(row,col,value,addv) \
112 { \
113  \
114     rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
115     rmax = bimax[row]; nrow = bilen[row];  \
116     col1 = col - shift; \
117      \
118     low = 0; high = nrow; \
119     while (high-low > 5) { \
120       t = (low+high)/2; \
121       if (rp[t] > col) high = t; \
122       else             low  = t; \
123     } \
124        for ( _i=0; _i<nrow; _i++ ) { \
125         if (rp[_i] > col1) break; \
126         if (rp[_i] == col1) { \
127           if (addv == ADD_VALUES) ap[_i] += value;   \
128           else                  ap[_i] = value; \
129           goto b_noinsert; \
130         } \
131       }  \
132       if (nonew == 1) goto b_noinsert; \
133       else if (nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
134       if (nrow >= rmax) { \
135         /* there is no extra room in row, therefore enlarge */ \
136         int    new_nz = bi[b->m] + CHUNKSIZE,len,*new_i,*new_j; \
137         Scalar *new_a; \
138  \
139         if (nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
140  \
141         /* malloc new storage space */ \
142         len     = new_nz*(sizeof(int)+sizeof(Scalar))+(b->m+1)*sizeof(int); \
143         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
144         new_j   = (int *) (new_a + new_nz); \
145         new_i   = new_j + new_nz; \
146  \
147         /* copy over old data into new slots */ \
148         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = bi[ii];} \
149         for ( ii=row+1; ii<b->m+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
150         PetscMemcpy(new_j,bj,(bi[row]+nrow+shift)*sizeof(int)); \
151         len = (new_nz - CHUNKSIZE - bi[row] - nrow - shift); \
152         PetscMemcpy(new_j+bi[row]+shift+nrow+CHUNKSIZE,bj+bi[row]+shift+nrow, \
153                                                            len*sizeof(int)); \
154         PetscMemcpy(new_a,ba,(bi[row]+nrow+shift)*sizeof(Scalar)); \
155         PetscMemcpy(new_a+bi[row]+shift+nrow+CHUNKSIZE,ba+bi[row]+shift+nrow, \
156                                                            len*sizeof(Scalar));  \
157         /* free up old matrix storage */ \
158  \
159         PetscFree(b->a);  \
160         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
161         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
162         b->singlemalloc = 1; \
163  \
164         rp   = bj + bi[row] + shift; ap = ba + bi[row] + shift; \
165         rmax = bimax[row] = bimax[row] + CHUNKSIZE; \
166         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + sizeof(Scalar))); \
167         b->maxnz += CHUNKSIZE; \
168         b->reallocs++; \
169       } \
170       N = nrow++ - 1; b->nz++; \
171       /* shift up all the later entries in this row */ \
172       for ( ii=N; ii>=_i; ii-- ) { \
173         rp[ii+1] = rp[ii]; \
174         ap[ii+1] = ap[ii]; \
175       } \
176       rp[_i] = col1;  \
177       ap[_i] = value;  \
178       b_noinsert: ; \
179       bilen[row] = nrow; \
180 }
181 
182 extern int MatSetValues_SeqAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
183 #undef __FUNC__
184 #define __FUNC__ "MatSetValues_MPIAIJ"
185 int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
186 {
187   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
188   Scalar     value;
189   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
190   int        cstart = aij->cstart, cend = aij->cend,row,col;
191   int        roworiented = aij->roworiented;
192 
193   /* Some Variables required in the macro */
194   Mat        A = aij->A;
195   Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
196   int        *aimax = a->imax, *ai = a->i, *ailen = a->ilen,*aj = a->j;
197   Scalar     *aa = a->a;
198 
199   Mat        B = aij->B;
200   Mat_SeqAIJ *b = (Mat_SeqAIJ *) B->data;
201   int        *bimax = b->imax, *bi = b->i, *bilen = b->ilen,*bj = b->j;
202   Scalar     *ba = b->a;
203 
204   int        *rp,ii,nrow,_i,rmax, N, col1,low,high,t;
205   int        nonew = a->nonew,shift = a->indexshift;
206   Scalar     *ap;
207 
208   PetscFunctionBegin;
209   for ( i=0; i<m; i++ ) {
210 #if defined(USE_PETSC_BOPT_g)
211     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
212     if (im[i] >= aij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
213 #endif
214     if (im[i] >= rstart && im[i] < rend) {
215       row = im[i] - rstart;
216       for ( j=0; j<n; j++ ) {
217         if (in[j] >= cstart && in[j] < cend){
218           col = in[j] - cstart;
219           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
220           MatSetValues_SeqAIJ_A_Private(row,col,value,addv);
221           /* ierr = MatSetValues_SeqAIJ(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
222         }
223 #if defined(USE_PETSC_BOPT_g)
224         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
225         else if (in[j] >= aij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
226 #endif
227         else {
228           if (mat->was_assembled) {
229             if (!aij->colmap) {
230               ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
231             }
232 #if defined (USE_CTABLE)
233 	    col = TableFind( aij->colmap, in[j] + 1 ) - 1;
234 #else
235             col = aij->colmap[in[j]] - 1;
236 #endif
237             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
238               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
239               col =  in[j];
240               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
241               B = aij->B;
242               b = (Mat_SeqAIJ *) B->data;
243               bimax = b->imax; bi = b->i; bilen = b->ilen; bj = b->j;
244               ba = b->a;
245             }
246           } else col = in[j];
247           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
248           MatSetValues_SeqAIJ_B_Private(row,col,value,addv);
249           /* ierr = MatSetValues_SeqAIJ(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
250         }
251       }
252     }
253     else {
254       if (roworiented && !aij->donotstash) {
255         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
256       }
257       else {
258         if (!aij->donotstash) {
259           row = im[i];
260           for ( j=0; j<n; j++ ) {
261             ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
262           }
263         }
264       }
265     }
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 #undef __FUNC__
271 #define __FUNC__ "MatGetValues_MPIAIJ"
272 int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
275   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
276   int        cstart = aij->cstart, cend = aij->cend,row,col;
277 
278   PetscFunctionBegin;
279   for ( i=0; i<m; i++ ) {
280     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
281     if (idxm[i] >= aij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
282     if (idxm[i] >= rstart && idxm[i] < rend) {
283       row = idxm[i] - rstart;
284       for ( j=0; j<n; j++ ) {
285         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
286         if (idxn[j] >= aij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
287         if (idxn[j] >= cstart && idxn[j] < cend){
288           col = idxn[j] - cstart;
289           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
290         } else {
291           if (!aij->colmap) {
292             ierr = CreateColmap_MPIAIJ_Private(mat);CHKERRQ(ierr);
293           }
294 #if defined (USE_CTABLE)
295 	  col = TableFind( aij->colmap, idxn[j] + 1 ) - 1;
296 #else
297           col = aij->colmap[idxn[j]] - 1;
298 #endif
299           if ((col < 0) || (aij->garray[col] != idxn[j])) *(v+i*n+j) = 0.0;
300           else {
301             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
302           }
303         }
304       }
305     } else {
306       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
307     }
308   }
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNC__
313 #define __FUNC__ "MatAssemblyBegin_MPIAIJ"
314 int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
315 {
316   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
317   MPI_Comm    comm = mat->comm;
318   int         size = aij->size, *owners = aij->rowners;
319   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
320   MPI_Request *send_waits,*recv_waits;
321   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
322   InsertMode  addv;
323   Scalar      *rvalues,*svalues;
324 
325   PetscFunctionBegin;
326   if (aij->donotstash) {
327     aij->svalues    = 0; aij->rvalues    = 0;
328     aij->nsends     = 0; aij->nrecvs     = 0;
329     aij->send_waits = 0; aij->recv_waits = 0;
330     aij->rmax       = 0;
331     PetscFunctionReturn(0);
332   }
333 
334   /* make sure all processors are either in INSERTMODE or ADDMODE */
335   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
336   if (addv == (ADD_VALUES|INSERT_VALUES)) {
337     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
338   }
339   mat->insertmode = addv; /* in case this processor had no cache */
340 
341   /*  first count number of contributors to each processor */
342   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
343   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
344   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
345   for ( i=0; i<aij->stash.n; i++ ) {
346     idx = aij->stash.idx[i];
347     for ( j=0; j<size; j++ ) {
348       if (idx >= owners[j] && idx < owners[j+1]) {
349         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
350       }
351     }
352   }
353   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
354 
355   /* inform other processors of number of messages and max length*/
356   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
357   ierr = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
358   nreceives = work[rank];
359   ierr = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
360   nmax = work[rank];
361   PetscFree(work);
362 
363   /* post receives:
364        1) each message will consist of ordered pairs
365      (global index,value) we store the global index as a double
366      to simplify the message passing.
367        2) since we don't know how long each individual message is we
368      allocate the largest needed buffer for each receive. Potentially
369      this is a lot of wasted space.
370 
371 
372        This could be done better.
373   */
374   rvalues    = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
375   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
376   for ( i=0; i<nreceives; i++ ) {
377     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
378               comm,recv_waits+i);CHKERRQ(ierr);
379   }
380 
381   /* do sends:
382       1) starts[i] gives the starting index in svalues for stuff going to
383          the ith processor
384   */
385   svalues    = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
386   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
387   starts     = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
388   starts[0]  = 0;
389   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
390   for ( i=0; i<aij->stash.n; i++ ) {
391     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
392     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
393     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
394   }
395   PetscFree(owner);
396   starts[0] = 0;
397   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
398   count = 0;
399   for ( i=0; i<size; i++ ) {
400     if (procs[i]) {
401       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
402     }
403   }
404   PetscFree(starts);
405   PetscFree(nprocs);
406 
407   /* Free cache space */
408   PLogInfo(aij->A,"MatAssemblyBegin_MPIAIJ:Number of off-processor values %d\n",aij->stash.n);
409   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
410 
411   aij->svalues    = svalues;    aij->rvalues    = rvalues;
412   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
413   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
414   aij->rmax       = nmax;
415 
416   PetscFunctionReturn(0);
417 }
418 extern int MatSetUpMultiply_MPIAIJ(Mat);
419 
420 #undef __FUNC__
421 #define __FUNC__ "MatAssemblyEnd_MPIAIJ"
422 int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
423 {
424   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
425   MPI_Status  *send_status,recv_status;
426   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
427   int         row,col,other_disassembled;
428   Scalar      *values,val;
429   InsertMode  addv = mat->insertmode;
430 
431   PetscFunctionBegin;
432 
433   /*  wait on receives */
434   while (count) {
435     ierr = MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
436     /* unpack receives into our local space */
437     values = aij->rvalues + 3*imdex*aij->rmax;
438     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
439     n = n/3;
440     for ( i=0; i<n; i++ ) {
441       row = (int) PetscReal(values[3*i]) - aij->rstart;
442       col = (int) PetscReal(values[3*i+1]);
443       val = values[3*i+2];
444       if (col >= aij->cstart && col < aij->cend) {
445         col -= aij->cstart;
446         ierr = MatSetValues(aij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
447       } else {
448         if (mat->was_assembled || mat->assembled) {
449           if (!aij->colmap) {
450             ierr = CreateColmap_MPIAIJ_Private(mat); CHKERRQ(ierr);
451           }
452 #if defined (USE_CTABLE)
453           col = TableFind( aij->colmap, col + 1 ) - 1;
454 #else
455           col = aij->colmap[col] - 1;
456 #endif
457           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
458             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
459             col = (int) PetscReal(values[3*i+1]);
460           }
461         }
462         ierr = MatSetValues(aij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr);
463       }
464     }
465     count--;
466   }
467   if (aij->recv_waits) PetscFree(aij->recv_waits);
468   if (aij->rvalues)    PetscFree(aij->rvalues);
469 
470   /* wait on sends */
471   if (aij->nsends) {
472     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
473     ierr        = MPI_Waitall(aij->nsends,aij->send_waits,send_status);CHKERRQ(ierr);
474     PetscFree(send_status);
475   }
476   if (aij->send_waits) PetscFree(aij->send_waits);
477   if (aij->svalues)    PetscFree(aij->svalues);
478 
479   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
480   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
481 
482   /* determine if any processor has disassembled, if so we must
483      also disassemble ourselfs, in order that we may reassemble. */
484   /*
485      if nonzero structure of submatrix B cannot change then we know that
486      no processor disassembled thus we can skip this stuff
487   */
488   if (!((Mat_SeqAIJ*) aij->B->data)->nonew)  {
489     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
490     if (mat->was_assembled && !other_disassembled) {
491       ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
492     }
493   }
494 
495   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
496     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
497   }
498   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
499   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
500 
501   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNC__
506 #define __FUNC__ "MatZeroEntries_MPIAIJ"
507 int MatZeroEntries_MPIAIJ(Mat A)
508 {
509   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
510   int        ierr;
511 
512   PetscFunctionBegin;
513   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
514   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
515   PetscFunctionReturn(0);
516 }
517 
518 #undef __FUNC__
519 #define __FUNC__ "MatZeroRows_MPIAIJ"
520 int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
521 {
522   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
523   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
524   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
525   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
526   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
527   int            *lens,imdex,*lrows,*values,rstart=l->rstart;
528   MPI_Comm       comm = A->comm;
529   MPI_Request    *send_waits,*recv_waits;
530   MPI_Status     recv_status,*send_status;
531   IS             istmp;
532   PetscTruth     localdiag;
533 
534   PetscFunctionBegin;
535   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
536   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
537 
538   /*  first count number of contributors to each processor */
539   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
540   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
541   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
542   for ( i=0; i<N; i++ ) {
543     idx = rows[i];
544     found = 0;
545     for ( j=0; j<size; j++ ) {
546       if (idx >= owners[j] && idx < owners[j+1]) {
547         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
548       }
549     }
550     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
551   }
552   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
553 
554   /* inform other processors of number of messages and max length*/
555   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
556   ierr = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
557   nrecvs = work[rank];
558   ierr = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
559   nmax = work[rank];
560   PetscFree(work);
561 
562   /* post receives:   */
563   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int));CHKPTRQ(rvalues);
564   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
565   for ( i=0; i<nrecvs; i++ ) {
566     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
567   }
568 
569   /* do sends:
570       1) starts[i] gives the starting index in svalues for stuff going to
571          the ith processor
572   */
573   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
574   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
575   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
576   starts[0] = 0;
577   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
578   for ( i=0; i<N; i++ ) {
579     svalues[starts[owner[i]]++] = rows[i];
580   }
581   ISRestoreIndices(is,&rows);
582 
583   starts[0] = 0;
584   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
585   count = 0;
586   for ( i=0; i<size; i++ ) {
587     if (procs[i]) {
588       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
589     }
590   }
591   PetscFree(starts);
592 
593   base = owners[rank];
594 
595   /*  wait on receives */
596   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
597   source = lens + nrecvs;
598   count  = nrecvs; slen = 0;
599   while (count) {
600     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
601     /* unpack receives into our local space */
602     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
603     source[imdex]  = recv_status.MPI_SOURCE;
604     lens[imdex]  = n;
605     slen += n;
606     count--;
607   }
608   PetscFree(recv_waits);
609 
610   /* move the data into the send scatter */
611   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
612   count = 0;
613   for ( i=0; i<nrecvs; i++ ) {
614     values = rvalues + i*nmax;
615     for ( j=0; j<lens[i]; j++ ) {
616       lrows[count++] = values[j] - base;
617     }
618   }
619   PetscFree(rvalues); PetscFree(lens);
620   PetscFree(owner); PetscFree(nprocs);
621 
622   /* actually zap the local rows */
623   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
624   PLogObjectParent(A,istmp);
625 
626   /*
627         Zero the required rows. If the "diagonal block" of the matrix
628      is square and the user wishes to set the diagonal we use seperate
629      code so that MatSetValues() is not called for each diagonal allocating
630      new memory, thus calling lots of mallocs and slowing things down.
631 
632        Contributed by: Mathew Knepley
633   */
634   localdiag = PETSC_FALSE;
635   if (diag && (l->A->M == l->A->N)) {
636     localdiag = PETSC_TRUE;
637     ierr      = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
638   } else {
639     ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
640   }
641   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
642   ierr = ISDestroy(istmp); CHKERRQ(ierr);
643 
644   if (diag && (localdiag == PETSC_FALSE)) {
645     for ( i = 0; i < slen; i++ ) {
646       row = lrows[i] + rstart;
647       MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);
648     }
649     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
650     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
651   }
652 
653   if (diag) {
654     for ( i = 0; i < slen; i++ ) {
655       row = lrows[i] + rstart;
656       MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);
657     }
658     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
659     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
660   }
661   PetscFree(lrows);
662 
663   /* wait on sends */
664   if (nsends) {
665     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
666     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
667     PetscFree(send_status);
668   }
669   PetscFree(send_waits); PetscFree(svalues);
670 
671   PetscFunctionReturn(0);
672 }
673 
674 #undef __FUNC__
675 #define __FUNC__ "MatMult_MPIAIJ"
676 int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
677 {
678   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
679   int        ierr,nt;
680 
681   PetscFunctionBegin;
682   ierr = VecGetLocalSize(xx,&nt);  CHKERRQ(ierr);
683   if (nt != a->n) {
684     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
685   }
686   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
687   ierr = (*a->A->ops->mult)(a->A,xx,yy); CHKERRQ(ierr);
688   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx); CHKERRQ(ierr);
689   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
690   PetscFunctionReturn(0);
691 }
692 
693 #undef __FUNC__
694 #define __FUNC__ "MatMultAdd_MPIAIJ"
695 int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
696 {
697   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
698   int        ierr;
699 
700   PetscFunctionBegin;
701   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
702   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
703   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
704   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNC__
709 #define __FUNC__ "MatMultTrans_MPIAIJ"
710 int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
711 {
712   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
713   int        ierr;
714 
715   PetscFunctionBegin;
716   /* do nondiagonal part */
717   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
718   /* send it on its way */
719   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
720   /* do local part */
721   ierr = (*a->A->ops->multtrans)(a->A,xx,yy); CHKERRQ(ierr);
722   /* receive remote parts: note this assumes the values are not actually */
723   /* inserted in yy until the next line, which is true for my implementation*/
724   /* but is not perhaps always true. */
725   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 #undef __FUNC__
730 #define __FUNC__ "MatMultTransAdd_MPIAIJ"
731 int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
732 {
733   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
734   int        ierr;
735 
736   PetscFunctionBegin;
737   /* do nondiagonal part */
738   ierr = (*a->B->ops->multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
739   /* send it on its way */
740   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
741   /* do local part */
742   ierr = (*a->A->ops->multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
743   /* receive remote parts: note this assumes the values are not actually */
744   /* inserted in yy until the next line, which is true for my implementation*/
745   /* but is not perhaps always true. */
746   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
747   PetscFunctionReturn(0);
748 }
749 
750 /*
751   This only works correctly for square matrices where the subblock A->A is the
752    diagonal block
753 */
754 #undef __FUNC__
755 #define __FUNC__ "MatGetDiagonal_MPIAIJ"
756 int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
757 {
758   int        ierr;
759   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
760 
761   PetscFunctionBegin;
762   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
763   if (a->rstart != a->cstart || a->rend != a->cend) {
764     SETERRQ(PETSC_ERR_ARG_SIZ,0,"row partition must equal col partition");
765   }
766   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
767   PetscFunctionReturn(0);
768 }
769 
770 #undef __FUNC__
771 #define __FUNC__ "MatScale_MPIAIJ"
772 int MatScale_MPIAIJ(Scalar *aa,Mat A)
773 {
774   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
775   int        ierr;
776 
777   PetscFunctionBegin;
778   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
779   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
780   PetscFunctionReturn(0);
781 }
782 
783 #undef __FUNC__
784 #define __FUNC__ "MatDestroy_MPIAIJ"
785 int MatDestroy_MPIAIJ(Mat mat)
786 {
787   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
788   int        ierr;
789 
790   PetscFunctionBegin;
791   if (--mat->refct > 0) PetscFunctionReturn(0);
792 
793   if (mat->mapping) {
794     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
795   }
796   if (mat->bmapping) {
797     ierr = ISLocalToGlobalMappingDestroy(mat->bmapping); CHKERRQ(ierr);
798   }
799   if (mat->rmap) {
800     ierr = MapDestroy(mat->rmap);CHKERRQ(ierr);
801   }
802   if (mat->cmap) {
803     ierr = MapDestroy(mat->cmap);CHKERRQ(ierr);
804   }
805 #if defined(USE_PETSC_LOG)
806   PLogObjectState((PetscObject)mat,"Rows=%d, Cols=%d",aij->M,aij->N);
807 #endif
808   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
809   PetscFree(aij->rowners);
810   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
811   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
812 #if defined (USE_CTABLE)
813   if (aij->colmap) TableDelete(aij->colmap);
814 #else
815   if (aij->colmap) PetscFree(aij->colmap);
816 #endif
817   if (aij->garray) PetscFree(aij->garray);
818   if (aij->lvec)   VecDestroy(aij->lvec);
819   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
820   if (aij->rowvalues) PetscFree(aij->rowvalues);
821   PetscFree(aij);
822   PLogObjectDestroy(mat);
823   PetscHeaderDestroy(mat);
824   PetscFunctionReturn(0);
825 }
826 
827 #undef __FUNC__
828 #define __FUNC__ "MatView_MPIAIJ_Binary"
829 extern int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
830 {
831   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
832   int         ierr;
833 
834   PetscFunctionBegin;
835   if (aij->size == 1) {
836     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
837   }
838   else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
839   PetscFunctionReturn(0);
840 }
841 
842 #undef __FUNC__
843 #define __FUNC__ "MatView_MPIAIJ_ASCIIorDraworMatlab"
844 extern int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
845 {
846   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
847   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
848   int         ierr, format,shift = C->indexshift,rank = aij->rank ;
849   int         size = aij->size;
850   FILE        *fd;
851   ViewerType  vtype;
852 
853   PetscFunctionBegin;
854   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
855   if (PetscTypeCompare(vtype,ASCII_VIEWER)) {
856     ierr = ViewerGetFormat(viewer,&format);
857     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
858       MatInfo info;
859       int     flg;
860       MPI_Comm_rank(mat->comm,&rank);
861       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
862       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
863       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
864       PetscSequentialPhaseBegin(mat->comm,1);
865       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
866          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
867       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
868          rank,aij->m,(int)info.nz_used,(int)info.nz_allocated,(int)info.memory);
869       ierr = MatGetInfo(aij->A,MAT_LOCAL,&info);
870       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used);
871       ierr = MatGetInfo(aij->B,MAT_LOCAL,&info);
872       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used);
873       fflush(fd);
874       PetscSequentialPhaseEnd(mat->comm,1);
875       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
876       PetscFunctionReturn(0);
877     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
878       PetscFunctionReturn(0);
879     }
880   } else if (PetscTypeCompare(vtype,DRAW_VIEWER)) {
881     Draw       draw;
882     PetscTruth isnull;
883     ierr = ViewerDrawGetDraw(viewer,0,&draw); CHKERRQ(ierr);
884     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
885   }
886 
887   if (size == 1) {
888     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
889   } else {
890     /* assemble the entire matrix onto first processor. */
891     Mat         A;
892     Mat_SeqAIJ *Aloc;
893     int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
894     Scalar      *a;
895 
896     if (!rank) {
897       ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
898     } else {
899       ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
900     }
901     PLogObjectParent(mat,A);
902 
903     /* copy over the A part */
904     Aloc = (Mat_SeqAIJ*) aij->A->data;
905     m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
906     row = aij->rstart;
907     for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
908     for ( i=0; i<m; i++ ) {
909       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
910       row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
911     }
912     aj = Aloc->j;
913     for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
914 
915     /* copy over the B part */
916     Aloc = (Mat_SeqAIJ*) aij->B->data;
917     m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
918     row = aij->rstart;
919     ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
920     for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
921     for ( i=0; i<m; i++ ) {
922       ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
923       row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
924     }
925     PetscFree(ct);
926     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
927     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
928     /*
929        Everyone has to call to draw the matrix since the graphics waits are
930        synchronized across all processors that share the Draw object
931     */
932     if (!rank || PetscTypeCompare(vtype,DRAW_VIEWER)) {
933       ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
934     }
935     ierr = MatDestroy(A); CHKERRQ(ierr);
936   }
937   PetscFunctionReturn(0);
938 }
939 
940 #undef __FUNC__
941 #define __FUNC__ "MatView_MPIAIJ"
942 int MatView_MPIAIJ(Mat mat,Viewer viewer)
943 {
944   int         ierr;
945   ViewerType  vtype;
946 
947   PetscFunctionBegin;
948   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
949   if (PetscTypeCompare(vtype,ASCII_VIEWER) || PetscTypeCompare(vtype,DRAW_VIEWER) ||
950       PetscTypeCompare(vtype,MATLAB_VIEWER)) {
951     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
952   } else if (PetscTypeCompare(vtype,BINARY_VIEWER)) {
953     ierr = MatView_MPIAIJ_Binary(mat,viewer);CHKERRQ(ierr);
954   } else {
955     SETERRQ(1,1,"Viewer type not supported by PETSc object");
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 /*
961     This has to provide several versions.
962 
963      2) a) use only local smoothing updating outer values only once.
964         b) local smoothing updating outer values each inner iteration
965      3) color updating out values betwen colors.
966 */
967 #undef __FUNC__
968 #define __FUNC__ "MatRelax_MPIAIJ"
969 int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
970                            double fshift,int its,Vec xx)
971 {
972   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
973   Mat        AA = mat->A, BB = mat->B;
974   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
975   Scalar     *b,*x,*xs,*ls,d,*v,sum;
976   int        ierr,*idx, *diag;
977   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
978 
979   PetscFunctionBegin;
980   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
981   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
982   ierr = VecGetArray(mat->lvec,&ls);CHKERRQ(ierr);
983   xs = x + shift; /* shift by one for index start of 1 */
984   ls = ls + shift;
985   if (!A->diag) {ierr = MatMarkDiag_SeqAIJ(AA); CHKERRQ(ierr);}
986   diag = A->diag;
987   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
988     if (flag & SOR_ZERO_INITIAL_GUESS) {
989       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
990       ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
991       ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
992       ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
993       PetscFunctionReturn(0);
994     }
995     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
996     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
997     while (its--) {
998       /* go down through the rows */
999       for ( i=0; i<m; i++ ) {
1000         n    = A->i[i+1] - A->i[i];
1001 	PLogFlops(4*n+3);
1002         idx  = A->j + A->i[i] + shift;
1003         v    = A->a + A->i[i] + shift;
1004         sum  = b[i];
1005         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1006         d    = fshift + A->a[diag[i]+shift];
1007         n    = B->i[i+1] - B->i[i];
1008         idx  = B->j + B->i[i] + shift;
1009         v    = B->a + B->i[i] + shift;
1010         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1011         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1012       }
1013       /* come up through the rows */
1014       for ( i=m-1; i>-1; i-- ) {
1015         n    = A->i[i+1] - A->i[i];
1016 	PLogFlops(4*n+3)
1017         idx  = A->j + A->i[i] + shift;
1018         v    = A->a + A->i[i] + shift;
1019         sum  = b[i];
1020         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1021         d    = fshift + A->a[diag[i]+shift];
1022         n    = B->i[i+1] - B->i[i];
1023         idx  = B->j + B->i[i] + shift;
1024         v    = B->a + B->i[i] + shift;
1025         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1026         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1027       }
1028     }
1029   } else if (flag & SOR_LOCAL_FORWARD_SWEEP){
1030     if (flag & SOR_ZERO_INITIAL_GUESS) {
1031       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
1032       ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1033       ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
1034       ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
1035       PetscFunctionReturn(0);
1036     }
1037     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1038     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1039     while (its--) {
1040       for ( i=0; i<m; i++ ) {
1041         n    = A->i[i+1] - A->i[i];
1042 	PLogFlops(4*n+3);
1043         idx  = A->j + A->i[i] + shift;
1044         v    = A->a + A->i[i] + shift;
1045         sum  = b[i];
1046         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1047         d    = fshift + A->a[diag[i]+shift];
1048         n    = B->i[i+1] - B->i[i];
1049         idx  = B->j + B->i[i] + shift;
1050         v    = B->a + B->i[i] + shift;
1051         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1052         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1053       }
1054     }
1055   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
1056     if (flag & SOR_ZERO_INITIAL_GUESS) {
1057       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,its,xx);CHKERRQ(ierr);
1058       ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1059       ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
1060       ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
1061       PetscFunctionReturn(0);
1062     }
1063     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1064     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
1065     while (its--) {
1066       for ( i=m-1; i>-1; i-- ) {
1067         n    = A->i[i+1] - A->i[i];
1068 	PLogFlops(4*n+3);
1069         idx  = A->j + A->i[i] + shift;
1070         v    = A->a + A->i[i] + shift;
1071         sum  = b[i];
1072         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1073         d    = fshift + A->a[diag[i]+shift];
1074         n    = B->i[i+1] - B->i[i];
1075         idx  = B->j + B->i[i] + shift;
1076         v    = B->a + B->i[i] + shift;
1077         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1078         x[i] = (1. - omega)*x[i] + omega*(sum + A->a[diag[i]+shift]*x[i])/d;
1079       }
1080     }
1081   } else {
1082     SETERRQ(PETSC_ERR_SUP,0,"Parallel SOR not supported");
1083   }
1084   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1085   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
1086   ierr = VecRestoreArray(mat->lvec,&ls);CHKERRQ(ierr);
1087   PetscFunctionReturn(0);
1088 }
1089 
1090 #undef __FUNC__
1091 #define __FUNC__ "MatGetInfo_MPIAIJ"
1092 int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1093 {
1094   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1095   Mat        A = mat->A, B = mat->B;
1096   int        ierr;
1097   double     isend[5], irecv[5];
1098 
1099   PetscFunctionBegin;
1100   info->block_size     = 1.0;
1101   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1102   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1103   isend[3] = info->memory;  isend[4] = info->mallocs;
1104   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1105   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1106   isend[3] += info->memory;  isend[4] += info->mallocs;
1107   if (flag == MAT_LOCAL) {
1108     info->nz_used      = isend[0];
1109     info->nz_allocated = isend[1];
1110     info->nz_unneeded  = isend[2];
1111     info->memory       = isend[3];
1112     info->mallocs      = isend[4];
1113   } else if (flag == MAT_GLOBAL_MAX) {
1114     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_MAX,matin->comm);CHKERRQ(ierr);
1115     info->nz_used      = irecv[0];
1116     info->nz_allocated = irecv[1];
1117     info->nz_unneeded  = irecv[2];
1118     info->memory       = irecv[3];
1119     info->mallocs      = irecv[4];
1120   } else if (flag == MAT_GLOBAL_SUM) {
1121     ierr = MPI_Allreduce(isend,irecv,5,MPI_DOUBLE,MPI_SUM,matin->comm);CHKERRQ(ierr);
1122     info->nz_used      = irecv[0];
1123     info->nz_allocated = irecv[1];
1124     info->nz_unneeded  = irecv[2];
1125     info->memory       = irecv[3];
1126     info->mallocs      = irecv[4];
1127   }
1128   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1129   info->fill_ratio_needed = 0;
1130   info->factor_mallocs    = 0;
1131   info->rows_global       = (double)mat->M;
1132   info->columns_global    = (double)mat->N;
1133   info->rows_local        = (double)mat->m;
1134   info->columns_local     = (double)mat->N;
1135 
1136   PetscFunctionReturn(0);
1137 }
1138 
1139 #undef __FUNC__
1140 #define __FUNC__ "MatSetOption_MPIAIJ"
1141 int MatSetOption_MPIAIJ(Mat A,MatOption op)
1142 {
1143   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1144 
1145   PetscFunctionBegin;
1146   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1147       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1148       op == MAT_COLUMNS_UNSORTED ||
1149       op == MAT_COLUMNS_SORTED ||
1150       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1151       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1152         MatSetOption(a->A,op);
1153         MatSetOption(a->B,op);
1154   } else if (op == MAT_ROW_ORIENTED) {
1155     a->roworiented = 1;
1156     MatSetOption(a->A,op);
1157     MatSetOption(a->B,op);
1158   } else if (op == MAT_ROWS_SORTED ||
1159              op == MAT_ROWS_UNSORTED ||
1160              op == MAT_SYMMETRIC ||
1161              op == MAT_STRUCTURALLY_SYMMETRIC ||
1162              op == MAT_YES_NEW_DIAGONALS)
1163     PLogInfo(A,"MatSetOption_MPIAIJ:Option ignored\n");
1164   else if (op == MAT_COLUMN_ORIENTED) {
1165     a->roworiented = 0;
1166     MatSetOption(a->A,op);
1167     MatSetOption(a->B,op);
1168   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1169     a->donotstash = 1;
1170   } else if (op == MAT_NO_NEW_DIAGONALS){
1171     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1172   } else {
1173     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1174   }
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 #undef __FUNC__
1179 #define __FUNC__ "MatGetSize_MPIAIJ"
1180 int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1181 {
1182   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1183 
1184   PetscFunctionBegin;
1185   if (m) *m = mat->M;
1186   if (n) *n = mat->N;
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 #undef __FUNC__
1191 #define __FUNC__ "MatGetLocalSize_MPIAIJ"
1192 int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1193 {
1194   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1195 
1196   PetscFunctionBegin;
1197   if (m) *m = mat->m;
1198   if (n) *n = mat->n;
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 #undef __FUNC__
1203 #define __FUNC__ "MatGetOwnershipRange_MPIAIJ"
1204 int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1205 {
1206   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1207 
1208   PetscFunctionBegin;
1209   *m = mat->rstart; *n = mat->rend;
1210   PetscFunctionReturn(0);
1211 }
1212 
1213 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1214 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1215 
1216 #undef __FUNC__
1217 #define __FUNC__ "MatGetRow_MPIAIJ"
1218 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1219 {
1220   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1221   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1222   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1223   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1224   int        *cmap, *idx_p;
1225 
1226   PetscFunctionBegin;
1227   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1228   mat->getrowactive = PETSC_TRUE;
1229 
1230   if (!mat->rowvalues && (idx || v)) {
1231     /*
1232         allocate enough space to hold information from the longest row.
1233     */
1234     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1235     int     max = 1,m = mat->m,tmp;
1236     for ( i=0; i<m; i++ ) {
1237       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1238       if (max < tmp) { max = tmp; }
1239     }
1240     mat->rowvalues  = (Scalar *) PetscMalloc(max*(sizeof(int)+sizeof(Scalar)));CHKPTRQ(mat->rowvalues);
1241     mat->rowindices = (int *) (mat->rowvalues + max);
1242   }
1243 
1244   if (row < rstart || row >= rend) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Only local rows")
1245   lrow = row - rstart;
1246 
1247   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1248   if (!v)   {pvA = 0; pvB = 0;}
1249   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1250   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1251   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1252   nztot = nzA + nzB;
1253 
1254   cmap  = mat->garray;
1255   if (v  || idx) {
1256     if (nztot) {
1257       /* Sort by increasing column numbers, assuming A and B already sorted */
1258       int imark = -1;
1259       if (v) {
1260         *v = v_p = mat->rowvalues;
1261         for ( i=0; i<nzB; i++ ) {
1262           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1263           else break;
1264         }
1265         imark = i;
1266         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1267         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1268       }
1269       if (idx) {
1270         *idx = idx_p = mat->rowindices;
1271         if (imark > -1) {
1272           for ( i=0; i<imark; i++ ) {
1273             idx_p[i] = cmap[cworkB[i]];
1274           }
1275         } else {
1276           for ( i=0; i<nzB; i++ ) {
1277             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1278             else break;
1279           }
1280           imark = i;
1281         }
1282         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1283         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1284       }
1285     } else {
1286       if (idx) *idx = 0;
1287       if (v)   *v   = 0;
1288     }
1289   }
1290   *nz = nztot;
1291   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1292   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1293   PetscFunctionReturn(0);
1294 }
1295 
1296 #undef __FUNC__
1297 #define __FUNC__ "MatRestoreRow_MPIAIJ"
1298 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1299 {
1300   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1301 
1302   PetscFunctionBegin;
1303   if (aij->getrowactive == PETSC_FALSE) {
1304     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1305   }
1306   aij->getrowactive = PETSC_FALSE;
1307   PetscFunctionReturn(0);
1308 }
1309 
1310 #undef __FUNC__
1311 #define __FUNC__ "MatNorm_MPIAIJ"
1312 int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1313 {
1314   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1315   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1316   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1317   double     sum = 0.0;
1318   Scalar     *v;
1319 
1320   PetscFunctionBegin;
1321   if (aij->size == 1) {
1322     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1323   } else {
1324     if (type == NORM_FROBENIUS) {
1325       v = amat->a;
1326       for (i=0; i<amat->nz; i++ ) {
1327 #if defined(USE_PETSC_COMPLEX)
1328         sum += PetscReal(PetscConj(*v)*(*v)); v++;
1329 #else
1330         sum += (*v)*(*v); v++;
1331 #endif
1332       }
1333       v = bmat->a;
1334       for (i=0; i<bmat->nz; i++ ) {
1335 #if defined(USE_PETSC_COMPLEX)
1336         sum += PetscReal(PetscConj(*v)*(*v)); v++;
1337 #else
1338         sum += (*v)*(*v); v++;
1339 #endif
1340       }
1341       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1342       *norm = sqrt(*norm);
1343     } else if (type == NORM_1) { /* max column norm */
1344       double *tmp, *tmp2;
1345       int    *jj, *garray = aij->garray;
1346       tmp  = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp);
1347       tmp2 = (double *) PetscMalloc( (aij->N+1)*sizeof(double) ); CHKPTRQ(tmp2);
1348       PetscMemzero(tmp,aij->N*sizeof(double));
1349       *norm = 0.0;
1350       v = amat->a; jj = amat->j;
1351       for ( j=0; j<amat->nz; j++ ) {
1352         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1353       }
1354       v = bmat->a; jj = bmat->j;
1355       for ( j=0; j<bmat->nz; j++ ) {
1356         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1357       }
1358       ierr = MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
1359       for ( j=0; j<aij->N; j++ ) {
1360         if (tmp2[j] > *norm) *norm = tmp2[j];
1361       }
1362       PetscFree(tmp); PetscFree(tmp2);
1363     } else if (type == NORM_INFINITY) { /* max row norm */
1364       double ntemp = 0.0;
1365       for ( j=0; j<amat->m; j++ ) {
1366         v = amat->a + amat->i[j] + shift;
1367         sum = 0.0;
1368         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1369           sum += PetscAbsScalar(*v); v++;
1370         }
1371         v = bmat->a + bmat->i[j] + shift;
1372         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1373           sum += PetscAbsScalar(*v); v++;
1374         }
1375         if (sum > ntemp) ntemp = sum;
1376       }
1377       ierr = MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);CHKERRQ(ierr);
1378     } else {
1379       SETERRQ(PETSC_ERR_SUP,0,"No support for two norm");
1380     }
1381   }
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 #undef __FUNC__
1386 #define __FUNC__ "MatTranspose_MPIAIJ"
1387 int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1388 {
1389   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1390   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1391   int        ierr,shift = Aloc->indexshift;
1392   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1393   Mat        B;
1394   Scalar     *array;
1395 
1396   PetscFunctionBegin;
1397   if (matout == PETSC_NULL && M != N) {
1398     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1399   }
1400 
1401   ierr = MatCreateMPIAIJ(A->comm,a->n,a->m,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);CHKERRQ(ierr);
1402 
1403   /* copy over the A part */
1404   Aloc = (Mat_SeqAIJ*) a->A->data;
1405   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1406   row = a->rstart;
1407   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1408   for ( i=0; i<m; i++ ) {
1409     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1410     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1411   }
1412   aj = Aloc->j;
1413   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1414 
1415   /* copy over the B part */
1416   Aloc = (Mat_SeqAIJ*) a->B->data;
1417   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1418   row = a->rstart;
1419   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1420   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1421   for ( i=0; i<m; i++ ) {
1422     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1423     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1424   }
1425   PetscFree(ct);
1426   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1427   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1428   if (matout != PETSC_NULL) {
1429     *matout = B;
1430   } else {
1431     PetscOps       *Abops;
1432     struct _MatOps *Aops;
1433 
1434     /* This isn't really an in-place transpose .... but free data structures from a */
1435     PetscFree(a->rowners);
1436     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1437     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1438 #if defined (USE_CTABLE)
1439     if (a->colmap) TableDelete(a->colmap);
1440 #else
1441     if (a->colmap) PetscFree(a->colmap);
1442 #endif
1443     if (a->garray) PetscFree(a->garray);
1444     if (a->lvec) VecDestroy(a->lvec);
1445     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1446     PetscFree(a);
1447 
1448     /*
1449        This is horrible, horrible code. We need to keep the
1450       A pointers for the bops and ops but copy everything
1451       else from C.
1452     */
1453     Abops = A->bops;
1454     Aops  = A->ops;
1455     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1456     A->bops = Abops;
1457     A->ops  = Aops;
1458     PetscHeaderDestroy(B);
1459   }
1460   PetscFunctionReturn(0);
1461 }
1462 
1463 #undef __FUNC__
1464 #define __FUNC__ "MatDiagonalScale_MPIAIJ"
1465 int MatDiagonalScale_MPIAIJ(Mat mat,Vec ll,Vec rr)
1466 {
1467   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1468   Mat a = aij->A, b = aij->B;
1469   int ierr,s1,s2,s3;
1470 
1471   PetscFunctionBegin;
1472   ierr = MatGetLocalSize(mat,&s2,&s3); CHKERRQ(ierr);
1473   if (rr) {
1474     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1475     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,0,"right vector non-conforming local size");
1476     /* Overlap communication with computation. */
1477     ierr = VecScatterBegin(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1478   }
1479   if (ll) {
1480     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1481     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"left vector non-conforming local size");
1482     ierr = (*b->ops->diagonalscale)(b,ll,0); CHKERRQ(ierr);
1483   }
1484   /* scale  the diagonal block */
1485   ierr = (*a->ops->diagonalscale)(a,ll,rr); CHKERRQ(ierr);
1486 
1487   if (rr) {
1488     /* Do a scatter end and then right scale the off-diagonal block */
1489     ierr = VecScatterEnd(rr,aij->lvec,INSERT_VALUES,SCATTER_FORWARD,aij->Mvctx); CHKERRQ(ierr);
1490     ierr = (*b->ops->diagonalscale)(b,0,aij->lvec); CHKERRQ(ierr);
1491   }
1492 
1493   PetscFunctionReturn(0);
1494 }
1495 
1496 
1497 extern int MatPrintHelp_SeqAIJ(Mat);
1498 #undef __FUNC__
1499 #define __FUNC__ "MatPrintHelp_MPIAIJ"
1500 int MatPrintHelp_MPIAIJ(Mat A)
1501 {
1502   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1503   int        ierr;
1504 
1505   PetscFunctionBegin;
1506   if (!a->rank) {
1507     ierr = MatPrintHelp_SeqAIJ(a->A);CHKERRQ(ierr);
1508   }
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 #undef __FUNC__
1513 #define __FUNC__ "MatGetBlockSize_MPIAIJ"
1514 int MatGetBlockSize_MPIAIJ(Mat A,int *bs)
1515 {
1516   PetscFunctionBegin;
1517   *bs = 1;
1518   PetscFunctionReturn(0);
1519 }
1520 #undef __FUNC__
1521 #define __FUNC__ "MatSetUnfactored_MPIAIJ"
1522 int MatSetUnfactored_MPIAIJ(Mat A)
1523 {
1524   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1525   int        ierr;
1526 
1527   PetscFunctionBegin;
1528   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 #undef __FUNC__
1533 #define __FUNC__ "MatEqual_MPIAIJ"
1534 int MatEqual_MPIAIJ(Mat A, Mat B, PetscTruth *flag)
1535 {
1536   Mat_MPIAIJ *matB = (Mat_MPIAIJ *) B->data,*matA = (Mat_MPIAIJ *) A->data;
1537   Mat        a, b, c, d;
1538   PetscTruth flg;
1539   int        ierr;
1540 
1541   PetscFunctionBegin;
1542   if (B->type != MATMPIAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1543   a = matA->A; b = matA->B;
1544   c = matB->A; d = matB->B;
1545 
1546   ierr = MatEqual(a, c, &flg); CHKERRQ(ierr);
1547   if (flg == PETSC_TRUE) {
1548     ierr = MatEqual(b, d, &flg); CHKERRQ(ierr);
1549   }
1550   ierr = MPI_Allreduce(&flg, flag, 1, MPI_INT, MPI_LAND, A->comm);CHKERRQ(ierr);
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 #undef __FUNC__
1555 #define __FUNC__ "MatCopy_MPIAIJ"
1556 int MatCopy_MPIAIJ(Mat A,Mat B,MatStructure str)
1557 {
1558   int        ierr;
1559   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1560   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
1561 
1562   PetscFunctionBegin;
1563   if (str != SAME_NONZERO_PATTERN || B->type != MATMPIAIJ) {
1564     /* because of the column compression in the off-processor part of the matrix a->B,
1565        the number of columns in a->B and b->B may be different, hence we cannot call
1566        the MatCopy() directly on the two parts. If need be, we can provide a more
1567        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
1568        then copying the submatrices */
1569     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1570   } else {
1571     ierr = MatCopy(a->A,b->A,str);CHKERRQ(ierr);
1572     ierr = MatCopy(a->B,b->B,str);CHKERRQ(ierr);
1573   }
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 extern int MatDuplicate_MPIAIJ(Mat,MatDuplicateOption,Mat *);
1578 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1579 extern int MatFDColoringCreate_MPIAIJ(Mat,ISColoring,MatFDColoring);
1580 extern int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1581 extern int MatGetSubMatrix_MPIAIJ (Mat ,IS,IS,int,MatGetSubMatrixCall,Mat *);
1582 
1583 /* -------------------------------------------------------------------*/
1584 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
1585        MatGetRow_MPIAIJ,
1586        MatRestoreRow_MPIAIJ,
1587        MatMult_MPIAIJ,
1588        MatMultAdd_MPIAIJ,
1589        MatMultTrans_MPIAIJ,
1590        MatMultTransAdd_MPIAIJ,
1591        0,
1592        0,
1593        0,
1594        0,
1595        0,
1596        0,
1597        MatRelax_MPIAIJ,
1598        MatTranspose_MPIAIJ,
1599        MatGetInfo_MPIAIJ,
1600        MatEqual_MPIAIJ,
1601        MatGetDiagonal_MPIAIJ,
1602        MatDiagonalScale_MPIAIJ,
1603        MatNorm_MPIAIJ,
1604        MatAssemblyBegin_MPIAIJ,
1605        MatAssemblyEnd_MPIAIJ,
1606        0,
1607        MatSetOption_MPIAIJ,
1608        MatZeroEntries_MPIAIJ,
1609        MatZeroRows_MPIAIJ,
1610        0,
1611        0,
1612        0,
1613        0,
1614        MatGetSize_MPIAIJ,
1615        MatGetLocalSize_MPIAIJ,
1616        MatGetOwnershipRange_MPIAIJ,
1617        0,
1618        0,
1619        0,
1620        0,
1621        MatDuplicate_MPIAIJ,
1622        0,
1623        0,
1624        0,
1625        0,
1626        0,
1627        MatGetSubMatrices_MPIAIJ,
1628        MatIncreaseOverlap_MPIAIJ,
1629        MatGetValues_MPIAIJ,
1630        MatCopy_MPIAIJ,
1631        MatPrintHelp_MPIAIJ,
1632        MatScale_MPIAIJ,
1633        0,
1634        0,
1635        0,
1636        MatGetBlockSize_MPIAIJ,
1637        0,
1638        0,
1639        0,
1640        0,
1641        MatFDColoringCreate_MPIAIJ,
1642        0,
1643        MatSetUnfactored_MPIAIJ,
1644        0,
1645        0,
1646        MatGetSubMatrix_MPIAIJ,
1647        0,
1648        0,
1649        MatGetMaps_Petsc};
1650 
1651 /* ----------------------------------------------------------------------------------------*/
1652 
1653 EXTERN_C_BEGIN
1654 #undef __FUNC__
1655 #define __FUNC__ "MatStoreValues_MPIAIJ"
1656 int MatStoreValues_MPIAIJ(Mat mat)
1657 {
1658   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1659   int        ierr;
1660 
1661   PetscFunctionBegin;
1662   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
1663   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
1664   PetscFunctionReturn(0);
1665 }
1666 EXTERN_C_END
1667 
1668 EXTERN_C_BEGIN
1669 #undef __FUNC__
1670 #define __FUNC__ "MatRetrieveValues_MPIAIJ"
1671 int MatRetrieveValues_MPIAIJ(Mat mat)
1672 {
1673   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1674   int        ierr;
1675 
1676   PetscFunctionBegin;
1677   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
1678   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
1679   PetscFunctionReturn(0);
1680 }
1681 EXTERN_C_END
1682 
1683 #include "pc.h"
1684 EXTERN_C_BEGIN
1685 extern int PCSetUp_BJacobi_AIJ(PC);
1686 EXTERN_C_END
1687 
1688 #undef __FUNC__
1689 #define __FUNC__ "MatCreateMPIAIJ"
1690 /*@C
1691    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1692    (the default parallel PETSc format).  For good matrix assembly performance
1693    the user should preallocate the matrix storage by setting the parameters
1694    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1695    performance can be increased by more than a factor of 50.
1696 
1697    Collective on MPI_Comm
1698 
1699    Input Parameters:
1700 +  comm - MPI communicator
1701 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1702            This value should be the same as the local size used in creating the
1703            y vector for the matrix-vector product y = Ax.
1704 .  n - This value should be the same as the local size used in creating the
1705        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
1706        calculated if N is given) For square matrices n is almost always m.
1707 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1708 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1709 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
1710            (same value is used for all local rows)
1711 .  d_nnz - array containing the number of nonzeros in the various rows of the
1712            DIAGONAL portion of the local submatrix (possibly different for each row)
1713            or PETSC_NULL, if d_nz is used to specify the nonzero structure.
1714            The size of this array is equal to the number of local rows, i.e 'm'.
1715            You must leave room for the diagonal entry even if it is zero.
1716 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
1717            submatrix (same value is used for all local rows).
1718 -  o_nnz - array containing the number of nonzeros in the various rows of the
1719            OFF-DIAGONAL portion of the local submatrix (possibly different for
1720            each row) or PETSC_NULL, if o_nz is used to specify the nonzero
1721            structure. The size of this array is equal to the number
1722            of local rows, i.e 'm'.
1723 
1724    Output Parameter:
1725 .  A - the matrix
1726 
1727    Notes:
1728    m,n,M,N parameters specify the size of the matrix, and its partitioning across
1729    processors, while d_nz,d_nnz,o_nz,o_nnz parameters specify the approximate
1730    storage requirements for this matrix.
1731 
1732    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one
1733    processor than it must be used on all processors that share the object for
1734    that argument.
1735 
1736    The AIJ format (also called the Yale sparse matrix format or
1737    compressed row storage), is fully compatible with standard Fortran 77
1738    storage.  That is, the stored row and column indices can begin at
1739    either one (as in Fortran) or zero.  See the users manual for details.
1740 
1741    The user MUST specify either the local or global matrix dimensions
1742    (possibly both).
1743 
1744    The parallel matrix is partitioned such that the first m0 rows belong to
1745    process 0, the next m1 rows belong to process 1, the next m2 rows belong
1746    to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
1747 
1748    The DIAGONAL portion of the local submatrix of a processor can be defined
1749    as the submatrix which is obtained by extraction the part corresponding
1750    to the rows r1-r2 and columns r1-r2 of the global matrix, where r1 is the
1751    first row that belongs to the processor, and r2 is the last row belonging
1752    to the this processor. This is a square mxm matrix. The remaining portion
1753    of the local submatrix (mxN) constitute the OFF-DIAGONAL portion.
1754 
1755    If o_nnz, d_nnz are specified, then o_nz, and d_nz are ignored.
1756 
1757    By default, this format uses inodes (identical nodes) when possible.
1758    We search for consecutive rows with the same nonzero structure, thereby
1759    reusing matrix information to achieve increased efficiency.
1760 
1761    Options Database Keys:
1762 +  -mat_aij_no_inode  - Do not use inodes
1763 .  -mat_aij_inode_limit <limit> - Sets inode limit (max limit=5)
1764 -  -mat_aij_oneindex - Internally use indexing starting at 1
1765         rather than 0.  Note that when calling MatSetValues(),
1766         the user still MUST index entries starting at 0!
1767 .   -mat_mpi - use the parallel matrix data structures even on one processor
1768                (defaults to using SeqBAIJ format on one processor)
1769 .   -mat_mpi - use the parallel matrix data structures even on one processor
1770                (defaults to using SeqAIJ format on one processor)
1771 
1772 
1773    Example usage:
1774 
1775    Consider the following 8x8 matrix with 34 non-zero values, that is
1776    assembled across 3 processors. Lets assume that proc0 owns 3 rows,
1777    proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
1778    as follows:
1779 
1780 .vb
1781             1  2  0  |  0  3  0  |  0  4
1782     Proc0   0  5  6  |  7  0  0  |  8  0
1783             9  0 10  | 11  0  0  | 12  0
1784     -------------------------------------
1785            13  0 14  | 15 16 17  |  0  0
1786     Proc1   0 18  0  | 19 20 21  |  0  0
1787             0  0  0  | 22 23  0  | 24  0
1788     -------------------------------------
1789     Proc2  25 26 27  |  0  0 28  | 29  0
1790            30  0  0  | 31 32 33  |  0 34
1791 .ve
1792 
1793    This can be represented as a collection of submatrices as:
1794 
1795 .vb
1796       A B C
1797       D E F
1798       G H I
1799 .ve
1800 
1801    Where the submatrices A,B,C are owned by proc0, D,E,F are
1802    owned by proc1, G,H,I are owned by proc2.
1803 
1804    The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
1805    The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
1806    The 'M','N' parameters are 8,8, and have the same values on all procs.
1807 
1808    The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
1809    submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
1810    corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
1811    Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
1812    part as SeqAIJ matrices. for eg: proc1 will store [E] as a SeqAIJ
1813    matrix, ans [DF] as another SeqAIJ matrix.
1814 
1815    When d_nz, o_nz parameters are specified, d_nz storage elements are
1816    allocated for every row of the local diagonal submatrix, and o_nz
1817    storage locations are allocated for every row of the OFF-DIAGONAL submat.
1818    One way to choose d_nz and o_nz is to use the max nonzerors per local
1819    rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
1820    In this case, the values of d_nz,o_nz are:
1821 .vb
1822      proc0 : dnz = 2, o_nz = 2
1823      proc1 : dnz = 3, o_nz = 2
1824      proc2 : dnz = 1, o_nz = 4
1825 .ve
1826    We are allocating m*(d_nz+o_nz) storage locations for every proc. This
1827    translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
1828    for proc3. i.e we are using 12+15+10=37 storage locations to store
1829    34 values.
1830 
1831    When d_nnz, o_nnz parameters are specified, the storage is specified
1832    for every row, coresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
1833    In the above case the values for d_nnz,o_nnz are:
1834 .vb
1835      proc0: d_nnz = [2,2,2] and o_nnz = [2,2,2]
1836      proc1: d_nnz = [3,3,2] and o_nnz = [2,1,1]
1837      proc2: d_nnz = [1,1]   and o_nnz = [4,4]
1838 .ve
1839    Here the space allocated is sum of all the above values i.e 34, and
1840    hence pre-allocation is perfect.
1841 
1842 .keywords: matrix, aij, compressed row, sparse, parallel
1843 
1844 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1845 @*/
1846 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1847 {
1848   Mat          B;
1849   Mat_MPIAIJ   *b;
1850   int          ierr, i,sum[2],work[2],size,flag1 = 0, flag2 = 0;
1851 
1852   PetscFunctionBegin;
1853   MPI_Comm_size(comm,&size);
1854   ierr = OptionsHasName(PETSC_NULL,"-mat_mpiaij",&flag1); CHKERRQ(ierr);
1855   ierr = OptionsHasName(PETSC_NULL,"-mat_mpi",&flag2); CHKERRQ(ierr);
1856   if (!flag1 && !flag2 && size == 1) {
1857     if (M == PETSC_DECIDE) M = m;
1858     if (N == PETSC_DECIDE) N = n;
1859     ierr = MatCreateSeqAIJ(comm,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1860     PetscFunctionReturn(0);
1861   }
1862 
1863   *A = 0;
1864   PetscHeaderCreate(B,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIAIJ,"Mat",comm,MatDestroy,MatView);
1865   PLogObjectCreate(B);
1866   B->data            = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1867   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1868   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));
1869   B->ops->destroy    = MatDestroy_MPIAIJ;
1870   B->ops->view       = MatView_MPIAIJ;
1871   B->factor          = 0;
1872   B->assembled       = PETSC_FALSE;
1873   B->mapping         = 0;
1874 
1875   B->insertmode      = NOT_SET_VALUES;
1876   b->size            = size;
1877   MPI_Comm_rank(comm,&b->rank);
1878 
1879   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1880     SETERRQ(PETSC_ERR_ARG_WRONG,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1881   }
1882 
1883   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1884     work[0] = m; work[1] = n;
1885     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1886     if (M == PETSC_DECIDE) M = sum[0];
1887     if (N == PETSC_DECIDE) N = sum[1];
1888   }
1889   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1890   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1891   b->m = m; B->m = m;
1892   b->n = n; B->n = n;
1893   b->N = N; B->N = N;
1894   b->M = M; B->M = M;
1895 
1896   /* the information in the maps duplicates the information computed below, eventually
1897      we should remove the duplicate information that is not contained in the maps */
1898   ierr = MapCreateMPI(comm,m,M,&B->rmap);CHKERRQ(ierr);
1899   ierr = MapCreateMPI(comm,n,N,&B->cmap);CHKERRQ(ierr);
1900 
1901   /* build local table of row and column ownerships */
1902   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1903   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1904   b->cowners = b->rowners + b->size + 2;
1905   ierr = MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1906   b->rowners[0] = 0;
1907   for ( i=2; i<=b->size; i++ ) {
1908     b->rowners[i] += b->rowners[i-1];
1909   }
1910   b->rstart = b->rowners[b->rank];
1911   b->rend   = b->rowners[b->rank+1];
1912   ierr = MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1913   b->cowners[0] = 0;
1914   for ( i=2; i<=b->size; i++ ) {
1915     b->cowners[i] += b->cowners[i-1];
1916   }
1917   b->cstart = b->cowners[b->rank];
1918   b->cend   = b->cowners[b->rank+1];
1919 
1920   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1921   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1922   PLogObjectParent(B,b->A);
1923   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1924   ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1925   PLogObjectParent(B,b->B);
1926 
1927   /* build cache for off array entries formed */
1928   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1929   b->donotstash  = 0;
1930   b->colmap      = 0;
1931   b->garray      = 0;
1932   b->roworiented = 1;
1933 
1934   /* stuff used for matrix vector multiply */
1935   b->lvec      = 0;
1936   b->Mvctx     = 0;
1937 
1938   /* stuff for MatGetRow() */
1939   b->rowindices   = 0;
1940   b->rowvalues    = 0;
1941   b->getrowactive = PETSC_FALSE;
1942 
1943   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",
1944                                      "MatStoreValues_MPIAIJ",
1945                                      (void*)MatStoreValues_MPIAIJ);CHKERRQ(ierr);
1946   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",
1947                                      "MatRetrieveValues_MPIAIJ",
1948                                      (void*)MatRetrieveValues_MPIAIJ);CHKERRQ(ierr);
1949   ierr = PetscObjectComposeFunction((PetscObject)B,"PCSetUp_BJacobi_C",
1950                                      "PCSetUp_BJacobi_AIJ",
1951                                      (void*)PCSetUp_BJacobi_AIJ);CHKERRQ(ierr);
1952   *A = B;
1953   PetscFunctionReturn(0);
1954 }
1955 
1956 #undef __FUNC__
1957 #define __FUNC__ "MatDuplicate_MPIAIJ"
1958 int MatDuplicate_MPIAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1959 {
1960   Mat        mat;
1961   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1962   int        ierr, len=0, flg;
1963 
1964   PetscFunctionBegin;
1965   *newmat       = 0;
1966   PetscHeaderCreate(mat,_p_Mat,struct _MatOps,MAT_COOKIE,MATMPIAIJ,"Mat",matin->comm,MatDestroy,MatView);
1967   PLogObjectCreate(mat);
1968   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1969   PetscMemcpy(mat->ops,&MatOps_Values,sizeof(struct _MatOps));
1970   mat->ops->destroy    = MatDestroy_MPIAIJ;
1971   mat->ops->view       = MatView_MPIAIJ;
1972   mat->factor     = matin->factor;
1973   mat->assembled  = PETSC_TRUE;
1974 
1975   a->m = mat->m   = oldmat->m;
1976   a->n = mat->n   = oldmat->n;
1977   a->M = mat->M   = oldmat->M;
1978   a->N = mat->N   = oldmat->N;
1979 
1980   a->rstart       = oldmat->rstart;
1981   a->rend         = oldmat->rend;
1982   a->cstart       = oldmat->cstart;
1983   a->cend         = oldmat->cend;
1984   a->size         = oldmat->size;
1985   a->rank         = oldmat->rank;
1986   mat->insertmode = NOT_SET_VALUES;
1987   a->rowvalues    = 0;
1988   a->getrowactive = PETSC_FALSE;
1989 
1990   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1991   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIAIJ));
1992   a->cowners = a->rowners + a->size + 2;
1993   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1994   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1995   if (oldmat->colmap) {
1996 #if defined (USE_CTABLE)
1997     ierr = TableCreateCopy( &a->colmap, oldmat->colmap ); CHKERRQ(ierr);
1998 #else
1999     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
2000     PLogObjectMemory(mat,(a->N)*sizeof(int));
2001     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
2002 #endif
2003   } else a->colmap = 0;
2004   if (oldmat->garray) {
2005     len = ((Mat_SeqAIJ *) (oldmat->B->data))->n;
2006     a->garray = (int *) PetscMalloc((len+1)*sizeof(int)); CHKPTRQ(a->garray);
2007     PLogObjectMemory(mat,len*sizeof(int));
2008     if (len) PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
2009   } else a->garray = 0;
2010 
2011   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
2012   PLogObjectParent(mat,a->lvec);
2013   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
2014   PLogObjectParent(mat,a->Mvctx);
2015   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A); CHKERRQ(ierr);
2016   PLogObjectParent(mat,a->A);
2017   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B); CHKERRQ(ierr);
2018   PLogObjectParent(mat,a->B);
2019   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
2020   if (flg) {
2021     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
2022   }
2023   ierr = FListDuplicate(matin->qlist,&mat->qlist);CHKERRQ(ierr);
2024   *newmat = mat;
2025   PetscFunctionReturn(0);
2026 }
2027 
2028 #include "sys.h"
2029 
2030 #undef __FUNC__
2031 #define __FUNC__ "MatLoad_MPIAIJ"
2032 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
2033 {
2034   Mat          A;
2035   Scalar       *vals,*svals;
2036   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2037   MPI_Status   status;
2038   int          i, nz, ierr, j,rstart, rend, fd;
2039   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
2040   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
2041   int          tag = ((PetscObject)viewer)->tag,cend,cstart,n;
2042 
2043   PetscFunctionBegin;
2044   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2045   if (!rank) {
2046     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2047     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2048     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2049     if (header[3] < 0) {
2050       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix in special format on disk, cannot load as MPIAIJ");
2051     }
2052   }
2053 
2054   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2055   M = header[1]; N = header[2];
2056   /* determine ownership of all rows */
2057   m = M/size + ((M % size) > rank);
2058   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
2059   ierr = MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2060   rowners[0] = 0;
2061   for ( i=2; i<=size; i++ ) {
2062     rowners[i] += rowners[i-1];
2063   }
2064   rstart = rowners[rank];
2065   rend   = rowners[rank+1];
2066 
2067   /* distribute row lengths to all processors */
2068   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
2069   offlens = ourlens + (rend-rstart);
2070   if (!rank) {
2071     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
2072     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2073     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2074     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
2075     ierr = MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);CHKERRQ(ierr);
2076     PetscFree(sndcounts);
2077   } else {
2078     ierr = MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);CHKERRQ(ierr);
2079   }
2080 
2081   if (!rank) {
2082     /* calculate the number of nonzeros on each processor */
2083     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2084     PetscMemzero(procsnz,size*sizeof(int));
2085     for ( i=0; i<size; i++ ) {
2086       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
2087         procsnz[i] += rowlengths[j];
2088       }
2089     }
2090     PetscFree(rowlengths);
2091 
2092     /* determine max buffer needed and allocate it */
2093     maxnz = 0;
2094     for ( i=0; i<size; i++ ) {
2095       maxnz = PetscMax(maxnz,procsnz[i]);
2096     }
2097     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2098 
2099     /* read in my part of the matrix column indices  */
2100     nz     = procsnz[0];
2101     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
2102     ierr   = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2103 
2104     /* read in every one elses and ship off */
2105     for ( i=1; i<size; i++ ) {
2106       nz   = procsnz[i];
2107       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2108       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2109     }
2110     PetscFree(cols);
2111   } else {
2112     /* determine buffer space needed for message */
2113     nz = 0;
2114     for ( i=0; i<m; i++ ) {
2115       nz += ourlens[i];
2116     }
2117     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
2118 
2119     /* receive message of column indices*/
2120     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2121     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2122     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2123   }
2124 
2125   /* determine column ownership if matrix is not square */
2126   if (N != M) {
2127     n      = N/size + ((N % size) > rank);
2128     ierr   = MPI_Scan(&n,&cend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
2129     cstart = cend - n;
2130   } else {
2131     cstart = rstart;
2132     cend   = rend;
2133     n      = cend - cstart;
2134   }
2135 
2136   /* loop over local rows, determining number of off diagonal entries */
2137   PetscMemzero(offlens,m*sizeof(int));
2138   jj = 0;
2139   for ( i=0; i<m; i++ ) {
2140     for ( j=0; j<ourlens[i]; j++ ) {
2141       if (mycols[jj] < cstart || mycols[jj] >= cend) offlens[i]++;
2142       jj++;
2143     }
2144   }
2145 
2146   /* create our matrix */
2147   for ( i=0; i<m; i++ ) {
2148     ourlens[i] -= offlens[i];
2149   }
2150   ierr = MatCreateMPIAIJ(comm,m,n,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
2151   A = *newmat;
2152   ierr = MatSetOption(A,MAT_COLUMNS_SORTED); CHKERRQ(ierr);
2153   for ( i=0; i<m; i++ ) {
2154     ourlens[i] += offlens[i];
2155   }
2156 
2157   if (!rank) {
2158     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
2159 
2160     /* read in my part of the matrix numerical values  */
2161     nz   = procsnz[0];
2162     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2163 
2164     /* insert into matrix */
2165     jj      = rstart;
2166     smycols = mycols;
2167     svals   = vals;
2168     for ( i=0; i<m; i++ ) {
2169       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
2170       smycols += ourlens[i];
2171       svals   += ourlens[i];
2172       jj++;
2173     }
2174 
2175     /* read in other processors and ship out */
2176     for ( i=1; i<size; i++ ) {
2177       nz   = procsnz[i];
2178       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2179       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2180     }
2181     PetscFree(procsnz);
2182   } else {
2183     /* receive numeric values */
2184     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
2185 
2186     /* receive message of values*/
2187     ierr = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2188     ierr = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2189     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2190 
2191     /* insert into matrix */
2192     jj      = rstart;
2193     smycols = mycols;
2194     svals   = vals;
2195     for ( i=0; i<m; i++ ) {
2196       ierr     = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
2197       smycols += ourlens[i];
2198       svals   += ourlens[i];
2199       jj++;
2200     }
2201   }
2202   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
2203 
2204   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2205   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2206   PetscFunctionReturn(0);
2207 }
2208 
2209 #undef __FUNC__
2210 #define __FUNC__ "MatGetSubMatrix_MPIAIJ"
2211 /*
2212     Not great since it makes two copies of the submatrix, first an SeqAIJ
2213   in local and then by concatenating the local matrices the end result.
2214   Writing it directly would be much like MatGetSubMatrices_MPIAIJ()
2215 */
2216 int MatGetSubMatrix_MPIAIJ(Mat mat,IS isrow,IS iscol,int csize,MatGetSubMatrixCall call,Mat *newmat)
2217 {
2218   int        ierr, i, m,n,rstart,row,rend,nz,*cwork,size,rank,j;
2219   Mat        *local,M, Mreuse;
2220   Scalar     *vwork,*aa;
2221   MPI_Comm   comm = mat->comm;
2222   Mat_SeqAIJ *aij;
2223   int        *ii, *jj,nlocal,*dlens,*olens,dlen,olen,jend;
2224 
2225   PetscFunctionBegin;
2226   MPI_Comm_rank(comm,&rank);
2227   MPI_Comm_size(comm,&size);
2228 
2229   if (call ==  MAT_REUSE_MATRIX) {
2230     ierr = PetscObjectQuery((PetscObject)*newmat,"SubMatrix",(PetscObject *)&Mreuse);CHKERRQ(ierr);
2231     if (!Mreuse) SETERRQ(1,1,"Submatrix passed in was not used before, cannot reuse");
2232     local = &Mreuse;
2233     ierr  = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_REUSE_MATRIX,&local);CHKERRQ(ierr);
2234   } else {
2235     ierr = MatGetSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&local);CHKERRQ(ierr);
2236     Mreuse = *local;
2237     PetscFree(local);
2238   }
2239 
2240   /*
2241       m - number of local rows
2242       n - number of columns (same on all processors)
2243       rstart - first row in new global matrix generated
2244   */
2245   ierr = MatGetSize(Mreuse,&m,&n);CHKERRQ(ierr);
2246   if (call == MAT_INITIAL_MATRIX) {
2247     aij = (Mat_SeqAIJ *) (Mreuse)->data;
2248     if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,1,"No support for index shifted matrix");
2249     ii  = aij->i;
2250     jj  = aij->j;
2251 
2252     /*
2253         Determine the number of non-zeros in the diagonal and off-diagonal
2254         portions of the matrix in order to do correct preallocation
2255     */
2256 
2257     /* first get start and end of "diagonal" columns */
2258     if (csize == PETSC_DECIDE) {
2259       nlocal = n/size + ((n % size) > rank);
2260     } else {
2261       nlocal = csize;
2262     }
2263     ierr   = MPI_Scan(&nlocal,&rend,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
2264     rstart = rend - nlocal;
2265     if (rank == size - 1 && rend != n) {
2266       SETERRQ(1,1,"Local column sizes do not add up to total number of columns");
2267     }
2268 
2269     /* next, compute all the lengths */
2270     dlens = (int *) PetscMalloc( (2*m+1)*sizeof(int) );CHKPTRQ(dlens);
2271     olens = dlens + m;
2272     for ( i=0; i<m; i++ ) {
2273       jend = ii[i+1] - ii[i];
2274       olen = 0;
2275       dlen = 0;
2276       for ( j=0; j<jend; j++ ) {
2277         if ( *jj < rstart || *jj >= rend) olen++;
2278         else dlen++;
2279         jj++;
2280       }
2281       olens[i] = olen;
2282       dlens[i] = dlen;
2283     }
2284     ierr = MatCreateMPIAIJ(comm,m,nlocal,PETSC_DECIDE,n,0,dlens,0,olens,&M);CHKERRQ(ierr);
2285     PetscFree(dlens);
2286   } else {
2287     int ml,nl;
2288 
2289     M = *newmat;
2290     ierr = MatGetLocalSize(M,&ml,&nl);CHKERRQ(ierr);
2291     if (ml != m) SETERRQ(PETSC_ERR_ARG_SIZ,1,"Previous matrix must be same size/layout as request");
2292     ierr = MatZeroEntries(M);CHKERRQ(ierr);
2293     /*
2294          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
2295        rather than the slower MatSetValues().
2296     */
2297     M->was_assembled = PETSC_TRUE;
2298     M->assembled     = PETSC_FALSE;
2299   }
2300   ierr = MatGetOwnershipRange(M,&rstart,&rend); CHKERRQ(ierr);
2301   aij = (Mat_SeqAIJ *) (Mreuse)->data;
2302   if (aij->indexshift) SETERRQ(PETSC_ERR_SUP,1,"No support for index shifted matrix");
2303   ii  = aij->i;
2304   jj  = aij->j;
2305   aa  = aij->a;
2306   for (i=0; i<m; i++) {
2307     row   = rstart + i;
2308     nz    = ii[i+1] - ii[i];
2309     cwork = jj;     jj += nz;
2310     vwork = aa;     aa += nz;
2311     ierr = MatSetValues_MPIAIJ(M,1,&row,nz,cwork,vwork,INSERT_VALUES); CHKERRQ(ierr);
2312   }
2313 
2314   ierr = MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2315   ierr = MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2316   *newmat = M;
2317 
2318   /* save submatrix used in processor for next request */
2319   if (call ==  MAT_INITIAL_MATRIX) {
2320     ierr = PetscObjectCompose((PetscObject)M,"SubMatrix",(PetscObject)Mreuse);CHKERRQ(ierr);
2321     ierr = PetscObjectDereference((PetscObject)Mreuse);CHKERRQ(ierr);
2322   }
2323 
2324   PetscFunctionReturn(0);
2325 }
2326