xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 3d323bbd40742d99bafa2d05bcb8f2ce5d4cc8ea)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.144 1996/06/26 18:11:34 curfman Exp bsmith $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "src/vec/vecimpl.h"
7 #include "src/inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (mat->was_assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   for ( i=0; i<m; i++ ) {
104     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
105     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
106     if (idxm[i] >= rstart && idxm[i] < rend) {
107       row = idxm[i] - rstart;
108       for ( j=0; j<n; j++ ) {
109         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
110         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
111         if (idxn[j] >= cstart && idxn[j] < cend){
112           col = idxn[j] - cstart;
113           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
114         }
115         else {
116           col = aij->colmap[idxn[j]] + shift;
117           ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
118         }
119       }
120     }
121     else {
122       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
123     }
124   }
125   return 0;
126 }
127 
128 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
129 {
130   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
131   MPI_Comm    comm = mat->comm;
132   int         size = aij->size, *owners = aij->rowners;
133   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
134   MPI_Request *send_waits,*recv_waits;
135   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
136   InsertMode  addv;
137   Scalar      *rvalues,*svalues;
138 
139   /* make sure all processors are either in INSERTMODE or ADDMODE */
140   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
141   if (addv == (ADD_VALUES|INSERT_VALUES)) {
142     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
143   }
144   aij->insertmode = addv; /* in case this processor had no cache */
145 
146   /*  first count number of contributors to each processor */
147   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
148   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
149   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
150   for ( i=0; i<aij->stash.n; i++ ) {
151     idx = aij->stash.idx[i];
152     for ( j=0; j<size; j++ ) {
153       if (idx >= owners[j] && idx < owners[j+1]) {
154         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
155       }
156     }
157   }
158   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
159 
160   /* inform other processors of number of messages and max length*/
161   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
162   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
163   nreceives = work[rank];
164   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
165   nmax = work[rank];
166   PetscFree(work);
167 
168   /* post receives:
169        1) each message will consist of ordered pairs
170      (global index,value) we store the global index as a double
171      to simplify the message passing.
172        2) since we don't know how long each individual message is we
173      allocate the largest needed buffer for each receive. Potentially
174      this is a lot of wasted space.
175 
176 
177        This could be done better.
178   */
179   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
180   CHKPTRQ(rvalues);
181   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
182   CHKPTRQ(recv_waits);
183   for ( i=0; i<nreceives; i++ ) {
184     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
185               comm,recv_waits+i);
186   }
187 
188   /* do sends:
189       1) starts[i] gives the starting index in svalues for stuff going to
190          the ith processor
191   */
192   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
193   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
194   CHKPTRQ(send_waits);
195   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
196   starts[0] = 0;
197   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
198   for ( i=0; i<aij->stash.n; i++ ) {
199     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
200     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
201     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
202   }
203   PetscFree(owner);
204   starts[0] = 0;
205   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
206   count = 0;
207   for ( i=0; i<size; i++ ) {
208     if (procs[i]) {
209       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
210                 comm,send_waits+count++);
211     }
212   }
213   PetscFree(starts); PetscFree(nprocs);
214 
215   /* Free cache space */
216   PLogInfo(0,"[%d]MatAssemblyBegin_MPIAIJ:Number of off processor values %d\n",rank,aij->stash.n);
217   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
218 
219   aij->svalues    = svalues;    aij->rvalues    = rvalues;
220   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
221   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
222   aij->rmax       = nmax;
223 
224   return 0;
225 }
226 extern int MatSetUpMultiply_MPIAIJ(Mat);
227 
228 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
229 {
230   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
231   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
232   MPI_Status  *send_status,recv_status;
233   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
234   int         row,col,other_disassembled,shift = C->indexshift;
235   Scalar      *values,val;
236   InsertMode  addv = aij->insertmode;
237 
238   /*  wait on receives */
239   while (count) {
240     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
241     /* unpack receives into our local space */
242     values = aij->rvalues + 3*imdex*aij->rmax;
243     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
244     n = n/3;
245     for ( i=0; i<n; i++ ) {
246       row = (int) PetscReal(values[3*i]) - aij->rstart;
247       col = (int) PetscReal(values[3*i+1]);
248       val = values[3*i+2];
249       if (col >= aij->cstart && col < aij->cend) {
250         col -= aij->cstart;
251         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
252       }
253       else {
254         if (mat->was_assembled) {
255           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
256           col = aij->colmap[col] + shift;
257           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
258             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
259             col = (int) PetscReal(values[3*i+1]);
260           }
261         }
262         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
263       }
264     }
265     count--;
266   }
267   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
268 
269   /* wait on sends */
270   if (aij->nsends) {
271     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
272     CHKPTRQ(send_status);
273     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
274     PetscFree(send_status);
275   }
276   PetscFree(aij->send_waits); PetscFree(aij->svalues);
277 
278   aij->insertmode = NOT_SET_VALUES;
279   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
280   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
281 
282   /* determine if any processor has disassembled, if so we must
283      also disassemble ourselfs, in order that we may reassemble. */
284   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
285   if (mat->was_assembled && !other_disassembled) {
286     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
287   }
288 
289   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
290     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
291   }
292   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
293   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
294 
295   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
296   return 0;
297 }
298 
299 static int MatZeroEntries_MPIAIJ(Mat A)
300 {
301   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
302   int        ierr;
303   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
304   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
305   return 0;
306 }
307 
308 /* the code does not do the diagonal entries correctly unless the
309    matrix is square and the column and row owerships are identical.
310    This is a BUG. The only way to fix it seems to be to access
311    aij->A and aij->B directly and not through the MatZeroRows()
312    routine.
313 */
314 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
315 {
316   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
317   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
318   int            *procs,*nprocs,j,found,idx,nsends,*work;
319   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
320   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
321   int            *lens,imdex,*lrows,*values;
322   MPI_Comm       comm = A->comm;
323   MPI_Request    *send_waits,*recv_waits;
324   MPI_Status     recv_status,*send_status;
325   IS             istmp;
326 
327   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
328   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
329 
330   /*  first count number of contributors to each processor */
331   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
332   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
333   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
334   for ( i=0; i<N; i++ ) {
335     idx = rows[i];
336     found = 0;
337     for ( j=0; j<size; j++ ) {
338       if (idx >= owners[j] && idx < owners[j+1]) {
339         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
340       }
341     }
342     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
343   }
344   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
345 
346   /* inform other processors of number of messages and max length*/
347   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
348   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
349   nrecvs = work[rank];
350   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
351   nmax = work[rank];
352   PetscFree(work);
353 
354   /* post receives:   */
355   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
356   CHKPTRQ(rvalues);
357   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
358   CHKPTRQ(recv_waits);
359   for ( i=0; i<nrecvs; i++ ) {
360     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
361   }
362 
363   /* do sends:
364       1) starts[i] gives the starting index in svalues for stuff going to
365          the ith processor
366   */
367   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
368   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
369   CHKPTRQ(send_waits);
370   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
371   starts[0] = 0;
372   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
373   for ( i=0; i<N; i++ ) {
374     svalues[starts[owner[i]]++] = rows[i];
375   }
376   ISRestoreIndices(is,&rows);
377 
378   starts[0] = 0;
379   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
380   count = 0;
381   for ( i=0; i<size; i++ ) {
382     if (procs[i]) {
383       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
384     }
385   }
386   PetscFree(starts);
387 
388   base = owners[rank];
389 
390   /*  wait on receives */
391   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
392   source = lens + nrecvs;
393   count  = nrecvs; slen = 0;
394   while (count) {
395     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
396     /* unpack receives into our local space */
397     MPI_Get_count(&recv_status,MPI_INT,&n);
398     source[imdex]  = recv_status.MPI_SOURCE;
399     lens[imdex]  = n;
400     slen += n;
401     count--;
402   }
403   PetscFree(recv_waits);
404 
405   /* move the data into the send scatter */
406   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
407   count = 0;
408   for ( i=0; i<nrecvs; i++ ) {
409     values = rvalues + i*nmax;
410     for ( j=0; j<lens[i]; j++ ) {
411       lrows[count++] = values[j] - base;
412     }
413   }
414   PetscFree(rvalues); PetscFree(lens);
415   PetscFree(owner); PetscFree(nprocs);
416 
417   /* actually zap the local rows */
418   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
419   PLogObjectParent(A,istmp);
420   PetscFree(lrows);
421   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
422   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
423   ierr = ISDestroy(istmp); CHKERRQ(ierr);
424 
425   /* wait on sends */
426   if (nsends) {
427     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
428     CHKPTRQ(send_status);
429     MPI_Waitall(nsends,send_waits,send_status);
430     PetscFree(send_status);
431   }
432   PetscFree(send_waits); PetscFree(svalues);
433 
434   return 0;
435 }
436 
437 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
438 {
439   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
440   int        ierr;
441 
442   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
443   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
444   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
445   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
446   return 0;
447 }
448 
449 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
450 {
451   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
452   int        ierr;
453   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
454   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
455   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
456   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
457   return 0;
458 }
459 
460 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
461 {
462   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
463   int        ierr;
464 
465   /* do nondiagonal part */
466   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
467   /* send it on its way */
468   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
469                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
470   /* do local part */
471   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
472   /* receive remote parts: note this assumes the values are not actually */
473   /* inserted in yy until the next line, which is true for my implementation*/
474   /* but is not perhaps always true. */
475   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
476                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
477   return 0;
478 }
479 
480 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
481 {
482   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
483   int        ierr;
484 
485   /* do nondiagonal part */
486   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
487   /* send it on its way */
488   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
489                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
490   /* do local part */
491   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
492   /* receive remote parts: note this assumes the values are not actually */
493   /* inserted in yy until the next line, which is true for my implementation*/
494   /* but is not perhaps always true. */
495   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
496                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
497   return 0;
498 }
499 
500 /*
501   This only works correctly for square matrices where the subblock A->A is the
502    diagonal block
503 */
504 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
505 {
506   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
507   if (a->M != a->N)
508     SETERRQ(1,"MatGetDiagonal_MPIAIJ:Supports only square matrix where A->A is diag block");
509   return MatGetDiagonal(a->A,v);
510 }
511 
512 static int MatScale_MPIAIJ(Scalar *aa,Mat A)
513 {
514   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
515   int        ierr;
516   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
517   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
518   return 0;
519 }
520 
521 static int MatDestroy_MPIAIJ(PetscObject obj)
522 {
523   Mat        mat = (Mat) obj;
524   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
525   int        ierr;
526 #if defined(PETSC_LOG)
527   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
528 #endif
529   PetscFree(aij->rowners);
530   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
531   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
532   if (aij->colmap) PetscFree(aij->colmap);
533   if (aij->garray) PetscFree(aij->garray);
534   if (aij->lvec)   VecDestroy(aij->lvec);
535   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
536   if (aij->rowvalues) PetscFree(aij->rowvalues);
537   PetscFree(aij);
538   PLogObjectDestroy(mat);
539   PetscHeaderDestroy(mat);
540   return 0;
541 }
542 #include "draw.h"
543 #include "pinclude/pviewer.h"
544 
545 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
546 {
547   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
548   int         ierr;
549 
550   if (aij->size == 1) {
551     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
552   }
553   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
554   return 0;
555 }
556 
557 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
558 {
559   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
560   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
561   int         ierr, format,shift = C->indexshift,rank;
562   FILE        *fd;
563   ViewerType  vtype;
564 
565   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
566   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
567     ierr = ViewerGetFormat(viewer,&format);
568     if (format == ASCII_FORMAT_INFO_DETAILED) {
569       int nz, nzalloc, mem, flg;
570       MPI_Comm_rank(mat->comm,&rank);
571       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
572       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
573       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
574       PetscSequentialPhaseBegin(mat->comm,1);
575       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
576          rank,aij->m,nz,nzalloc,mem);
577       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
578          rank,aij->m,nz,nzalloc,mem);
579       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
580       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz);
581       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
582       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz);
583       fflush(fd);
584       PetscSequentialPhaseEnd(mat->comm,1);
585       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
586       return 0;
587     }
588     else if (format == ASCII_FORMAT_INFO) {
589       return 0;
590     }
591   }
592 
593   if (vtype == DRAW_VIEWER) {
594     Draw       draw;
595     PetscTruth isnull;
596     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
597     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
598   }
599 
600   if (vtype == ASCII_FILE_VIEWER) {
601     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
602     PetscSequentialPhaseBegin(mat->comm,1);
603     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
604            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
605            aij->cend);
606     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
607     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
608     fflush(fd);
609     PetscSequentialPhaseEnd(mat->comm,1);
610   }
611   else {
612     int size = aij->size;
613     rank = aij->rank;
614     if (size == 1) {
615       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
616     }
617     else {
618       /* assemble the entire matrix onto first processor. */
619       Mat         A;
620       Mat_SeqAIJ *Aloc;
621       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
622       Scalar      *a;
623 
624       if (!rank) {
625         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
626                CHKERRQ(ierr);
627       }
628       else {
629         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
630                CHKERRQ(ierr);
631       }
632       PLogObjectParent(mat,A);
633 
634       /* copy over the A part */
635       Aloc = (Mat_SeqAIJ*) aij->A->data;
636       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
637       row = aij->rstart;
638       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
639       for ( i=0; i<m; i++ ) {
640         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
641         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
642       }
643       aj = Aloc->j;
644       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
645 
646       /* copy over the B part */
647       Aloc = (Mat_SeqAIJ*) aij->B->data;
648       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
649       row = aij->rstart;
650       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
651       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
652       for ( i=0; i<m; i++ ) {
653         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
654         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
655       }
656       PetscFree(ct);
657       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
658       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
659       if (!rank) {
660         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
661       }
662       ierr = MatDestroy(A); CHKERRQ(ierr);
663     }
664   }
665   return 0;
666 }
667 
668 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
669 {
670   Mat         mat = (Mat) obj;
671   int         ierr;
672   ViewerType  vtype;
673 
674   if (!viewer) {
675     viewer = STDOUT_VIEWER_SELF;
676   }
677   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
678   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
679       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
680     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
681   }
682   else if (vtype == BINARY_FILE_VIEWER) {
683     return MatView_MPIAIJ_Binary(mat,viewer);
684   }
685   return 0;
686 }
687 
688 /*
689     This has to provide several versions.
690 
691      1) per sequential
692      2) a) use only local smoothing updating outer values only once.
693         b) local smoothing updating outer values each inner iteration
694      3) color updating out values betwen colors.
695 */
696 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
697                            double fshift,int its,Vec xx)
698 {
699   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
700   Mat        AA = mat->A, BB = mat->B;
701   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
702   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
703   int        ierr,*idx, *diag;
704   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
705   Vec        tt;
706 
707   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
708   xs = x + shift; /* shift by one for index start of 1 */
709   ls = ls + shift;
710   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
711   diag = A->diag;
712   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
713     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
714   }
715   if (flag & SOR_EISENSTAT) {
716     /* Let  A = L + U + D; where L is lower trianglar,
717     U is upper triangular, E is diagonal; This routine applies
718 
719             (L + E)^{-1} A (U + E)^{-1}
720 
721     to a vector efficiently using Eisenstat's trick. This is for
722     the case of SSOR preconditioner, so E is D/omega where omega
723     is the relaxation factor.
724     */
725     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
726     PLogObjectParent(matin,tt);
727     VecGetArray(tt,&t);
728     scale = (2.0/omega) - 1.0;
729     /*  x = (E + U)^{-1} b */
730     VecSet(&zero,mat->lvec);
731     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
732                               mat->Mvctx); CHKERRQ(ierr);
733     for ( i=m-1; i>-1; i-- ) {
734       n    = A->i[i+1] - diag[i] - 1;
735       idx  = A->j + diag[i] + !shift;
736       v    = A->a + diag[i] + !shift;
737       sum  = b[i];
738       SPARSEDENSEMDOT(sum,xs,v,idx,n);
739       d    = fshift + A->a[diag[i]+shift];
740       n    = B->i[i+1] - B->i[i];
741       idx  = B->j + B->i[i] + shift;
742       v    = B->a + B->i[i] + shift;
743       SPARSEDENSEMDOT(sum,ls,v,idx,n);
744       x[i] = omega*(sum/d);
745     }
746     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
747                             mat->Mvctx); CHKERRQ(ierr);
748 
749     /*  t = b - (2*E - D)x */
750     v = A->a;
751     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
752 
753     /*  t = (E + L)^{-1}t */
754     ts = t + shift; /* shifted by one for index start of a or mat->j*/
755     diag = A->diag;
756     VecSet(&zero,mat->lvec);
757     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
758                                                  mat->Mvctx); CHKERRQ(ierr);
759     for ( i=0; i<m; i++ ) {
760       n    = diag[i] - A->i[i];
761       idx  = A->j + A->i[i] + shift;
762       v    = A->a + A->i[i] + shift;
763       sum  = t[i];
764       SPARSEDENSEMDOT(sum,ts,v,idx,n);
765       d    = fshift + A->a[diag[i]+shift];
766       n    = B->i[i+1] - B->i[i];
767       idx  = B->j + B->i[i] + shift;
768       v    = B->a + B->i[i] + shift;
769       SPARSEDENSEMDOT(sum,ls,v,idx,n);
770       t[i] = omega*(sum/d);
771     }
772     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
773                                                     mat->Mvctx); CHKERRQ(ierr);
774     /*  x = x + t */
775     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
776     VecDestroy(tt);
777     return 0;
778   }
779 
780 
781   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
782     if (flag & SOR_ZERO_INITIAL_GUESS) {
783       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
784     }
785     else {
786       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
787       CHKERRQ(ierr);
788       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
789       CHKERRQ(ierr);
790     }
791     while (its--) {
792       /* go down through the rows */
793       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
794                               mat->Mvctx); CHKERRQ(ierr);
795       for ( i=0; i<m; i++ ) {
796         n    = A->i[i+1] - A->i[i];
797         idx  = A->j + A->i[i] + shift;
798         v    = A->a + A->i[i] + shift;
799         sum  = b[i];
800         SPARSEDENSEMDOT(sum,xs,v,idx,n);
801         d    = fshift + A->a[diag[i]+shift];
802         n    = B->i[i+1] - B->i[i];
803         idx  = B->j + B->i[i] + shift;
804         v    = B->a + B->i[i] + shift;
805         SPARSEDENSEMDOT(sum,ls,v,idx,n);
806         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
807       }
808       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
809                             mat->Mvctx); CHKERRQ(ierr);
810       /* come up through the rows */
811       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
812                               mat->Mvctx); CHKERRQ(ierr);
813       for ( i=m-1; i>-1; i-- ) {
814         n    = A->i[i+1] - A->i[i];
815         idx  = A->j + A->i[i] + shift;
816         v    = A->a + A->i[i] + shift;
817         sum  = b[i];
818         SPARSEDENSEMDOT(sum,xs,v,idx,n);
819         d    = fshift + A->a[diag[i]+shift];
820         n    = B->i[i+1] - B->i[i];
821         idx  = B->j + B->i[i] + shift;
822         v    = B->a + B->i[i] + shift;
823         SPARSEDENSEMDOT(sum,ls,v,idx,n);
824         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
825       }
826       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
827                             mat->Mvctx); CHKERRQ(ierr);
828     }
829   }
830   else if (flag & SOR_FORWARD_SWEEP){
831     if (flag & SOR_ZERO_INITIAL_GUESS) {
832       VecSet(&zero,mat->lvec);
833       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
834                               mat->Mvctx); CHKERRQ(ierr);
835       for ( i=0; i<m; i++ ) {
836         n    = diag[i] - A->i[i];
837         idx  = A->j + A->i[i] + shift;
838         v    = A->a + A->i[i] + shift;
839         sum  = b[i];
840         SPARSEDENSEMDOT(sum,xs,v,idx,n);
841         d    = fshift + A->a[diag[i]+shift];
842         n    = B->i[i+1] - B->i[i];
843         idx  = B->j + B->i[i] + shift;
844         v    = B->a + B->i[i] + shift;
845         SPARSEDENSEMDOT(sum,ls,v,idx,n);
846         x[i] = omega*(sum/d);
847       }
848       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
849                             mat->Mvctx); CHKERRQ(ierr);
850       its--;
851     }
852     while (its--) {
853       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
854       CHKERRQ(ierr);
855       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
856       CHKERRQ(ierr);
857       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
858                               mat->Mvctx); CHKERRQ(ierr);
859       for ( i=0; i<m; i++ ) {
860         n    = A->i[i+1] - A->i[i];
861         idx  = A->j + A->i[i] + shift;
862         v    = A->a + A->i[i] + shift;
863         sum  = b[i];
864         SPARSEDENSEMDOT(sum,xs,v,idx,n);
865         d    = fshift + A->a[diag[i]+shift];
866         n    = B->i[i+1] - B->i[i];
867         idx  = B->j + B->i[i] + shift;
868         v    = B->a + B->i[i] + shift;
869         SPARSEDENSEMDOT(sum,ls,v,idx,n);
870         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
871       }
872       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
873                             mat->Mvctx); CHKERRQ(ierr);
874     }
875   }
876   else if (flag & SOR_BACKWARD_SWEEP){
877     if (flag & SOR_ZERO_INITIAL_GUESS) {
878       VecSet(&zero,mat->lvec);
879       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
880                               mat->Mvctx); CHKERRQ(ierr);
881       for ( i=m-1; i>-1; i-- ) {
882         n    = A->i[i+1] - diag[i] - 1;
883         idx  = A->j + diag[i] + !shift;
884         v    = A->a + diag[i] + !shift;
885         sum  = b[i];
886         SPARSEDENSEMDOT(sum,xs,v,idx,n);
887         d    = fshift + A->a[diag[i]+shift];
888         n    = B->i[i+1] - B->i[i];
889         idx  = B->j + B->i[i] + shift;
890         v    = B->a + B->i[i] + shift;
891         SPARSEDENSEMDOT(sum,ls,v,idx,n);
892         x[i] = omega*(sum/d);
893       }
894       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
895                             mat->Mvctx); CHKERRQ(ierr);
896       its--;
897     }
898     while (its--) {
899       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
900                             mat->Mvctx); CHKERRQ(ierr);
901       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
902                             mat->Mvctx); CHKERRQ(ierr);
903       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
904                               mat->Mvctx); CHKERRQ(ierr);
905       for ( i=m-1; i>-1; i-- ) {
906         n    = A->i[i+1] - A->i[i];
907         idx  = A->j + A->i[i] + shift;
908         v    = A->a + A->i[i] + shift;
909         sum  = b[i];
910         SPARSEDENSEMDOT(sum,xs,v,idx,n);
911         d    = fshift + A->a[diag[i]+shift];
912         n    = B->i[i+1] - B->i[i];
913         idx  = B->j + B->i[i] + shift;
914         v    = B->a + B->i[i] + shift;
915         SPARSEDENSEMDOT(sum,ls,v,idx,n);
916         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
917       }
918       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
919                             mat->Mvctx); CHKERRQ(ierr);
920     }
921   }
922   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
923     if (flag & SOR_ZERO_INITIAL_GUESS) {
924       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
925     }
926     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
927     CHKERRQ(ierr);
928     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
929     CHKERRQ(ierr);
930     while (its--) {
931       /* go down through the rows */
932       for ( i=0; i<m; i++ ) {
933         n    = A->i[i+1] - A->i[i];
934         idx  = A->j + A->i[i] + shift;
935         v    = A->a + A->i[i] + shift;
936         sum  = b[i];
937         SPARSEDENSEMDOT(sum,xs,v,idx,n);
938         d    = fshift + A->a[diag[i]+shift];
939         n    = B->i[i+1] - B->i[i];
940         idx  = B->j + B->i[i] + shift;
941         v    = B->a + B->i[i] + shift;
942         SPARSEDENSEMDOT(sum,ls,v,idx,n);
943         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
944       }
945       /* come up through the rows */
946       for ( i=m-1; i>-1; i-- ) {
947         n    = A->i[i+1] - A->i[i];
948         idx  = A->j + A->i[i] + shift;
949         v    = A->a + A->i[i] + shift;
950         sum  = b[i];
951         SPARSEDENSEMDOT(sum,xs,v,idx,n);
952         d    = fshift + A->a[diag[i]+shift];
953         n    = B->i[i+1] - B->i[i];
954         idx  = B->j + B->i[i] + shift;
955         v    = B->a + B->i[i] + shift;
956         SPARSEDENSEMDOT(sum,ls,v,idx,n);
957         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
958       }
959     }
960   }
961   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
962     if (flag & SOR_ZERO_INITIAL_GUESS) {
963       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
964     }
965     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
966     CHKERRQ(ierr);
967     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
968     CHKERRQ(ierr);
969     while (its--) {
970       for ( i=0; i<m; i++ ) {
971         n    = A->i[i+1] - A->i[i];
972         idx  = A->j + A->i[i] + shift;
973         v    = A->a + A->i[i] + shift;
974         sum  = b[i];
975         SPARSEDENSEMDOT(sum,xs,v,idx,n);
976         d    = fshift + A->a[diag[i]+shift];
977         n    = B->i[i+1] - B->i[i];
978         idx  = B->j + B->i[i] + shift;
979         v    = B->a + B->i[i] + shift;
980         SPARSEDENSEMDOT(sum,ls,v,idx,n);
981         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
982       }
983     }
984   }
985   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
986     if (flag & SOR_ZERO_INITIAL_GUESS) {
987       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
988     }
989     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
990                             mat->Mvctx); CHKERRQ(ierr);
991     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
992                             mat->Mvctx); CHKERRQ(ierr);
993     while (its--) {
994       for ( i=m-1; i>-1; i-- ) {
995         n    = A->i[i+1] - A->i[i];
996         idx  = A->j + A->i[i] + shift;
997         v    = A->a + A->i[i] + shift;
998         sum  = b[i];
999         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1000         d    = fshift + A->a[diag[i]+shift];
1001         n    = B->i[i+1] - B->i[i];
1002         idx  = B->j + B->i[i] + shift;
1003         v    = B->a + B->i[i] + shift;
1004         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1005         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1006       }
1007     }
1008   }
1009   return 0;
1010 }
1011 
1012 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1013                              int *nzalloc,int *mem)
1014 {
1015   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1016   Mat        A = mat->A, B = mat->B;
1017   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1018 
1019   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1020   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1021   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1022   if (flag == MAT_LOCAL) {
1023     if (nz)       *nz      = isend[0];
1024     if (nzalloc)  *nzalloc = isend[1];
1025     if (mem)      *mem     = isend[2];
1026   } else if (flag == MAT_GLOBAL_MAX) {
1027     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1028     if (nz)      *nz      = irecv[0];
1029     if (nzalloc) *nzalloc = irecv[1];
1030     if (mem)     *mem     = irecv[2];
1031   } else if (flag == MAT_GLOBAL_SUM) {
1032     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1033     if (nz)      *nz      = irecv[0];
1034     if (nzalloc) *nzalloc = irecv[1];
1035     if (mem)     *mem     = irecv[2];
1036   }
1037   return 0;
1038 }
1039 
1040 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1041 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1042 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1043 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1044 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1045 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1046 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1047 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1048 
1049 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1050 {
1051   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1052 
1053   if (op == NO_NEW_NONZERO_LOCATIONS ||
1054       op == YES_NEW_NONZERO_LOCATIONS ||
1055       op == COLUMNS_SORTED ||
1056       op == ROW_ORIENTED) {
1057         MatSetOption(a->A,op);
1058         MatSetOption(a->B,op);
1059   }
1060   else if (op == ROWS_SORTED ||
1061            op == SYMMETRIC_MATRIX ||
1062            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1063            op == YES_NEW_DIAGONALS)
1064     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1065   else if (op == COLUMN_ORIENTED) {
1066     a->roworiented = 0;
1067     MatSetOption(a->A,op);
1068     MatSetOption(a->B,op);
1069   }
1070   else if (op == NO_NEW_DIAGONALS)
1071     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1072   else
1073     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1074   return 0;
1075 }
1076 
1077 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1078 {
1079   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1080   *m = mat->M; *n = mat->N;
1081   return 0;
1082 }
1083 
1084 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1085 {
1086   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1087   *m = mat->m; *n = mat->N;
1088   return 0;
1089 }
1090 
1091 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1092 {
1093   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1094   *m = mat->rstart; *n = mat->rend;
1095   return 0;
1096 }
1097 
1098 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1099 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1100 
1101 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1102 {
1103   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1104   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1105   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1106   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1107   int        *cmap, *idx_p;
1108 
1109   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,"MatGetRow_MPIAIJ:Already active");
1110   mat->getrowactive = PETSC_TRUE;
1111 
1112   if (!mat->rowvalues && (idx || v)) {
1113     /*
1114         allocate enough space to hold information from the longest row.
1115     */
1116     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1117     int     max = 1,n = mat->n,tmp;
1118     for ( i=0; i<n; i++ ) {
1119       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1120       if (max < tmp) { max = tmp; }
1121     }
1122     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1123     CHKPTRQ(mat->rowvalues);
1124     mat->rowindices = (int *) (mat->rowvalues + max);
1125   }
1126 
1127 
1128   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1129   lrow = row - rstart;
1130 
1131   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1132   if (!v)   {pvA = 0; pvB = 0;}
1133   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1134   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1135   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1136   nztot = nzA + nzB;
1137 
1138   cmap  = mat->garray;
1139   if (v  || idx) {
1140     if (nztot) {
1141       /* Sort by increasing column numbers, assuming A and B already sorted */
1142       int imark = -1;
1143       if (v) {
1144         *v = v_p = mat->rowvalues;
1145         for ( i=0; i<nzB; i++ ) {
1146           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1147           else break;
1148         }
1149         imark = i;
1150         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1151         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1152       }
1153       if (idx) {
1154         *idx = idx_p = mat->rowindices;
1155         if (imark > -1) {
1156           for ( i=0; i<imark; i++ ) {
1157             idx_p[i] = cmap[cworkB[i]];
1158           }
1159         } else {
1160           for ( i=0; i<nzB; i++ ) {
1161             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1162             else break;
1163           }
1164           imark = i;
1165         }
1166         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1167         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1168       }
1169     }
1170     else {*idx = 0; *v=0;}
1171   }
1172   *nz = nztot;
1173   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1174   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1175   return 0;
1176 }
1177 
1178 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1179 {
1180   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1181   if (aij->getrowactive == PETSC_FALSE) {
1182     SETERRQ(1,"MatRestoreRow_MPIAIJ:MatGetRow not called");
1183   }
1184   aij->getrowactive = PETSC_FALSE;
1185   return 0;
1186 }
1187 
1188 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1189 {
1190   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1191   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1192   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1193   double     sum = 0.0;
1194   Scalar     *v;
1195 
1196   if (aij->size == 1) {
1197     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1198   } else {
1199     if (type == NORM_FROBENIUS) {
1200       v = amat->a;
1201       for (i=0; i<amat->nz; i++ ) {
1202 #if defined(PETSC_COMPLEX)
1203         sum += real(conj(*v)*(*v)); v++;
1204 #else
1205         sum += (*v)*(*v); v++;
1206 #endif
1207       }
1208       v = bmat->a;
1209       for (i=0; i<bmat->nz; i++ ) {
1210 #if defined(PETSC_COMPLEX)
1211         sum += real(conj(*v)*(*v)); v++;
1212 #else
1213         sum += (*v)*(*v); v++;
1214 #endif
1215       }
1216       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1217       *norm = sqrt(*norm);
1218     }
1219     else if (type == NORM_1) { /* max column norm */
1220       double *tmp, *tmp2;
1221       int    *jj, *garray = aij->garray;
1222       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1223       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1224       PetscMemzero(tmp,aij->N*sizeof(double));
1225       *norm = 0.0;
1226       v = amat->a; jj = amat->j;
1227       for ( j=0; j<amat->nz; j++ ) {
1228         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1229       }
1230       v = bmat->a; jj = bmat->j;
1231       for ( j=0; j<bmat->nz; j++ ) {
1232         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1233       }
1234       MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1235       for ( j=0; j<aij->N; j++ ) {
1236         if (tmp2[j] > *norm) *norm = tmp2[j];
1237       }
1238       PetscFree(tmp); PetscFree(tmp2);
1239     }
1240     else if (type == NORM_INFINITY) { /* max row norm */
1241       double ntemp = 0.0;
1242       for ( j=0; j<amat->m; j++ ) {
1243         v = amat->a + amat->i[j] + shift;
1244         sum = 0.0;
1245         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1246           sum += PetscAbsScalar(*v); v++;
1247         }
1248         v = bmat->a + bmat->i[j] + shift;
1249         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1250           sum += PetscAbsScalar(*v); v++;
1251         }
1252         if (sum > ntemp) ntemp = sum;
1253       }
1254       MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1255     }
1256     else {
1257       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1258     }
1259   }
1260   return 0;
1261 }
1262 
1263 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1264 {
1265   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1266   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1267   int        ierr,shift = Aloc->indexshift;
1268   Mat        B;
1269   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1270   Scalar     *array;
1271 
1272   if (matout == PETSC_NULL && M != N)
1273     SETERRQ(1,"MatTranspose_MPIAIJ:Square matrix only for in-place");
1274   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1275          PETSC_NULL,&B); CHKERRQ(ierr);
1276 
1277   /* copy over the A part */
1278   Aloc = (Mat_SeqAIJ*) a->A->data;
1279   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1280   row = a->rstart;
1281   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1282   for ( i=0; i<m; i++ ) {
1283     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1284     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1285   }
1286   aj = Aloc->j;
1287   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1288 
1289   /* copy over the B part */
1290   Aloc = (Mat_SeqAIJ*) a->B->data;
1291   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1292   row = a->rstart;
1293   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1294   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1295   for ( i=0; i<m; i++ ) {
1296     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1297     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1298   }
1299   PetscFree(ct);
1300   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1301   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1302   if (matout != PETSC_NULL) {
1303     *matout = B;
1304   } else {
1305     /* This isn't really an in-place transpose .... but free data structures from a */
1306     PetscFree(a->rowners);
1307     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1308     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1309     if (a->colmap) PetscFree(a->colmap);
1310     if (a->garray) PetscFree(a->garray);
1311     if (a->lvec) VecDestroy(a->lvec);
1312     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1313     PetscFree(a);
1314     PetscMemcpy(A,B,sizeof(struct _Mat));
1315     PetscHeaderDestroy(B);
1316   }
1317   return 0;
1318 }
1319 
1320 extern int MatPrintHelp_SeqAIJ(Mat);
1321 static int MatPrintHelp_MPIAIJ(Mat A)
1322 {
1323   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1324 
1325   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1326   else return 0;
1327 }
1328 
1329 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1330 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1331 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1332 int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1333 /* -------------------------------------------------------------------*/
1334 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1335        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1336        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1337        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1338        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1339        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1340        MatLUFactor_MPIAIJ,0,
1341        MatRelax_MPIAIJ,
1342        MatTranspose_MPIAIJ,
1343        MatGetInfo_MPIAIJ,0,
1344        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1345        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1346        0,
1347        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1348        MatGetReordering_MPIAIJ,
1349        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1350        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1351        MatILUFactorSymbolic_MPIAIJ,0,
1352        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1353        0,0,0,
1354        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1355        MatPrintHelp_MPIAIJ,
1356        MatScale_MPIAIJ};
1357 
1358 /*@C
1359    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1360    (the default parallel PETSc format).  For good matrix assembly performance
1361    the user should preallocate the matrix storage by setting the parameters
1362    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1363    performance can be increased by more than a factor of 50.
1364 
1365    Input Parameters:
1366 .  comm - MPI communicator
1367 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1368 .  n - number of local columns (or PETSC_DECIDE to have calculated
1369            if N is given)
1370 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1371 .  N - number of global columns (or PETSC_DECIDE to have calculated
1372            if n is given)
1373 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1374            (same for all local rows)
1375 .  d_nzz - array containing the number of nonzeros in the various rows of the
1376            diagonal portion of local submatrix (possibly different for each row)
1377            or PETSC_NULL. You must leave room for the diagonal entry even if
1378            it is zero.
1379 .  o_nz - number of nonzeros per row in the off-diagonal portion of local
1380            submatrix (same for all local rows).
1381 .  o_nzz - array containing the number of nonzeros in the various rows of the
1382            off-diagonal portion of the local submatrix (possibly different for
1383            each row) or PETSC_NULL.
1384 
1385    Output Parameter:
1386 .  A - the matrix
1387 
1388    Notes:
1389    The AIJ format (also called the Yale sparse matrix format or
1390    compressed row storage), is fully compatible with standard Fortran 77
1391    storage.  That is, the stored row and column indices can begin at
1392    either one (as in Fortran) or zero.  See the users manual for details.
1393 
1394    The user MUST specify either the local or global matrix dimensions
1395    (possibly both).
1396 
1397    By default, this format uses inodes (identical nodes) when possible.
1398    We search for consecutive rows with the same nonzero structure, thereby
1399    reusing matrix information to achieve increased efficiency.
1400 
1401    Options Database Keys:
1402 $    -mat_aij_no_inode  - Do not use inodes
1403 $    -mat_aij_inode_limit <limit> - Set inode limit.
1404 $        (max limit=5)
1405 $    -mat_aij_oneindex - Internally use indexing starting at 1
1406 $        rather than 0.  Note: When calling MatSetValues(),
1407 $        the user still MUST index entries starting at 0!
1408 
1409    Storage Information:
1410    For a square global matrix we define each processor's diagonal portion
1411    to be its local rows and the corresponding columns (a square submatrix);
1412    each processor's off-diagonal portion encompasses the remainder of the
1413    local matrix (a rectangular submatrix).
1414 
1415    The user can specify preallocated storage for the diagonal part of
1416    the local submatrix with either d_nz or d_nnz (not both).  Set
1417    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1418    memory allocation.  Likewise, specify preallocated storage for the
1419    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1420 
1421    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1422    the figure below we depict these three local rows and all columns (0-11).
1423 
1424 $          0 1 2 3 4 5 6 7 8 9 10 11
1425 $         -------------------
1426 $  row 3  |  o o o d d d o o o o o o
1427 $  row 4  |  o o o d d d o o o o o o
1428 $  row 5  |  o o o d d d o o o o o o
1429 $         -------------------
1430 $
1431 
1432    Thus, any entries in the d locations are stored in the d (diagonal)
1433    submatrix, and any entries in the o locations are stored in the
1434    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1435    stored simply in the MATSEQAIJ format for compressed row storage.
1436 
1437    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1438    and o_nz should indicate the number of nonzeros per row in the o matrix.
1439    In general, for PDE problems in which most nonzeros are near the diagonal,
1440    one expects d_nz >> o_nz. For large problems you MUST preallocate memory
1441    or you will get TERRIBLE performance, see the users' manual chapter on
1442    matrices and the file $(PETSC_DIR)/Performance.
1443 
1444 .keywords: matrix, aij, compressed row, sparse, parallel
1445 
1446 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1447 @*/
1448 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1449                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1450 {
1451   Mat          B;
1452   Mat_MPIAIJ   *b;
1453   int          ierr, i,sum[2],work[2];
1454 
1455   *A = 0;
1456   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1457   PLogObjectCreate(B);
1458   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1459   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1460   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1461   B->destroy    = MatDestroy_MPIAIJ;
1462   B->view       = MatView_MPIAIJ;
1463   B->factor     = 0;
1464   B->assembled  = PETSC_FALSE;
1465 
1466   b->insertmode = NOT_SET_VALUES;
1467   MPI_Comm_rank(comm,&b->rank);
1468   MPI_Comm_size(comm,&b->size);
1469 
1470   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1471     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1472 
1473   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1474     work[0] = m; work[1] = n;
1475     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1476     if (M == PETSC_DECIDE) M = sum[0];
1477     if (N == PETSC_DECIDE) N = sum[1];
1478   }
1479   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1480   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1481   b->m = m; B->m = m;
1482   b->n = n; B->n = n;
1483   b->N = N; B->N = N;
1484   b->M = M; B->M = M;
1485 
1486   /* build local table of row and column ownerships */
1487   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1488   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1489   b->cowners = b->rowners + b->size + 1;
1490   MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1491   b->rowners[0] = 0;
1492   for ( i=2; i<=b->size; i++ ) {
1493     b->rowners[i] += b->rowners[i-1];
1494   }
1495   b->rstart = b->rowners[b->rank];
1496   b->rend   = b->rowners[b->rank+1];
1497   MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1498   b->cowners[0] = 0;
1499   for ( i=2; i<=b->size; i++ ) {
1500     b->cowners[i] += b->cowners[i-1];
1501   }
1502   b->cstart = b->cowners[b->rank];
1503   b->cend   = b->cowners[b->rank+1];
1504 
1505   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1506   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1507   PLogObjectParent(B,b->A);
1508   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1509   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1510   PLogObjectParent(B,b->B);
1511 
1512   /* build cache for off array entries formed */
1513   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1514   b->colmap      = 0;
1515   b->garray      = 0;
1516   b->roworiented = 1;
1517 
1518   /* stuff used for matrix vector multiply */
1519   b->lvec      = 0;
1520   b->Mvctx     = 0;
1521 
1522   /* stuff for MatGetRow() */
1523   b->rowindices   = 0;
1524   b->rowvalues    = 0;
1525   b->getrowactive = PETSC_FALSE;
1526 
1527   *A = B;
1528   return 0;
1529 }
1530 
1531 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1532 {
1533   Mat        mat;
1534   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1535   int        ierr, len=0, flg;
1536 
1537   *newmat       = 0;
1538   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1539   PLogObjectCreate(mat);
1540   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1541   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1542   mat->destroy    = MatDestroy_MPIAIJ;
1543   mat->view       = MatView_MPIAIJ;
1544   mat->factor     = matin->factor;
1545   mat->assembled  = PETSC_TRUE;
1546 
1547   a->m = mat->m   = oldmat->m;
1548   a->n = mat->n   = oldmat->n;
1549   a->M = mat->M   = oldmat->M;
1550   a->N = mat->N   = oldmat->N;
1551 
1552   a->rstart       = oldmat->rstart;
1553   a->rend         = oldmat->rend;
1554   a->cstart       = oldmat->cstart;
1555   a->cend         = oldmat->cend;
1556   a->size         = oldmat->size;
1557   a->rank         = oldmat->rank;
1558   a->insertmode   = NOT_SET_VALUES;
1559   a->rowvalues    = 0;
1560   a->getrowactive = PETSC_FALSE;
1561 
1562   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1563   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1564   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1565   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1566   if (oldmat->colmap) {
1567     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1568     PLogObjectMemory(mat,(a->N)*sizeof(int));
1569     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1570   } else a->colmap = 0;
1571   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1572     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1573     PLogObjectMemory(mat,len*sizeof(int));
1574     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1575   } else a->garray = 0;
1576 
1577   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1578   PLogObjectParent(mat,a->lvec);
1579   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1580   PLogObjectParent(mat,a->Mvctx);
1581   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1582   PLogObjectParent(mat,a->A);
1583   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1584   PLogObjectParent(mat,a->B);
1585   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1586   if (flg) {
1587     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1588   }
1589   *newmat = mat;
1590   return 0;
1591 }
1592 
1593 #include "sys.h"
1594 
1595 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1596 {
1597   Mat          A;
1598   int          i, nz, ierr, j,rstart, rend, fd;
1599   Scalar       *vals,*svals;
1600   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1601   MPI_Status   status;
1602   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1603   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1604   int          tag = ((PetscObject)viewer)->tag;
1605 
1606   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1607   if (!rank) {
1608     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1609     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1610     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
1611   }
1612 
1613   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1614   M = header[1]; N = header[2];
1615   /* determine ownership of all rows */
1616   m = M/size + ((M % size) > rank);
1617   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1618   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1619   rowners[0] = 0;
1620   for ( i=2; i<=size; i++ ) {
1621     rowners[i] += rowners[i-1];
1622   }
1623   rstart = rowners[rank];
1624   rend   = rowners[rank+1];
1625 
1626   /* distribute row lengths to all processors */
1627   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1628   offlens = ourlens + (rend-rstart);
1629   if (!rank) {
1630     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1631     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1632     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1633     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1634     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1635     PetscFree(sndcounts);
1636   }
1637   else {
1638     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1639   }
1640 
1641   if (!rank) {
1642     /* calculate the number of nonzeros on each processor */
1643     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1644     PetscMemzero(procsnz,size*sizeof(int));
1645     for ( i=0; i<size; i++ ) {
1646       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1647         procsnz[i] += rowlengths[j];
1648       }
1649     }
1650     PetscFree(rowlengths);
1651 
1652     /* determine max buffer needed and allocate it */
1653     maxnz = 0;
1654     for ( i=0; i<size; i++ ) {
1655       maxnz = PetscMax(maxnz,procsnz[i]);
1656     }
1657     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1658 
1659     /* read in my part of the matrix column indices  */
1660     nz = procsnz[0];
1661     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1662     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1663 
1664     /* read in every one elses and ship off */
1665     for ( i=1; i<size; i++ ) {
1666       nz = procsnz[i];
1667       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1668       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1669     }
1670     PetscFree(cols);
1671   }
1672   else {
1673     /* determine buffer space needed for message */
1674     nz = 0;
1675     for ( i=0; i<m; i++ ) {
1676       nz += ourlens[i];
1677     }
1678     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1679 
1680     /* receive message of column indices*/
1681     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1682     MPI_Get_count(&status,MPI_INT,&maxnz);
1683     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1684   }
1685 
1686   /* loop over local rows, determining number of off diagonal entries */
1687   PetscMemzero(offlens,m*sizeof(int));
1688   jj = 0;
1689   for ( i=0; i<m; i++ ) {
1690     for ( j=0; j<ourlens[i]; j++ ) {
1691       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1692       jj++;
1693     }
1694   }
1695 
1696   /* create our matrix */
1697   for ( i=0; i<m; i++ ) {
1698     ourlens[i] -= offlens[i];
1699   }
1700   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1701   A = *newmat;
1702   MatSetOption(A,COLUMNS_SORTED);
1703   for ( i=0; i<m; i++ ) {
1704     ourlens[i] += offlens[i];
1705   }
1706 
1707   if (!rank) {
1708     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1709 
1710     /* read in my part of the matrix numerical values  */
1711     nz = procsnz[0];
1712     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1713 
1714     /* insert into matrix */
1715     jj      = rstart;
1716     smycols = mycols;
1717     svals   = vals;
1718     for ( i=0; i<m; i++ ) {
1719       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1720       smycols += ourlens[i];
1721       svals   += ourlens[i];
1722       jj++;
1723     }
1724 
1725     /* read in other processors and ship out */
1726     for ( i=1; i<size; i++ ) {
1727       nz = procsnz[i];
1728       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1729       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1730     }
1731     PetscFree(procsnz);
1732   }
1733   else {
1734     /* receive numeric values */
1735     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1736 
1737     /* receive message of values*/
1738     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1739     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1740     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1741 
1742     /* insert into matrix */
1743     jj      = rstart;
1744     smycols = mycols;
1745     svals   = vals;
1746     for ( i=0; i<m; i++ ) {
1747       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1748       smycols += ourlens[i];
1749       svals   += ourlens[i];
1750       jj++;
1751     }
1752   }
1753   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1754 
1755   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1756   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1757   return 0;
1758 }
1759