xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 7a0afa10060dc57fa05a230d6d4cfef77f884957)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.127 1996/02/23 23:06:31 curfman Exp bsmith $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (mat->was_assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   for ( i=0; i<m; i++ ) {
104     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
105     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
106     if (idxm[i] >= rstart && idxm[i] < rend) {
107       row = idxm[i] - rstart;
108       for ( j=0; j<n; j++ ) {
109         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
110         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
111         if (idxn[j] >= cstart && idxn[j] < cend){
112           col = idxn[j] - cstart;
113           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
114         }
115         else {
116           col = aij->colmap[idxn[j]] + shift;
117           ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
118         }
119       }
120     }
121     else {
122       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
123     }
124   }
125   return 0;
126 }
127 
128 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
129 {
130   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
131   MPI_Comm    comm = mat->comm;
132   int         size = aij->size, *owners = aij->rowners;
133   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
134   MPI_Request *send_waits,*recv_waits;
135   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
136   InsertMode  addv;
137   Scalar      *rvalues,*svalues;
138 
139   /* make sure all processors are either in INSERTMODE or ADDMODE */
140   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
141   if (addv == (ADD_VALUES|INSERT_VALUES)) {
142     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
143   }
144   aij->insertmode = addv; /* in case this processor had no cache */
145 
146   /*  first count number of contributors to each processor */
147   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
148   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
149   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
150   for ( i=0; i<aij->stash.n; i++ ) {
151     idx = aij->stash.idx[i];
152     for ( j=0; j<size; j++ ) {
153       if (idx >= owners[j] && idx < owners[j+1]) {
154         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
155       }
156     }
157   }
158   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
159 
160   /* inform other processors of number of messages and max length*/
161   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
162   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
163   nreceives = work[rank];
164   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
165   nmax = work[rank];
166   PetscFree(work);
167 
168   /* post receives:
169        1) each message will consist of ordered pairs
170      (global index,value) we store the global index as a double
171      to simplify the message passing.
172        2) since we don't know how long each individual message is we
173      allocate the largest needed buffer for each receive. Potentially
174      this is a lot of wasted space.
175 
176 
177        This could be done better.
178   */
179   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
180   CHKPTRQ(rvalues);
181   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
182   CHKPTRQ(recv_waits);
183   for ( i=0; i<nreceives; i++ ) {
184     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
185               comm,recv_waits+i);
186   }
187 
188   /* do sends:
189       1) starts[i] gives the starting index in svalues for stuff going to
190          the ith processor
191   */
192   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
193   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
194   CHKPTRQ(send_waits);
195   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
196   starts[0] = 0;
197   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
198   for ( i=0; i<aij->stash.n; i++ ) {
199     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
200     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
201     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
202   }
203   PetscFree(owner);
204   starts[0] = 0;
205   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
206   count = 0;
207   for ( i=0; i<size; i++ ) {
208     if (procs[i]) {
209       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
210                 comm,send_waits+count++);
211     }
212   }
213   PetscFree(starts); PetscFree(nprocs);
214 
215   /* Free cache space */
216   PLogInfo(0,"[%d]MatAssemblyBegin_MPIAIJ:Number of off processor values %d\n",rank,aij->stash.n);
217   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
218 
219   aij->svalues    = svalues;    aij->rvalues    = rvalues;
220   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
221   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
222   aij->rmax       = nmax;
223 
224   return 0;
225 }
226 extern int MatSetUpMultiply_MPIAIJ(Mat);
227 
228 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
229 {
230   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
231   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
232   MPI_Status  *send_status,recv_status;
233   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
234   int         row,col,other_disassembled,shift = C->indexshift;
235   Scalar      *values,val;
236   InsertMode  addv = aij->insertmode;
237 
238   /*  wait on receives */
239   while (count) {
240     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
241     /* unpack receives into our local space */
242     values = aij->rvalues + 3*imdex*aij->rmax;
243     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
244     n = n/3;
245     for ( i=0; i<n; i++ ) {
246       row = (int) PetscReal(values[3*i]) - aij->rstart;
247       col = (int) PetscReal(values[3*i+1]);
248       val = values[3*i+2];
249       if (col >= aij->cstart && col < aij->cend) {
250         col -= aij->cstart;
251         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
252       }
253       else {
254         if (mat->was_assembled) {
255           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
256           col = aij->colmap[col] + shift;
257           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
258             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
259             col = (int) PetscReal(values[3*i+1]);
260           }
261         }
262         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
263       }
264     }
265     count--;
266   }
267   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
268 
269   /* wait on sends */
270   if (aij->nsends) {
271     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
272     CHKPTRQ(send_status);
273     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
274     PetscFree(send_status);
275   }
276   PetscFree(aij->send_waits); PetscFree(aij->svalues);
277 
278   aij->insertmode = NOT_SET_VALUES;
279   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
280   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
281 
282   /* determine if any processor has disassembled, if so we must
283      also disassemble ourselfs, in order that we may reassemble. */
284   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
285   if (mat->was_assembled && !other_disassembled) {
286     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
287   }
288 
289   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
290     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
291   }
292   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
293   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
294 
295   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
296   return 0;
297 }
298 
299 static int MatZeroEntries_MPIAIJ(Mat A)
300 {
301   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
302   int        ierr;
303   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
304   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
305   return 0;
306 }
307 
308 /* the code does not do the diagonal entries correctly unless the
309    matrix is square and the column and row owerships are identical.
310    This is a BUG. The only way to fix it seems to be to access
311    aij->A and aij->B directly and not through the MatZeroRows()
312    routine.
313 */
314 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
315 {
316   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
317   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
318   int            *procs,*nprocs,j,found,idx,nsends,*work;
319   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
320   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
321   int            *lens,imdex,*lrows,*values;
322   MPI_Comm       comm = A->comm;
323   MPI_Request    *send_waits,*recv_waits;
324   MPI_Status     recv_status,*send_status;
325   IS             istmp;
326 
327   ierr = ISGetLocalSize(is,&N); CHKERRQ(ierr);
328   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
329 
330   /*  first count number of contributors to each processor */
331   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
332   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
333   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
334   for ( i=0; i<N; i++ ) {
335     idx = rows[i];
336     found = 0;
337     for ( j=0; j<size; j++ ) {
338       if (idx >= owners[j] && idx < owners[j+1]) {
339         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
340       }
341     }
342     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
343   }
344   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
345 
346   /* inform other processors of number of messages and max length*/
347   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
348   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
349   nrecvs = work[rank];
350   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
351   nmax = work[rank];
352   PetscFree(work);
353 
354   /* post receives:   */
355   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
356   CHKPTRQ(rvalues);
357   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
358   CHKPTRQ(recv_waits);
359   for ( i=0; i<nrecvs; i++ ) {
360     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
361   }
362 
363   /* do sends:
364       1) starts[i] gives the starting index in svalues for stuff going to
365          the ith processor
366   */
367   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
368   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
369   CHKPTRQ(send_waits);
370   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
371   starts[0] = 0;
372   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
373   for ( i=0; i<N; i++ ) {
374     svalues[starts[owner[i]]++] = rows[i];
375   }
376   ISRestoreIndices(is,&rows);
377 
378   starts[0] = 0;
379   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
380   count = 0;
381   for ( i=0; i<size; i++ ) {
382     if (procs[i]) {
383       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
384     }
385   }
386   PetscFree(starts);
387 
388   base = owners[rank];
389 
390   /*  wait on receives */
391   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
392   source = lens + nrecvs;
393   count  = nrecvs; slen = 0;
394   while (count) {
395     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
396     /* unpack receives into our local space */
397     MPI_Get_count(&recv_status,MPI_INT,&n);
398     source[imdex]  = recv_status.MPI_SOURCE;
399     lens[imdex]  = n;
400     slen += n;
401     count--;
402   }
403   PetscFree(recv_waits);
404 
405   /* move the data into the send scatter */
406   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
407   count = 0;
408   for ( i=0; i<nrecvs; i++ ) {
409     values = rvalues + i*nmax;
410     for ( j=0; j<lens[i]; j++ ) {
411       lrows[count++] = values[j] - base;
412     }
413   }
414   PetscFree(rvalues); PetscFree(lens);
415   PetscFree(owner); PetscFree(nprocs);
416 
417   /* actually zap the local rows */
418   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
419   PLogObjectParent(A,istmp);
420   PetscFree(lrows);
421   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
422   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
423   ierr = ISDestroy(istmp); CHKERRQ(ierr);
424 
425   /* wait on sends */
426   if (nsends) {
427     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
428     CHKPTRQ(send_status);
429     MPI_Waitall(nsends,send_waits,send_status);
430     PetscFree(send_status);
431   }
432   PetscFree(send_waits); PetscFree(svalues);
433 
434   return 0;
435 }
436 
437 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
438 {
439   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
440   int        ierr;
441 
442   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
443   ierr = MatMult(a->A,xx,yy); CHKERRQ(ierr);
444   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
445   ierr = MatMultAdd(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
446   return 0;
447 }
448 
449 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
450 {
451   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
452   int        ierr;
453   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
454   ierr = MatMultAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
455   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
456   ierr = MatMultAdd(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
457   return 0;
458 }
459 
460 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
461 {
462   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
463   int        ierr;
464 
465   /* do nondiagonal part */
466   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
467   /* send it on its way */
468   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
469                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
470   /* do local part */
471   ierr = MatMultTrans(a->A,xx,yy); CHKERRQ(ierr);
472   /* receive remote parts: note this assumes the values are not actually */
473   /* inserted in yy until the next line, which is true for my implementation*/
474   /* but is not perhaps always true. */
475   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
476                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
477   return 0;
478 }
479 
480 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
481 {
482   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
483   int        ierr;
484 
485   /* do nondiagonal part */
486   ierr = MatMultTrans(a->B,xx,a->lvec); CHKERRQ(ierr);
487   /* send it on its way */
488   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
489                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
490   /* do local part */
491   ierr = MatMultTransAdd(a->A,xx,yy,zz); CHKERRQ(ierr);
492   /* receive remote parts: note this assumes the values are not actually */
493   /* inserted in yy until the next line, which is true for my implementation*/
494   /* but is not perhaps always true. */
495   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
496                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
497   return 0;
498 }
499 
500 /*
501   This only works correctly for square matrices where the subblock A->A is the
502    diagonal block
503 */
504 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
505 {
506   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
507   return MatGetDiagonal(a->A,v);
508 }
509 
510 static int MatScale_MPIAIJ(Scalar *aa,Mat A)
511 {
512   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
513   int        ierr;
514   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
515   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
516   return 0;
517 }
518 
519 static int MatDestroy_MPIAIJ(PetscObject obj)
520 {
521   Mat        mat = (Mat) obj;
522   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
523   int        ierr;
524 #if defined(PETSC_LOG)
525   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
526 #endif
527   PetscFree(aij->rowners);
528   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
529   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
530   if (aij->colmap) PetscFree(aij->colmap);
531   if (aij->garray) PetscFree(aij->garray);
532   if (aij->lvec)   VecDestroy(aij->lvec);
533   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
534   if (aij->rowvalues) PetscFree(aij->rowvalues);
535   PetscFree(aij);
536   PLogObjectDestroy(mat);
537   PetscHeaderDestroy(mat);
538   return 0;
539 }
540 #include "draw.h"
541 #include "pinclude/pviewer.h"
542 
543 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
544 {
545   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
546   int         ierr;
547 
548   if (aij->size == 1) {
549     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
550   }
551   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
552   return 0;
553 }
554 
555 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
556 {
557   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
558   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
559   int         ierr, format,shift = C->indexshift,rank;
560   PetscObject vobj = (PetscObject) viewer;
561   FILE        *fd;
562 
563   if (vobj->type == ASCII_FILE_VIEWER || vobj->type == ASCII_FILES_VIEWER) {
564     ierr = ViewerFileGetFormat_Private(viewer,&format);
565     if (format == FILE_FORMAT_INFO_DETAILED) {
566       int nz, nzalloc, mem, flg;
567       MPI_Comm_rank(mat->comm,&rank);
568       ierr = ViewerFileGetPointer(viewer,&fd); CHKERRQ(ierr);
569       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
570       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
571       MPIU_Seq_begin(mat->comm,1);
572       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
573          rank,aij->m,nz,nzalloc,mem);
574       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
575          rank,aij->m,nz,nzalloc,mem);
576       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
577       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz);
578       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
579       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz);
580       fflush(fd);
581       MPIU_Seq_end(mat->comm,1);
582       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
583       return 0;
584     }
585     else if (format == FILE_FORMAT_INFO) {
586       return 0;
587     }
588   }
589 
590   if (vobj->type == ASCII_FILE_VIEWER) {
591     ierr = ViewerFileGetPointer(viewer,&fd); CHKERRQ(ierr);
592     MPIU_Seq_begin(mat->comm,1);
593     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
594            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
595            aij->cend);
596     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
597     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
598     fflush(fd);
599     MPIU_Seq_end(mat->comm,1);
600   }
601   else {
602     int size = aij->size;
603     rank = aij->rank;
604     if (size == 1) {
605       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
606     }
607     else {
608       /* assemble the entire matrix onto first processor. */
609       Mat         A;
610       Mat_SeqAIJ *Aloc;
611       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
612       Scalar      *a;
613 
614       if (!rank) {
615         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
616                CHKERRQ(ierr);
617       }
618       else {
619         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
620                CHKERRQ(ierr);
621       }
622       PLogObjectParent(mat,A);
623 
624       /* copy over the A part */
625       Aloc = (Mat_SeqAIJ*) aij->A->data;
626       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
627       row = aij->rstart;
628       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
629       for ( i=0; i<m; i++ ) {
630         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
631         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
632       }
633       aj = Aloc->j;
634       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
635 
636       /* copy over the B part */
637       Aloc = (Mat_SeqAIJ*) aij->B->data;
638       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
639       row = aij->rstart;
640       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
641       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
642       for ( i=0; i<m; i++ ) {
643         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
644         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
645       }
646       PetscFree(ct);
647       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
648       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
649       if (!rank) {
650         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
651       }
652       ierr = MatDestroy(A); CHKERRQ(ierr);
653     }
654   }
655   return 0;
656 }
657 
658 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
659 {
660   Mat         mat = (Mat) obj;
661   int         ierr;
662   PetscObject vobj = (PetscObject) viewer;
663 
664   if (!viewer) {
665     viewer = STDOUT_VIEWER_SELF; vobj = (PetscObject) viewer;
666   }
667   if (vobj->cookie == DRAW_COOKIE && vobj->type == NULLWINDOW) return 0;
668   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILE_VIEWER) {
669     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
670   }
671   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == ASCII_FILES_VIEWER) {
672     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
673   }
674   else if (vobj->cookie == VIEWER_COOKIE && vobj->type == MATLAB_VIEWER) {
675     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
676   }
677   else if (vobj->cookie == DRAW_COOKIE) {
678     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
679   }
680   else if (vobj->type == BINARY_FILE_VIEWER) {
681     return MatView_MPIAIJ_Binary(mat,viewer);
682   }
683   return 0;
684 }
685 
686 extern int MatMarkDiag_SeqAIJ(Mat);
687 /*
688     This has to provide several versions.
689 
690      1) per sequential
691      2) a) use only local smoothing updating outer values only once.
692         b) local smoothing updating outer values each inner iteration
693      3) color updating out values betwen colors.
694 */
695 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
696                            double fshift,int its,Vec xx)
697 {
698   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
699   Mat        AA = mat->A, BB = mat->B;
700   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
701   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
702   int        ierr,*idx, *diag;
703   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
704   Vec        tt;
705 
706   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
707   xs = x + shift; /* shift by one for index start of 1 */
708   ls = ls + shift;
709   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
710   diag = A->diag;
711   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
712     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
713   }
714   if (flag & SOR_EISENSTAT) {
715     /* Let  A = L + U + D; where L is lower trianglar,
716     U is upper triangular, E is diagonal; This routine applies
717 
718             (L + E)^{-1} A (U + E)^{-1}
719 
720     to a vector efficiently using Eisenstat's trick. This is for
721     the case of SSOR preconditioner, so E is D/omega where omega
722     is the relaxation factor.
723     */
724     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
725     PLogObjectParent(matin,tt);
726     VecGetArray(tt,&t);
727     scale = (2.0/omega) - 1.0;
728     /*  x = (E + U)^{-1} b */
729     VecSet(&zero,mat->lvec);
730     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
731                               mat->Mvctx); CHKERRQ(ierr);
732     for ( i=m-1; i>-1; i-- ) {
733       n    = A->i[i+1] - diag[i] - 1;
734       idx  = A->j + diag[i] + !shift;
735       v    = A->a + diag[i] + !shift;
736       sum  = b[i];
737       SPARSEDENSEMDOT(sum,xs,v,idx,n);
738       d    = fshift + A->a[diag[i]+shift];
739       n    = B->i[i+1] - B->i[i];
740       idx  = B->j + B->i[i] + shift;
741       v    = B->a + B->i[i] + shift;
742       SPARSEDENSEMDOT(sum,ls,v,idx,n);
743       x[i] = omega*(sum/d);
744     }
745     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
746                             mat->Mvctx); CHKERRQ(ierr);
747 
748     /*  t = b - (2*E - D)x */
749     v = A->a;
750     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
751 
752     /*  t = (E + L)^{-1}t */
753     ts = t + shift; /* shifted by one for index start of a or mat->j*/
754     diag = A->diag;
755     VecSet(&zero,mat->lvec);
756     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
757                                                  mat->Mvctx); CHKERRQ(ierr);
758     for ( i=0; i<m; i++ ) {
759       n    = diag[i] - A->i[i];
760       idx  = A->j + A->i[i] + shift;
761       v    = A->a + A->i[i] + shift;
762       sum  = t[i];
763       SPARSEDENSEMDOT(sum,ts,v,idx,n);
764       d    = fshift + A->a[diag[i]+shift];
765       n    = B->i[i+1] - B->i[i];
766       idx  = B->j + B->i[i] + shift;
767       v    = B->a + B->i[i] + shift;
768       SPARSEDENSEMDOT(sum,ls,v,idx,n);
769       t[i] = omega*(sum/d);
770     }
771     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
772                                                     mat->Mvctx); CHKERRQ(ierr);
773     /*  x = x + t */
774     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
775     VecDestroy(tt);
776     return 0;
777   }
778 
779 
780   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
781     if (flag & SOR_ZERO_INITIAL_GUESS) {
782       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
783     }
784     else {
785       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
786       CHKERRQ(ierr);
787       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
788       CHKERRQ(ierr);
789     }
790     while (its--) {
791       /* go down through the rows */
792       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
793                               mat->Mvctx); CHKERRQ(ierr);
794       for ( i=0; i<m; i++ ) {
795         n    = A->i[i+1] - A->i[i];
796         idx  = A->j + A->i[i] + shift;
797         v    = A->a + A->i[i] + shift;
798         sum  = b[i];
799         SPARSEDENSEMDOT(sum,xs,v,idx,n);
800         d    = fshift + A->a[diag[i]+shift];
801         n    = B->i[i+1] - B->i[i];
802         idx  = B->j + B->i[i] + shift;
803         v    = B->a + B->i[i] + shift;
804         SPARSEDENSEMDOT(sum,ls,v,idx,n);
805         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
806       }
807       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
808                             mat->Mvctx); CHKERRQ(ierr);
809       /* come up through the rows */
810       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
811                               mat->Mvctx); CHKERRQ(ierr);
812       for ( i=m-1; i>-1; i-- ) {
813         n    = A->i[i+1] - A->i[i];
814         idx  = A->j + A->i[i] + shift;
815         v    = A->a + A->i[i] + shift;
816         sum  = b[i];
817         SPARSEDENSEMDOT(sum,xs,v,idx,n);
818         d    = fshift + A->a[diag[i]+shift];
819         n    = B->i[i+1] - B->i[i];
820         idx  = B->j + B->i[i] + shift;
821         v    = B->a + B->i[i] + shift;
822         SPARSEDENSEMDOT(sum,ls,v,idx,n);
823         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
824       }
825       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
826                             mat->Mvctx); CHKERRQ(ierr);
827     }
828   }
829   else if (flag & SOR_FORWARD_SWEEP){
830     if (flag & SOR_ZERO_INITIAL_GUESS) {
831       VecSet(&zero,mat->lvec);
832       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
833                               mat->Mvctx); CHKERRQ(ierr);
834       for ( i=0; i<m; i++ ) {
835         n    = diag[i] - A->i[i];
836         idx  = A->j + A->i[i] + shift;
837         v    = A->a + A->i[i] + shift;
838         sum  = b[i];
839         SPARSEDENSEMDOT(sum,xs,v,idx,n);
840         d    = fshift + A->a[diag[i]+shift];
841         n    = B->i[i+1] - B->i[i];
842         idx  = B->j + B->i[i] + shift;
843         v    = B->a + B->i[i] + shift;
844         SPARSEDENSEMDOT(sum,ls,v,idx,n);
845         x[i] = omega*(sum/d);
846       }
847       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
848                             mat->Mvctx); CHKERRQ(ierr);
849       its--;
850     }
851     while (its--) {
852       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
853       CHKERRQ(ierr);
854       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
855       CHKERRQ(ierr);
856       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
857                               mat->Mvctx); CHKERRQ(ierr);
858       for ( i=0; i<m; i++ ) {
859         n    = A->i[i+1] - A->i[i];
860         idx  = A->j + A->i[i] + shift;
861         v    = A->a + A->i[i] + shift;
862         sum  = b[i];
863         SPARSEDENSEMDOT(sum,xs,v,idx,n);
864         d    = fshift + A->a[diag[i]+shift];
865         n    = B->i[i+1] - B->i[i];
866         idx  = B->j + B->i[i] + shift;
867         v    = B->a + B->i[i] + shift;
868         SPARSEDENSEMDOT(sum,ls,v,idx,n);
869         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
870       }
871       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
872                             mat->Mvctx); CHKERRQ(ierr);
873     }
874   }
875   else if (flag & SOR_BACKWARD_SWEEP){
876     if (flag & SOR_ZERO_INITIAL_GUESS) {
877       VecSet(&zero,mat->lvec);
878       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
879                               mat->Mvctx); CHKERRQ(ierr);
880       for ( i=m-1; i>-1; i-- ) {
881         n    = A->i[i+1] - diag[i] - 1;
882         idx  = A->j + diag[i] + !shift;
883         v    = A->a + diag[i] + !shift;
884         sum  = b[i];
885         SPARSEDENSEMDOT(sum,xs,v,idx,n);
886         d    = fshift + A->a[diag[i]+shift];
887         n    = B->i[i+1] - B->i[i];
888         idx  = B->j + B->i[i] + shift;
889         v    = B->a + B->i[i] + shift;
890         SPARSEDENSEMDOT(sum,ls,v,idx,n);
891         x[i] = omega*(sum/d);
892       }
893       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
894                             mat->Mvctx); CHKERRQ(ierr);
895       its--;
896     }
897     while (its--) {
898       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
899                             mat->Mvctx); CHKERRQ(ierr);
900       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
901                             mat->Mvctx); CHKERRQ(ierr);
902       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
903                               mat->Mvctx); CHKERRQ(ierr);
904       for ( i=m-1; i>-1; i-- ) {
905         n    = A->i[i+1] - A->i[i];
906         idx  = A->j + A->i[i] + shift;
907         v    = A->a + A->i[i] + shift;
908         sum  = b[i];
909         SPARSEDENSEMDOT(sum,xs,v,idx,n);
910         d    = fshift + A->a[diag[i]+shift];
911         n    = B->i[i+1] - B->i[i];
912         idx  = B->j + B->i[i] + shift;
913         v    = B->a + B->i[i] + shift;
914         SPARSEDENSEMDOT(sum,ls,v,idx,n);
915         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
916       }
917       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
918                             mat->Mvctx); CHKERRQ(ierr);
919     }
920   }
921   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
922     if (flag & SOR_ZERO_INITIAL_GUESS) {
923       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
924     }
925     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
926     CHKERRQ(ierr);
927     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
928     CHKERRQ(ierr);
929     while (its--) {
930       /* go down through the rows */
931       for ( i=0; i<m; i++ ) {
932         n    = A->i[i+1] - A->i[i];
933         idx  = A->j + A->i[i] + shift;
934         v    = A->a + A->i[i] + shift;
935         sum  = b[i];
936         SPARSEDENSEMDOT(sum,xs,v,idx,n);
937         d    = fshift + A->a[diag[i]+shift];
938         n    = B->i[i+1] - B->i[i];
939         idx  = B->j + B->i[i] + shift;
940         v    = B->a + B->i[i] + shift;
941         SPARSEDENSEMDOT(sum,ls,v,idx,n);
942         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
943       }
944       /* come up through the rows */
945       for ( i=m-1; i>-1; i-- ) {
946         n    = A->i[i+1] - A->i[i];
947         idx  = A->j + A->i[i] + shift;
948         v    = A->a + A->i[i] + shift;
949         sum  = b[i];
950         SPARSEDENSEMDOT(sum,xs,v,idx,n);
951         d    = fshift + A->a[diag[i]+shift];
952         n    = B->i[i+1] - B->i[i];
953         idx  = B->j + B->i[i] + shift;
954         v    = B->a + B->i[i] + shift;
955         SPARSEDENSEMDOT(sum,ls,v,idx,n);
956         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
957       }
958     }
959   }
960   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
961     if (flag & SOR_ZERO_INITIAL_GUESS) {
962       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
963     }
964     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
965     CHKERRQ(ierr);
966     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
967     CHKERRQ(ierr);
968     while (its--) {
969       for ( i=0; i<m; i++ ) {
970         n    = A->i[i+1] - A->i[i];
971         idx  = A->j + A->i[i] + shift;
972         v    = A->a + A->i[i] + shift;
973         sum  = b[i];
974         SPARSEDENSEMDOT(sum,xs,v,idx,n);
975         d    = fshift + A->a[diag[i]+shift];
976         n    = B->i[i+1] - B->i[i];
977         idx  = B->j + B->i[i] + shift;
978         v    = B->a + B->i[i] + shift;
979         SPARSEDENSEMDOT(sum,ls,v,idx,n);
980         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
981       }
982     }
983   }
984   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
985     if (flag & SOR_ZERO_INITIAL_GUESS) {
986       return MatRelax(mat->A,bb,omega,flag,fshift,its,xx);
987     }
988     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
989                             mat->Mvctx); CHKERRQ(ierr);
990     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
991                             mat->Mvctx); CHKERRQ(ierr);
992     while (its--) {
993       for ( i=m-1; i>-1; i-- ) {
994         n    = A->i[i+1] - A->i[i];
995         idx  = A->j + A->i[i] + shift;
996         v    = A->a + A->i[i] + shift;
997         sum  = b[i];
998         SPARSEDENSEMDOT(sum,xs,v,idx,n);
999         d    = fshift + A->a[diag[i]+shift];
1000         n    = B->i[i+1] - B->i[i];
1001         idx  = B->j + B->i[i] + shift;
1002         v    = B->a + B->i[i] + shift;
1003         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1004         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1005       }
1006     }
1007   }
1008   return 0;
1009 }
1010 
1011 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1012                              int *nzalloc,int *mem)
1013 {
1014   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1015   Mat        A = mat->A, B = mat->B;
1016   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1017 
1018   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1019   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1020   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1021   if (flag == MAT_LOCAL) {
1022     *nz = isend[0]; *nzalloc = isend[1]; *mem = isend[2];
1023   } else if (flag == MAT_GLOBAL_MAX) {
1024     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1025     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1026   } else if (flag == MAT_GLOBAL_SUM) {
1027     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1028     *nz = irecv[0]; *nzalloc = irecv[1]; *mem = irecv[2];
1029   }
1030   return 0;
1031 }
1032 
1033 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1034 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1035 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1036 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1037 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1038 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1039 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1040 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1041 
1042 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1043 {
1044   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1045 
1046   if (op == NO_NEW_NONZERO_LOCATIONS ||
1047       op == YES_NEW_NONZERO_LOCATIONS ||
1048       op == COLUMNS_SORTED ||
1049       op == ROW_ORIENTED) {
1050         MatSetOption(a->A,op);
1051         MatSetOption(a->B,op);
1052   }
1053   else if (op == ROWS_SORTED ||
1054            op == SYMMETRIC_MATRIX ||
1055            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1056            op == YES_NEW_DIAGONALS)
1057     PLogInfo((PetscObject)A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1058   else if (op == COLUMN_ORIENTED) {
1059     a->roworiented = 0;
1060     MatSetOption(a->A,op);
1061     MatSetOption(a->B,op);
1062   }
1063   else if (op == NO_NEW_DIAGONALS)
1064     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1065   else
1066     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1067   return 0;
1068 }
1069 
1070 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1071 {
1072   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1073   *m = mat->M; *n = mat->N;
1074   return 0;
1075 }
1076 
1077 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1078 {
1079   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1080   *m = mat->m; *n = mat->N;
1081   return 0;
1082 }
1083 
1084 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1085 {
1086   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1087   *m = mat->rstart; *n = mat->rend;
1088   return 0;
1089 }
1090 
1091 static int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1092 {
1093   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1094   Scalar     *vworkA, *vworkB, **pvA, **pvB;
1095   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1096   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1097 
1098   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,"MatGetRow_MPIAIJ:Already active");
1099   mat->getrowactive = PETSC_TRUE;
1100 
1101   if (!mat->rowvalues) {
1102     /*
1103         allocate enough space to hold information from the longest row.
1104     */
1105     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1106     int     max = 1,n = mat->n,tmp;
1107     for ( i=0; i<n; i++ ) {
1108       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1109       if (max < tmp) { max = tmp; }
1110     }
1111     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1112     CHKPTRQ(mat->rowvalues);
1113     mat->rowindices = (int *) (mat->rowvalues + max);
1114   }
1115 
1116 
1117   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1118   lrow = row - rstart;
1119 
1120   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1121   if (!v)   {pvA = 0; pvB = 0;}
1122   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1123   ierr = MatGetRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1124   ierr = MatGetRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1125   nztot = nzA + nzB;
1126 
1127   if (v  || idx) {
1128     if (nztot) {
1129       /* Sort by increasing column numbers, assuming A and B already sorted */
1130       int imark;
1131       for (i=0; i<nzB; i++) cworkB[i] = mat->garray[cworkB[i]];
1132       if (v) {
1133         *v = mat->rowvalues;
1134         for ( i=0; i<nzB; i++ ) {
1135           if (cworkB[i] < cstart)   (*v)[i] = vworkB[i];
1136           else break;
1137         }
1138         imark = i;
1139         for ( i=0; i<nzA; i++ )     (*v)[imark+i] = vworkA[i];
1140         for ( i=imark; i<nzB; i++ ) (*v)[nzA+i] = vworkB[i];
1141       }
1142       if (idx) {
1143         *idx = mat->rowindices;
1144         for (i=0; i<nzA; i++) cworkA[i] += cstart;
1145         for ( i=0; i<nzB; i++ ) {
1146           if (cworkB[i] < cstart)   (*idx)[i] = cworkB[i];
1147           else break;
1148         }
1149         imark = i;
1150         for ( i=0; i<nzA; i++ )     (*idx)[imark+i] = cworkA[i];
1151         for ( i=imark; i<nzB; i++ ) (*idx)[nzA+i] = cworkB[i];
1152       }
1153     }
1154     else {*idx = 0; *v=0;}
1155   }
1156   *nz = nztot;
1157   ierr = MatRestoreRow(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1158   ierr = MatRestoreRow(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1159   return 0;
1160 }
1161 
1162 static int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1163 {
1164   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1165   if (aij->getrowactive == PETSC_FALSE) {
1166     SETERRQ(1,"MatRestoreRow_MPIAIJ:MatGetRow not called");
1167   }
1168   aij->getrowactive = PETSC_FALSE;
1169   return 0;
1170 }
1171 
1172 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1173 {
1174   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1175   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1176   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1177   double     sum = 0.0;
1178   Scalar     *v;
1179 
1180   if (aij->size == 1) {
1181     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1182   } else {
1183     if (type == NORM_FROBENIUS) {
1184       v = amat->a;
1185       for (i=0; i<amat->nz; i++ ) {
1186 #if defined(PETSC_COMPLEX)
1187         sum += real(conj(*v)*(*v)); v++;
1188 #else
1189         sum += (*v)*(*v); v++;
1190 #endif
1191       }
1192       v = bmat->a;
1193       for (i=0; i<bmat->nz; i++ ) {
1194 #if defined(PETSC_COMPLEX)
1195         sum += real(conj(*v)*(*v)); v++;
1196 #else
1197         sum += (*v)*(*v); v++;
1198 #endif
1199       }
1200       MPI_Allreduce((void*)&sum,(void*)norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1201       *norm = sqrt(*norm);
1202     }
1203     else if (type == NORM_1) { /* max column norm */
1204       double *tmp, *tmp2;
1205       int    *jj, *garray = aij->garray;
1206       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1207       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1208       PetscMemzero(tmp,aij->N*sizeof(double));
1209       *norm = 0.0;
1210       v = amat->a; jj = amat->j;
1211       for ( j=0; j<amat->nz; j++ ) {
1212         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1213       }
1214       v = bmat->a; jj = bmat->j;
1215       for ( j=0; j<bmat->nz; j++ ) {
1216         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1217       }
1218       MPI_Allreduce((void*)tmp,(void*)tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1219       for ( j=0; j<aij->N; j++ ) {
1220         if (tmp2[j] > *norm) *norm = tmp2[j];
1221       }
1222       PetscFree(tmp); PetscFree(tmp2);
1223     }
1224     else if (type == NORM_INFINITY) { /* max row norm */
1225       double ntemp = 0.0;
1226       for ( j=0; j<amat->m; j++ ) {
1227         v = amat->a + amat->i[j] + shift;
1228         sum = 0.0;
1229         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1230           sum += PetscAbsScalar(*v); v++;
1231         }
1232         v = bmat->a + bmat->i[j] + shift;
1233         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1234           sum += PetscAbsScalar(*v); v++;
1235         }
1236         if (sum > ntemp) ntemp = sum;
1237       }
1238       MPI_Allreduce((void*)&ntemp,(void*)norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1239     }
1240     else {
1241       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1242     }
1243   }
1244   return 0;
1245 }
1246 
1247 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1248 {
1249   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1250   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1251   int        ierr,shift = Aloc->indexshift;
1252   Mat        B;
1253   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1254   Scalar     *array;
1255 
1256   if (matout == PETSC_NULL && M != N)
1257     SETERRQ(1,"MatTranspose_MPIAIJ:Square matrix only for in-place");
1258   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1259          PETSC_NULL,&B); CHKERRQ(ierr);
1260 
1261   /* copy over the A part */
1262   Aloc = (Mat_SeqAIJ*) a->A->data;
1263   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1264   row = a->rstart;
1265   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1266   for ( i=0; i<m; i++ ) {
1267     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1268     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1269   }
1270   aj = Aloc->j;
1271   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1272 
1273   /* copy over the B part */
1274   Aloc = (Mat_SeqAIJ*) a->B->data;
1275   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1276   row = a->rstart;
1277   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1278   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1279   for ( i=0; i<m; i++ ) {
1280     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1281     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1282   }
1283   PetscFree(ct);
1284   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1285   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1286   if (matout != PETSC_NULL) {
1287     *matout = B;
1288   } else {
1289     /* This isn't really an in-place transpose .... but free data structures from a */
1290     PetscFree(a->rowners);
1291     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1292     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1293     if (a->colmap) PetscFree(a->colmap);
1294     if (a->garray) PetscFree(a->garray);
1295     if (a->lvec) VecDestroy(a->lvec);
1296     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1297     PetscFree(a);
1298     PetscMemcpy(A,B,sizeof(struct _Mat));
1299     PetscHeaderDestroy(B);
1300   }
1301   return 0;
1302 }
1303 
1304 extern int MatPrintHelp_SeqAIJ(Mat);
1305 static int MatPrintHelp_MPIAIJ(Mat A)
1306 {
1307   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1308 
1309   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1310   else return 0;
1311 }
1312 
1313 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1314 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1315 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1316 int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1317 /* -------------------------------------------------------------------*/
1318 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1319        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1320        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1321        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1322        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1323        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1324        MatLUFactor_MPIAIJ,0,
1325        MatRelax_MPIAIJ,
1326        MatTranspose_MPIAIJ,
1327        MatGetInfo_MPIAIJ,0,
1328        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1329        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1330        0,
1331        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1332        MatGetReordering_MPIAIJ,
1333        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1334        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1335        MatILUFactorSymbolic_MPIAIJ,0,
1336        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1337        0,0,0,
1338        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1339        MatPrintHelp_MPIAIJ,
1340        MatScale_MPIAIJ};
1341 
1342 /*@C
1343    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1344    (the default parallel PETSc format).  For good matrix assembly performance
1345    the user should preallocate the matrix storage by setting the parameters
1346    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1347    performance can be increased by more than a factor of 50.
1348 
1349    Input Parameters:
1350 .  comm - MPI communicator
1351 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1352 .  n - number of local columns (or PETSC_DECIDE to have calculated
1353            if N is given)
1354 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1355 .  N - number of global columns (or PETSC_DECIDE to have calculated
1356            if n is given)
1357 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1358            (same for all local rows)
1359 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1360            or null (possibly different for each row).  You must leave room
1361            for the diagonal entry even if it is zero.
1362 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1363            submatrix (same for all local rows).
1364 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1365            submatrix or null (possibly different for each row).
1366 
1367    Output Parameter:
1368 .  newmat - the matrix
1369 
1370    Notes:
1371    The AIJ format (also called the Yale sparse matrix format or
1372    compressed row storage), is fully compatible with standard Fortran 77
1373    storage.  That is, the stored row and column indices can begin at
1374    either one (as in Fortran) or zero.  See the users manual for details.
1375 
1376    The user MUST specify either the local or global matrix dimensions
1377    (possibly both).
1378 
1379    By default, this format uses inodes (identical nodes) when possible.
1380    We search for consecutive rows with the same nonzero structure, thereby
1381    reusing matrix information to achieve increased efficiency.
1382 
1383    Options Database Keys:
1384 $    -mat_aij_no_inode  - Do not use inodes
1385 $    -mat_aij_inode_limit <limit> - Set inode limit.
1386 $        (max limit=5)
1387 $    -mat_aij_oneindex - Internally use indexing starting at 1
1388 $        rather than 0.  Note: When calling MatSetValues(),
1389 $        the user still MUST index entries starting at 0!
1390 
1391    Storage Information:
1392    For a square global matrix we define each processor's diagonal portion
1393    to be its local rows and the corresponding columns (a square submatrix);
1394    each processor's off-diagonal portion encompasses the remainder of the
1395    local matrix (a rectangular submatrix).
1396 
1397    The user can specify preallocated storage for the diagonal part of
1398    the local submatrix with either d_nz or d_nnz (not both).  Set
1399    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1400    memory allocation.  Likewise, specify preallocated storage for the
1401    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1402 
1403    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1404    the figure below we depict these three local rows and all columns (0-11).
1405 
1406 $          0 1 2 3 4 5 6 7 8 9 10 11
1407 $         -------------------
1408 $  row 3  |  o o o d d d o o o o o o
1409 $  row 4  |  o o o d d d o o o o o o
1410 $  row 5  |  o o o d d d o o o o o o
1411 $         -------------------
1412 $
1413 
1414    Thus, any entries in the d locations are stored in the d (diagonal)
1415    submatrix, and any entries in the o locations are stored in the
1416    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1417    stored simply in the MATSEQAIJ format for compressed row storage.
1418 
1419    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1420    and o_nz should indicate the number of nonzeros per row in the o matrix.
1421    In general, for PDE problems in which most nonzeros are near the diagonal,
1422    one expects d_nz >> o_nz.   For additional details, see the users manual
1423    chapter on matrices and the file $(PETSC_DIR)/Performance.
1424 
1425 .keywords: matrix, aij, compressed row, sparse, parallel
1426 
1427 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1428 @*/
1429 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1430                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *newmat)
1431 {
1432   Mat          mat;
1433   Mat_MPIAIJ   *a;
1434   int          ierr, i,sum[2],work[2];
1435 
1436   *newmat         = 0;
1437   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1438   PLogObjectCreate(mat);
1439   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1440   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1441   mat->destroy    = MatDestroy_MPIAIJ;
1442   mat->view       = MatView_MPIAIJ;
1443   mat->factor     = 0;
1444   mat->assembled  = PETSC_FALSE;
1445 
1446   a->insertmode = NOT_SET_VALUES;
1447   MPI_Comm_rank(comm,&a->rank);
1448   MPI_Comm_size(comm,&a->size);
1449 
1450   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1451     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSc decide rows but set d_nnz or o_nnz");
1452 
1453   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1454     work[0] = m; work[1] = n;
1455     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1456     if (M == PETSC_DECIDE) M = sum[0];
1457     if (N == PETSC_DECIDE) N = sum[1];
1458   }
1459   if (m == PETSC_DECIDE) {m = M/a->size + ((M % a->size) > a->rank);}
1460   if (n == PETSC_DECIDE) {n = N/a->size + ((N % a->size) > a->rank);}
1461   a->m = m;
1462   a->n = n;
1463   a->N = N;
1464   a->M = M;
1465 
1466   /* build local table of row and column ownerships */
1467   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1468   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1469   a->cowners = a->rowners + a->size + 1;
1470   MPI_Allgather(&m,1,MPI_INT,a->rowners+1,1,MPI_INT,comm);
1471   a->rowners[0] = 0;
1472   for ( i=2; i<=a->size; i++ ) {
1473     a->rowners[i] += a->rowners[i-1];
1474   }
1475   a->rstart = a->rowners[a->rank];
1476   a->rend   = a->rowners[a->rank+1];
1477   MPI_Allgather(&n,1,MPI_INT,a->cowners+1,1,MPI_INT,comm);
1478   a->cowners[0] = 0;
1479   for ( i=2; i<=a->size; i++ ) {
1480     a->cowners[i] += a->cowners[i-1];
1481   }
1482   a->cstart = a->cowners[a->rank];
1483   a->cend   = a->cowners[a->rank+1];
1484 
1485   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1486   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&a->A); CHKERRQ(ierr);
1487   PLogObjectParent(mat,a->A);
1488   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1489   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&a->B); CHKERRQ(ierr);
1490   PLogObjectParent(mat,a->B);
1491 
1492   /* build cache for off array entries formed */
1493   ierr = StashBuild_Private(&a->stash); CHKERRQ(ierr);
1494   a->colmap      = 0;
1495   a->garray      = 0;
1496   a->roworiented = 1;
1497 
1498   /* stuff used for matrix vector multiply */
1499   a->lvec      = 0;
1500   a->Mvctx     = 0;
1501 
1502   /* stuff for MatGetRow() */
1503   a->rowindices   = 0;
1504   a->rowvalues    = 0;
1505   a->getrowactive = PETSC_FALSE;
1506 
1507   *newmat = mat;
1508   return 0;
1509 }
1510 
1511 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1512 {
1513   Mat        mat;
1514   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1515   int        ierr, len,flg;
1516 
1517   *newmat       = 0;
1518   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1519   PLogObjectCreate(mat);
1520   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1521   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1522   mat->destroy    = MatDestroy_MPIAIJ;
1523   mat->view       = MatView_MPIAIJ;
1524   mat->factor     = matin->factor;
1525   mat->assembled  = PETSC_TRUE;
1526 
1527   a->m          = oldmat->m;
1528   a->n          = oldmat->n;
1529   a->M          = oldmat->M;
1530   a->N          = oldmat->N;
1531 
1532   a->rstart     = oldmat->rstart;
1533   a->rend       = oldmat->rend;
1534   a->cstart     = oldmat->cstart;
1535   a->cend       = oldmat->cend;
1536   a->size       = oldmat->size;
1537   a->rank       = oldmat->rank;
1538   a->insertmode = NOT_SET_VALUES;
1539 
1540   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1541   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1542   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1543   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1544   if (oldmat->colmap) {
1545     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1546     PLogObjectMemory(mat,(a->N)*sizeof(int));
1547     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1548   } else a->colmap = 0;
1549   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1550     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1551     PLogObjectMemory(mat,len*sizeof(int));
1552     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1553   } else a->garray = 0;
1554 
1555   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1556   PLogObjectParent(mat,a->lvec);
1557   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1558   PLogObjectParent(mat,a->Mvctx);
1559   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1560   PLogObjectParent(mat,a->A);
1561   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1562   PLogObjectParent(mat,a->B);
1563   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1564   if (flg) {
1565     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1566   }
1567   *newmat = mat;
1568   return 0;
1569 }
1570 
1571 #include "sysio.h"
1572 
1573 int MatLoad_MPIAIJ(Viewer bview,MatType type,Mat *newmat)
1574 {
1575   Mat          A;
1576   int          i, nz, ierr, j,rstart, rend, fd;
1577   Scalar       *vals,*svals;
1578   PetscObject  vobj = (PetscObject) bview;
1579   MPI_Comm     comm = vobj->comm;
1580   MPI_Status   status;
1581   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1582   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1583   int          tag = ((PetscObject)bview)->tag;
1584 
1585   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1586   if (!rank) {
1587     ierr = ViewerFileGetDescriptor_Private(bview,&fd); CHKERRQ(ierr);
1588     ierr = SYRead(fd,(char *)header,4,SYINT); CHKERRQ(ierr);
1589     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
1590   }
1591 
1592   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1593   M = header[1]; N = header[2];
1594   /* determine ownership of all rows */
1595   m = M/size + ((M % size) > rank);
1596   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1597   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1598   rowners[0] = 0;
1599   for ( i=2; i<=size; i++ ) {
1600     rowners[i] += rowners[i-1];
1601   }
1602   rstart = rowners[rank];
1603   rend   = rowners[rank+1];
1604 
1605   /* distribute row lengths to all processors */
1606   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1607   offlens = ourlens + (rend-rstart);
1608   if (!rank) {
1609     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1610     ierr = SYRead(fd,rowlengths,M,SYINT); CHKERRQ(ierr);
1611     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1612     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1613     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1614     PetscFree(sndcounts);
1615   }
1616   else {
1617     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1618   }
1619 
1620   if (!rank) {
1621     /* calculate the number of nonzeros on each processor */
1622     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1623     PetscMemzero(procsnz,size*sizeof(int));
1624     for ( i=0; i<size; i++ ) {
1625       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1626         procsnz[i] += rowlengths[j];
1627       }
1628     }
1629     PetscFree(rowlengths);
1630 
1631     /* determine max buffer needed and allocate it */
1632     maxnz = 0;
1633     for ( i=0; i<size; i++ ) {
1634       maxnz = PetscMax(maxnz,procsnz[i]);
1635     }
1636     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1637 
1638     /* read in my part of the matrix column indices  */
1639     nz = procsnz[0];
1640     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1641     ierr = SYRead(fd,mycols,nz,SYINT); CHKERRQ(ierr);
1642 
1643     /* read in every one elses and ship off */
1644     for ( i=1; i<size; i++ ) {
1645       nz = procsnz[i];
1646       ierr = SYRead(fd,cols,nz,SYINT); CHKERRQ(ierr);
1647       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1648     }
1649     PetscFree(cols);
1650   }
1651   else {
1652     /* determine buffer space needed for message */
1653     nz = 0;
1654     for ( i=0; i<m; i++ ) {
1655       nz += ourlens[i];
1656     }
1657     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1658 
1659     /* receive message of column indices*/
1660     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1661     MPI_Get_count(&status,MPI_INT,&maxnz);
1662     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1663   }
1664 
1665   /* loop over local rows, determining number of off diagonal entries */
1666   PetscMemzero(offlens,m*sizeof(int));
1667   jj = 0;
1668   for ( i=0; i<m; i++ ) {
1669     for ( j=0; j<ourlens[i]; j++ ) {
1670       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1671       jj++;
1672     }
1673   }
1674 
1675   /* create our matrix */
1676   for ( i=0; i<m; i++ ) {
1677     ourlens[i] -= offlens[i];
1678   }
1679   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1680   A = *newmat;
1681   MatSetOption(A,COLUMNS_SORTED);
1682   for ( i=0; i<m; i++ ) {
1683     ourlens[i] += offlens[i];
1684   }
1685 
1686   if (!rank) {
1687     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1688 
1689     /* read in my part of the matrix numerical values  */
1690     nz = procsnz[0];
1691     ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1692 
1693     /* insert into matrix */
1694     jj      = rstart;
1695     smycols = mycols;
1696     svals   = vals;
1697     for ( i=0; i<m; i++ ) {
1698       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1699       smycols += ourlens[i];
1700       svals   += ourlens[i];
1701       jj++;
1702     }
1703 
1704     /* read in other processors and ship out */
1705     for ( i=1; i<size; i++ ) {
1706       nz = procsnz[i];
1707       ierr = SYRead(fd,vals,nz,SYSCALAR); CHKERRQ(ierr);
1708       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1709     }
1710     PetscFree(procsnz);
1711   }
1712   else {
1713     /* receive numeric values */
1714     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1715 
1716     /* receive message of values*/
1717     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1718     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1719     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1720 
1721     /* insert into matrix */
1722     jj      = rstart;
1723     smycols = mycols;
1724     svals   = vals;
1725     for ( i=0; i<m; i++ ) {
1726       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1727       smycols += ourlens[i];
1728       svals   += ourlens[i];
1729       jj++;
1730     }
1731   }
1732   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1733 
1734   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1735   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1736   return 0;
1737 }
1738