xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision d9d09a02e1e5c0c12eda52f6d75965a49fd2c915)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.146 1996/07/01 15:33:56 bsmith Exp balay $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "src/vec/vecimpl.h"
7 #include "src/inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatReordering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (mat->was_assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   for ( i=0; i<m; i++ ) {
104     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
105     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
106     if (idxm[i] >= rstart && idxm[i] < rend) {
107       row = idxm[i] - rstart;
108       for ( j=0; j<n; j++ ) {
109         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
110         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
111         if (idxn[j] >= cstart && idxn[j] < cend){
112           col = idxn[j] - cstart;
113           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
114         }
115         else {
116           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
117           col = aij->colmap[idxn[j]] + shift;
118           if ( aij->garray[col] != idxn[j] ) *(v+i*n+j) = 0.0;
119           else {
120             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
121           }
122         }
123       }
124     }
125     else {
126       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
127     }
128   }
129   return 0;
130 }
131 
132 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
133 {
134   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
135   MPI_Comm    comm = mat->comm;
136   int         size = aij->size, *owners = aij->rowners;
137   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
138   MPI_Request *send_waits,*recv_waits;
139   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
140   InsertMode  addv;
141   Scalar      *rvalues,*svalues;
142 
143   /* make sure all processors are either in INSERTMODE or ADDMODE */
144   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
145   if (addv == (ADD_VALUES|INSERT_VALUES)) {
146     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
147   }
148   aij->insertmode = addv; /* in case this processor had no cache */
149 
150   /*  first count number of contributors to each processor */
151   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
152   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
153   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
154   for ( i=0; i<aij->stash.n; i++ ) {
155     idx = aij->stash.idx[i];
156     for ( j=0; j<size; j++ ) {
157       if (idx >= owners[j] && idx < owners[j+1]) {
158         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
159       }
160     }
161   }
162   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
163 
164   /* inform other processors of number of messages and max length*/
165   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
166   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
167   nreceives = work[rank];
168   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
169   nmax = work[rank];
170   PetscFree(work);
171 
172   /* post receives:
173        1) each message will consist of ordered pairs
174      (global index,value) we store the global index as a double
175      to simplify the message passing.
176        2) since we don't know how long each individual message is we
177      allocate the largest needed buffer for each receive. Potentially
178      this is a lot of wasted space.
179 
180 
181        This could be done better.
182   */
183   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
184   CHKPTRQ(rvalues);
185   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
186   CHKPTRQ(recv_waits);
187   for ( i=0; i<nreceives; i++ ) {
188     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
189               comm,recv_waits+i);
190   }
191 
192   /* do sends:
193       1) starts[i] gives the starting index in svalues for stuff going to
194          the ith processor
195   */
196   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
197   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
198   CHKPTRQ(send_waits);
199   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
200   starts[0] = 0;
201   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
202   for ( i=0; i<aij->stash.n; i++ ) {
203     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
204     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
205     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
206   }
207   PetscFree(owner);
208   starts[0] = 0;
209   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
210   count = 0;
211   for ( i=0; i<size; i++ ) {
212     if (procs[i]) {
213       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
214                 comm,send_waits+count++);
215     }
216   }
217   PetscFree(starts); PetscFree(nprocs);
218 
219   /* Free cache space */
220   PLogInfo(0,"[%d]MatAssemblyBegin_MPIAIJ:Number of off processor values %d\n",rank,aij->stash.n);
221   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
222 
223   aij->svalues    = svalues;    aij->rvalues    = rvalues;
224   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
225   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
226   aij->rmax       = nmax;
227 
228   return 0;
229 }
230 extern int MatSetUpMultiply_MPIAIJ(Mat);
231 
232 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
233 {
234   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
235   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
236   MPI_Status  *send_status,recv_status;
237   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
238   int         row,col,other_disassembled,shift = C->indexshift;
239   Scalar      *values,val;
240   InsertMode  addv = aij->insertmode;
241 
242   /*  wait on receives */
243   while (count) {
244     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
245     /* unpack receives into our local space */
246     values = aij->rvalues + 3*imdex*aij->rmax;
247     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
248     n = n/3;
249     for ( i=0; i<n; i++ ) {
250       row = (int) PetscReal(values[3*i]) - aij->rstart;
251       col = (int) PetscReal(values[3*i+1]);
252       val = values[3*i+2];
253       if (col >= aij->cstart && col < aij->cend) {
254         col -= aij->cstart;
255         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
256       }
257       else {
258         if (mat->was_assembled) {
259           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
260           col = aij->colmap[col] + shift;
261           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
262             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
263             col = (int) PetscReal(values[3*i+1]);
264           }
265         }
266         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
267       }
268     }
269     count--;
270   }
271   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
272 
273   /* wait on sends */
274   if (aij->nsends) {
275     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
276     CHKPTRQ(send_status);
277     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
278     PetscFree(send_status);
279   }
280   PetscFree(aij->send_waits); PetscFree(aij->svalues);
281 
282   aij->insertmode = NOT_SET_VALUES;
283   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
285 
286   /* determine if any processor has disassembled, if so we must
287      also disassemble ourselfs, in order that we may reassemble. */
288   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
289   if (mat->was_assembled && !other_disassembled) {
290     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
291   }
292 
293   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
294     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
295   }
296   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
297   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
298 
299   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
300   return 0;
301 }
302 
303 static int MatZeroEntries_MPIAIJ(Mat A)
304 {
305   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
306   int        ierr;
307   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
308   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
309   return 0;
310 }
311 
312 /* the code does not do the diagonal entries correctly unless the
313    matrix is square and the column and row owerships are identical.
314    This is a BUG. The only way to fix it seems to be to access
315    aij->A and aij->B directly and not through the MatZeroRows()
316    routine.
317 */
318 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
319 {
320   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
321   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
322   int            *procs,*nprocs,j,found,idx,nsends,*work;
323   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
324   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
325   int            *lens,imdex,*lrows,*values;
326   MPI_Comm       comm = A->comm;
327   MPI_Request    *send_waits,*recv_waits;
328   MPI_Status     recv_status,*send_status;
329   IS             istmp;
330 
331   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
332   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
333 
334   /*  first count number of contributors to each processor */
335   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
336   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
337   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
338   for ( i=0; i<N; i++ ) {
339     idx = rows[i];
340     found = 0;
341     for ( j=0; j<size; j++ ) {
342       if (idx >= owners[j] && idx < owners[j+1]) {
343         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
344       }
345     }
346     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
347   }
348   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
349 
350   /* inform other processors of number of messages and max length*/
351   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
352   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
353   nrecvs = work[rank];
354   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
355   nmax = work[rank];
356   PetscFree(work);
357 
358   /* post receives:   */
359   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
360   CHKPTRQ(rvalues);
361   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
362   CHKPTRQ(recv_waits);
363   for ( i=0; i<nrecvs; i++ ) {
364     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
365   }
366 
367   /* do sends:
368       1) starts[i] gives the starting index in svalues for stuff going to
369          the ith processor
370   */
371   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
372   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
373   CHKPTRQ(send_waits);
374   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
375   starts[0] = 0;
376   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
377   for ( i=0; i<N; i++ ) {
378     svalues[starts[owner[i]]++] = rows[i];
379   }
380   ISRestoreIndices(is,&rows);
381 
382   starts[0] = 0;
383   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
384   count = 0;
385   for ( i=0; i<size; i++ ) {
386     if (procs[i]) {
387       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
388     }
389   }
390   PetscFree(starts);
391 
392   base = owners[rank];
393 
394   /*  wait on receives */
395   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
396   source = lens + nrecvs;
397   count  = nrecvs; slen = 0;
398   while (count) {
399     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
400     /* unpack receives into our local space */
401     MPI_Get_count(&recv_status,MPI_INT,&n);
402     source[imdex]  = recv_status.MPI_SOURCE;
403     lens[imdex]  = n;
404     slen += n;
405     count--;
406   }
407   PetscFree(recv_waits);
408 
409   /* move the data into the send scatter */
410   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
411   count = 0;
412   for ( i=0; i<nrecvs; i++ ) {
413     values = rvalues + i*nmax;
414     for ( j=0; j<lens[i]; j++ ) {
415       lrows[count++] = values[j] - base;
416     }
417   }
418   PetscFree(rvalues); PetscFree(lens);
419   PetscFree(owner); PetscFree(nprocs);
420 
421   /* actually zap the local rows */
422   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
423   PLogObjectParent(A,istmp);
424   PetscFree(lrows);
425   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
426   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
427   ierr = ISDestroy(istmp); CHKERRQ(ierr);
428 
429   /* wait on sends */
430   if (nsends) {
431     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
432     CHKPTRQ(send_status);
433     MPI_Waitall(nsends,send_waits,send_status);
434     PetscFree(send_status);
435   }
436   PetscFree(send_waits); PetscFree(svalues);
437 
438   return 0;
439 }
440 
441 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
442 {
443   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
444   int        ierr;
445 
446   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
447   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
448   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
449   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
450   return 0;
451 }
452 
453 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
454 {
455   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
456   int        ierr;
457   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
458   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
459   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
460   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
461   return 0;
462 }
463 
464 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
465 {
466   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
467   int        ierr;
468 
469   /* do nondiagonal part */
470   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
471   /* send it on its way */
472   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
473                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
474   /* do local part */
475   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
476   /* receive remote parts: note this assumes the values are not actually */
477   /* inserted in yy until the next line, which is true for my implementation*/
478   /* but is not perhaps always true. */
479   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
480                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
481   return 0;
482 }
483 
484 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
485 {
486   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
487   int        ierr;
488 
489   /* do nondiagonal part */
490   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
491   /* send it on its way */
492   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
493                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
494   /* do local part */
495   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
496   /* receive remote parts: note this assumes the values are not actually */
497   /* inserted in yy until the next line, which is true for my implementation*/
498   /* but is not perhaps always true. */
499   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
500                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
501   return 0;
502 }
503 
504 /*
505   This only works correctly for square matrices where the subblock A->A is the
506    diagonal block
507 */
508 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
509 {
510   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
511   if (a->M != a->N)
512     SETERRQ(1,"MatGetDiagonal_MPIAIJ:Supports only square matrix where A->A is diag block");
513   return MatGetDiagonal(a->A,v);
514 }
515 
516 static int MatScale_MPIAIJ(Scalar *aa,Mat A)
517 {
518   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
519   int        ierr;
520   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
521   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
522   return 0;
523 }
524 
525 static int MatDestroy_MPIAIJ(PetscObject obj)
526 {
527   Mat        mat = (Mat) obj;
528   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
529   int        ierr;
530 #if defined(PETSC_LOG)
531   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
532 #endif
533   PetscFree(aij->rowners);
534   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
535   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
536   if (aij->colmap) PetscFree(aij->colmap);
537   if (aij->garray) PetscFree(aij->garray);
538   if (aij->lvec)   VecDestroy(aij->lvec);
539   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
540   if (aij->rowvalues) PetscFree(aij->rowvalues);
541   PetscFree(aij);
542   PLogObjectDestroy(mat);
543   PetscHeaderDestroy(mat);
544   return 0;
545 }
546 #include "draw.h"
547 #include "pinclude/pviewer.h"
548 
549 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
550 {
551   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
552   int         ierr;
553 
554   if (aij->size == 1) {
555     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
556   }
557   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
558   return 0;
559 }
560 
561 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
562 {
563   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
564   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
565   int         ierr, format,shift = C->indexshift,rank;
566   FILE        *fd;
567   ViewerType  vtype;
568 
569   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
570   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
571     ierr = ViewerGetFormat(viewer,&format);
572     if (format == ASCII_FORMAT_INFO_DETAILED) {
573       int nz, nzalloc, mem, flg;
574       MPI_Comm_rank(mat->comm,&rank);
575       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
576       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
577       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
578       PetscSequentialPhaseBegin(mat->comm,1);
579       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
580          rank,aij->m,nz,nzalloc,mem);
581       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
582          rank,aij->m,nz,nzalloc,mem);
583       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
584       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz);
585       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
586       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz);
587       fflush(fd);
588       PetscSequentialPhaseEnd(mat->comm,1);
589       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
590       return 0;
591     }
592     else if (format == ASCII_FORMAT_INFO) {
593       return 0;
594     }
595   }
596 
597   if (vtype == DRAW_VIEWER) {
598     Draw       draw;
599     PetscTruth isnull;
600     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
601     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
602   }
603 
604   if (vtype == ASCII_FILE_VIEWER) {
605     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
606     PetscSequentialPhaseBegin(mat->comm,1);
607     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
608            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
609            aij->cend);
610     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
611     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
612     fflush(fd);
613     PetscSequentialPhaseEnd(mat->comm,1);
614   }
615   else {
616     int size = aij->size;
617     rank = aij->rank;
618     if (size == 1) {
619       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
620     }
621     else {
622       /* assemble the entire matrix onto first processor. */
623       Mat         A;
624       Mat_SeqAIJ *Aloc;
625       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
626       Scalar      *a;
627 
628       if (!rank) {
629         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
630                CHKERRQ(ierr);
631       }
632       else {
633         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
634                CHKERRQ(ierr);
635       }
636       PLogObjectParent(mat,A);
637 
638       /* copy over the A part */
639       Aloc = (Mat_SeqAIJ*) aij->A->data;
640       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
641       row = aij->rstart;
642       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
643       for ( i=0; i<m; i++ ) {
644         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
645         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
646       }
647       aj = Aloc->j;
648       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
649 
650       /* copy over the B part */
651       Aloc = (Mat_SeqAIJ*) aij->B->data;
652       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
653       row = aij->rstart;
654       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
655       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
656       for ( i=0; i<m; i++ ) {
657         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
658         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
659       }
660       PetscFree(ct);
661       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
662       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
663       if (!rank) {
664         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
665       }
666       ierr = MatDestroy(A); CHKERRQ(ierr);
667     }
668   }
669   return 0;
670 }
671 
672 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
673 {
674   Mat         mat = (Mat) obj;
675   int         ierr;
676   ViewerType  vtype;
677 
678   if (!viewer) {
679     viewer = STDOUT_VIEWER_SELF;
680   }
681   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
682   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
683       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
684     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
685   }
686   else if (vtype == BINARY_FILE_VIEWER) {
687     return MatView_MPIAIJ_Binary(mat,viewer);
688   }
689   return 0;
690 }
691 
692 /*
693     This has to provide several versions.
694 
695      1) per sequential
696      2) a) use only local smoothing updating outer values only once.
697         b) local smoothing updating outer values each inner iteration
698      3) color updating out values betwen colors.
699 */
700 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
701                            double fshift,int its,Vec xx)
702 {
703   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
704   Mat        AA = mat->A, BB = mat->B;
705   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
706   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
707   int        ierr,*idx, *diag;
708   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
709   Vec        tt;
710 
711   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
712   xs = x + shift; /* shift by one for index start of 1 */
713   ls = ls + shift;
714   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
715   diag = A->diag;
716   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
717     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
718   }
719   if (flag & SOR_EISENSTAT) {
720     /* Let  A = L + U + D; where L is lower trianglar,
721     U is upper triangular, E is diagonal; This routine applies
722 
723             (L + E)^{-1} A (U + E)^{-1}
724 
725     to a vector efficiently using Eisenstat's trick. This is for
726     the case of SSOR preconditioner, so E is D/omega where omega
727     is the relaxation factor.
728     */
729     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
730     PLogObjectParent(matin,tt);
731     VecGetArray(tt,&t);
732     scale = (2.0/omega) - 1.0;
733     /*  x = (E + U)^{-1} b */
734     VecSet(&zero,mat->lvec);
735     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
736                               mat->Mvctx); CHKERRQ(ierr);
737     for ( i=m-1; i>-1; i-- ) {
738       n    = A->i[i+1] - diag[i] - 1;
739       idx  = A->j + diag[i] + !shift;
740       v    = A->a + diag[i] + !shift;
741       sum  = b[i];
742       SPARSEDENSEMDOT(sum,xs,v,idx,n);
743       d    = fshift + A->a[diag[i]+shift];
744       n    = B->i[i+1] - B->i[i];
745       idx  = B->j + B->i[i] + shift;
746       v    = B->a + B->i[i] + shift;
747       SPARSEDENSEMDOT(sum,ls,v,idx,n);
748       x[i] = omega*(sum/d);
749     }
750     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
751                             mat->Mvctx); CHKERRQ(ierr);
752 
753     /*  t = b - (2*E - D)x */
754     v = A->a;
755     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
756 
757     /*  t = (E + L)^{-1}t */
758     ts = t + shift; /* shifted by one for index start of a or mat->j*/
759     diag = A->diag;
760     VecSet(&zero,mat->lvec);
761     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
762                                                  mat->Mvctx); CHKERRQ(ierr);
763     for ( i=0; i<m; i++ ) {
764       n    = diag[i] - A->i[i];
765       idx  = A->j + A->i[i] + shift;
766       v    = A->a + A->i[i] + shift;
767       sum  = t[i];
768       SPARSEDENSEMDOT(sum,ts,v,idx,n);
769       d    = fshift + A->a[diag[i]+shift];
770       n    = B->i[i+1] - B->i[i];
771       idx  = B->j + B->i[i] + shift;
772       v    = B->a + B->i[i] + shift;
773       SPARSEDENSEMDOT(sum,ls,v,idx,n);
774       t[i] = omega*(sum/d);
775     }
776     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
777                                                     mat->Mvctx); CHKERRQ(ierr);
778     /*  x = x + t */
779     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
780     VecDestroy(tt);
781     return 0;
782   }
783 
784 
785   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
786     if (flag & SOR_ZERO_INITIAL_GUESS) {
787       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
788     }
789     else {
790       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
791       CHKERRQ(ierr);
792       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
793       CHKERRQ(ierr);
794     }
795     while (its--) {
796       /* go down through the rows */
797       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
798                               mat->Mvctx); CHKERRQ(ierr);
799       for ( i=0; i<m; i++ ) {
800         n    = A->i[i+1] - A->i[i];
801         idx  = A->j + A->i[i] + shift;
802         v    = A->a + A->i[i] + shift;
803         sum  = b[i];
804         SPARSEDENSEMDOT(sum,xs,v,idx,n);
805         d    = fshift + A->a[diag[i]+shift];
806         n    = B->i[i+1] - B->i[i];
807         idx  = B->j + B->i[i] + shift;
808         v    = B->a + B->i[i] + shift;
809         SPARSEDENSEMDOT(sum,ls,v,idx,n);
810         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
811       }
812       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
813                             mat->Mvctx); CHKERRQ(ierr);
814       /* come up through the rows */
815       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
816                               mat->Mvctx); CHKERRQ(ierr);
817       for ( i=m-1; i>-1; i-- ) {
818         n    = A->i[i+1] - A->i[i];
819         idx  = A->j + A->i[i] + shift;
820         v    = A->a + A->i[i] + shift;
821         sum  = b[i];
822         SPARSEDENSEMDOT(sum,xs,v,idx,n);
823         d    = fshift + A->a[diag[i]+shift];
824         n    = B->i[i+1] - B->i[i];
825         idx  = B->j + B->i[i] + shift;
826         v    = B->a + B->i[i] + shift;
827         SPARSEDENSEMDOT(sum,ls,v,idx,n);
828         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
829       }
830       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
831                             mat->Mvctx); CHKERRQ(ierr);
832     }
833   }
834   else if (flag & SOR_FORWARD_SWEEP){
835     if (flag & SOR_ZERO_INITIAL_GUESS) {
836       VecSet(&zero,mat->lvec);
837       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
838                               mat->Mvctx); CHKERRQ(ierr);
839       for ( i=0; i<m; i++ ) {
840         n    = diag[i] - A->i[i];
841         idx  = A->j + A->i[i] + shift;
842         v    = A->a + A->i[i] + shift;
843         sum  = b[i];
844         SPARSEDENSEMDOT(sum,xs,v,idx,n);
845         d    = fshift + A->a[diag[i]+shift];
846         n    = B->i[i+1] - B->i[i];
847         idx  = B->j + B->i[i] + shift;
848         v    = B->a + B->i[i] + shift;
849         SPARSEDENSEMDOT(sum,ls,v,idx,n);
850         x[i] = omega*(sum/d);
851       }
852       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
853                             mat->Mvctx); CHKERRQ(ierr);
854       its--;
855     }
856     while (its--) {
857       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
858       CHKERRQ(ierr);
859       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
860       CHKERRQ(ierr);
861       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
862                               mat->Mvctx); CHKERRQ(ierr);
863       for ( i=0; i<m; i++ ) {
864         n    = A->i[i+1] - A->i[i];
865         idx  = A->j + A->i[i] + shift;
866         v    = A->a + A->i[i] + shift;
867         sum  = b[i];
868         SPARSEDENSEMDOT(sum,xs,v,idx,n);
869         d    = fshift + A->a[diag[i]+shift];
870         n    = B->i[i+1] - B->i[i];
871         idx  = B->j + B->i[i] + shift;
872         v    = B->a + B->i[i] + shift;
873         SPARSEDENSEMDOT(sum,ls,v,idx,n);
874         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
875       }
876       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
877                             mat->Mvctx); CHKERRQ(ierr);
878     }
879   }
880   else if (flag & SOR_BACKWARD_SWEEP){
881     if (flag & SOR_ZERO_INITIAL_GUESS) {
882       VecSet(&zero,mat->lvec);
883       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
884                               mat->Mvctx); CHKERRQ(ierr);
885       for ( i=m-1; i>-1; i-- ) {
886         n    = A->i[i+1] - diag[i] - 1;
887         idx  = A->j + diag[i] + !shift;
888         v    = A->a + diag[i] + !shift;
889         sum  = b[i];
890         SPARSEDENSEMDOT(sum,xs,v,idx,n);
891         d    = fshift + A->a[diag[i]+shift];
892         n    = B->i[i+1] - B->i[i];
893         idx  = B->j + B->i[i] + shift;
894         v    = B->a + B->i[i] + shift;
895         SPARSEDENSEMDOT(sum,ls,v,idx,n);
896         x[i] = omega*(sum/d);
897       }
898       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
899                             mat->Mvctx); CHKERRQ(ierr);
900       its--;
901     }
902     while (its--) {
903       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
904                             mat->Mvctx); CHKERRQ(ierr);
905       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
906                             mat->Mvctx); CHKERRQ(ierr);
907       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
908                               mat->Mvctx); CHKERRQ(ierr);
909       for ( i=m-1; i>-1; i-- ) {
910         n    = A->i[i+1] - A->i[i];
911         idx  = A->j + A->i[i] + shift;
912         v    = A->a + A->i[i] + shift;
913         sum  = b[i];
914         SPARSEDENSEMDOT(sum,xs,v,idx,n);
915         d    = fshift + A->a[diag[i]+shift];
916         n    = B->i[i+1] - B->i[i];
917         idx  = B->j + B->i[i] + shift;
918         v    = B->a + B->i[i] + shift;
919         SPARSEDENSEMDOT(sum,ls,v,idx,n);
920         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
921       }
922       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
923                             mat->Mvctx); CHKERRQ(ierr);
924     }
925   }
926   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
927     if (flag & SOR_ZERO_INITIAL_GUESS) {
928       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
929     }
930     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
931     CHKERRQ(ierr);
932     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
933     CHKERRQ(ierr);
934     while (its--) {
935       /* go down through the rows */
936       for ( i=0; i<m; i++ ) {
937         n    = A->i[i+1] - A->i[i];
938         idx  = A->j + A->i[i] + shift;
939         v    = A->a + A->i[i] + shift;
940         sum  = b[i];
941         SPARSEDENSEMDOT(sum,xs,v,idx,n);
942         d    = fshift + A->a[diag[i]+shift];
943         n    = B->i[i+1] - B->i[i];
944         idx  = B->j + B->i[i] + shift;
945         v    = B->a + B->i[i] + shift;
946         SPARSEDENSEMDOT(sum,ls,v,idx,n);
947         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
948       }
949       /* come up through the rows */
950       for ( i=m-1; i>-1; i-- ) {
951         n    = A->i[i+1] - A->i[i];
952         idx  = A->j + A->i[i] + shift;
953         v    = A->a + A->i[i] + shift;
954         sum  = b[i];
955         SPARSEDENSEMDOT(sum,xs,v,idx,n);
956         d    = fshift + A->a[diag[i]+shift];
957         n    = B->i[i+1] - B->i[i];
958         idx  = B->j + B->i[i] + shift;
959         v    = B->a + B->i[i] + shift;
960         SPARSEDENSEMDOT(sum,ls,v,idx,n);
961         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
962       }
963     }
964   }
965   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
966     if (flag & SOR_ZERO_INITIAL_GUESS) {
967       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
968     }
969     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
970     CHKERRQ(ierr);
971     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
972     CHKERRQ(ierr);
973     while (its--) {
974       for ( i=0; i<m; i++ ) {
975         n    = A->i[i+1] - A->i[i];
976         idx  = A->j + A->i[i] + shift;
977         v    = A->a + A->i[i] + shift;
978         sum  = b[i];
979         SPARSEDENSEMDOT(sum,xs,v,idx,n);
980         d    = fshift + A->a[diag[i]+shift];
981         n    = B->i[i+1] - B->i[i];
982         idx  = B->j + B->i[i] + shift;
983         v    = B->a + B->i[i] + shift;
984         SPARSEDENSEMDOT(sum,ls,v,idx,n);
985         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
986       }
987     }
988   }
989   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
990     if (flag & SOR_ZERO_INITIAL_GUESS) {
991       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
992     }
993     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
994                             mat->Mvctx); CHKERRQ(ierr);
995     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
996                             mat->Mvctx); CHKERRQ(ierr);
997     while (its--) {
998       for ( i=m-1; i>-1; i-- ) {
999         n    = A->i[i+1] - A->i[i];
1000         idx  = A->j + A->i[i] + shift;
1001         v    = A->a + A->i[i] + shift;
1002         sum  = b[i];
1003         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1004         d    = fshift + A->a[diag[i]+shift];
1005         n    = B->i[i+1] - B->i[i];
1006         idx  = B->j + B->i[i] + shift;
1007         v    = B->a + B->i[i] + shift;
1008         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1009         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1010       }
1011     }
1012   }
1013   return 0;
1014 }
1015 
1016 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1017                              int *nzalloc,int *mem)
1018 {
1019   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1020   Mat        A = mat->A, B = mat->B;
1021   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1022 
1023   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1024   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1025   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1026   if (flag == MAT_LOCAL) {
1027     if (nz)       *nz      = isend[0];
1028     if (nzalloc)  *nzalloc = isend[1];
1029     if (mem)      *mem     = isend[2];
1030   } else if (flag == MAT_GLOBAL_MAX) {
1031     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1032     if (nz)      *nz      = irecv[0];
1033     if (nzalloc) *nzalloc = irecv[1];
1034     if (mem)     *mem     = irecv[2];
1035   } else if (flag == MAT_GLOBAL_SUM) {
1036     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1037     if (nz)      *nz      = irecv[0];
1038     if (nzalloc) *nzalloc = irecv[1];
1039     if (mem)     *mem     = irecv[2];
1040   }
1041   return 0;
1042 }
1043 
1044 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1045 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1046 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1047 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1048 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1049 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1050 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1051 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1052 
1053 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1054 {
1055   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1056 
1057   if (op == NO_NEW_NONZERO_LOCATIONS ||
1058       op == YES_NEW_NONZERO_LOCATIONS ||
1059       op == COLUMNS_SORTED ||
1060       op == ROW_ORIENTED) {
1061         MatSetOption(a->A,op);
1062         MatSetOption(a->B,op);
1063   }
1064   else if (op == ROWS_SORTED ||
1065            op == SYMMETRIC_MATRIX ||
1066            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1067            op == YES_NEW_DIAGONALS)
1068     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1069   else if (op == COLUMN_ORIENTED) {
1070     a->roworiented = 0;
1071     MatSetOption(a->A,op);
1072     MatSetOption(a->B,op);
1073   }
1074   else if (op == NO_NEW_DIAGONALS)
1075     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1076   else
1077     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1078   return 0;
1079 }
1080 
1081 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1082 {
1083   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1084   *m = mat->M; *n = mat->N;
1085   return 0;
1086 }
1087 
1088 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1089 {
1090   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1091   *m = mat->m; *n = mat->N;
1092   return 0;
1093 }
1094 
1095 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1096 {
1097   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1098   *m = mat->rstart; *n = mat->rend;
1099   return 0;
1100 }
1101 
1102 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1103 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1104 
1105 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1106 {
1107   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1108   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1109   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1110   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1111   int        *cmap, *idx_p;
1112 
1113   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,"MatGetRow_MPIAIJ:Already active");
1114   mat->getrowactive = PETSC_TRUE;
1115 
1116   if (!mat->rowvalues && (idx || v)) {
1117     /*
1118         allocate enough space to hold information from the longest row.
1119     */
1120     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1121     int     max = 1,n = mat->n,tmp;
1122     for ( i=0; i<n; i++ ) {
1123       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1124       if (max < tmp) { max = tmp; }
1125     }
1126     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1127     CHKPTRQ(mat->rowvalues);
1128     mat->rowindices = (int *) (mat->rowvalues + max);
1129   }
1130 
1131 
1132   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1133   lrow = row - rstart;
1134 
1135   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1136   if (!v)   {pvA = 0; pvB = 0;}
1137   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1138   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1139   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1140   nztot = nzA + nzB;
1141 
1142   cmap  = mat->garray;
1143   if (v  || idx) {
1144     if (nztot) {
1145       /* Sort by increasing column numbers, assuming A and B already sorted */
1146       int imark = -1;
1147       if (v) {
1148         *v = v_p = mat->rowvalues;
1149         for ( i=0; i<nzB; i++ ) {
1150           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1151           else break;
1152         }
1153         imark = i;
1154         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1155         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1156       }
1157       if (idx) {
1158         *idx = idx_p = mat->rowindices;
1159         if (imark > -1) {
1160           for ( i=0; i<imark; i++ ) {
1161             idx_p[i] = cmap[cworkB[i]];
1162           }
1163         } else {
1164           for ( i=0; i<nzB; i++ ) {
1165             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1166             else break;
1167           }
1168           imark = i;
1169         }
1170         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1171         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1172       }
1173     }
1174     else {*idx = 0; *v=0;}
1175   }
1176   *nz = nztot;
1177   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1178   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1179   return 0;
1180 }
1181 
1182 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1183 {
1184   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1185   if (aij->getrowactive == PETSC_FALSE) {
1186     SETERRQ(1,"MatRestoreRow_MPIAIJ:MatGetRow not called");
1187   }
1188   aij->getrowactive = PETSC_FALSE;
1189   return 0;
1190 }
1191 
1192 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1193 {
1194   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1195   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1196   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1197   double     sum = 0.0;
1198   Scalar     *v;
1199 
1200   if (aij->size == 1) {
1201     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1202   } else {
1203     if (type == NORM_FROBENIUS) {
1204       v = amat->a;
1205       for (i=0; i<amat->nz; i++ ) {
1206 #if defined(PETSC_COMPLEX)
1207         sum += real(conj(*v)*(*v)); v++;
1208 #else
1209         sum += (*v)*(*v); v++;
1210 #endif
1211       }
1212       v = bmat->a;
1213       for (i=0; i<bmat->nz; i++ ) {
1214 #if defined(PETSC_COMPLEX)
1215         sum += real(conj(*v)*(*v)); v++;
1216 #else
1217         sum += (*v)*(*v); v++;
1218 #endif
1219       }
1220       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1221       *norm = sqrt(*norm);
1222     }
1223     else if (type == NORM_1) { /* max column norm */
1224       double *tmp, *tmp2;
1225       int    *jj, *garray = aij->garray;
1226       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1227       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1228       PetscMemzero(tmp,aij->N*sizeof(double));
1229       *norm = 0.0;
1230       v = amat->a; jj = amat->j;
1231       for ( j=0; j<amat->nz; j++ ) {
1232         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1233       }
1234       v = bmat->a; jj = bmat->j;
1235       for ( j=0; j<bmat->nz; j++ ) {
1236         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1237       }
1238       MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1239       for ( j=0; j<aij->N; j++ ) {
1240         if (tmp2[j] > *norm) *norm = tmp2[j];
1241       }
1242       PetscFree(tmp); PetscFree(tmp2);
1243     }
1244     else if (type == NORM_INFINITY) { /* max row norm */
1245       double ntemp = 0.0;
1246       for ( j=0; j<amat->m; j++ ) {
1247         v = amat->a + amat->i[j] + shift;
1248         sum = 0.0;
1249         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1250           sum += PetscAbsScalar(*v); v++;
1251         }
1252         v = bmat->a + bmat->i[j] + shift;
1253         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1254           sum += PetscAbsScalar(*v); v++;
1255         }
1256         if (sum > ntemp) ntemp = sum;
1257       }
1258       MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1259     }
1260     else {
1261       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1262     }
1263   }
1264   return 0;
1265 }
1266 
1267 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1268 {
1269   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1270   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1271   int        ierr,shift = Aloc->indexshift;
1272   Mat        B;
1273   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1274   Scalar     *array;
1275 
1276   if (matout == PETSC_NULL && M != N)
1277     SETERRQ(1,"MatTranspose_MPIAIJ:Square matrix only for in-place");
1278   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1279          PETSC_NULL,&B); CHKERRQ(ierr);
1280 
1281   /* copy over the A part */
1282   Aloc = (Mat_SeqAIJ*) a->A->data;
1283   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1284   row = a->rstart;
1285   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1286   for ( i=0; i<m; i++ ) {
1287     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1288     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1289   }
1290   aj = Aloc->j;
1291   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1292 
1293   /* copy over the B part */
1294   Aloc = (Mat_SeqAIJ*) a->B->data;
1295   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1296   row = a->rstart;
1297   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1298   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1299   for ( i=0; i<m; i++ ) {
1300     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1301     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1302   }
1303   PetscFree(ct);
1304   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1305   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1306   if (matout != PETSC_NULL) {
1307     *matout = B;
1308   } else {
1309     /* This isn't really an in-place transpose .... but free data structures from a */
1310     PetscFree(a->rowners);
1311     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1312     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1313     if (a->colmap) PetscFree(a->colmap);
1314     if (a->garray) PetscFree(a->garray);
1315     if (a->lvec) VecDestroy(a->lvec);
1316     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1317     PetscFree(a);
1318     PetscMemcpy(A,B,sizeof(struct _Mat));
1319     PetscHeaderDestroy(B);
1320   }
1321   return 0;
1322 }
1323 
1324 extern int MatPrintHelp_SeqAIJ(Mat);
1325 static int MatPrintHelp_MPIAIJ(Mat A)
1326 {
1327   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1328 
1329   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1330   else return 0;
1331 }
1332 
1333 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1334 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1335 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1336 int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1337 /* -------------------------------------------------------------------*/
1338 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1339        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1340        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1341        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1342        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1343        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1344        MatLUFactor_MPIAIJ,0,
1345        MatRelax_MPIAIJ,
1346        MatTranspose_MPIAIJ,
1347        MatGetInfo_MPIAIJ,0,
1348        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1349        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1350        0,
1351        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1352        MatGetReordering_MPIAIJ,
1353        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1354        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1355        MatILUFactorSymbolic_MPIAIJ,0,
1356        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1357        0,0,0,
1358        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1359        MatPrintHelp_MPIAIJ,
1360        MatScale_MPIAIJ};
1361 
1362 /*@C
1363    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1364    (the default parallel PETSc format).  For good matrix assembly performance
1365    the user should preallocate the matrix storage by setting the parameters
1366    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1367    performance can be increased by more than a factor of 50.
1368 
1369    Input Parameters:
1370 .  comm - MPI communicator
1371 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1372 .  n - number of local columns (or PETSC_DECIDE to have calculated
1373            if N is given)
1374 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1375 .  N - number of global columns (or PETSC_DECIDE to have calculated
1376            if n is given)
1377 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1378            (same for all local rows)
1379 .  d_nzz - array containing the number of nonzeros in the various rows of the
1380            diagonal portion of local submatrix (possibly different for each row)
1381            or PETSC_NULL. You must leave room for the diagonal entry even if
1382            it is zero.
1383 .  o_nz - number of nonzeros per row in the off-diagonal portion of local
1384            submatrix (same for all local rows).
1385 .  o_nzz - array containing the number of nonzeros in the various rows of the
1386            off-diagonal portion of the local submatrix (possibly different for
1387            each row) or PETSC_NULL.
1388 
1389    Output Parameter:
1390 .  A - the matrix
1391 
1392    Notes:
1393    The AIJ format (also called the Yale sparse matrix format or
1394    compressed row storage), is fully compatible with standard Fortran 77
1395    storage.  That is, the stored row and column indices can begin at
1396    either one (as in Fortran) or zero.  See the users manual for details.
1397 
1398    The user MUST specify either the local or global matrix dimensions
1399    (possibly both).
1400 
1401    By default, this format uses inodes (identical nodes) when possible.
1402    We search for consecutive rows with the same nonzero structure, thereby
1403    reusing matrix information to achieve increased efficiency.
1404 
1405    Options Database Keys:
1406 $    -mat_aij_no_inode  - Do not use inodes
1407 $    -mat_aij_inode_limit <limit> - Set inode limit.
1408 $        (max limit=5)
1409 $    -mat_aij_oneindex - Internally use indexing starting at 1
1410 $        rather than 0.  Note: When calling MatSetValues(),
1411 $        the user still MUST index entries starting at 0!
1412 
1413    Storage Information:
1414    For a square global matrix we define each processor's diagonal portion
1415    to be its local rows and the corresponding columns (a square submatrix);
1416    each processor's off-diagonal portion encompasses the remainder of the
1417    local matrix (a rectangular submatrix).
1418 
1419    The user can specify preallocated storage for the diagonal part of
1420    the local submatrix with either d_nz or d_nnz (not both).  Set
1421    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1422    memory allocation.  Likewise, specify preallocated storage for the
1423    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1424 
1425    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1426    the figure below we depict these three local rows and all columns (0-11).
1427 
1428 $          0 1 2 3 4 5 6 7 8 9 10 11
1429 $         -------------------
1430 $  row 3  |  o o o d d d o o o o o o
1431 $  row 4  |  o o o d d d o o o o o o
1432 $  row 5  |  o o o d d d o o o o o o
1433 $         -------------------
1434 $
1435 
1436    Thus, any entries in the d locations are stored in the d (diagonal)
1437    submatrix, and any entries in the o locations are stored in the
1438    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1439    stored simply in the MATSEQAIJ format for compressed row storage.
1440 
1441    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1442    and o_nz should indicate the number of nonzeros per row in the o matrix.
1443    In general, for PDE problems in which most nonzeros are near the diagonal,
1444    one expects d_nz >> o_nz. For large problems you MUST preallocate memory
1445    or you will get TERRIBLE performance, see the users' manual chapter on
1446    matrices and the file $(PETSC_DIR)/Performance.
1447 
1448 .keywords: matrix, aij, compressed row, sparse, parallel
1449 
1450 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1451 @*/
1452 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1453                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1454 {
1455   Mat          B;
1456   Mat_MPIAIJ   *b;
1457   int          ierr, i,sum[2],work[2];
1458 
1459   *A = 0;
1460   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1461   PLogObjectCreate(B);
1462   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1463   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1464   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1465   B->destroy    = MatDestroy_MPIAIJ;
1466   B->view       = MatView_MPIAIJ;
1467   B->factor     = 0;
1468   B->assembled  = PETSC_FALSE;
1469 
1470   b->insertmode = NOT_SET_VALUES;
1471   MPI_Comm_rank(comm,&b->rank);
1472   MPI_Comm_size(comm,&b->size);
1473 
1474   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1475     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1476 
1477   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1478     work[0] = m; work[1] = n;
1479     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1480     if (M == PETSC_DECIDE) M = sum[0];
1481     if (N == PETSC_DECIDE) N = sum[1];
1482   }
1483   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1484   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1485   b->m = m; B->m = m;
1486   b->n = n; B->n = n;
1487   b->N = N; B->N = N;
1488   b->M = M; B->M = M;
1489 
1490   /* build local table of row and column ownerships */
1491   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1492   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1493   b->cowners = b->rowners + b->size + 1;
1494   MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1495   b->rowners[0] = 0;
1496   for ( i=2; i<=b->size; i++ ) {
1497     b->rowners[i] += b->rowners[i-1];
1498   }
1499   b->rstart = b->rowners[b->rank];
1500   b->rend   = b->rowners[b->rank+1];
1501   MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1502   b->cowners[0] = 0;
1503   for ( i=2; i<=b->size; i++ ) {
1504     b->cowners[i] += b->cowners[i-1];
1505   }
1506   b->cstart = b->cowners[b->rank];
1507   b->cend   = b->cowners[b->rank+1];
1508 
1509   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1510   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1511   PLogObjectParent(B,b->A);
1512   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1513   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1514   PLogObjectParent(B,b->B);
1515 
1516   /* build cache for off array entries formed */
1517   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1518   b->colmap      = 0;
1519   b->garray      = 0;
1520   b->roworiented = 1;
1521 
1522   /* stuff used for matrix vector multiply */
1523   b->lvec      = 0;
1524   b->Mvctx     = 0;
1525 
1526   /* stuff for MatGetRow() */
1527   b->rowindices   = 0;
1528   b->rowvalues    = 0;
1529   b->getrowactive = PETSC_FALSE;
1530 
1531   *A = B;
1532   return 0;
1533 }
1534 
1535 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1536 {
1537   Mat        mat;
1538   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1539   int        ierr, len=0, flg;
1540 
1541   *newmat       = 0;
1542   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1543   PLogObjectCreate(mat);
1544   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1545   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1546   mat->destroy    = MatDestroy_MPIAIJ;
1547   mat->view       = MatView_MPIAIJ;
1548   mat->factor     = matin->factor;
1549   mat->assembled  = PETSC_TRUE;
1550 
1551   a->m = mat->m   = oldmat->m;
1552   a->n = mat->n   = oldmat->n;
1553   a->M = mat->M   = oldmat->M;
1554   a->N = mat->N   = oldmat->N;
1555 
1556   a->rstart       = oldmat->rstart;
1557   a->rend         = oldmat->rend;
1558   a->cstart       = oldmat->cstart;
1559   a->cend         = oldmat->cend;
1560   a->size         = oldmat->size;
1561   a->rank         = oldmat->rank;
1562   a->insertmode   = NOT_SET_VALUES;
1563   a->rowvalues    = 0;
1564   a->getrowactive = PETSC_FALSE;
1565 
1566   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1567   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1568   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1569   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1570   if (oldmat->colmap) {
1571     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1572     PLogObjectMemory(mat,(a->N)*sizeof(int));
1573     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1574   } else a->colmap = 0;
1575   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1576     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1577     PLogObjectMemory(mat,len*sizeof(int));
1578     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1579   } else a->garray = 0;
1580 
1581   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1582   PLogObjectParent(mat,a->lvec);
1583   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1584   PLogObjectParent(mat,a->Mvctx);
1585   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1586   PLogObjectParent(mat,a->A);
1587   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1588   PLogObjectParent(mat,a->B);
1589   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1590   if (flg) {
1591     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1592   }
1593   *newmat = mat;
1594   return 0;
1595 }
1596 
1597 #include "sys.h"
1598 
1599 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1600 {
1601   Mat          A;
1602   int          i, nz, ierr, j,rstart, rend, fd;
1603   Scalar       *vals,*svals;
1604   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1605   MPI_Status   status;
1606   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1607   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1608   int          tag = ((PetscObject)viewer)->tag;
1609 
1610   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1611   if (!rank) {
1612     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1613     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1614     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
1615   }
1616 
1617   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1618   M = header[1]; N = header[2];
1619   /* determine ownership of all rows */
1620   m = M/size + ((M % size) > rank);
1621   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1622   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1623   rowners[0] = 0;
1624   for ( i=2; i<=size; i++ ) {
1625     rowners[i] += rowners[i-1];
1626   }
1627   rstart = rowners[rank];
1628   rend   = rowners[rank+1];
1629 
1630   /* distribute row lengths to all processors */
1631   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1632   offlens = ourlens + (rend-rstart);
1633   if (!rank) {
1634     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1635     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1636     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1637     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1638     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1639     PetscFree(sndcounts);
1640   }
1641   else {
1642     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1643   }
1644 
1645   if (!rank) {
1646     /* calculate the number of nonzeros on each processor */
1647     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1648     PetscMemzero(procsnz,size*sizeof(int));
1649     for ( i=0; i<size; i++ ) {
1650       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1651         procsnz[i] += rowlengths[j];
1652       }
1653     }
1654     PetscFree(rowlengths);
1655 
1656     /* determine max buffer needed and allocate it */
1657     maxnz = 0;
1658     for ( i=0; i<size; i++ ) {
1659       maxnz = PetscMax(maxnz,procsnz[i]);
1660     }
1661     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1662 
1663     /* read in my part of the matrix column indices  */
1664     nz = procsnz[0];
1665     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1666     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1667 
1668     /* read in every one elses and ship off */
1669     for ( i=1; i<size; i++ ) {
1670       nz = procsnz[i];
1671       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1672       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1673     }
1674     PetscFree(cols);
1675   }
1676   else {
1677     /* determine buffer space needed for message */
1678     nz = 0;
1679     for ( i=0; i<m; i++ ) {
1680       nz += ourlens[i];
1681     }
1682     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1683 
1684     /* receive message of column indices*/
1685     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1686     MPI_Get_count(&status,MPI_INT,&maxnz);
1687     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1688   }
1689 
1690   /* loop over local rows, determining number of off diagonal entries */
1691   PetscMemzero(offlens,m*sizeof(int));
1692   jj = 0;
1693   for ( i=0; i<m; i++ ) {
1694     for ( j=0; j<ourlens[i]; j++ ) {
1695       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1696       jj++;
1697     }
1698   }
1699 
1700   /* create our matrix */
1701   for ( i=0; i<m; i++ ) {
1702     ourlens[i] -= offlens[i];
1703   }
1704   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1705   A = *newmat;
1706   MatSetOption(A,COLUMNS_SORTED);
1707   for ( i=0; i<m; i++ ) {
1708     ourlens[i] += offlens[i];
1709   }
1710 
1711   if (!rank) {
1712     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1713 
1714     /* read in my part of the matrix numerical values  */
1715     nz = procsnz[0];
1716     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1717 
1718     /* insert into matrix */
1719     jj      = rstart;
1720     smycols = mycols;
1721     svals   = vals;
1722     for ( i=0; i<m; i++ ) {
1723       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1724       smycols += ourlens[i];
1725       svals   += ourlens[i];
1726       jj++;
1727     }
1728 
1729     /* read in other processors and ship out */
1730     for ( i=1; i<size; i++ ) {
1731       nz = procsnz[i];
1732       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1733       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1734     }
1735     PetscFree(procsnz);
1736   }
1737   else {
1738     /* receive numeric values */
1739     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1740 
1741     /* receive message of values*/
1742     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1743     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1744     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1745 
1746     /* insert into matrix */
1747     jj      = rstart;
1748     smycols = mycols;
1749     svals   = vals;
1750     for ( i=0; i<m; i++ ) {
1751       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1752       smycols += ourlens[i];
1753       svals   += ourlens[i];
1754       jj++;
1755     }
1756   }
1757   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1758 
1759   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1760   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1761   return 0;
1762 }
1763