xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision bd16c2fe4d4045da961594cb53fb6c4a23bb4ff7)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.148 1996/07/02 02:38:51 bsmith Exp bsmith $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "src/vec/vecimpl.h"
7 #include "src/inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatReordering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (mat->was_assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   for ( i=0; i<m; i++ ) {
104     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
105     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
106     if (idxm[i] >= rstart && idxm[i] < rend) {
107       row = idxm[i] - rstart;
108       for ( j=0; j<n; j++ ) {
109         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
110         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
111         if (idxn[j] >= cstart && idxn[j] < cend){
112           col = idxn[j] - cstart;
113           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
114         }
115         else {
116           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
117           col = aij->colmap[idxn[j]] + shift;
118           if ( aij->garray[col] != idxn[j] ) *(v+i*n+j) = 0.0;
119           else {
120             ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
121           }
122         }
123       }
124     }
125     else {
126       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
127     }
128   }
129   return 0;
130 }
131 
132 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
133 {
134   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
135   MPI_Comm    comm = mat->comm;
136   int         size = aij->size, *owners = aij->rowners;
137   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
138   MPI_Request *send_waits,*recv_waits;
139   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
140   InsertMode  addv;
141   Scalar      *rvalues,*svalues;
142 
143   /* make sure all processors are either in INSERTMODE or ADDMODE */
144   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
145   if (addv == (ADD_VALUES|INSERT_VALUES)) {
146     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
147   }
148   aij->insertmode = addv; /* in case this processor had no cache */
149 
150   /*  first count number of contributors to each processor */
151   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
152   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
153   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
154   for ( i=0; i<aij->stash.n; i++ ) {
155     idx = aij->stash.idx[i];
156     for ( j=0; j<size; j++ ) {
157       if (idx >= owners[j] && idx < owners[j+1]) {
158         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
159       }
160     }
161   }
162   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
163 
164   /* inform other processors of number of messages and max length*/
165   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
166   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
167   nreceives = work[rank];
168   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
169   nmax = work[rank];
170   PetscFree(work);
171 
172   /* post receives:
173        1) each message will consist of ordered pairs
174      (global index,value) we store the global index as a double
175      to simplify the message passing.
176        2) since we don't know how long each individual message is we
177      allocate the largest needed buffer for each receive. Potentially
178      this is a lot of wasted space.
179 
180 
181        This could be done better.
182   */
183   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
184   CHKPTRQ(rvalues);
185   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
186   CHKPTRQ(recv_waits);
187   for ( i=0; i<nreceives; i++ ) {
188     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
189               comm,recv_waits+i);
190   }
191 
192   /* do sends:
193       1) starts[i] gives the starting index in svalues for stuff going to
194          the ith processor
195   */
196   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
197   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
198   CHKPTRQ(send_waits);
199   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
200   starts[0] = 0;
201   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
202   for ( i=0; i<aij->stash.n; i++ ) {
203     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
204     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
205     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
206   }
207   PetscFree(owner);
208   starts[0] = 0;
209   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
210   count = 0;
211   for ( i=0; i<size; i++ ) {
212     if (procs[i]) {
213       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
214                 comm,send_waits+count++);
215     }
216   }
217   PetscFree(starts); PetscFree(nprocs);
218 
219   /* Free cache space */
220   PLogInfo(0,"[%d]MatAssemblyBegin_MPIAIJ:Number of off processor values %d\n",rank,aij->stash.n);
221   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
222 
223   aij->svalues    = svalues;    aij->rvalues    = rvalues;
224   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
225   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
226   aij->rmax       = nmax;
227 
228   return 0;
229 }
230 extern int MatSetUpMultiply_MPIAIJ(Mat);
231 
232 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
233 {
234   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
235   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
236   MPI_Status  *send_status,recv_status;
237   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
238   int         row,col,other_disassembled,shift = C->indexshift;
239   Scalar      *values,val;
240   InsertMode  addv = aij->insertmode;
241 
242   /*  wait on receives */
243   while (count) {
244     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
245     /* unpack receives into our local space */
246     values = aij->rvalues + 3*imdex*aij->rmax;
247     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
248     n = n/3;
249     for ( i=0; i<n; i++ ) {
250       row = (int) PetscReal(values[3*i]) - aij->rstart;
251       col = (int) PetscReal(values[3*i+1]);
252       val = values[3*i+2];
253       if (col >= aij->cstart && col < aij->cend) {
254         col -= aij->cstart;
255         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
256       }
257       else {
258         if (mat->was_assembled) {
259           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
260           col = aij->colmap[col] + shift;
261           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
262             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
263             col = (int) PetscReal(values[3*i+1]);
264           }
265         }
266         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
267       }
268     }
269     count--;
270   }
271   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
272 
273   /* wait on sends */
274   if (aij->nsends) {
275     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
276     CHKPTRQ(send_status);
277     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
278     PetscFree(send_status);
279   }
280   PetscFree(aij->send_waits); PetscFree(aij->svalues);
281 
282   aij->insertmode = NOT_SET_VALUES;
283   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
285 
286   /* determine if any processor has disassembled, if so we must
287      also disassemble ourselfs, in order that we may reassemble. */
288   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
289   if (mat->was_assembled && !other_disassembled) {
290     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
291   }
292 
293   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
294     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
295   }
296   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
297   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
298 
299   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
300   return 0;
301 }
302 
303 static int MatZeroEntries_MPIAIJ(Mat A)
304 {
305   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
306   int        ierr;
307   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
308   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
309   return 0;
310 }
311 
312 /* the code does not do the diagonal entries correctly unless the
313    matrix is square and the column and row owerships are identical.
314    This is a BUG. The only way to fix it seems to be to access
315    aij->A and aij->B directly and not through the MatZeroRows()
316    routine.
317 */
318 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
319 {
320   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
321   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
322   int            *procs,*nprocs,j,found,idx,nsends,*work;
323   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
324   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
325   int            *lens,imdex,*lrows,*values;
326   MPI_Comm       comm = A->comm;
327   MPI_Request    *send_waits,*recv_waits;
328   MPI_Status     recv_status,*send_status;
329   IS             istmp;
330 
331   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
332   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
333 
334   /*  first count number of contributors to each processor */
335   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
336   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
337   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
338   for ( i=0; i<N; i++ ) {
339     idx = rows[i];
340     found = 0;
341     for ( j=0; j<size; j++ ) {
342       if (idx >= owners[j] && idx < owners[j+1]) {
343         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
344       }
345     }
346     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
347   }
348   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
349 
350   /* inform other processors of number of messages and max length*/
351   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
352   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
353   nrecvs = work[rank];
354   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
355   nmax = work[rank];
356   PetscFree(work);
357 
358   /* post receives:   */
359   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
360   CHKPTRQ(rvalues);
361   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
362   CHKPTRQ(recv_waits);
363   for ( i=0; i<nrecvs; i++ ) {
364     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
365   }
366 
367   /* do sends:
368       1) starts[i] gives the starting index in svalues for stuff going to
369          the ith processor
370   */
371   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
372   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
373   CHKPTRQ(send_waits);
374   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
375   starts[0] = 0;
376   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
377   for ( i=0; i<N; i++ ) {
378     svalues[starts[owner[i]]++] = rows[i];
379   }
380   ISRestoreIndices(is,&rows);
381 
382   starts[0] = 0;
383   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
384   count = 0;
385   for ( i=0; i<size; i++ ) {
386     if (procs[i]) {
387       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
388     }
389   }
390   PetscFree(starts);
391 
392   base = owners[rank];
393 
394   /*  wait on receives */
395   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
396   source = lens + nrecvs;
397   count  = nrecvs; slen = 0;
398   while (count) {
399     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
400     /* unpack receives into our local space */
401     MPI_Get_count(&recv_status,MPI_INT,&n);
402     source[imdex]  = recv_status.MPI_SOURCE;
403     lens[imdex]  = n;
404     slen += n;
405     count--;
406   }
407   PetscFree(recv_waits);
408 
409   /* move the data into the send scatter */
410   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
411   count = 0;
412   for ( i=0; i<nrecvs; i++ ) {
413     values = rvalues + i*nmax;
414     for ( j=0; j<lens[i]; j++ ) {
415       lrows[count++] = values[j] - base;
416     }
417   }
418   PetscFree(rvalues); PetscFree(lens);
419   PetscFree(owner); PetscFree(nprocs);
420 
421   /* actually zap the local rows */
422   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
423   PLogObjectParent(A,istmp);
424   PetscFree(lrows);
425   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
426   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
427   ierr = ISDestroy(istmp); CHKERRQ(ierr);
428 
429   /* wait on sends */
430   if (nsends) {
431     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
432     CHKPTRQ(send_status);
433     MPI_Waitall(nsends,send_waits,send_status);
434     PetscFree(send_status);
435   }
436   PetscFree(send_waits); PetscFree(svalues);
437 
438   return 0;
439 }
440 
441 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
442 {
443   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
444   int        ierr,nt;
445 
446   ierr = VecGetLocalSize(xx,&nt);  CHKERRQ(ierr);
447   if (nt != a->n) {
448     SETERRQ(1,"MatMult_MPIAIJ:Incompatiable parition of A and xx");
449   }
450   ierr = VecGetLocalSize(yy,&nt);  CHKERRQ(ierr);
451   if (nt != a->m) {
452     SETERRQ(1,"MatMult_MPIAIJ:Incompatiable parition of A and yy");
453   }
454   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
455   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
456   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
457   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
458   return 0;
459 }
460 
461 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
462 {
463   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
464   int        ierr;
465   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
466   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
467   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
468   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
469   return 0;
470 }
471 
472 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
473 {
474   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
475   int        ierr;
476 
477   /* do nondiagonal part */
478   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
479   /* send it on its way */
480   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
481                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
482   /* do local part */
483   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
484   /* receive remote parts: note this assumes the values are not actually */
485   /* inserted in yy until the next line, which is true for my implementation*/
486   /* but is not perhaps always true. */
487   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
488                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
489   return 0;
490 }
491 
492 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
493 {
494   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
495   int        ierr;
496 
497   /* do nondiagonal part */
498   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
499   /* send it on its way */
500   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
501                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
502   /* do local part */
503   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
504   /* receive remote parts: note this assumes the values are not actually */
505   /* inserted in yy until the next line, which is true for my implementation*/
506   /* but is not perhaps always true. */
507   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
508                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
509   return 0;
510 }
511 
512 /*
513   This only works correctly for square matrices where the subblock A->A is the
514    diagonal block
515 */
516 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
517 {
518   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
519   if (a->M != a->N)
520     SETERRQ(1,"MatGetDiagonal_MPIAIJ:Supports only square matrix where A->A is diag block");
521   return MatGetDiagonal(a->A,v);
522 }
523 
524 static int MatScale_MPIAIJ(Scalar *aa,Mat A)
525 {
526   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
527   int        ierr;
528   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
529   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
530   return 0;
531 }
532 
533 static int MatDestroy_MPIAIJ(PetscObject obj)
534 {
535   Mat        mat = (Mat) obj;
536   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
537   int        ierr;
538 #if defined(PETSC_LOG)
539   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
540 #endif
541   PetscFree(aij->rowners);
542   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
543   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
544   if (aij->colmap) PetscFree(aij->colmap);
545   if (aij->garray) PetscFree(aij->garray);
546   if (aij->lvec)   VecDestroy(aij->lvec);
547   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
548   if (aij->rowvalues) PetscFree(aij->rowvalues);
549   PetscFree(aij);
550   PLogObjectDestroy(mat);
551   PetscHeaderDestroy(mat);
552   return 0;
553 }
554 #include "draw.h"
555 #include "pinclude/pviewer.h"
556 
557 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
558 {
559   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
560   int         ierr;
561 
562   if (aij->size == 1) {
563     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
564   }
565   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
566   return 0;
567 }
568 
569 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
570 {
571   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
572   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
573   int         ierr, format,shift = C->indexshift,rank;
574   FILE        *fd;
575   ViewerType  vtype;
576 
577   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
578   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
579     ierr = ViewerGetFormat(viewer,&format);
580     if (format == ASCII_FORMAT_INFO_DETAILED) {
581       int nz, nzalloc, mem, flg;
582       MPI_Comm_rank(mat->comm,&rank);
583       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
584       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
585       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
586       PetscSequentialPhaseBegin(mat->comm,1);
587       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
588          rank,aij->m,nz,nzalloc,mem);
589       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
590          rank,aij->m,nz,nzalloc,mem);
591       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
592       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz);
593       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
594       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz);
595       fflush(fd);
596       PetscSequentialPhaseEnd(mat->comm,1);
597       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
598       return 0;
599     }
600     else if (format == ASCII_FORMAT_INFO) {
601       return 0;
602     }
603   }
604 
605   if (vtype == DRAW_VIEWER) {
606     Draw       draw;
607     PetscTruth isnull;
608     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
609     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
610   }
611 
612   if (vtype == ASCII_FILE_VIEWER) {
613     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
614     PetscSequentialPhaseBegin(mat->comm,1);
615     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
616            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
617            aij->cend);
618     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
619     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
620     fflush(fd);
621     PetscSequentialPhaseEnd(mat->comm,1);
622   }
623   else {
624     int size = aij->size;
625     rank = aij->rank;
626     if (size == 1) {
627       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
628     }
629     else {
630       /* assemble the entire matrix onto first processor. */
631       Mat         A;
632       Mat_SeqAIJ *Aloc;
633       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
634       Scalar      *a;
635 
636       if (!rank) {
637         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
638                CHKERRQ(ierr);
639       }
640       else {
641         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
642                CHKERRQ(ierr);
643       }
644       PLogObjectParent(mat,A);
645 
646       /* copy over the A part */
647       Aloc = (Mat_SeqAIJ*) aij->A->data;
648       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
649       row = aij->rstart;
650       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
651       for ( i=0; i<m; i++ ) {
652         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
653         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
654       }
655       aj = Aloc->j;
656       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
657 
658       /* copy over the B part */
659       Aloc = (Mat_SeqAIJ*) aij->B->data;
660       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
661       row = aij->rstart;
662       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
663       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
664       for ( i=0; i<m; i++ ) {
665         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
666         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
667       }
668       PetscFree(ct);
669       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
670       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
671       if (!rank) {
672         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
673       }
674       ierr = MatDestroy(A); CHKERRQ(ierr);
675     }
676   }
677   return 0;
678 }
679 
680 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
681 {
682   Mat         mat = (Mat) obj;
683   int         ierr;
684   ViewerType  vtype;
685 
686   if (!viewer) {
687     viewer = STDOUT_VIEWER_SELF;
688   }
689   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
690   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
691       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
692     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
693   }
694   else if (vtype == BINARY_FILE_VIEWER) {
695     return MatView_MPIAIJ_Binary(mat,viewer);
696   }
697   return 0;
698 }
699 
700 /*
701     This has to provide several versions.
702 
703      1) per sequential
704      2) a) use only local smoothing updating outer values only once.
705         b) local smoothing updating outer values each inner iteration
706      3) color updating out values betwen colors.
707 */
708 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
709                            double fshift,int its,Vec xx)
710 {
711   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
712   Mat        AA = mat->A, BB = mat->B;
713   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
714   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
715   int        ierr,*idx, *diag;
716   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
717   Vec        tt;
718 
719   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
720   xs = x + shift; /* shift by one for index start of 1 */
721   ls = ls + shift;
722   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
723   diag = A->diag;
724   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
725     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
726   }
727   if (flag & SOR_EISENSTAT) {
728     /* Let  A = L + U + D; where L is lower trianglar,
729     U is upper triangular, E is diagonal; This routine applies
730 
731             (L + E)^{-1} A (U + E)^{-1}
732 
733     to a vector efficiently using Eisenstat's trick. This is for
734     the case of SSOR preconditioner, so E is D/omega where omega
735     is the relaxation factor.
736     */
737     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
738     PLogObjectParent(matin,tt);
739     VecGetArray(tt,&t);
740     scale = (2.0/omega) - 1.0;
741     /*  x = (E + U)^{-1} b */
742     VecSet(&zero,mat->lvec);
743     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
744                               mat->Mvctx); CHKERRQ(ierr);
745     for ( i=m-1; i>-1; i-- ) {
746       n    = A->i[i+1] - diag[i] - 1;
747       idx  = A->j + diag[i] + !shift;
748       v    = A->a + diag[i] + !shift;
749       sum  = b[i];
750       SPARSEDENSEMDOT(sum,xs,v,idx,n);
751       d    = fshift + A->a[diag[i]+shift];
752       n    = B->i[i+1] - B->i[i];
753       idx  = B->j + B->i[i] + shift;
754       v    = B->a + B->i[i] + shift;
755       SPARSEDENSEMDOT(sum,ls,v,idx,n);
756       x[i] = omega*(sum/d);
757     }
758     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
759                             mat->Mvctx); CHKERRQ(ierr);
760 
761     /*  t = b - (2*E - D)x */
762     v = A->a;
763     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
764 
765     /*  t = (E + L)^{-1}t */
766     ts = t + shift; /* shifted by one for index start of a or mat->j*/
767     diag = A->diag;
768     VecSet(&zero,mat->lvec);
769     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
770                                                  mat->Mvctx); CHKERRQ(ierr);
771     for ( i=0; i<m; i++ ) {
772       n    = diag[i] - A->i[i];
773       idx  = A->j + A->i[i] + shift;
774       v    = A->a + A->i[i] + shift;
775       sum  = t[i];
776       SPARSEDENSEMDOT(sum,ts,v,idx,n);
777       d    = fshift + A->a[diag[i]+shift];
778       n    = B->i[i+1] - B->i[i];
779       idx  = B->j + B->i[i] + shift;
780       v    = B->a + B->i[i] + shift;
781       SPARSEDENSEMDOT(sum,ls,v,idx,n);
782       t[i] = omega*(sum/d);
783     }
784     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
785                                                     mat->Mvctx); CHKERRQ(ierr);
786     /*  x = x + t */
787     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
788     VecDestroy(tt);
789     return 0;
790   }
791 
792 
793   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
794     if (flag & SOR_ZERO_INITIAL_GUESS) {
795       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
796     }
797     else {
798       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
799       CHKERRQ(ierr);
800       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
801       CHKERRQ(ierr);
802     }
803     while (its--) {
804       /* go down through the rows */
805       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
806                               mat->Mvctx); CHKERRQ(ierr);
807       for ( i=0; i<m; i++ ) {
808         n    = A->i[i+1] - A->i[i];
809         idx  = A->j + A->i[i] + shift;
810         v    = A->a + A->i[i] + shift;
811         sum  = b[i];
812         SPARSEDENSEMDOT(sum,xs,v,idx,n);
813         d    = fshift + A->a[diag[i]+shift];
814         n    = B->i[i+1] - B->i[i];
815         idx  = B->j + B->i[i] + shift;
816         v    = B->a + B->i[i] + shift;
817         SPARSEDENSEMDOT(sum,ls,v,idx,n);
818         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
819       }
820       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
821                             mat->Mvctx); CHKERRQ(ierr);
822       /* come up through the rows */
823       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
824                               mat->Mvctx); CHKERRQ(ierr);
825       for ( i=m-1; i>-1; i-- ) {
826         n    = A->i[i+1] - A->i[i];
827         idx  = A->j + A->i[i] + shift;
828         v    = A->a + A->i[i] + shift;
829         sum  = b[i];
830         SPARSEDENSEMDOT(sum,xs,v,idx,n);
831         d    = fshift + A->a[diag[i]+shift];
832         n    = B->i[i+1] - B->i[i];
833         idx  = B->j + B->i[i] + shift;
834         v    = B->a + B->i[i] + shift;
835         SPARSEDENSEMDOT(sum,ls,v,idx,n);
836         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
837       }
838       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
839                             mat->Mvctx); CHKERRQ(ierr);
840     }
841   }
842   else if (flag & SOR_FORWARD_SWEEP){
843     if (flag & SOR_ZERO_INITIAL_GUESS) {
844       VecSet(&zero,mat->lvec);
845       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
846                               mat->Mvctx); CHKERRQ(ierr);
847       for ( i=0; i<m; i++ ) {
848         n    = diag[i] - A->i[i];
849         idx  = A->j + A->i[i] + shift;
850         v    = A->a + A->i[i] + shift;
851         sum  = b[i];
852         SPARSEDENSEMDOT(sum,xs,v,idx,n);
853         d    = fshift + A->a[diag[i]+shift];
854         n    = B->i[i+1] - B->i[i];
855         idx  = B->j + B->i[i] + shift;
856         v    = B->a + B->i[i] + shift;
857         SPARSEDENSEMDOT(sum,ls,v,idx,n);
858         x[i] = omega*(sum/d);
859       }
860       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
861                             mat->Mvctx); CHKERRQ(ierr);
862       its--;
863     }
864     while (its--) {
865       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
866       CHKERRQ(ierr);
867       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
868       CHKERRQ(ierr);
869       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
870                               mat->Mvctx); CHKERRQ(ierr);
871       for ( i=0; i<m; i++ ) {
872         n    = A->i[i+1] - A->i[i];
873         idx  = A->j + A->i[i] + shift;
874         v    = A->a + A->i[i] + shift;
875         sum  = b[i];
876         SPARSEDENSEMDOT(sum,xs,v,idx,n);
877         d    = fshift + A->a[diag[i]+shift];
878         n    = B->i[i+1] - B->i[i];
879         idx  = B->j + B->i[i] + shift;
880         v    = B->a + B->i[i] + shift;
881         SPARSEDENSEMDOT(sum,ls,v,idx,n);
882         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
883       }
884       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
885                             mat->Mvctx); CHKERRQ(ierr);
886     }
887   }
888   else if (flag & SOR_BACKWARD_SWEEP){
889     if (flag & SOR_ZERO_INITIAL_GUESS) {
890       VecSet(&zero,mat->lvec);
891       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
892                               mat->Mvctx); CHKERRQ(ierr);
893       for ( i=m-1; i>-1; i-- ) {
894         n    = A->i[i+1] - diag[i] - 1;
895         idx  = A->j + diag[i] + !shift;
896         v    = A->a + diag[i] + !shift;
897         sum  = b[i];
898         SPARSEDENSEMDOT(sum,xs,v,idx,n);
899         d    = fshift + A->a[diag[i]+shift];
900         n    = B->i[i+1] - B->i[i];
901         idx  = B->j + B->i[i] + shift;
902         v    = B->a + B->i[i] + shift;
903         SPARSEDENSEMDOT(sum,ls,v,idx,n);
904         x[i] = omega*(sum/d);
905       }
906       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
907                             mat->Mvctx); CHKERRQ(ierr);
908       its--;
909     }
910     while (its--) {
911       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
912                             mat->Mvctx); CHKERRQ(ierr);
913       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
914                             mat->Mvctx); CHKERRQ(ierr);
915       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
916                               mat->Mvctx); CHKERRQ(ierr);
917       for ( i=m-1; i>-1; i-- ) {
918         n    = A->i[i+1] - A->i[i];
919         idx  = A->j + A->i[i] + shift;
920         v    = A->a + A->i[i] + shift;
921         sum  = b[i];
922         SPARSEDENSEMDOT(sum,xs,v,idx,n);
923         d    = fshift + A->a[diag[i]+shift];
924         n    = B->i[i+1] - B->i[i];
925         idx  = B->j + B->i[i] + shift;
926         v    = B->a + B->i[i] + shift;
927         SPARSEDENSEMDOT(sum,ls,v,idx,n);
928         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
929       }
930       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
931                             mat->Mvctx); CHKERRQ(ierr);
932     }
933   }
934   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
935     if (flag & SOR_ZERO_INITIAL_GUESS) {
936       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
937     }
938     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
939     CHKERRQ(ierr);
940     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
941     CHKERRQ(ierr);
942     while (its--) {
943       /* go down through the rows */
944       for ( i=0; i<m; i++ ) {
945         n    = A->i[i+1] - A->i[i];
946         idx  = A->j + A->i[i] + shift;
947         v    = A->a + A->i[i] + shift;
948         sum  = b[i];
949         SPARSEDENSEMDOT(sum,xs,v,idx,n);
950         d    = fshift + A->a[diag[i]+shift];
951         n    = B->i[i+1] - B->i[i];
952         idx  = B->j + B->i[i] + shift;
953         v    = B->a + B->i[i] + shift;
954         SPARSEDENSEMDOT(sum,ls,v,idx,n);
955         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
956       }
957       /* come up through the rows */
958       for ( i=m-1; i>-1; i-- ) {
959         n    = A->i[i+1] - A->i[i];
960         idx  = A->j + A->i[i] + shift;
961         v    = A->a + A->i[i] + shift;
962         sum  = b[i];
963         SPARSEDENSEMDOT(sum,xs,v,idx,n);
964         d    = fshift + A->a[diag[i]+shift];
965         n    = B->i[i+1] - B->i[i];
966         idx  = B->j + B->i[i] + shift;
967         v    = B->a + B->i[i] + shift;
968         SPARSEDENSEMDOT(sum,ls,v,idx,n);
969         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
970       }
971     }
972   }
973   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
974     if (flag & SOR_ZERO_INITIAL_GUESS) {
975       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
976     }
977     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
978     CHKERRQ(ierr);
979     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
980     CHKERRQ(ierr);
981     while (its--) {
982       for ( i=0; i<m; i++ ) {
983         n    = A->i[i+1] - A->i[i];
984         idx  = A->j + A->i[i] + shift;
985         v    = A->a + A->i[i] + shift;
986         sum  = b[i];
987         SPARSEDENSEMDOT(sum,xs,v,idx,n);
988         d    = fshift + A->a[diag[i]+shift];
989         n    = B->i[i+1] - B->i[i];
990         idx  = B->j + B->i[i] + shift;
991         v    = B->a + B->i[i] + shift;
992         SPARSEDENSEMDOT(sum,ls,v,idx,n);
993         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
994       }
995     }
996   }
997   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
998     if (flag & SOR_ZERO_INITIAL_GUESS) {
999       return (*mat->A->ops.relax)(mat->A,bb,omega,flag,fshift,its,xx);
1000     }
1001     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
1002                             mat->Mvctx); CHKERRQ(ierr);
1003     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
1004                             mat->Mvctx); CHKERRQ(ierr);
1005     while (its--) {
1006       for ( i=m-1; i>-1; i-- ) {
1007         n    = A->i[i+1] - A->i[i];
1008         idx  = A->j + A->i[i] + shift;
1009         v    = A->a + A->i[i] + shift;
1010         sum  = b[i];
1011         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1012         d    = fshift + A->a[diag[i]+shift];
1013         n    = B->i[i+1] - B->i[i];
1014         idx  = B->j + B->i[i] + shift;
1015         v    = B->a + B->i[i] + shift;
1016         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1017         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1018       }
1019     }
1020   }
1021   return 0;
1022 }
1023 
1024 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1025                              int *nzalloc,int *mem)
1026 {
1027   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1028   Mat        A = mat->A, B = mat->B;
1029   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1030 
1031   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1032   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1033   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1034   if (flag == MAT_LOCAL) {
1035     if (nz)       *nz      = isend[0];
1036     if (nzalloc)  *nzalloc = isend[1];
1037     if (mem)      *mem     = isend[2];
1038   } else if (flag == MAT_GLOBAL_MAX) {
1039     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1040     if (nz)      *nz      = irecv[0];
1041     if (nzalloc) *nzalloc = irecv[1];
1042     if (mem)     *mem     = irecv[2];
1043   } else if (flag == MAT_GLOBAL_SUM) {
1044     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1045     if (nz)      *nz      = irecv[0];
1046     if (nzalloc) *nzalloc = irecv[1];
1047     if (mem)     *mem     = irecv[2];
1048   }
1049   return 0;
1050 }
1051 
1052 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1053 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1054 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1055 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1056 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1057 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1058 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1059 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1060 
1061 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1062 {
1063   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1064 
1065   if (op == NO_NEW_NONZERO_LOCATIONS ||
1066       op == YES_NEW_NONZERO_LOCATIONS ||
1067       op == COLUMNS_SORTED ||
1068       op == ROW_ORIENTED) {
1069         MatSetOption(a->A,op);
1070         MatSetOption(a->B,op);
1071   }
1072   else if (op == ROWS_SORTED ||
1073            op == SYMMETRIC_MATRIX ||
1074            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1075            op == YES_NEW_DIAGONALS)
1076     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1077   else if (op == COLUMN_ORIENTED) {
1078     a->roworiented = 0;
1079     MatSetOption(a->A,op);
1080     MatSetOption(a->B,op);
1081   }
1082   else if (op == NO_NEW_DIAGONALS)
1083     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1084   else
1085     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1086   return 0;
1087 }
1088 
1089 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1090 {
1091   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1092   *m = mat->M; *n = mat->N;
1093   return 0;
1094 }
1095 
1096 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1097 {
1098   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1099   *m = mat->m; *n = mat->N;
1100   return 0;
1101 }
1102 
1103 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1104 {
1105   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1106   *m = mat->rstart; *n = mat->rend;
1107   return 0;
1108 }
1109 
1110 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1111 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1112 
1113 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1114 {
1115   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1116   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1117   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1118   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1119   int        *cmap, *idx_p;
1120 
1121   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,"MatGetRow_MPIAIJ:Already active");
1122   mat->getrowactive = PETSC_TRUE;
1123 
1124   if (!mat->rowvalues && (idx || v)) {
1125     /*
1126         allocate enough space to hold information from the longest row.
1127     */
1128     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1129     int     max = 1,n = mat->n,tmp;
1130     for ( i=0; i<n; i++ ) {
1131       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1132       if (max < tmp) { max = tmp; }
1133     }
1134     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1135     CHKPTRQ(mat->rowvalues);
1136     mat->rowindices = (int *) (mat->rowvalues + max);
1137   }
1138 
1139 
1140   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1141   lrow = row - rstart;
1142 
1143   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1144   if (!v)   {pvA = 0; pvB = 0;}
1145   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1146   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1147   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1148   nztot = nzA + nzB;
1149 
1150   cmap  = mat->garray;
1151   if (v  || idx) {
1152     if (nztot) {
1153       /* Sort by increasing column numbers, assuming A and B already sorted */
1154       int imark = -1;
1155       if (v) {
1156         *v = v_p = mat->rowvalues;
1157         for ( i=0; i<nzB; i++ ) {
1158           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1159           else break;
1160         }
1161         imark = i;
1162         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1163         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1164       }
1165       if (idx) {
1166         *idx = idx_p = mat->rowindices;
1167         if (imark > -1) {
1168           for ( i=0; i<imark; i++ ) {
1169             idx_p[i] = cmap[cworkB[i]];
1170           }
1171         } else {
1172           for ( i=0; i<nzB; i++ ) {
1173             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1174             else break;
1175           }
1176           imark = i;
1177         }
1178         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1179         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1180       }
1181     }
1182     else {*idx = 0; *v=0;}
1183   }
1184   *nz = nztot;
1185   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1186   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1187   return 0;
1188 }
1189 
1190 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1191 {
1192   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1193   if (aij->getrowactive == PETSC_FALSE) {
1194     SETERRQ(1,"MatRestoreRow_MPIAIJ:MatGetRow not called");
1195   }
1196   aij->getrowactive = PETSC_FALSE;
1197   return 0;
1198 }
1199 
1200 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1201 {
1202   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1203   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1204   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1205   double     sum = 0.0;
1206   Scalar     *v;
1207 
1208   if (aij->size == 1) {
1209     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1210   } else {
1211     if (type == NORM_FROBENIUS) {
1212       v = amat->a;
1213       for (i=0; i<amat->nz; i++ ) {
1214 #if defined(PETSC_COMPLEX)
1215         sum += real(conj(*v)*(*v)); v++;
1216 #else
1217         sum += (*v)*(*v); v++;
1218 #endif
1219       }
1220       v = bmat->a;
1221       for (i=0; i<bmat->nz; i++ ) {
1222 #if defined(PETSC_COMPLEX)
1223         sum += real(conj(*v)*(*v)); v++;
1224 #else
1225         sum += (*v)*(*v); v++;
1226 #endif
1227       }
1228       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1229       *norm = sqrt(*norm);
1230     }
1231     else if (type == NORM_1) { /* max column norm */
1232       double *tmp, *tmp2;
1233       int    *jj, *garray = aij->garray;
1234       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1235       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1236       PetscMemzero(tmp,aij->N*sizeof(double));
1237       *norm = 0.0;
1238       v = amat->a; jj = amat->j;
1239       for ( j=0; j<amat->nz; j++ ) {
1240         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1241       }
1242       v = bmat->a; jj = bmat->j;
1243       for ( j=0; j<bmat->nz; j++ ) {
1244         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1245       }
1246       MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1247       for ( j=0; j<aij->N; j++ ) {
1248         if (tmp2[j] > *norm) *norm = tmp2[j];
1249       }
1250       PetscFree(tmp); PetscFree(tmp2);
1251     }
1252     else if (type == NORM_INFINITY) { /* max row norm */
1253       double ntemp = 0.0;
1254       for ( j=0; j<amat->m; j++ ) {
1255         v = amat->a + amat->i[j] + shift;
1256         sum = 0.0;
1257         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1258           sum += PetscAbsScalar(*v); v++;
1259         }
1260         v = bmat->a + bmat->i[j] + shift;
1261         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1262           sum += PetscAbsScalar(*v); v++;
1263         }
1264         if (sum > ntemp) ntemp = sum;
1265       }
1266       MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1267     }
1268     else {
1269       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1270     }
1271   }
1272   return 0;
1273 }
1274 
1275 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1276 {
1277   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1278   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1279   int        ierr,shift = Aloc->indexshift;
1280   Mat        B;
1281   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1282   Scalar     *array;
1283 
1284   if (matout == PETSC_NULL && M != N)
1285     SETERRQ(1,"MatTranspose_MPIAIJ:Square matrix only for in-place");
1286   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1287          PETSC_NULL,&B); CHKERRQ(ierr);
1288 
1289   /* copy over the A part */
1290   Aloc = (Mat_SeqAIJ*) a->A->data;
1291   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1292   row = a->rstart;
1293   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1294   for ( i=0; i<m; i++ ) {
1295     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1296     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1297   }
1298   aj = Aloc->j;
1299   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1300 
1301   /* copy over the B part */
1302   Aloc = (Mat_SeqAIJ*) a->B->data;
1303   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1304   row = a->rstart;
1305   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1306   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1307   for ( i=0; i<m; i++ ) {
1308     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1309     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1310   }
1311   PetscFree(ct);
1312   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1313   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1314   if (matout != PETSC_NULL) {
1315     *matout = B;
1316   } else {
1317     /* This isn't really an in-place transpose .... but free data structures from a */
1318     PetscFree(a->rowners);
1319     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1320     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1321     if (a->colmap) PetscFree(a->colmap);
1322     if (a->garray) PetscFree(a->garray);
1323     if (a->lvec) VecDestroy(a->lvec);
1324     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1325     PetscFree(a);
1326     PetscMemcpy(A,B,sizeof(struct _Mat));
1327     PetscHeaderDestroy(B);
1328   }
1329   return 0;
1330 }
1331 
1332 extern int MatPrintHelp_SeqAIJ(Mat);
1333 static int MatPrintHelp_MPIAIJ(Mat A)
1334 {
1335   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1336 
1337   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1338   else return 0;
1339 }
1340 
1341 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1342 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1343 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1344 int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1345 /* -------------------------------------------------------------------*/
1346 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1347        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1348        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1349        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1350        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1351        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1352        MatLUFactor_MPIAIJ,0,
1353        MatRelax_MPIAIJ,
1354        MatTranspose_MPIAIJ,
1355        MatGetInfo_MPIAIJ,0,
1356        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1357        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1358        0,
1359        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1360        MatGetReordering_MPIAIJ,
1361        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1362        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1363        MatILUFactorSymbolic_MPIAIJ,0,
1364        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1365        0,0,0,
1366        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1367        MatPrintHelp_MPIAIJ,
1368        MatScale_MPIAIJ};
1369 
1370 /*@C
1371    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1372    (the default parallel PETSc format).  For good matrix assembly performance
1373    the user should preallocate the matrix storage by setting the parameters
1374    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1375    performance can be increased by more than a factor of 50.
1376 
1377    Input Parameters:
1378 .  comm - MPI communicator
1379 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1380 .      This should be the same as the local size used in creating the
1381 .      y vector in y = Ax
1382 .  n - number of local columns (or PETSC_DECIDE to have calculated
1383            if N is given)
1384 .      This should be the same as the local size used in creating the
1385 .      x vector in y = Ax
1386 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1387 .  N - number of global columns (or PETSC_DECIDE to have calculated
1388            if n is given)
1389 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1390            (same for all local rows)
1391 .  d_nzz - array containing the number of nonzeros in the various rows of the
1392            diagonal portion of local submatrix (possibly different for each row)
1393            or PETSC_NULL. You must leave room for the diagonal entry even if
1394            it is zero.
1395 .  o_nz - number of nonzeros per row in the off-diagonal portion of local
1396            submatrix (same for all local rows).
1397 .  o_nzz - array containing the number of nonzeros in the various rows of the
1398            off-diagonal portion of the local submatrix (possibly different for
1399            each row) or PETSC_NULL.
1400 
1401    Output Parameter:
1402 .  A - the matrix
1403 
1404    Notes:
1405    The AIJ format (also called the Yale sparse matrix format or
1406    compressed row storage), is fully compatible with standard Fortran 77
1407    storage.  That is, the stored row and column indices can begin at
1408    either one (as in Fortran) or zero.  See the users manual for details.
1409 
1410    The user MUST specify either the local or global matrix dimensions
1411    (possibly both).
1412 
1413    By default, this format uses inodes (identical nodes) when possible.
1414    We search for consecutive rows with the same nonzero structure, thereby
1415    reusing matrix information to achieve increased efficiency.
1416 
1417    Options Database Keys:
1418 $    -mat_aij_no_inode  - Do not use inodes
1419 $    -mat_aij_inode_limit <limit> - Set inode limit.
1420 $        (max limit=5)
1421 $    -mat_aij_oneindex - Internally use indexing starting at 1
1422 $        rather than 0.  Note: When calling MatSetValues(),
1423 $        the user still MUST index entries starting at 0!
1424 
1425    Storage Information:
1426    For a square global matrix we define each processor's diagonal portion
1427    to be its local rows and the corresponding columns (a square submatrix);
1428    each processor's off-diagonal portion encompasses the remainder of the
1429    local matrix (a rectangular submatrix).
1430 
1431    The user can specify preallocated storage for the diagonal part of
1432    the local submatrix with either d_nz or d_nnz (not both).  Set
1433    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1434    memory allocation.  Likewise, specify preallocated storage for the
1435    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1436 
1437    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1438    the figure below we depict these three local rows and all columns (0-11).
1439 
1440 $          0 1 2 3 4 5 6 7 8 9 10 11
1441 $         -------------------
1442 $  row 3  |  o o o d d d o o o o o o
1443 $  row 4  |  o o o d d d o o o o o o
1444 $  row 5  |  o o o d d d o o o o o o
1445 $         -------------------
1446 $
1447 
1448    Thus, any entries in the d locations are stored in the d (diagonal)
1449    submatrix, and any entries in the o locations are stored in the
1450    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1451    stored simply in the MATSEQAIJ format for compressed row storage.
1452 
1453    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1454    and o_nz should indicate the number of nonzeros per row in the o matrix.
1455    In general, for PDE problems in which most nonzeros are near the diagonal,
1456    one expects d_nz >> o_nz. For large problems you MUST preallocate memory
1457    or you will get TERRIBLE performance, see the users' manual chapter on
1458    matrices and the file $(PETSC_DIR)/Performance.
1459 
1460 .keywords: matrix, aij, compressed row, sparse, parallel
1461 
1462 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1463 @*/
1464 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1465                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1466 {
1467   Mat          B;
1468   Mat_MPIAIJ   *b;
1469   int          ierr, i,sum[2],work[2];
1470 
1471   *A = 0;
1472   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1473   PLogObjectCreate(B);
1474   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1475   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1476   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1477   B->destroy    = MatDestroy_MPIAIJ;
1478   B->view       = MatView_MPIAIJ;
1479   B->factor     = 0;
1480   B->assembled  = PETSC_FALSE;
1481 
1482   b->insertmode = NOT_SET_VALUES;
1483   MPI_Comm_rank(comm,&b->rank);
1484   MPI_Comm_size(comm,&b->size);
1485 
1486   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1487     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1488 
1489   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1490     work[0] = m; work[1] = n;
1491     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1492     if (M == PETSC_DECIDE) M = sum[0];
1493     if (N == PETSC_DECIDE) N = sum[1];
1494   }
1495   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1496   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1497   b->m = m; B->m = m;
1498   b->n = n; B->n = n;
1499   b->N = N; B->N = N;
1500   b->M = M; B->M = M;
1501 
1502   /* build local table of row and column ownerships */
1503   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1504   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1505   b->cowners = b->rowners + b->size + 1;
1506   MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1507   b->rowners[0] = 0;
1508   for ( i=2; i<=b->size; i++ ) {
1509     b->rowners[i] += b->rowners[i-1];
1510   }
1511   b->rstart = b->rowners[b->rank];
1512   b->rend   = b->rowners[b->rank+1];
1513   MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1514   b->cowners[0] = 0;
1515   for ( i=2; i<=b->size; i++ ) {
1516     b->cowners[i] += b->cowners[i-1];
1517   }
1518   b->cstart = b->cowners[b->rank];
1519   b->cend   = b->cowners[b->rank+1];
1520 
1521   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1522   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1523   PLogObjectParent(B,b->A);
1524   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1525   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1526   PLogObjectParent(B,b->B);
1527 
1528   /* build cache for off array entries formed */
1529   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1530   b->colmap      = 0;
1531   b->garray      = 0;
1532   b->roworiented = 1;
1533 
1534   /* stuff used for matrix vector multiply */
1535   b->lvec      = 0;
1536   b->Mvctx     = 0;
1537 
1538   /* stuff for MatGetRow() */
1539   b->rowindices   = 0;
1540   b->rowvalues    = 0;
1541   b->getrowactive = PETSC_FALSE;
1542 
1543   *A = B;
1544   return 0;
1545 }
1546 
1547 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1548 {
1549   Mat        mat;
1550   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1551   int        ierr, len=0, flg;
1552 
1553   *newmat       = 0;
1554   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1555   PLogObjectCreate(mat);
1556   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1557   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1558   mat->destroy    = MatDestroy_MPIAIJ;
1559   mat->view       = MatView_MPIAIJ;
1560   mat->factor     = matin->factor;
1561   mat->assembled  = PETSC_TRUE;
1562 
1563   a->m = mat->m   = oldmat->m;
1564   a->n = mat->n   = oldmat->n;
1565   a->M = mat->M   = oldmat->M;
1566   a->N = mat->N   = oldmat->N;
1567 
1568   a->rstart       = oldmat->rstart;
1569   a->rend         = oldmat->rend;
1570   a->cstart       = oldmat->cstart;
1571   a->cend         = oldmat->cend;
1572   a->size         = oldmat->size;
1573   a->rank         = oldmat->rank;
1574   a->insertmode   = NOT_SET_VALUES;
1575   a->rowvalues    = 0;
1576   a->getrowactive = PETSC_FALSE;
1577 
1578   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1579   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1580   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1581   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1582   if (oldmat->colmap) {
1583     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1584     PLogObjectMemory(mat,(a->N)*sizeof(int));
1585     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1586   } else a->colmap = 0;
1587   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1588     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1589     PLogObjectMemory(mat,len*sizeof(int));
1590     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1591   } else a->garray = 0;
1592 
1593   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1594   PLogObjectParent(mat,a->lvec);
1595   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1596   PLogObjectParent(mat,a->Mvctx);
1597   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1598   PLogObjectParent(mat,a->A);
1599   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1600   PLogObjectParent(mat,a->B);
1601   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1602   if (flg) {
1603     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1604   }
1605   *newmat = mat;
1606   return 0;
1607 }
1608 
1609 #include "sys.h"
1610 
1611 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1612 {
1613   Mat          A;
1614   int          i, nz, ierr, j,rstart, rend, fd;
1615   Scalar       *vals,*svals;
1616   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1617   MPI_Status   status;
1618   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1619   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1620   int          tag = ((PetscObject)viewer)->tag;
1621 
1622   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1623   if (!rank) {
1624     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1625     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1626     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
1627   }
1628 
1629   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1630   M = header[1]; N = header[2];
1631   /* determine ownership of all rows */
1632   m = M/size + ((M % size) > rank);
1633   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1634   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1635   rowners[0] = 0;
1636   for ( i=2; i<=size; i++ ) {
1637     rowners[i] += rowners[i-1];
1638   }
1639   rstart = rowners[rank];
1640   rend   = rowners[rank+1];
1641 
1642   /* distribute row lengths to all processors */
1643   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1644   offlens = ourlens + (rend-rstart);
1645   if (!rank) {
1646     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1647     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1648     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1649     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1650     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1651     PetscFree(sndcounts);
1652   }
1653   else {
1654     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1655   }
1656 
1657   if (!rank) {
1658     /* calculate the number of nonzeros on each processor */
1659     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1660     PetscMemzero(procsnz,size*sizeof(int));
1661     for ( i=0; i<size; i++ ) {
1662       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1663         procsnz[i] += rowlengths[j];
1664       }
1665     }
1666     PetscFree(rowlengths);
1667 
1668     /* determine max buffer needed and allocate it */
1669     maxnz = 0;
1670     for ( i=0; i<size; i++ ) {
1671       maxnz = PetscMax(maxnz,procsnz[i]);
1672     }
1673     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1674 
1675     /* read in my part of the matrix column indices  */
1676     nz = procsnz[0];
1677     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1678     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1679 
1680     /* read in every one elses and ship off */
1681     for ( i=1; i<size; i++ ) {
1682       nz = procsnz[i];
1683       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1684       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1685     }
1686     PetscFree(cols);
1687   }
1688   else {
1689     /* determine buffer space needed for message */
1690     nz = 0;
1691     for ( i=0; i<m; i++ ) {
1692       nz += ourlens[i];
1693     }
1694     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1695 
1696     /* receive message of column indices*/
1697     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1698     MPI_Get_count(&status,MPI_INT,&maxnz);
1699     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1700   }
1701 
1702   /* loop over local rows, determining number of off diagonal entries */
1703   PetscMemzero(offlens,m*sizeof(int));
1704   jj = 0;
1705   for ( i=0; i<m; i++ ) {
1706     for ( j=0; j<ourlens[i]; j++ ) {
1707       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1708       jj++;
1709     }
1710   }
1711 
1712   /* create our matrix */
1713   for ( i=0; i<m; i++ ) {
1714     ourlens[i] -= offlens[i];
1715   }
1716   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1717   A = *newmat;
1718   MatSetOption(A,COLUMNS_SORTED);
1719   for ( i=0; i<m; i++ ) {
1720     ourlens[i] += offlens[i];
1721   }
1722 
1723   if (!rank) {
1724     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1725 
1726     /* read in my part of the matrix numerical values  */
1727     nz = procsnz[0];
1728     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1729 
1730     /* insert into matrix */
1731     jj      = rstart;
1732     smycols = mycols;
1733     svals   = vals;
1734     for ( i=0; i<m; i++ ) {
1735       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1736       smycols += ourlens[i];
1737       svals   += ourlens[i];
1738       jj++;
1739     }
1740 
1741     /* read in other processors and ship out */
1742     for ( i=1; i<size; i++ ) {
1743       nz = procsnz[i];
1744       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1745       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1746     }
1747     PetscFree(procsnz);
1748   }
1749   else {
1750     /* receive numeric values */
1751     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1752 
1753     /* receive message of values*/
1754     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1755     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1756     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1757 
1758     /* insert into matrix */
1759     jj      = rstart;
1760     smycols = mycols;
1761     svals   = vals;
1762     for ( i=0; i<m; i++ ) {
1763       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1764       smycols += ourlens[i];
1765       svals   += ourlens[i];
1766       jj++;
1767     }
1768   }
1769   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1770 
1771   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1772   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1773   return 0;
1774 }
1775