xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 8d1cd36bbae09094f8343e481f39717c4ade5555)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiaij.c,v 1.137 1996/04/03 14:39:22 curfman Exp curfman $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "vec/vecimpl.h"
7 #include "inline/spops.h"
8 
9 /* local utility routine that creates a mapping from the global column
10 number to the local number in the off-diagonal part of the local
11 storage of the matrix.  This is done in a non scable way since the
12 length of colmap equals the global matrix length.
13 */
14 static int CreateColmap_Private(Mat mat)
15 {
16   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
17   Mat_SeqAIJ *B = (Mat_SeqAIJ*) aij->B->data;
18   int        n = B->n,i,shift = B->indexshift;
19 
20   aij->colmap = (int *) PetscMalloc(aij->N*sizeof(int));CHKPTRQ(aij->colmap);
21   PLogObjectMemory(mat,aij->N*sizeof(int));
22   PetscMemzero(aij->colmap,aij->N*sizeof(int));
23   for ( i=0; i<n; i++ ) aij->colmap[aij->garray[i]] = i-shift;
24   return 0;
25 }
26 
27 extern int DisAssemble_MPIAIJ(Mat);
28 
29 static int MatGetReordering_MPIAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
30 {
31   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
32   int        ierr;
33   if (aij->size == 1) {
34     ierr = MatGetReordering(aij->A,type,rperm,cperm); CHKERRQ(ierr);
35   } else SETERRQ(1,"MatGetReordering_MPIAIJ:not supported in parallel");
36   return 0;
37 }
38 
39 static int MatSetValues_MPIAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
42   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
43   Scalar     value;
44   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
45   int        cstart = aij->cstart, cend = aij->cend,row,col;
46   int        shift = C->indexshift,roworiented = aij->roworiented;
47 
48   if (aij->insertmode != NOT_SET_VALUES && aij->insertmode != addv) {
49     SETERRQ(1,"MatSetValues_MPIAIJ:Cannot mix inserts and adds");
50   }
51   aij->insertmode = addv;
52   for ( i=0; i<m; i++ ) {
53     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative row");
54     if (im[i] >= aij->M) SETERRQ(1,"MatSetValues_MPIAIJ:Row too large");
55     if (im[i] >= rstart && im[i] < rend) {
56       row = im[i] - rstart;
57       for ( j=0; j<n; j++ ) {
58         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIAIJ:Negative column");
59         if (in[j] >= aij->N) SETERRQ(1,"MatSetValues_MPIAIJ:Col too large");
60         if (in[j] >= cstart && in[j] < cend){
61           col = in[j] - cstart;
62           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
63           ierr = MatSetValues(aij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
64         }
65         else {
66           if (mat->was_assembled) {
67             if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
68             col = aij->colmap[in[j]] + shift;
69             if (col < 0 && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
70               ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
71               col =  in[j];
72             }
73           }
74           else col = in[j];
75           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
76           ierr = MatSetValues(aij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
77         }
78       }
79     }
80     else {
81       if (roworiented) {
82         ierr = StashValues_Private(&aij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
83       }
84       else {
85         row = im[i];
86         for ( j=0; j<n; j++ ) {
87           ierr = StashValues_Private(&aij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
88         }
89       }
90     }
91   }
92   return 0;
93 }
94 
95 static int MatGetValues_MPIAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
96 {
97   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
98   Mat_SeqAIJ *C = (Mat_SeqAIJ*) aij->A->data;
99   int        ierr,i,j, rstart = aij->rstart, rend = aij->rend;
100   int        cstart = aij->cstart, cend = aij->cend,row,col;
101   int        shift = C->indexshift;
102 
103   for ( i=0; i<m; i++ ) {
104     if (idxm[i] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative row");
105     if (idxm[i] >= aij->M) SETERRQ(1,"MatGetValues_MPIAIJ:Row too large");
106     if (idxm[i] >= rstart && idxm[i] < rend) {
107       row = idxm[i] - rstart;
108       for ( j=0; j<n; j++ ) {
109         if (idxn[j] < 0) SETERRQ(1,"MatGetValues_MPIAIJ:Negative column");
110         if (idxn[j] >= aij->N) SETERRQ(1,"MatGetValues_MPIAIJ:Col too large");
111         if (idxn[j] >= cstart && idxn[j] < cend){
112           col = idxn[j] - cstart;
113           ierr = MatGetValues(aij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
114         }
115         else {
116           col = aij->colmap[idxn[j]] + shift;
117           ierr = MatGetValues(aij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
118         }
119       }
120     }
121     else {
122       SETERRQ(1,"MatGetValues_MPIAIJ:Only local values currently supported");
123     }
124   }
125   return 0;
126 }
127 
128 static int MatAssemblyBegin_MPIAIJ(Mat mat,MatAssemblyType mode)
129 {
130   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
131   MPI_Comm    comm = mat->comm;
132   int         size = aij->size, *owners = aij->rowners;
133   int         rank = aij->rank,tag = mat->tag, *owner,*starts,count,ierr;
134   MPI_Request *send_waits,*recv_waits;
135   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
136   InsertMode  addv;
137   Scalar      *rvalues,*svalues;
138 
139   /* make sure all processors are either in INSERTMODE or ADDMODE */
140   MPI_Allreduce(&aij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
141   if (addv == (ADD_VALUES|INSERT_VALUES)) {
142     SETERRQ(1,"MatAssemblyBegin_MPIAIJ:Some processors inserted others added");
143   }
144   aij->insertmode = addv; /* in case this processor had no cache */
145 
146   /*  first count number of contributors to each processor */
147   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
148   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
149   owner = (int *) PetscMalloc( (aij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
150   for ( i=0; i<aij->stash.n; i++ ) {
151     idx = aij->stash.idx[i];
152     for ( j=0; j<size; j++ ) {
153       if (idx >= owners[j] && idx < owners[j+1]) {
154         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
155       }
156     }
157   }
158   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
159 
160   /* inform other processors of number of messages and max length*/
161   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
162   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
163   nreceives = work[rank];
164   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
165   nmax = work[rank];
166   PetscFree(work);
167 
168   /* post receives:
169        1) each message will consist of ordered pairs
170      (global index,value) we store the global index as a double
171      to simplify the message passing.
172        2) since we don't know how long each individual message is we
173      allocate the largest needed buffer for each receive. Potentially
174      this is a lot of wasted space.
175 
176 
177        This could be done better.
178   */
179   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
180   CHKPTRQ(rvalues);
181   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
182   CHKPTRQ(recv_waits);
183   for ( i=0; i<nreceives; i++ ) {
184     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
185               comm,recv_waits+i);
186   }
187 
188   /* do sends:
189       1) starts[i] gives the starting index in svalues for stuff going to
190          the ith processor
191   */
192   svalues = (Scalar *) PetscMalloc(3*(aij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
193   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
194   CHKPTRQ(send_waits);
195   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
196   starts[0] = 0;
197   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
198   for ( i=0; i<aij->stash.n; i++ ) {
199     svalues[3*starts[owner[i]]]       = (Scalar)  aij->stash.idx[i];
200     svalues[3*starts[owner[i]]+1]     = (Scalar)  aij->stash.idy[i];
201     svalues[3*(starts[owner[i]]++)+2] =  aij->stash.array[i];
202   }
203   PetscFree(owner);
204   starts[0] = 0;
205   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
206   count = 0;
207   for ( i=0; i<size; i++ ) {
208     if (procs[i]) {
209       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
210                 comm,send_waits+count++);
211     }
212   }
213   PetscFree(starts); PetscFree(nprocs);
214 
215   /* Free cache space */
216   PLogInfo(0,"[%d]MatAssemblyBegin_MPIAIJ:Number of off processor values %d\n",rank,aij->stash.n);
217   ierr = StashDestroy_Private(&aij->stash); CHKERRQ(ierr);
218 
219   aij->svalues    = svalues;    aij->rvalues    = rvalues;
220   aij->nsends     = nsends;     aij->nrecvs     = nreceives;
221   aij->send_waits = send_waits; aij->recv_waits = recv_waits;
222   aij->rmax       = nmax;
223 
224   return 0;
225 }
226 extern int MatSetUpMultiply_MPIAIJ(Mat);
227 
228 static int MatAssemblyEnd_MPIAIJ(Mat mat,MatAssemblyType mode)
229 {
230   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
231   Mat_SeqAIJ *C = (Mat_SeqAIJ *) aij->A->data;
232   MPI_Status  *send_status,recv_status;
233   int         imdex,nrecvs = aij->nrecvs, count = nrecvs, i, n, ierr;
234   int         row,col,other_disassembled,shift = C->indexshift;
235   Scalar      *values,val;
236   InsertMode  addv = aij->insertmode;
237 
238   /*  wait on receives */
239   while (count) {
240     MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status);
241     /* unpack receives into our local space */
242     values = aij->rvalues + 3*imdex*aij->rmax;
243     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
244     n = n/3;
245     for ( i=0; i<n; i++ ) {
246       row = (int) PetscReal(values[3*i]) - aij->rstart;
247       col = (int) PetscReal(values[3*i+1]);
248       val = values[3*i+2];
249       if (col >= aij->cstart && col < aij->cend) {
250         col -= aij->cstart;
251         MatSetValues(aij->A,1,&row,1,&col,&val,addv);
252       }
253       else {
254         if (mat->was_assembled) {
255           if (!aij->colmap) {ierr = CreateColmap_Private(mat);CHKERRQ(ierr);}
256           col = aij->colmap[col] + shift;
257           if (col < 0  && !((Mat_SeqAIJ*)(aij->A->data))->nonew) {
258             ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
259             col = (int) PetscReal(values[3*i+1]);
260           }
261         }
262         MatSetValues(aij->B,1,&row,1,&col,&val,addv);
263       }
264     }
265     count--;
266   }
267   PetscFree(aij->recv_waits); PetscFree(aij->rvalues);
268 
269   /* wait on sends */
270   if (aij->nsends) {
271     send_status = (MPI_Status *) PetscMalloc(aij->nsends*sizeof(MPI_Status));
272     CHKPTRQ(send_status);
273     MPI_Waitall(aij->nsends,aij->send_waits,send_status);
274     PetscFree(send_status);
275   }
276   PetscFree(aij->send_waits); PetscFree(aij->svalues);
277 
278   aij->insertmode = NOT_SET_VALUES;
279   ierr = MatAssemblyBegin(aij->A,mode); CHKERRQ(ierr);
280   ierr = MatAssemblyEnd(aij->A,mode); CHKERRQ(ierr);
281 
282   /* determine if any processor has disassembled, if so we must
283      also disassemble ourselfs, in order that we may reassemble. */
284   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
285   if (mat->was_assembled && !other_disassembled) {
286     ierr = DisAssemble_MPIAIJ(mat); CHKERRQ(ierr);
287   }
288 
289   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
290     ierr = MatSetUpMultiply_MPIAIJ(mat); CHKERRQ(ierr);
291   }
292   ierr = MatAssemblyBegin(aij->B,mode); CHKERRQ(ierr);
293   ierr = MatAssemblyEnd(aij->B,mode); CHKERRQ(ierr);
294 
295   if (aij->rowvalues) {PetscFree(aij->rowvalues); aij->rowvalues = 0;}
296   return 0;
297 }
298 
299 static int MatZeroEntries_MPIAIJ(Mat A)
300 {
301   Mat_MPIAIJ *l = (Mat_MPIAIJ *) A->data;
302   int        ierr;
303   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
304   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
305   return 0;
306 }
307 
308 /* the code does not do the diagonal entries correctly unless the
309    matrix is square and the column and row owerships are identical.
310    This is a BUG. The only way to fix it seems to be to access
311    aij->A and aij->B directly and not through the MatZeroRows()
312    routine.
313 */
314 static int MatZeroRows_MPIAIJ(Mat A,IS is,Scalar *diag)
315 {
316   Mat_MPIAIJ     *l = (Mat_MPIAIJ *) A->data;
317   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
318   int            *procs,*nprocs,j,found,idx,nsends,*work;
319   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
320   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
321   int            *lens,imdex,*lrows,*values;
322   MPI_Comm       comm = A->comm;
323   MPI_Request    *send_waits,*recv_waits;
324   MPI_Status     recv_status,*send_status;
325   IS             istmp;
326 
327   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
328   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
329 
330   /*  first count number of contributors to each processor */
331   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
332   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
333   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
334   for ( i=0; i<N; i++ ) {
335     idx = rows[i];
336     found = 0;
337     for ( j=0; j<size; j++ ) {
338       if (idx >= owners[j] && idx < owners[j+1]) {
339         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
340       }
341     }
342     if (!found) SETERRQ(1,"MatZeroRows_MPIAIJ:Index out of range");
343   }
344   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
345 
346   /* inform other processors of number of messages and max length*/
347   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
348   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
349   nrecvs = work[rank];
350   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
351   nmax = work[rank];
352   PetscFree(work);
353 
354   /* post receives:   */
355   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
356   CHKPTRQ(rvalues);
357   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
358   CHKPTRQ(recv_waits);
359   for ( i=0; i<nrecvs; i++ ) {
360     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
361   }
362 
363   /* do sends:
364       1) starts[i] gives the starting index in svalues for stuff going to
365          the ith processor
366   */
367   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
368   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
369   CHKPTRQ(send_waits);
370   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
371   starts[0] = 0;
372   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
373   for ( i=0; i<N; i++ ) {
374     svalues[starts[owner[i]]++] = rows[i];
375   }
376   ISRestoreIndices(is,&rows);
377 
378   starts[0] = 0;
379   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
380   count = 0;
381   for ( i=0; i<size; i++ ) {
382     if (procs[i]) {
383       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
384     }
385   }
386   PetscFree(starts);
387 
388   base = owners[rank];
389 
390   /*  wait on receives */
391   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
392   source = lens + nrecvs;
393   count  = nrecvs; slen = 0;
394   while (count) {
395     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
396     /* unpack receives into our local space */
397     MPI_Get_count(&recv_status,MPI_INT,&n);
398     source[imdex]  = recv_status.MPI_SOURCE;
399     lens[imdex]  = n;
400     slen += n;
401     count--;
402   }
403   PetscFree(recv_waits);
404 
405   /* move the data into the send scatter */
406   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
407   count = 0;
408   for ( i=0; i<nrecvs; i++ ) {
409     values = rvalues + i*nmax;
410     for ( j=0; j<lens[i]; j++ ) {
411       lrows[count++] = values[j] - base;
412     }
413   }
414   PetscFree(rvalues); PetscFree(lens);
415   PetscFree(owner); PetscFree(nprocs);
416 
417   /* actually zap the local rows */
418   ierr = ISCreateSeq(MPI_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
419   PLogObjectParent(A,istmp);
420   PetscFree(lrows);
421   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
422   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
423   ierr = ISDestroy(istmp); CHKERRQ(ierr);
424 
425   /* wait on sends */
426   if (nsends) {
427     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
428     CHKPTRQ(send_status);
429     MPI_Waitall(nsends,send_waits,send_status);
430     PetscFree(send_status);
431   }
432   PetscFree(send_waits); PetscFree(svalues);
433 
434   return 0;
435 }
436 
437 extern int MatMult_SeqAIJ(Mat A,Vec,Vec);
438 extern int MatMultAdd_SeqAIJ(Mat A,Vec,Vec,Vec);
439 extern int MatMultTrans_SeqAIJ(Mat A,Vec,Vec);
440 extern int MatMultTransAdd_SeqAIJ(Mat A,Vec,Vec,Vec);
441 static int MatMult_MPIAIJ(Mat A,Vec xx,Vec yy)
442 {
443   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
444   int        ierr;
445 
446   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
447   ierr = MatMult_SeqAIJ(a->A,xx,yy); CHKERRQ(ierr);
448   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
449   ierr = MatMultAdd_SeqAIJ(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
450   return 0;
451 }
452 
453 static int MatMultAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
454 {
455   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
456   int        ierr;
457   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
458   ierr = MatMultAdd_SeqAIJ(a->A,xx,yy,zz); CHKERRQ(ierr);
459   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
460   ierr = MatMultAdd_SeqAIJ(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
461   return 0;
462 }
463 
464 static int MatMultTrans_MPIAIJ(Mat A,Vec xx,Vec yy)
465 {
466   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
467   int        ierr;
468 
469   /* do nondiagonal part */
470   ierr = MatMultTrans_SeqAIJ(a->B,xx,a->lvec); CHKERRQ(ierr);
471   /* send it on its way */
472   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
473                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
474   /* do local part */
475   ierr = MatMultTrans_SeqAIJ(a->A,xx,yy); CHKERRQ(ierr);
476   /* receive remote parts: note this assumes the values are not actually */
477   /* inserted in yy until the next line, which is true for my implementation*/
478   /* but is not perhaps always true. */
479   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
480                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
481   return 0;
482 }
483 
484 static int MatMultTransAdd_MPIAIJ(Mat A,Vec xx,Vec yy,Vec zz)
485 {
486   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
487   int        ierr;
488 
489   /* do nondiagonal part */
490   ierr = MatMultTrans_SeqAIJ(a->B,xx,a->lvec); CHKERRQ(ierr);
491   /* send it on its way */
492   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
493                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
494   /* do local part */
495   ierr = MatMultTransAdd_SeqAIJ(a->A,xx,yy,zz); CHKERRQ(ierr);
496   /* receive remote parts: note this assumes the values are not actually */
497   /* inserted in yy until the next line, which is true for my implementation*/
498   /* but is not perhaps always true. */
499   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
500                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
501   return 0;
502 }
503 
504 /*
505   This only works correctly for square matrices where the subblock A->A is the
506    diagonal block
507 */
508 static int MatGetDiagonal_MPIAIJ(Mat A,Vec v)
509 {
510   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
511   if (a->M != a->N)
512     SETERRQ(1,"MatGetDiagonal_MPIAIJ:Supports only square matrix where A->A is diag block");
513   return MatGetDiagonal(a->A,v);
514 }
515 
516 static int MatScale_MPIAIJ(Scalar *aa,Mat A)
517 {
518   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
519   int        ierr;
520   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
521   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
522   return 0;
523 }
524 
525 static int MatDestroy_MPIAIJ(PetscObject obj)
526 {
527   Mat        mat = (Mat) obj;
528   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
529   int        ierr;
530 #if defined(PETSC_LOG)
531   PLogObjectState(obj,"Rows=%d, Cols=%d",aij->M,aij->N);
532 #endif
533   PetscFree(aij->rowners);
534   ierr = MatDestroy(aij->A); CHKERRQ(ierr);
535   ierr = MatDestroy(aij->B); CHKERRQ(ierr);
536   if (aij->colmap) PetscFree(aij->colmap);
537   if (aij->garray) PetscFree(aij->garray);
538   if (aij->lvec)   VecDestroy(aij->lvec);
539   if (aij->Mvctx)  VecScatterDestroy(aij->Mvctx);
540   if (aij->rowvalues) PetscFree(aij->rowvalues);
541   PetscFree(aij);
542   PLogObjectDestroy(mat);
543   PetscHeaderDestroy(mat);
544   return 0;
545 }
546 #include "draw.h"
547 #include "pinclude/pviewer.h"
548 
549 static int MatView_MPIAIJ_Binary(Mat mat,Viewer viewer)
550 {
551   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
552   int         ierr;
553 
554   if (aij->size == 1) {
555     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
556   }
557   else SETERRQ(1,"MatView_MPIAIJ_Binary:Only uniprocessor output supported");
558   return 0;
559 }
560 
561 static int MatView_MPIAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
562 {
563   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *) mat->data;
564   Mat_SeqAIJ* C = (Mat_SeqAIJ*)aij->A->data;
565   int         ierr, format,shift = C->indexshift,rank;
566   FILE        *fd;
567   ViewerType  vtype;
568 
569   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
570   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
571     ierr = ViewerGetFormat(viewer,&format);
572     if (format == ASCII_FORMAT_INFO_DETAILED) {
573       int nz, nzalloc, mem, flg;
574       MPI_Comm_rank(mat->comm,&rank);
575       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
576       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
577       ierr = OptionsHasName(PETSC_NULL,"-mat_aij_no_inode",&flg); CHKERRQ(ierr);
578       PetscSequentialPhaseBegin(mat->comm,1);
579       if (flg) fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, not using I-node routines\n",
580          rank,aij->m,nz,nzalloc,mem);
581       else fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d mem %d, using I-node routines\n",
582          rank,aij->m,nz,nzalloc,mem);
583       ierr = MatGetInfo(aij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
584       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz);
585       ierr = MatGetInfo(aij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
586       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz);
587       fflush(fd);
588       PetscSequentialPhaseEnd(mat->comm,1);
589       ierr = VecScatterView(aij->Mvctx,viewer); CHKERRQ(ierr);
590       return 0;
591     }
592     else if (format == ASCII_FORMAT_INFO) {
593       return 0;
594     }
595   }
596 
597   if (vtype == DRAW_VIEWER) {
598     Draw       draw;
599     PetscTruth isnull;
600     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
601     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
602   }
603 
604   if (vtype == ASCII_FILE_VIEWER) {
605     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
606     PetscSequentialPhaseBegin(mat->comm,1);
607     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
608            aij->rank,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart,
609            aij->cend);
610     ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
611     ierr = MatView(aij->B,viewer); CHKERRQ(ierr);
612     fflush(fd);
613     PetscSequentialPhaseEnd(mat->comm,1);
614   }
615   else {
616     int size = aij->size;
617     rank = aij->rank;
618     if (size == 1) {
619       ierr = MatView(aij->A,viewer); CHKERRQ(ierr);
620     }
621     else {
622       /* assemble the entire matrix onto first processor. */
623       Mat         A;
624       Mat_SeqAIJ *Aloc;
625       int         M = aij->M, N = aij->N,m,*ai,*aj,row,*cols,i,*ct;
626       Scalar      *a;
627 
628       if (!rank) {
629         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
630                CHKERRQ(ierr);
631       }
632       else {
633         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
634                CHKERRQ(ierr);
635       }
636       PLogObjectParent(mat,A);
637 
638       /* copy over the A part */
639       Aloc = (Mat_SeqAIJ*) aij->A->data;
640       m = Aloc->m; ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
641       row = aij->rstart;
642       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += aij->cstart + shift;}
643       for ( i=0; i<m; i++ ) {
644         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],aj,a,INSERT_VALUES);CHKERRQ(ierr);
645         row++; a += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
646       }
647       aj = Aloc->j;
648       for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= aij->cstart + shift;}
649 
650       /* copy over the B part */
651       Aloc = (Mat_SeqAIJ*) aij->B->data;
652       m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
653       row = aij->rstart;
654       ct = cols = (int *) PetscMalloc( (ai[m]+1)*sizeof(int) ); CHKPTRQ(cols);
655       for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = aij->garray[aj[i]+shift];}
656       for ( i=0; i<m; i++ ) {
657         ierr = MatSetValues(A,1,&row,ai[i+1]-ai[i],cols,a,INSERT_VALUES);CHKERRQ(ierr);
658         row++; a += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
659       }
660       PetscFree(ct);
661       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
662       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
663       if (!rank) {
664         ierr = MatView(((Mat_MPIAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
665       }
666       ierr = MatDestroy(A); CHKERRQ(ierr);
667     }
668   }
669   return 0;
670 }
671 
672 static int MatView_MPIAIJ(PetscObject obj,Viewer viewer)
673 {
674   Mat         mat = (Mat) obj;
675   int         ierr;
676   ViewerType  vtype;
677 
678   if (!viewer) {
679     viewer = STDOUT_VIEWER_SELF;
680   }
681   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
682   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
683       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
684     ierr = MatView_MPIAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
685   }
686   else if (vtype == BINARY_FILE_VIEWER) {
687     return MatView_MPIAIJ_Binary(mat,viewer);
688   }
689   return 0;
690 }
691 
692 extern int MatMarkDiag_SeqAIJ(Mat);
693 extern int MatRelax_SeqAIJ(Mat,Vec,double,MatSORType,double,int,Vec);
694 /*
695     This has to provide several versions.
696 
697      1) per sequential
698      2) a) use only local smoothing updating outer values only once.
699         b) local smoothing updating outer values each inner iteration
700      3) color updating out values betwen colors.
701 */
702 static int MatRelax_MPIAIJ(Mat matin,Vec bb,double omega,MatSORType flag,
703                            double fshift,int its,Vec xx)
704 {
705   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
706   Mat        AA = mat->A, BB = mat->B;
707   Mat_SeqAIJ *A = (Mat_SeqAIJ *) AA->data, *B = (Mat_SeqAIJ *)BB->data;
708   Scalar     zero = 0.0,*b,*x,*xs,*ls,d,*v,sum,scale,*t,*ts;
709   int        ierr,*idx, *diag;
710   int        n = mat->n, m = mat->m, i,shift = A->indexshift;
711   Vec        tt;
712 
713   VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls);
714   xs = x + shift; /* shift by one for index start of 1 */
715   ls = ls + shift;
716   if (!A->diag) {if ((ierr = MatMarkDiag_SeqAIJ(AA))) return ierr;}
717   diag = A->diag;
718   if (flag == SOR_APPLY_UPPER || flag == SOR_APPLY_LOWER) {
719     SETERRQ(1,"MatRelax_MPIAIJ:Option not supported");
720   }
721   if (flag & SOR_EISENSTAT) {
722     /* Let  A = L + U + D; where L is lower trianglar,
723     U is upper triangular, E is diagonal; This routine applies
724 
725             (L + E)^{-1} A (U + E)^{-1}
726 
727     to a vector efficiently using Eisenstat's trick. This is for
728     the case of SSOR preconditioner, so E is D/omega where omega
729     is the relaxation factor.
730     */
731     ierr = VecDuplicate(xx,&tt); CHKERRQ(ierr);
732     PLogObjectParent(matin,tt);
733     VecGetArray(tt,&t);
734     scale = (2.0/omega) - 1.0;
735     /*  x = (E + U)^{-1} b */
736     VecSet(&zero,mat->lvec);
737     ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
738                               mat->Mvctx); CHKERRQ(ierr);
739     for ( i=m-1; i>-1; i-- ) {
740       n    = A->i[i+1] - diag[i] - 1;
741       idx  = A->j + diag[i] + !shift;
742       v    = A->a + diag[i] + !shift;
743       sum  = b[i];
744       SPARSEDENSEMDOT(sum,xs,v,idx,n);
745       d    = fshift + A->a[diag[i]+shift];
746       n    = B->i[i+1] - B->i[i];
747       idx  = B->j + B->i[i] + shift;
748       v    = B->a + B->i[i] + shift;
749       SPARSEDENSEMDOT(sum,ls,v,idx,n);
750       x[i] = omega*(sum/d);
751     }
752     ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
753                             mat->Mvctx); CHKERRQ(ierr);
754 
755     /*  t = b - (2*E - D)x */
756     v = A->a;
757     for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ + shift])*x[i]; }
758 
759     /*  t = (E + L)^{-1}t */
760     ts = t + shift; /* shifted by one for index start of a or mat->j*/
761     diag = A->diag;
762     VecSet(&zero,mat->lvec);
763     ierr = VecPipelineBegin(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
764                                                  mat->Mvctx); CHKERRQ(ierr);
765     for ( i=0; i<m; i++ ) {
766       n    = diag[i] - A->i[i];
767       idx  = A->j + A->i[i] + shift;
768       v    = A->a + A->i[i] + shift;
769       sum  = t[i];
770       SPARSEDENSEMDOT(sum,ts,v,idx,n);
771       d    = fshift + A->a[diag[i]+shift];
772       n    = B->i[i+1] - B->i[i];
773       idx  = B->j + B->i[i] + shift;
774       v    = B->a + B->i[i] + shift;
775       SPARSEDENSEMDOT(sum,ls,v,idx,n);
776       t[i] = omega*(sum/d);
777     }
778     ierr = VecPipelineEnd(tt,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
779                                                     mat->Mvctx); CHKERRQ(ierr);
780     /*  x = x + t */
781     for ( i=0; i<m; i++ ) { x[i] += t[i]; }
782     VecDestroy(tt);
783     return 0;
784   }
785 
786 
787   if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){
788     if (flag & SOR_ZERO_INITIAL_GUESS) {
789       VecSet(&zero,mat->lvec); VecSet(&zero,xx);
790     }
791     else {
792       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
793       CHKERRQ(ierr);
794       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
795       CHKERRQ(ierr);
796     }
797     while (its--) {
798       /* go down through the rows */
799       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
800                               mat->Mvctx); CHKERRQ(ierr);
801       for ( i=0; i<m; i++ ) {
802         n    = A->i[i+1] - A->i[i];
803         idx  = A->j + A->i[i] + shift;
804         v    = A->a + A->i[i] + shift;
805         sum  = b[i];
806         SPARSEDENSEMDOT(sum,xs,v,idx,n);
807         d    = fshift + A->a[diag[i]+shift];
808         n    = B->i[i+1] - B->i[i];
809         idx  = B->j + B->i[i] + shift;
810         v    = B->a + B->i[i] + shift;
811         SPARSEDENSEMDOT(sum,ls,v,idx,n);
812         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
813       }
814       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
815                             mat->Mvctx); CHKERRQ(ierr);
816       /* come up through the rows */
817       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
818                               mat->Mvctx); CHKERRQ(ierr);
819       for ( i=m-1; i>-1; i-- ) {
820         n    = A->i[i+1] - A->i[i];
821         idx  = A->j + A->i[i] + shift;
822         v    = A->a + A->i[i] + shift;
823         sum  = b[i];
824         SPARSEDENSEMDOT(sum,xs,v,idx,n);
825         d    = fshift + A->a[diag[i]+shift];
826         n    = B->i[i+1] - B->i[i];
827         idx  = B->j + B->i[i] + shift;
828         v    = B->a + B->i[i] + shift;
829         SPARSEDENSEMDOT(sum,ls,v,idx,n);
830         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
831       }
832       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
833                             mat->Mvctx); CHKERRQ(ierr);
834     }
835   }
836   else if (flag & SOR_FORWARD_SWEEP){
837     if (flag & SOR_ZERO_INITIAL_GUESS) {
838       VecSet(&zero,mat->lvec);
839       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
840                               mat->Mvctx); CHKERRQ(ierr);
841       for ( i=0; i<m; i++ ) {
842         n    = diag[i] - A->i[i];
843         idx  = A->j + A->i[i] + shift;
844         v    = A->a + A->i[i] + shift;
845         sum  = b[i];
846         SPARSEDENSEMDOT(sum,xs,v,idx,n);
847         d    = fshift + A->a[diag[i]+shift];
848         n    = B->i[i+1] - B->i[i];
849         idx  = B->j + B->i[i] + shift;
850         v    = B->a + B->i[i] + shift;
851         SPARSEDENSEMDOT(sum,ls,v,idx,n);
852         x[i] = omega*(sum/d);
853       }
854       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
855                             mat->Mvctx); CHKERRQ(ierr);
856       its--;
857     }
858     while (its--) {
859       ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
860       CHKERRQ(ierr);
861       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_UP,mat->Mvctx);
862       CHKERRQ(ierr);
863       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
864                               mat->Mvctx); CHKERRQ(ierr);
865       for ( i=0; i<m; i++ ) {
866         n    = A->i[i+1] - A->i[i];
867         idx  = A->j + A->i[i] + shift;
868         v    = A->a + A->i[i] + shift;
869         sum  = b[i];
870         SPARSEDENSEMDOT(sum,xs,v,idx,n);
871         d    = fshift + A->a[diag[i]+shift];
872         n    = B->i[i+1] - B->i[i];
873         idx  = B->j + B->i[i] + shift;
874         v    = B->a + B->i[i] + shift;
875         SPARSEDENSEMDOT(sum,ls,v,idx,n);
876         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
877       }
878       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_DOWN,
879                             mat->Mvctx); CHKERRQ(ierr);
880     }
881   }
882   else if (flag & SOR_BACKWARD_SWEEP){
883     if (flag & SOR_ZERO_INITIAL_GUESS) {
884       VecSet(&zero,mat->lvec);
885       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
886                               mat->Mvctx); CHKERRQ(ierr);
887       for ( i=m-1; i>-1; i-- ) {
888         n    = A->i[i+1] - diag[i] - 1;
889         idx  = A->j + diag[i] + !shift;
890         v    = A->a + diag[i] + !shift;
891         sum  = b[i];
892         SPARSEDENSEMDOT(sum,xs,v,idx,n);
893         d    = fshift + A->a[diag[i]+shift];
894         n    = B->i[i+1] - B->i[i];
895         idx  = B->j + B->i[i] + shift;
896         v    = B->a + B->i[i] + shift;
897         SPARSEDENSEMDOT(sum,ls,v,idx,n);
898         x[i] = omega*(sum/d);
899       }
900       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
901                             mat->Mvctx); CHKERRQ(ierr);
902       its--;
903     }
904     while (its--) {
905       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
906                             mat->Mvctx); CHKERRQ(ierr);
907       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_DOWN,
908                             mat->Mvctx); CHKERRQ(ierr);
909       ierr = VecPipelineBegin(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
910                               mat->Mvctx); CHKERRQ(ierr);
911       for ( i=m-1; i>-1; i-- ) {
912         n    = A->i[i+1] - A->i[i];
913         idx  = A->j + A->i[i] + shift;
914         v    = A->a + A->i[i] + shift;
915         sum  = b[i];
916         SPARSEDENSEMDOT(sum,xs,v,idx,n);
917         d    = fshift + A->a[diag[i]+shift];
918         n    = B->i[i+1] - B->i[i];
919         idx  = B->j + B->i[i] + shift;
920         v    = B->a + B->i[i] + shift;
921         SPARSEDENSEMDOT(sum,ls,v,idx,n);
922         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
923       }
924       ierr = VecPipelineEnd(xx,mat->lvec,INSERT_VALUES,PIPELINE_UP,
925                             mat->Mvctx); CHKERRQ(ierr);
926     }
927   }
928   else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
929     if (flag & SOR_ZERO_INITIAL_GUESS) {
930       return MatRelax_SeqAIJ(mat->A,bb,omega,flag,fshift,its,xx);
931     }
932     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
933     CHKERRQ(ierr);
934     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
935     CHKERRQ(ierr);
936     while (its--) {
937       /* go down through the rows */
938       for ( i=0; i<m; i++ ) {
939         n    = A->i[i+1] - A->i[i];
940         idx  = A->j + A->i[i] + shift;
941         v    = A->a + A->i[i] + shift;
942         sum  = b[i];
943         SPARSEDENSEMDOT(sum,xs,v,idx,n);
944         d    = fshift + A->a[diag[i]+shift];
945         n    = B->i[i+1] - B->i[i];
946         idx  = B->j + B->i[i] + shift;
947         v    = B->a + B->i[i] + shift;
948         SPARSEDENSEMDOT(sum,ls,v,idx,n);
949         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
950       }
951       /* come up through the rows */
952       for ( i=m-1; i>-1; i-- ) {
953         n    = A->i[i+1] - A->i[i];
954         idx  = A->j + A->i[i] + shift;
955         v    = A->a + A->i[i] + shift;
956         sum  = b[i];
957         SPARSEDENSEMDOT(sum,xs,v,idx,n);
958         d    = fshift + A->a[diag[i]+shift];
959         n    = B->i[i+1] - B->i[i];
960         idx  = B->j + B->i[i] + shift;
961         v    = B->a + B->i[i] + shift;
962         SPARSEDENSEMDOT(sum,ls,v,idx,n);
963         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
964       }
965     }
966   }
967   else if (flag & SOR_LOCAL_FORWARD_SWEEP){
968     if (flag & SOR_ZERO_INITIAL_GUESS) {
969       return MatRelax_SeqAIJ(mat->A,bb,omega,flag,fshift,its,xx);
970     }
971     ierr=VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
972     CHKERRQ(ierr);
973     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,mat->Mvctx);
974     CHKERRQ(ierr);
975     while (its--) {
976       for ( i=0; i<m; i++ ) {
977         n    = A->i[i+1] - A->i[i];
978         idx  = A->j + A->i[i] + shift;
979         v    = A->a + A->i[i] + shift;
980         sum  = b[i];
981         SPARSEDENSEMDOT(sum,xs,v,idx,n);
982         d    = fshift + A->a[diag[i]+shift];
983         n    = B->i[i+1] - B->i[i];
984         idx  = B->j + B->i[i] + shift;
985         v    = B->a + B->i[i] + shift;
986         SPARSEDENSEMDOT(sum,ls,v,idx,n);
987         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
988       }
989     }
990   }
991   else if (flag & SOR_LOCAL_BACKWARD_SWEEP){
992     if (flag & SOR_ZERO_INITIAL_GUESS) {
993       return MatRelax_SeqAIJ(mat->A,bb,omega,flag,fshift,its,xx);
994     }
995     ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
996                             mat->Mvctx); CHKERRQ(ierr);
997     ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_ALL,
998                             mat->Mvctx); CHKERRQ(ierr);
999     while (its--) {
1000       for ( i=m-1; i>-1; i-- ) {
1001         n    = A->i[i+1] - A->i[i];
1002         idx  = A->j + A->i[i] + shift;
1003         v    = A->a + A->i[i] + shift;
1004         sum  = b[i];
1005         SPARSEDENSEMDOT(sum,xs,v,idx,n);
1006         d    = fshift + A->a[diag[i]+shift];
1007         n    = B->i[i+1] - B->i[i];
1008         idx  = B->j + B->i[i] + shift;
1009         v    = B->a + B->i[i] + shift;
1010         SPARSEDENSEMDOT(sum,ls,v,idx,n);
1011         x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]);
1012       }
1013     }
1014   }
1015   return 0;
1016 }
1017 
1018 static int MatGetInfo_MPIAIJ(Mat matin,MatInfoType flag,int *nz,
1019                              int *nzalloc,int *mem)
1020 {
1021   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1022   Mat        A = mat->A, B = mat->B;
1023   int        ierr, isend[3], irecv[3], nzA, nzallocA, memA;
1024 
1025   ierr = MatGetInfo(A,MAT_LOCAL,&nzA,&nzallocA,&memA); CHKERRQ(ierr);
1026   ierr = MatGetInfo(B,MAT_LOCAL,&isend[0],&isend[1],&isend[2]); CHKERRQ(ierr);
1027   isend[0] += nzA; isend[1] += nzallocA; isend[2] += memA;
1028   if (flag == MAT_LOCAL) {
1029     if (nz)       *nz      = isend[0];
1030     if (nzalloc)  *nzalloc = isend[1];
1031     if (mem)      *mem     = isend[2];
1032   } else if (flag == MAT_GLOBAL_MAX) {
1033     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_MAX,matin->comm);
1034     if (nz)      *nz      = irecv[0];
1035     if (nzalloc) *nzalloc = irecv[1];
1036     if (mem)     *mem     = irecv[2];
1037   } else if (flag == MAT_GLOBAL_SUM) {
1038     MPI_Allreduce( isend, irecv,3,MPI_INT,MPI_SUM,matin->comm);
1039     if (nz)      *nz      = irecv[0];
1040     if (nzalloc) *nzalloc = irecv[1];
1041     if (mem)     *mem     = irecv[2];
1042   }
1043   return 0;
1044 }
1045 
1046 extern int MatLUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,Mat*);
1047 extern int MatLUFactorNumeric_MPIAIJ(Mat,Mat*);
1048 extern int MatLUFactor_MPIAIJ(Mat,IS,IS,double);
1049 extern int MatILUFactorSymbolic_MPIAIJ(Mat,IS,IS,double,int,Mat *);
1050 extern int MatSolve_MPIAIJ(Mat,Vec,Vec);
1051 extern int MatSolveAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1052 extern int MatSolveTrans_MPIAIJ(Mat,Vec,Vec);
1053 extern int MatSolveTransAdd_MPIAIJ(Mat,Vec,Vec,Vec);
1054 
1055 static int MatSetOption_MPIAIJ(Mat A,MatOption op)
1056 {
1057   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1058 
1059   if (op == NO_NEW_NONZERO_LOCATIONS ||
1060       op == YES_NEW_NONZERO_LOCATIONS ||
1061       op == COLUMNS_SORTED ||
1062       op == ROW_ORIENTED) {
1063         MatSetOption(a->A,op);
1064         MatSetOption(a->B,op);
1065   }
1066   else if (op == ROWS_SORTED ||
1067            op == SYMMETRIC_MATRIX ||
1068            op == STRUCTURALLY_SYMMETRIC_MATRIX ||
1069            op == YES_NEW_DIAGONALS)
1070     PLogInfo(A,"Info:MatSetOption_MPIAIJ:Option ignored\n");
1071   else if (op == COLUMN_ORIENTED) {
1072     a->roworiented = 0;
1073     MatSetOption(a->A,op);
1074     MatSetOption(a->B,op);
1075   }
1076   else if (op == NO_NEW_DIAGONALS)
1077     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:NO_NEW_DIAGONALS");}
1078   else
1079     {SETERRQ(PETSC_ERR_SUP,"MatSetOption_MPIAIJ:unknown option");}
1080   return 0;
1081 }
1082 
1083 static int MatGetSize_MPIAIJ(Mat matin,int *m,int *n)
1084 {
1085   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1086   *m = mat->M; *n = mat->N;
1087   return 0;
1088 }
1089 
1090 static int MatGetLocalSize_MPIAIJ(Mat matin,int *m,int *n)
1091 {
1092   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1093   *m = mat->m; *n = mat->N;
1094   return 0;
1095 }
1096 
1097 static int MatGetOwnershipRange_MPIAIJ(Mat matin,int *m,int *n)
1098 {
1099   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1100   *m = mat->rstart; *n = mat->rend;
1101   return 0;
1102 }
1103 
1104 extern int MatGetRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1105 extern int MatRestoreRow_SeqAIJ(Mat,int,int*,int**,Scalar**);
1106 
1107 int MatGetRow_MPIAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1108 {
1109   Mat_MPIAIJ *mat = (Mat_MPIAIJ *) matin->data;
1110   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1111   int        i, ierr, *cworkA, *cworkB, **pcA, **pcB, cstart = mat->cstart;
1112   int        nztot, nzA, nzB, lrow, rstart = mat->rstart, rend = mat->rend;
1113   int        *cmap, *idx_p;
1114 
1115   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,"MatGetRow_MPIAIJ:Already active");
1116   mat->getrowactive = PETSC_TRUE;
1117 
1118   if (!mat->rowvalues && (idx || v)) {
1119     /*
1120         allocate enough space to hold information from the longest row.
1121     */
1122     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *) mat->A->data,*Ba = (Mat_SeqAIJ *) mat->B->data;
1123     int     max = 1,n = mat->n,tmp;
1124     for ( i=0; i<n; i++ ) {
1125       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1126       if (max < tmp) { max = tmp; }
1127     }
1128     mat->rowvalues = (Scalar *) PetscMalloc( max*(sizeof(int)+sizeof(Scalar)));
1129     CHKPTRQ(mat->rowvalues);
1130     mat->rowindices = (int *) (mat->rowvalues + max);
1131   }
1132 
1133 
1134   if (row < rstart || row >= rend) SETERRQ(1,"MatGetRow_MPIAIJ:Only local rows")
1135   lrow = row - rstart;
1136 
1137   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1138   if (!v)   {pvA = 0; pvB = 0;}
1139   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1140   ierr = MatGetRow_SeqAIJ(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1141   ierr = MatGetRow_SeqAIJ(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1142   nztot = nzA + nzB;
1143 
1144   cmap  = mat->garray;
1145   if (v  || idx) {
1146     if (nztot) {
1147       /* Sort by increasing column numbers, assuming A and B already sorted */
1148       int imark = -1;
1149       if (v) {
1150         *v = v_p = mat->rowvalues;
1151         for ( i=0; i<nzB; i++ ) {
1152           if (cmap[cworkB[i]] < cstart)   v_p[i] = vworkB[i];
1153           else break;
1154         }
1155         imark = i;
1156         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1157         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1158       }
1159       if (idx) {
1160         *idx = idx_p = mat->rowindices;
1161         if (imark > -1) {
1162           for ( i=0; i<imark; i++ ) {
1163             idx_p[i] = cmap[cworkB[i]];
1164           }
1165         } else {
1166           for ( i=0; i<nzB; i++ ) {
1167             if (cmap[cworkB[i]] < cstart)   idx_p[i] = cmap[cworkB[i]];
1168             else break;
1169           }
1170           imark = i;
1171         }
1172         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart + cworkA[i];
1173         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]];
1174       }
1175     }
1176     else {*idx = 0; *v=0;}
1177   }
1178   *nz = nztot;
1179   ierr = MatRestoreRow_SeqAIJ(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1180   ierr = MatRestoreRow_SeqAIJ(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1181   return 0;
1182 }
1183 
1184 int MatRestoreRow_MPIAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1185 {
1186   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1187   if (aij->getrowactive == PETSC_FALSE) {
1188     SETERRQ(1,"MatRestoreRow_MPIAIJ:MatGetRow not called");
1189   }
1190   aij->getrowactive = PETSC_FALSE;
1191   return 0;
1192 }
1193 
1194 static int MatNorm_MPIAIJ(Mat mat,NormType type,double *norm)
1195 {
1196   Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
1197   Mat_SeqAIJ *amat = (Mat_SeqAIJ*) aij->A->data, *bmat = (Mat_SeqAIJ*) aij->B->data;
1198   int        ierr, i, j, cstart = aij->cstart,shift = amat->indexshift;
1199   double     sum = 0.0;
1200   Scalar     *v;
1201 
1202   if (aij->size == 1) {
1203     ierr =  MatNorm(aij->A,type,norm); CHKERRQ(ierr);
1204   } else {
1205     if (type == NORM_FROBENIUS) {
1206       v = amat->a;
1207       for (i=0; i<amat->nz; i++ ) {
1208 #if defined(PETSC_COMPLEX)
1209         sum += real(conj(*v)*(*v)); v++;
1210 #else
1211         sum += (*v)*(*v); v++;
1212 #endif
1213       }
1214       v = bmat->a;
1215       for (i=0; i<bmat->nz; i++ ) {
1216 #if defined(PETSC_COMPLEX)
1217         sum += real(conj(*v)*(*v)); v++;
1218 #else
1219         sum += (*v)*(*v); v++;
1220 #endif
1221       }
1222       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
1223       *norm = sqrt(*norm);
1224     }
1225     else if (type == NORM_1) { /* max column norm */
1226       double *tmp, *tmp2;
1227       int    *jj, *garray = aij->garray;
1228       tmp  = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp);
1229       tmp2 = (double *) PetscMalloc( aij->N*sizeof(double) ); CHKPTRQ(tmp2);
1230       PetscMemzero(tmp,aij->N*sizeof(double));
1231       *norm = 0.0;
1232       v = amat->a; jj = amat->j;
1233       for ( j=0; j<amat->nz; j++ ) {
1234         tmp[cstart + *jj++ + shift] += PetscAbsScalar(*v);  v++;
1235       }
1236       v = bmat->a; jj = bmat->j;
1237       for ( j=0; j<bmat->nz; j++ ) {
1238         tmp[garray[*jj++ + shift]] += PetscAbsScalar(*v); v++;
1239       }
1240       MPI_Allreduce(tmp,tmp2,aij->N,MPI_DOUBLE,MPI_SUM,mat->comm);
1241       for ( j=0; j<aij->N; j++ ) {
1242         if (tmp2[j] > *norm) *norm = tmp2[j];
1243       }
1244       PetscFree(tmp); PetscFree(tmp2);
1245     }
1246     else if (type == NORM_INFINITY) { /* max row norm */
1247       double ntemp = 0.0;
1248       for ( j=0; j<amat->m; j++ ) {
1249         v = amat->a + amat->i[j] + shift;
1250         sum = 0.0;
1251         for ( i=0; i<amat->i[j+1]-amat->i[j]; i++ ) {
1252           sum += PetscAbsScalar(*v); v++;
1253         }
1254         v = bmat->a + bmat->i[j] + shift;
1255         for ( i=0; i<bmat->i[j+1]-bmat->i[j]; i++ ) {
1256           sum += PetscAbsScalar(*v); v++;
1257         }
1258         if (sum > ntemp) ntemp = sum;
1259       }
1260       MPI_Allreduce(&ntemp,norm,1,MPI_DOUBLE,MPI_MAX,mat->comm);
1261     }
1262     else {
1263       SETERRQ(1,"MatNorm_MPIAIJ:No support for two norm");
1264     }
1265   }
1266   return 0;
1267 }
1268 
1269 static int MatTranspose_MPIAIJ(Mat A,Mat *matout)
1270 {
1271   Mat_MPIAIJ *a = (Mat_MPIAIJ *) A->data;
1272   Mat_SeqAIJ *Aloc = (Mat_SeqAIJ *) a->A->data;
1273   int        ierr,shift = Aloc->indexshift;
1274   Mat        B;
1275   int        M = a->M, N = a->N,m,*ai,*aj,row,*cols,i,*ct;
1276   Scalar     *array;
1277 
1278   if (matout == PETSC_NULL && M != N)
1279     SETERRQ(1,"MatTranspose_MPIAIJ:Square matrix only for in-place");
1280   ierr = MatCreateMPIAIJ(A->comm,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,
1281          PETSC_NULL,&B); CHKERRQ(ierr);
1282 
1283   /* copy over the A part */
1284   Aloc = (Mat_SeqAIJ*) a->A->data;
1285   m = Aloc->m; ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1286   row = a->rstart;
1287   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] += a->cstart + shift;}
1288   for ( i=0; i<m; i++ ) {
1289     ierr = MatSetValues(B,ai[i+1]-ai[i],aj,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1290     row++; array += ai[i+1]-ai[i]; aj += ai[i+1]-ai[i];
1291   }
1292   aj = Aloc->j;
1293   for ( i=0; i<ai[m]+shift; i++ ) {aj[i] -= a->cstart + shift;}
1294 
1295   /* copy over the B part */
1296   Aloc = (Mat_SeqAIJ*) a->B->data;
1297   m = Aloc->m;  ai = Aloc->i; aj = Aloc->j; array = Aloc->a;
1298   row = a->rstart;
1299   ct = cols = (int *) PetscMalloc( (1+ai[m]-shift)*sizeof(int) ); CHKPTRQ(cols);
1300   for ( i=0; i<ai[m]+shift; i++ ) {cols[i] = a->garray[aj[i]+shift];}
1301   for ( i=0; i<m; i++ ) {
1302     ierr = MatSetValues(B,ai[i+1]-ai[i],cols,1,&row,array,INSERT_VALUES);CHKERRQ(ierr);
1303     row++; array += ai[i+1]-ai[i]; cols += ai[i+1]-ai[i];
1304   }
1305   PetscFree(ct);
1306   ierr = MatAssemblyBegin(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1307   ierr = MatAssemblyEnd(B,FINAL_ASSEMBLY); CHKERRQ(ierr);
1308   if (matout != PETSC_NULL) {
1309     *matout = B;
1310   } else {
1311     /* This isn't really an in-place transpose .... but free data structures from a */
1312     PetscFree(a->rowners);
1313     ierr = MatDestroy(a->A); CHKERRQ(ierr);
1314     ierr = MatDestroy(a->B); CHKERRQ(ierr);
1315     if (a->colmap) PetscFree(a->colmap);
1316     if (a->garray) PetscFree(a->garray);
1317     if (a->lvec) VecDestroy(a->lvec);
1318     if (a->Mvctx) VecScatterDestroy(a->Mvctx);
1319     PetscFree(a);
1320     PetscMemcpy(A,B,sizeof(struct _Mat));
1321     PetscHeaderDestroy(B);
1322   }
1323   return 0;
1324 }
1325 
1326 extern int MatPrintHelp_SeqAIJ(Mat);
1327 static int MatPrintHelp_MPIAIJ(Mat A)
1328 {
1329   Mat_MPIAIJ *a   = (Mat_MPIAIJ*) A->data;
1330 
1331   if (!a->rank) return MatPrintHelp_SeqAIJ(a->A);
1332   else return 0;
1333 }
1334 
1335 extern int MatConvert_MPIAIJ(Mat,MatType,Mat *);
1336 static int MatConvertSameType_MPIAIJ(Mat,Mat *,int);
1337 extern int MatIncreaseOverlap_MPIAIJ(Mat , int, IS *, int);
1338 int MatGetSubMatrices_MPIAIJ (Mat ,int , IS *,IS *,MatGetSubMatrixCall,Mat **);
1339 /* -------------------------------------------------------------------*/
1340 static struct _MatOps MatOps = {MatSetValues_MPIAIJ,
1341        MatGetRow_MPIAIJ,MatRestoreRow_MPIAIJ,
1342        MatMult_MPIAIJ,MatMultAdd_MPIAIJ,
1343        MatMultTrans_MPIAIJ,MatMultTransAdd_MPIAIJ,
1344        MatSolve_MPIAIJ,MatSolveAdd_MPIAIJ,
1345        MatSolveTrans_MPIAIJ,MatSolveTransAdd_MPIAIJ,
1346        MatLUFactor_MPIAIJ,0,
1347        MatRelax_MPIAIJ,
1348        MatTranspose_MPIAIJ,
1349        MatGetInfo_MPIAIJ,0,
1350        MatGetDiagonal_MPIAIJ,0,MatNorm_MPIAIJ,
1351        MatAssemblyBegin_MPIAIJ,MatAssemblyEnd_MPIAIJ,
1352        0,
1353        MatSetOption_MPIAIJ,MatZeroEntries_MPIAIJ,MatZeroRows_MPIAIJ,
1354        MatGetReordering_MPIAIJ,
1355        MatLUFactorSymbolic_MPIAIJ,MatLUFactorNumeric_MPIAIJ,0,0,
1356        MatGetSize_MPIAIJ,MatGetLocalSize_MPIAIJ,MatGetOwnershipRange_MPIAIJ,
1357        MatILUFactorSymbolic_MPIAIJ,0,
1358        0,0,MatConvert_MPIAIJ,0,0,MatConvertSameType_MPIAIJ,0,0,
1359        0,0,0,
1360        MatGetSubMatrices_MPIAIJ,MatIncreaseOverlap_MPIAIJ,MatGetValues_MPIAIJ,0,
1361        MatPrintHelp_MPIAIJ,
1362        MatScale_MPIAIJ};
1363 
1364 /*@C
1365    MatCreateMPIAIJ - Creates a sparse parallel matrix in AIJ format
1366    (the default parallel PETSc format).  For good matrix assembly performance
1367    the user should preallocate the matrix storage by setting the parameters
1368    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1369    performance can be increased by more than a factor of 50.
1370 
1371    Input Parameters:
1372 .  comm - MPI communicator
1373 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1374 .  n - number of local columns (or PETSC_DECIDE to have calculated
1375            if N is given)
1376 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1377 .  N - number of global columns (or PETSC_DECIDE to have calculated
1378            if n is given)
1379 .  d_nz - number of nonzeros per row in diagonal portion of local submatrix
1380            (same for all local rows)
1381 .  d_nzz - number of nonzeros per row in diagonal portion of local submatrix
1382            or null (possibly different for each row).  You must leave room
1383            for the diagonal entry even if it is zero.
1384 .  o_nz - number of nonzeros per row in off-diagonal portion of local
1385            submatrix (same for all local rows).
1386 .  o_nzz - number of nonzeros per row in off-diagonal portion of local
1387            submatrix or null (possibly different for each row).
1388 
1389    Output Parameter:
1390 .  A - the matrix
1391 
1392    Notes:
1393    The AIJ format (also called the Yale sparse matrix format or
1394    compressed row storage), is fully compatible with standard Fortran 77
1395    storage.  That is, the stored row and column indices can begin at
1396    either one (as in Fortran) or zero.  See the users manual for details.
1397 
1398    The user MUST specify either the local or global matrix dimensions
1399    (possibly both).
1400 
1401    By default, this format uses inodes (identical nodes) when possible.
1402    We search for consecutive rows with the same nonzero structure, thereby
1403    reusing matrix information to achieve increased efficiency.
1404 
1405    Options Database Keys:
1406 $    -mat_aij_no_inode  - Do not use inodes
1407 $    -mat_aij_inode_limit <limit> - Set inode limit.
1408 $        (max limit=5)
1409 $    -mat_aij_oneindex - Internally use indexing starting at 1
1410 $        rather than 0.  Note: When calling MatSetValues(),
1411 $        the user still MUST index entries starting at 0!
1412 
1413    Storage Information:
1414    For a square global matrix we define each processor's diagonal portion
1415    to be its local rows and the corresponding columns (a square submatrix);
1416    each processor's off-diagonal portion encompasses the remainder of the
1417    local matrix (a rectangular submatrix).
1418 
1419    The user can specify preallocated storage for the diagonal part of
1420    the local submatrix with either d_nz or d_nnz (not both).  Set
1421    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1422    memory allocation.  Likewise, specify preallocated storage for the
1423    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1424 
1425    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1426    the figure below we depict these three local rows and all columns (0-11).
1427 
1428 $          0 1 2 3 4 5 6 7 8 9 10 11
1429 $         -------------------
1430 $  row 3  |  o o o d d d o o o o o o
1431 $  row 4  |  o o o d d d o o o o o o
1432 $  row 5  |  o o o d d d o o o o o o
1433 $         -------------------
1434 $
1435 
1436    Thus, any entries in the d locations are stored in the d (diagonal)
1437    submatrix, and any entries in the o locations are stored in the
1438    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1439    stored simply in the MATSEQAIJ format for compressed row storage.
1440 
1441    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1442    and o_nz should indicate the number of nonzeros per row in the o matrix.
1443    In general, for PDE problems in which most nonzeros are near the diagonal,
1444    one expects d_nz >> o_nz.   For additional details, see the users manual
1445    chapter on matrices and the file $(PETSC_DIR)/Performance.
1446 
1447 .keywords: matrix, aij, compressed row, sparse, parallel
1448 
1449 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
1450 @*/
1451 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N,
1452                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1453 {
1454   Mat          B;
1455   Mat_MPIAIJ   *b;
1456   int          ierr, i,sum[2],work[2];
1457 
1458   *A = 0;
1459   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIAIJ,comm);
1460   PLogObjectCreate(B);
1461   B->data       = (void *) (b = PetscNew(Mat_MPIAIJ)); CHKPTRQ(b);
1462   PetscMemzero(b,sizeof(Mat_MPIAIJ));
1463   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1464   B->destroy    = MatDestroy_MPIAIJ;
1465   B->view       = MatView_MPIAIJ;
1466   B->factor     = 0;
1467   B->assembled  = PETSC_FALSE;
1468 
1469   b->insertmode = NOT_SET_VALUES;
1470   MPI_Comm_rank(comm,&b->rank);
1471   MPI_Comm_size(comm,&b->size);
1472 
1473   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1474     SETERRQ(1,"MatCreateMPIAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1475 
1476   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1477     work[0] = m; work[1] = n;
1478     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1479     if (M == PETSC_DECIDE) M = sum[0];
1480     if (N == PETSC_DECIDE) N = sum[1];
1481   }
1482   if (m == PETSC_DECIDE) {m = M/b->size + ((M % b->size) > b->rank);}
1483   if (n == PETSC_DECIDE) {n = N/b->size + ((N % b->size) > b->rank);}
1484   b->m = m; B->m = m;
1485   b->n = n; B->n = n;
1486   b->N = N; B->N = N;
1487   b->M = M; B->M = M;
1488 
1489   /* build local table of row and column ownerships */
1490   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1491   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1492   b->cowners = b->rowners + b->size + 1;
1493   MPI_Allgather(&m,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1494   b->rowners[0] = 0;
1495   for ( i=2; i<=b->size; i++ ) {
1496     b->rowners[i] += b->rowners[i-1];
1497   }
1498   b->rstart = b->rowners[b->rank];
1499   b->rend   = b->rowners[b->rank+1];
1500   MPI_Allgather(&n,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1501   b->cowners[0] = 0;
1502   for ( i=2; i<=b->size; i++ ) {
1503     b->cowners[i] += b->cowners[i-1];
1504   }
1505   b->cstart = b->cowners[b->rank];
1506   b->cend   = b->cowners[b->rank+1];
1507 
1508   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1509   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1510   PLogObjectParent(B,b->A);
1511   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1512   ierr = MatCreateSeqAIJ(MPI_COMM_SELF,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1513   PLogObjectParent(B,b->B);
1514 
1515   /* build cache for off array entries formed */
1516   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1517   b->colmap      = 0;
1518   b->garray      = 0;
1519   b->roworiented = 1;
1520 
1521   /* stuff used for matrix vector multiply */
1522   b->lvec      = 0;
1523   b->Mvctx     = 0;
1524 
1525   /* stuff for MatGetRow() */
1526   b->rowindices   = 0;
1527   b->rowvalues    = 0;
1528   b->getrowactive = PETSC_FALSE;
1529 
1530   *A = B;
1531   return 0;
1532 }
1533 
1534 static int MatConvertSameType_MPIAIJ(Mat matin,Mat *newmat,int cpvalues)
1535 {
1536   Mat        mat;
1537   Mat_MPIAIJ *a,*oldmat = (Mat_MPIAIJ *) matin->data;
1538   int        ierr, len,flg;
1539 
1540   *newmat       = 0;
1541   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIAIJ,matin->comm);
1542   PLogObjectCreate(mat);
1543   mat->data       = (void *) (a = PetscNew(Mat_MPIAIJ)); CHKPTRQ(a);
1544   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1545   mat->destroy    = MatDestroy_MPIAIJ;
1546   mat->view       = MatView_MPIAIJ;
1547   mat->factor     = matin->factor;
1548   mat->assembled  = PETSC_TRUE;
1549 
1550   a->m = mat->m   = oldmat->m;
1551   a->n = mat->n   = oldmat->n;
1552   a->M = mat->M   = oldmat->M;
1553   a->N = mat->N   = oldmat->N;
1554 
1555   a->rstart       = oldmat->rstart;
1556   a->rend         = oldmat->rend;
1557   a->cstart       = oldmat->cstart;
1558   a->cend         = oldmat->cend;
1559   a->size         = oldmat->size;
1560   a->rank         = oldmat->rank;
1561   a->insertmode   = NOT_SET_VALUES;
1562   a->rowvalues    = 0;
1563   a->getrowactive = PETSC_FALSE;
1564 
1565   a->rowners = (int *) PetscMalloc((a->size+1)*sizeof(int)); CHKPTRQ(a->rowners);
1566   PLogObjectMemory(mat,(a->size+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIAIJ));
1567   PetscMemcpy(a->rowners,oldmat->rowners,(a->size+1)*sizeof(int));
1568   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1569   if (oldmat->colmap) {
1570     a->colmap = (int *) PetscMalloc((a->N)*sizeof(int));CHKPTRQ(a->colmap);
1571     PLogObjectMemory(mat,(a->N)*sizeof(int));
1572     PetscMemcpy(a->colmap,oldmat->colmap,(a->N)*sizeof(int));
1573   } else a->colmap = 0;
1574   if (oldmat->garray && (len = ((Mat_SeqAIJ *) (oldmat->B->data))->n)) {
1575     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1576     PLogObjectMemory(mat,len*sizeof(int));
1577     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1578   } else a->garray = 0;
1579 
1580   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1581   PLogObjectParent(mat,a->lvec);
1582   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1583   PLogObjectParent(mat,a->Mvctx);
1584   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1585   PLogObjectParent(mat,a->A);
1586   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1587   PLogObjectParent(mat,a->B);
1588   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1589   if (flg) {
1590     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1591   }
1592   *newmat = mat;
1593   return 0;
1594 }
1595 
1596 #include "sys.h"
1597 
1598 int MatLoad_MPIAIJ(Viewer viewer,MatType type,Mat *newmat)
1599 {
1600   Mat          A;
1601   int          i, nz, ierr, j,rstart, rend, fd;
1602   Scalar       *vals,*svals;
1603   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1604   MPI_Status   status;
1605   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
1606   int          *ourlens,*sndcounts = 0,*procsnz = 0, *offlens,jj,*mycols,*smycols;
1607   int          tag = ((PetscObject)viewer)->tag;
1608 
1609   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1610   if (!rank) {
1611     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1612     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1613     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
1614   }
1615 
1616   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1617   M = header[1]; N = header[2];
1618   /* determine ownership of all rows */
1619   m = M/size + ((M % size) > rank);
1620   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
1621   MPI_Allgather(&m,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1622   rowners[0] = 0;
1623   for ( i=2; i<=size; i++ ) {
1624     rowners[i] += rowners[i-1];
1625   }
1626   rstart = rowners[rank];
1627   rend   = rowners[rank+1];
1628 
1629   /* distribute row lengths to all processors */
1630   ourlens = (int*) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(ourlens);
1631   offlens = ourlens + (rend-rstart);
1632   if (!rank) {
1633     rowlengths = (int*) PetscMalloc( M*sizeof(int) ); CHKPTRQ(rowlengths);
1634     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1635     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1636     for ( i=0; i<size; i++ ) sndcounts[i] = rowners[i+1] - rowners[i];
1637     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,ourlens,rend-rstart,MPI_INT,0,comm);
1638     PetscFree(sndcounts);
1639   }
1640   else {
1641     MPI_Scatterv(0,0,0,MPI_INT,ourlens,rend-rstart,MPI_INT, 0,comm);
1642   }
1643 
1644   if (!rank) {
1645     /* calculate the number of nonzeros on each processor */
1646     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1647     PetscMemzero(procsnz,size*sizeof(int));
1648     for ( i=0; i<size; i++ ) {
1649       for ( j=rowners[i]; j< rowners[i+1]; j++ ) {
1650         procsnz[i] += rowlengths[j];
1651       }
1652     }
1653     PetscFree(rowlengths);
1654 
1655     /* determine max buffer needed and allocate it */
1656     maxnz = 0;
1657     for ( i=0; i<size; i++ ) {
1658       maxnz = PetscMax(maxnz,procsnz[i]);
1659     }
1660     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1661 
1662     /* read in my part of the matrix column indices  */
1663     nz = procsnz[0];
1664     mycols = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1665     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1666 
1667     /* read in every one elses and ship off */
1668     for ( i=1; i<size; i++ ) {
1669       nz = procsnz[i];
1670       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1671       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1672     }
1673     PetscFree(cols);
1674   }
1675   else {
1676     /* determine buffer space needed for message */
1677     nz = 0;
1678     for ( i=0; i<m; i++ ) {
1679       nz += ourlens[i];
1680     }
1681     mycols = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(mycols);
1682 
1683     /* receive message of column indices*/
1684     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1685     MPI_Get_count(&status,MPI_INT,&maxnz);
1686     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1687   }
1688 
1689   /* loop over local rows, determining number of off diagonal entries */
1690   PetscMemzero(offlens,m*sizeof(int));
1691   jj = 0;
1692   for ( i=0; i<m; i++ ) {
1693     for ( j=0; j<ourlens[i]; j++ ) {
1694       if (mycols[jj] < rstart || mycols[jj] >= rend) offlens[i]++;
1695       jj++;
1696     }
1697   }
1698 
1699   /* create our matrix */
1700   for ( i=0; i<m; i++ ) {
1701     ourlens[i] -= offlens[i];
1702   }
1703   ierr = MatCreateMPIAIJ(comm,m,PETSC_DECIDE,M,N,0,ourlens,0,offlens,newmat);CHKERRQ(ierr);
1704   A = *newmat;
1705   MatSetOption(A,COLUMNS_SORTED);
1706   for ( i=0; i<m; i++ ) {
1707     ourlens[i] += offlens[i];
1708   }
1709 
1710   if (!rank) {
1711     vals = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(vals);
1712 
1713     /* read in my part of the matrix numerical values  */
1714     nz = procsnz[0];
1715     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1716 
1717     /* insert into matrix */
1718     jj      = rstart;
1719     smycols = mycols;
1720     svals   = vals;
1721     for ( i=0; i<m; i++ ) {
1722       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1723       smycols += ourlens[i];
1724       svals   += ourlens[i];
1725       jj++;
1726     }
1727 
1728     /* read in other processors and ship out */
1729     for ( i=1; i<size; i++ ) {
1730       nz = procsnz[i];
1731       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1732       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1733     }
1734     PetscFree(procsnz);
1735   }
1736   else {
1737     /* receive numeric values */
1738     vals = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(vals);
1739 
1740     /* receive message of values*/
1741     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1742     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1743     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
1744 
1745     /* insert into matrix */
1746     jj      = rstart;
1747     smycols = mycols;
1748     svals   = vals;
1749     for ( i=0; i<m; i++ ) {
1750       ierr = MatSetValues(A,1,&jj,ourlens[i],smycols,svals,INSERT_VALUES);CHKERRQ(ierr);
1751       smycols += ourlens[i];
1752       svals   += ourlens[i];
1753       jj++;
1754     }
1755   }
1756   PetscFree(ourlens); PetscFree(vals); PetscFree(mycols); PetscFree(rowners);
1757 
1758   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1759   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
1760   return 0;
1761 }
1762