xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 11ebbc714b45ef4372941d536196407bce829dd6)
1 #ifndef lint
2 static char vcid[] = "$Id: mpibaij.c,v 1.68 1997/05/03 22:43:28 curfman Exp curfman $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 extern int MatLUFactorSymbolic_MPIBAIJ(Mat,IS,IS,double,Mat *);
15 extern int MatLUFactorNumeric_MPIBAIJ(Mat,Mat *);
16 extern int MatLUFactor_MPIBAIJ(Mat,IS,IS,double);
17 extern int MatSolve_MPIBAIJ(Mat,Vec,Vec);
18 extern int MatSolveAdd_MPIBAIJ(Mat,Vec,Vec,Vec);
19 extern int MatSolveTrans_MPIBAIJ(Mat,Vec,Vec);
20 extern int MatSolveTransAdd_MPIBAIJ(Mat,Vec,Vec,Vec);
21 extern int MatILUFactorSymbolic_MPIBAIJ(Mat,IS,IS,double,int,Mat *);
22 
23 
24 /*
25      Local utility routine that creates a mapping from the global column
26    number to the local number in the off-diagonal part of the local
27    storage of the matrix.  This is done in a non scable way since the
28    length of colmap equals the global matrix length.
29 */
30 #undef __FUNC__
31 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
32 static int CreateColmap_MPIBAIJ_Private(Mat mat)
33 {
34   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
35   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
36   int         nbs = B->nbs,i,bs=B->bs;;
37 
38   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
39   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
40   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
41   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
42   return 0;
43 }
44 
45 #undef __FUNC__
46 #define __FUNC__ "MatGetRowIJ_MPIBAIJ("
47 static int MatGetRowIJ_MPIBAIJ(Mat mat,int shift,PetscTruth symmetric,int *n,int **ia,int **ja,
48                             PetscTruth *done)
49 {
50   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *) mat->data;
51   int         ierr;
52   if (aij->size == 1) {
53     ierr = MatGetRowIJ(aij->A,shift,symmetric,n,ia,ja,done); CHKERRQ(ierr);
54   } else SETERRQ(1,0,"not supported in parallel");
55   return 0;
56 }
57 
58 #undef __FUNC__
59 #define __FUNC__ "MatRestoreRowIJ_MPIBAIJ"
60 static int MatRestoreRowIJ_MPIBAIJ(Mat mat,int shift,PetscTruth symmetric,int *n,int **ia,int **ja,
61                                 PetscTruth *done)
62 {
63   Mat_MPIBAIJ *aij = (Mat_MPIBAIJ *) mat->data;
64   int        ierr;
65   if (aij->size == 1) {
66     ierr = MatRestoreRowIJ(aij->A,shift,symmetric,n,ia,ja,done); CHKERRQ(ierr);
67   } else SETERRQ(1,0,"not supported in parallel");
68   return 0;
69 }
70 #define CHUNKSIZE  10
71 
72 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
73 { \
74  \
75     brow = row/bs;  \
76     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
77     rmax = aimax[brow]; nrow = ailen[brow]; \
78       bcol = col/bs; \
79       ridx = row % bs; cidx = col % bs; \
80       low = 0; high = nrow; \
81       while (high-low > 3) { \
82         t = (low+high)/2; \
83         if (rp[t] > bcol) high = t; \
84         else              low  = t; \
85       } \
86       for ( _i=low; _i<high; _i++ ) { \
87         if (rp[_i] > bcol) break; \
88         if (rp[_i] == bcol) { \
89           bap  = ap +  bs2*_i + bs*cidx + ridx; \
90           if (addv == ADD_VALUES) *bap += value;  \
91           else                    *bap  = value;  \
92           goto a_noinsert; \
93         } \
94       } \
95       if (a->nonew == 1) goto a_noinsert; \
96       else if (a->nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
97       if (nrow >= rmax) { \
98         /* there is no extra room in row, therefore enlarge */ \
99         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
100         Scalar *new_a; \
101  \
102         if (a->nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
103  \
104         /* malloc new storage space */ \
105         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
106         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
107         new_j   = (int *) (new_a + bs2*new_nz); \
108         new_i   = new_j + new_nz; \
109  \
110         /* copy over old data into new slots */ \
111         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
112         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
113         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
114         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
115         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
116                                                            len*sizeof(int)); \
117         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
118         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
119         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
120                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
121         /* free up old matrix storage */ \
122         PetscFree(a->a);  \
123         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
124         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
125         a->singlemalloc = 1; \
126  \
127         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
128         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
129         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
130         a->maxnz += bs2*CHUNKSIZE; \
131         a->reallocs++; \
132         a->nz++; \
133       } \
134       N = nrow++ - 1;  \
135       /* shift up all the later entries in this row */ \
136       for ( ii=N; ii>=_i; ii-- ) { \
137         rp[ii+1] = rp[ii]; \
138         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
139       } \
140       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
141       rp[_i]                      = bcol;  \
142       ap[bs2*_i + bs*cidx + ridx] = value;  \
143       a_noinsert:; \
144     ailen[brow] = nrow; \
145 }
146 
147 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
148 { \
149  \
150     brow = row/bs;  \
151     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
152     rmax = bimax[brow]; nrow = bilen[brow]; \
153       bcol = col/bs; \
154       ridx = row % bs; cidx = col % bs; \
155       low = 0; high = nrow; \
156       while (high-low > 3) { \
157         t = (low+high)/2; \
158         if (rp[t] > bcol) high = t; \
159         else              low  = t; \
160       } \
161       for ( _i=low; _i<high; _i++ ) { \
162         if (rp[_i] > bcol) break; \
163         if (rp[_i] == bcol) { \
164           bap  = ap +  bs2*_i + bs*cidx + ridx; \
165           if (addv == ADD_VALUES) *bap += value;  \
166           else                    *bap  = value;  \
167           goto b_noinsert; \
168         } \
169       } \
170       if (b->nonew == 1) goto b_noinsert; \
171       else if (b->nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
172       if (nrow >= rmax) { \
173         /* there is no extra room in row, therefore enlarge */ \
174         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
175         Scalar *new_a; \
176  \
177         if (b->nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix"); \
178  \
179         /* malloc new storage space */ \
180         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
181         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
182         new_j   = (int *) (new_a + bs2*new_nz); \
183         new_i   = new_j + new_nz; \
184  \
185         /* copy over old data into new slots */ \
186         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
187         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
188         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
189         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
190         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
191                                                            len*sizeof(int)); \
192         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
193         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
194         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
195                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
196         /* free up old matrix storage */ \
197         PetscFree(b->a);  \
198         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
199         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
200         b->singlemalloc = 1; \
201  \
202         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
203         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
204         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
205         b->maxnz += bs2*CHUNKSIZE; \
206         b->reallocs++; \
207         b->nz++; \
208       } \
209       N = nrow++ - 1;  \
210       /* shift up all the later entries in this row */ \
211       for ( ii=N; ii>=_i; ii-- ) { \
212         rp[ii+1] = rp[ii]; \
213         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
214       } \
215       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
216       rp[_i]                      = bcol;  \
217       ap[bs2*_i + bs*cidx + ridx] = value;  \
218       b_noinsert:; \
219     bilen[brow] = nrow; \
220 }
221 
222 #undef __FUNC__
223 #define __FUNC__ "MatSetValues_MPIBAIJ"
224 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
225 {
226   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
227   Scalar      value;
228   int         ierr,i,j,row,col;
229   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
230   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
231   int         cend_orig=baij->cend_bs,bs=baij->bs;
232 
233   /* Some Variables required in the macro */
234   Mat         A = baij->A;
235   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
236   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
237   Scalar      *aa=a->a;
238 
239   Mat         B = baij->B;
240   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
241   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
242   Scalar      *ba=b->a;
243 
244   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
245   int         low,high,t,ridx,cidx,bs2=a->bs2;
246   Scalar      *ap,*bap;
247 
248   for ( i=0; i<m; i++ ) {
249 #if defined(PETSC_BOPT_g)
250     if (im[i] < 0) SETERRQ(1,0,"Negative row");
251     if (im[i] >= baij->M) SETERRQ(1,0,"Row too large");
252 #endif
253     if (im[i] >= rstart_orig && im[i] < rend_orig) {
254       row = im[i] - rstart_orig;
255       for ( j=0; j<n; j++ ) {
256         if (in[j] >= cstart_orig && in[j] < cend_orig){
257           col = in[j] - cstart_orig;
258           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
259           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
260           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
261         }
262 #if defined(PETSC_BOPT_g)
263         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
264         else if (in[j] >= baij->N) {SETERRQ(1,0,"Col too large");}
265 #endif
266         else {
267           if (mat->was_assembled) {
268             if (!baij->colmap) {
269               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
270             }
271             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
272             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
273               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
274               col =  in[j];
275             }
276           }
277           else col = in[j];
278           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
279           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
280           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
281         }
282       }
283     }
284     else {
285       if (roworiented && !baij->donotstash) {
286         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
287       }
288       else {
289         if (!baij->donotstash) {
290           row = im[i];
291 	  for ( j=0; j<n; j++ ) {
292 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
293           }
294         }
295       }
296     }
297   }
298   return 0;
299 }
300 
301 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
302 #undef __FUNC__
303 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
304 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
305 {
306   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
307   Scalar      *value,*barray=baij->barray;
308   int         ierr,i,j,ii,jj,row,col,k,l;
309   int         roworiented = baij->roworiented,rstart=baij->rstart ;
310   int         rend=baij->rend,cstart=baij->cstart,stepval;
311   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
312 
313 
314   if(!barray) {
315     barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
316     baij->barray = barray;
317   }
318 
319   if (roworiented) {
320     stepval = (n-1)*bs;
321   } else {
322     stepval = (m-1)*bs;
323   }
324   for ( i=0; i<m; i++ ) {
325 #if defined(PETSC_BOPT_g)
326     if (im[i] < 0) SETERRQ(1,0,"Negative row");
327     if (im[i] >= baij->Mbs) SETERRQ(1,0,"Row too large");
328 #endif
329     if (im[i] >= rstart && im[i] < rend) {
330       row = im[i] - rstart;
331       for ( j=0; j<n; j++ ) {
332         /* If NumCol = 1 then a copy is not required */
333         if ((roworiented) && (n == 1)) {
334           barray = v + i*bs2;
335         } else if((!roworiented) && (m == 1)) {
336           barray = v + j*bs2;
337         } else { /* Here a copy is required */
338           if (roworiented) {
339             value = v + i*(stepval+bs)*bs + j*bs;
340           } else {
341             value = v + j*(stepval+bs)*bs + i*bs;
342           }
343           for ( ii=0; ii<bs; ii++,value+=stepval )
344             for (jj=0; jj<bs; jj++ )
345               *barray++  = *value++;
346           barray -=bs2;
347         }
348 
349         if (in[j] >= cstart && in[j] < cend){
350           col = in[j] - cstart;
351           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
352         }
353 #if defined(PETSC_BOPT_g)
354         else if (in[j] < 0) {SETERRQ(1,0,"Negative column");}
355         else if (in[j] >= baij->Nbs) {SETERRQ(1,0,"Col too large");}
356 #endif
357         else {
358           if (mat->was_assembled) {
359             if (!baij->colmap) {
360               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
361             }
362             col = baij->colmap[in[j]] - 1;
363             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
364               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
365               col =  in[j];
366             }
367           }
368           else col = in[j];
369           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
370         }
371       }
372     }
373     else {
374       if (!baij->donotstash) {
375         if (roworiented ) {
376           row   = im[i]*bs;
377           value = v + i*(stepval+bs)*bs;
378           for ( j=0; j<bs; j++,row++ ) {
379             for ( k=0; k<n; k++ ) {
380               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
381                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
382               }
383             }
384           }
385         }
386         else {
387           for ( j=0; j<n; j++ ) {
388             value = v + j*(stepval+bs)*bs + i*bs;
389             col   = in[j]*bs;
390             for ( k=0; k<bs; k++,col++,value+=stepval) {
391               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
392                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
393               }
394             }
395           }
396 
397         }
398       }
399     }
400   }
401   return 0;
402 }
403 
404 #undef __FUNC__
405 #define __FUNC__ "MatGetValues_MPIBAIJ"
406 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
407 {
408   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
409   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
410   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
411 
412   for ( i=0; i<m; i++ ) {
413     if (idxm[i] < 0) SETERRQ(1,0,"Negative row");
414     if (idxm[i] >= baij->M) SETERRQ(1,0,"Row too large");
415     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
416       row = idxm[i] - bsrstart;
417       for ( j=0; j<n; j++ ) {
418         if (idxn[j] < 0) SETERRQ(1,0,"Negative column");
419         if (idxn[j] >= baij->N) SETERRQ(1,0,"Col too large");
420         if (idxn[j] >= bscstart && idxn[j] < bscend){
421           col = idxn[j] - bscstart;
422           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
423         }
424         else {
425           if (!baij->colmap) {
426             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
427           }
428           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
429              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
430           else {
431             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
432             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
433           }
434         }
435       }
436     }
437     else {
438       SETERRQ(1,0,"Only local values currently supported");
439     }
440   }
441   return 0;
442 }
443 
444 #undef __FUNC__
445 #define __FUNC__ "MatNorm_MPIBAIJ"
446 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
447 {
448   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
449   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
450   int        ierr, i,bs2=baij->bs2;
451   double     sum = 0.0;
452   Scalar     *v;
453 
454   if (baij->size == 1) {
455     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
456   } else {
457     if (type == NORM_FROBENIUS) {
458       v = amat->a;
459       for (i=0; i<amat->nz*bs2; i++ ) {
460 #if defined(PETSC_COMPLEX)
461         sum += real(conj(*v)*(*v)); v++;
462 #else
463         sum += (*v)*(*v); v++;
464 #endif
465       }
466       v = bmat->a;
467       for (i=0; i<bmat->nz*bs2; i++ ) {
468 #if defined(PETSC_COMPLEX)
469         sum += real(conj(*v)*(*v)); v++;
470 #else
471         sum += (*v)*(*v); v++;
472 #endif
473       }
474       MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);
475       *norm = sqrt(*norm);
476     }
477     else
478       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
479   }
480   return 0;
481 }
482 
483 #undef __FUNC__
484 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
485 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
486 {
487   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
488   MPI_Comm    comm = mat->comm;
489   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
490   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
491   MPI_Request *send_waits,*recv_waits;
492   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
493   InsertMode  addv;
494   Scalar      *rvalues,*svalues;
495 
496   /* make sure all processors are either in INSERTMODE or ADDMODE */
497   MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
498   if (addv == (ADD_VALUES|INSERT_VALUES)) {
499     SETERRQ(1,0,"Some processors inserted others added");
500   }
501   mat->insertmode = addv; /* in case this processor had no cache */
502 
503   /*  first count number of contributors to each processor */
504   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
505   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
506   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
507   for ( i=0; i<baij->stash.n; i++ ) {
508     idx = baij->stash.idx[i];
509     for ( j=0; j<size; j++ ) {
510       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
511         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
512       }
513     }
514   }
515   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
516 
517   /* inform other processors of number of messages and max length*/
518   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
519   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
520   nreceives = work[rank];
521   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
522   nmax = work[rank];
523   PetscFree(work);
524 
525   /* post receives:
526        1) each message will consist of ordered pairs
527      (global index,value) we store the global index as a double
528      to simplify the message passing.
529        2) since we don't know how long each individual message is we
530      allocate the largest needed buffer for each receive. Potentially
531      this is a lot of wasted space.
532 
533 
534        This could be done better.
535   */
536   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
537   CHKPTRQ(rvalues);
538   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
539   CHKPTRQ(recv_waits);
540   for ( i=0; i<nreceives; i++ ) {
541     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
542               comm,recv_waits+i);
543   }
544 
545   /* do sends:
546       1) starts[i] gives the starting index in svalues for stuff going to
547          the ith processor
548   */
549   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
550   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
551   CHKPTRQ(send_waits);
552   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
553   starts[0] = 0;
554   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
555   for ( i=0; i<baij->stash.n; i++ ) {
556     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
557     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
558     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
559   }
560   PetscFree(owner);
561   starts[0] = 0;
562   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
563   count = 0;
564   for ( i=0; i<size; i++ ) {
565     if (procs[i]) {
566       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
567                 comm,send_waits+count++);
568     }
569   }
570   PetscFree(starts); PetscFree(nprocs);
571 
572   /* Free cache space */
573   PLogInfo(mat,"MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",baij->stash.n);
574   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
575 
576   baij->svalues    = svalues;    baij->rvalues    = rvalues;
577   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
578   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
579   baij->rmax       = nmax;
580 
581   return 0;
582 }
583 
584 
585 #undef __FUNC__
586 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
587 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
588 {
589   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
590   MPI_Status  *send_status,recv_status;
591   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
592   int         bs=baij->bs,row,col,other_disassembled;
593   Scalar      *values,val;
594   InsertMode  addv = mat->insertmode;
595 
596   /*  wait on receives */
597   while (count) {
598     MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);
599     /* unpack receives into our local space */
600     values = baij->rvalues + 3*imdex*baij->rmax;
601     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
602     n = n/3;
603     for ( i=0; i<n; i++ ) {
604       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
605       col = (int) PetscReal(values[3*i+1]);
606       val = values[3*i+2];
607       if (col >= baij->cstart*bs && col < baij->cend*bs) {
608         col -= baij->cstart*bs;
609         MatSetValues(baij->A,1,&row,1,&col,&val,addv);
610       }
611       else {
612         if (mat->was_assembled) {
613           if (!baij->colmap) {
614             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
615           }
616           col = (baij->colmap[col/bs]-1)*bs + col%bs;
617           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
618             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
619             col = (int) PetscReal(values[3*i+1]);
620           }
621         }
622         MatSetValues(baij->B,1,&row,1,&col,&val,addv);
623       }
624     }
625     count--;
626   }
627   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
628 
629   /* wait on sends */
630   if (baij->nsends) {
631     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));
632     CHKPTRQ(send_status);
633     MPI_Waitall(baij->nsends,baij->send_waits,send_status);
634     PetscFree(send_status);
635   }
636   PetscFree(baij->send_waits); PetscFree(baij->svalues);
637 
638   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
639   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
640 
641   /* determine if any processor has disassembled, if so we must
642      also disassemble ourselfs, in order that we may reassemble. */
643   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
644   if (mat->was_assembled && !other_disassembled) {
645     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
646   }
647 
648   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
649     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
650   }
651   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
652   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
653 
654   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
655   return 0;
656 }
657 
658 #undef __FUNC__
659 #define __FUNC__ "MatView_MPIBAIJ_Binary"
660 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
661 {
662   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
663   int          ierr;
664 
665   if (baij->size == 1) {
666     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
667   }
668   else SETERRQ(1,0,"Only uniprocessor output supported");
669   return 0;
670 }
671 
672 #undef __FUNC__
673 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
674 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
675 {
676   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
677   int          ierr, format,rank,bs = baij->bs;
678   FILE         *fd;
679   ViewerType   vtype;
680 
681   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
682   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
683     ierr = ViewerGetFormat(viewer,&format);
684     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
685       MatInfo info;
686       MPI_Comm_rank(mat->comm,&rank);
687       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
688       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
689       PetscSequentialPhaseBegin(mat->comm,1);
690       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
691               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
692               baij->bs,(int)info.memory);
693       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
694       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
695       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
696       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
697       fflush(fd);
698       PetscSequentialPhaseEnd(mat->comm,1);
699       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
700       return 0;
701     }
702     else if (format == VIEWER_FORMAT_ASCII_INFO) {
703       PetscPrintf(mat->comm,"  block size is %d\n",bs);
704       return 0;
705     }
706   }
707 
708   if (vtype == DRAW_VIEWER) {
709     Draw       draw;
710     PetscTruth isnull;
711     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
712     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
713   }
714 
715   if (vtype == ASCII_FILE_VIEWER) {
716     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
717     PetscSequentialPhaseBegin(mat->comm,1);
718     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
719            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
720             baij->cstart*bs,baij->cend*bs);
721     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
722     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
723     fflush(fd);
724     PetscSequentialPhaseEnd(mat->comm,1);
725   }
726   else {
727     int size = baij->size;
728     rank = baij->rank;
729     if (size == 1) {
730       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
731     }
732     else {
733       /* assemble the entire matrix onto first processor. */
734       Mat         A;
735       Mat_SeqBAIJ *Aloc;
736       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
737       int         mbs=baij->mbs;
738       Scalar      *a;
739 
740       if (!rank) {
741         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
742         CHKERRQ(ierr);
743       }
744       else {
745         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
746         CHKERRQ(ierr);
747       }
748       PLogObjectParent(mat,A);
749 
750       /* copy over the A part */
751       Aloc = (Mat_SeqBAIJ*) baij->A->data;
752       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
753       row = baij->rstart;
754       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
755 
756       for ( i=0; i<mbs; i++ ) {
757         rvals[0] = bs*(baij->rstart + i);
758         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
759         for ( j=ai[i]; j<ai[i+1]; j++ ) {
760           col = (baij->cstart+aj[j])*bs;
761           for (k=0; k<bs; k++ ) {
762             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
763             col++; a += bs;
764           }
765         }
766       }
767       /* copy over the B part */
768       Aloc = (Mat_SeqBAIJ*) baij->B->data;
769       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
770       row = baij->rstart*bs;
771       for ( i=0; i<mbs; i++ ) {
772         rvals[0] = bs*(baij->rstart + i);
773         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
774         for ( j=ai[i]; j<ai[i+1]; j++ ) {
775           col = baij->garray[aj[j]]*bs;
776           for (k=0; k<bs; k++ ) {
777             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
778             col++; a += bs;
779           }
780         }
781       }
782       PetscFree(rvals);
783       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
784       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
785       if (!rank) {
786         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
787       }
788       ierr = MatDestroy(A); CHKERRQ(ierr);
789     }
790   }
791   return 0;
792 }
793 
794 
795 
796 #undef __FUNC__
797 #define __FUNC__ "MatView_MPIBAIJ"
798 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
799 {
800   Mat         mat = (Mat) obj;
801   int         ierr;
802   ViewerType  vtype;
803 
804   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
805   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
806       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
807     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
808   }
809   else if (vtype == BINARY_FILE_VIEWER) {
810     return MatView_MPIBAIJ_Binary(mat,viewer);
811   }
812   return 0;
813 }
814 
815 #undef __FUNC__
816 #define __FUNC__ "MatDestroy_MPIBAIJ"
817 int MatDestroy_MPIBAIJ(PetscObject obj)
818 {
819   Mat         mat = (Mat) obj;
820   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
821   int         ierr;
822 
823 #if defined(PETSC_LOG)
824   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
825 #endif
826 
827   PetscFree(baij->rowners);
828   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
829   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
830   if (baij->colmap) PetscFree(baij->colmap);
831   if (baij->garray) PetscFree(baij->garray);
832   if (baij->lvec)   VecDestroy(baij->lvec);
833   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
834   if (baij->rowvalues) PetscFree(baij->rowvalues);
835   if (baij->barray) PetscFree(baij->barray);
836   PetscFree(baij);
837   if (mat->mapping) {
838     ierr = ISLocalToGlobalMappingDestroy(mat->mapping); CHKERRQ(ierr);
839   }
840   PLogObjectDestroy(mat);
841   PetscHeaderDestroy(mat);
842   return 0;
843 }
844 
845 #undef __FUNC__
846 #define __FUNC__ "MatMult_MPIBAIJ"
847 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
848 {
849   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
850   int         ierr, nt;
851 
852   VecGetLocalSize_Fast(xx,nt);
853   if (nt != a->n) {
854     SETERRQ(1,0,"Incompatible partition of A and xx");
855   }
856   VecGetLocalSize_Fast(yy,nt);
857   if (nt != a->m) {
858     SETERRQ(1,0,"Incompatible parition of A and yy");
859   }
860   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
861   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
862   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
863   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
864   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
865   return 0;
866 }
867 
868 #undef __FUNC__
869 #define __FUNC__ "MatMultAdd_MPIBAIJ"
870 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
871 {
872   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
873   int        ierr;
874   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
875   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
876   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
877   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
878   return 0;
879 }
880 
881 #undef __FUNC__
882 #define __FUNC__ "MatMultTrans_MPIBAIJ"
883 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
884 {
885   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
886   int        ierr;
887 
888   /* do nondiagonal part */
889   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
890   /* send it on its way */
891   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
892   /* do local part */
893   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
894   /* receive remote parts: note this assumes the values are not actually */
895   /* inserted in yy until the next line, which is true for my implementation*/
896   /* but is not perhaps always true. */
897   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
898   return 0;
899 }
900 
901 #undef __FUNC__
902 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
903 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
904 {
905   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
906   int        ierr;
907 
908   /* do nondiagonal part */
909   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
910   /* send it on its way */
911   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
912   /* do local part */
913   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
914   /* receive remote parts: note this assumes the values are not actually */
915   /* inserted in yy until the next line, which is true for my implementation*/
916   /* but is not perhaps always true. */
917   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
918   return 0;
919 }
920 
921 /*
922   This only works correctly for square matrices where the subblock A->A is the
923    diagonal block
924 */
925 #undef __FUNC__
926 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
927 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
928 {
929   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
930   if (a->M != a->N)
931     SETERRQ(1,0,"Supports only square matrix where A->A is diag block");
932   return MatGetDiagonal(a->A,v);
933 }
934 
935 #undef __FUNC__
936 #define __FUNC__ "MatScale_MPIBAIJ"
937 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
938 {
939   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
940   int        ierr;
941   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
942   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
943   return 0;
944 }
945 
946 #undef __FUNC__
947 #define __FUNC__ "MatGetSize_MPIBAIJ"
948 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
949 {
950   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
951   *m = mat->M; *n = mat->N;
952   return 0;
953 }
954 
955 #undef __FUNC__
956 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
957 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
958 {
959   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
960   *m = mat->m; *n = mat->N;
961   return 0;
962 }
963 
964 #undef __FUNC__
965 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
966 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
967 {
968   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
969   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
970   return 0;
971 }
972 
973 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
974 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
975 
976 #undef __FUNC__
977 #define __FUNC__ "MatGetRow_MPIBAIJ"
978 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
979 {
980   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
981   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
982   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
983   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
984   int        *cmap, *idx_p,cstart = mat->cstart;
985 
986   if (mat->getrowactive == PETSC_TRUE) SETERRQ(1,0,"Already active");
987   mat->getrowactive = PETSC_TRUE;
988 
989   if (!mat->rowvalues && (idx || v)) {
990     /*
991         allocate enough space to hold information from the longest row.
992     */
993     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
994     int     max = 1,mbs = mat->mbs,tmp;
995     for ( i=0; i<mbs; i++ ) {
996       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
997       if (max < tmp) { max = tmp; }
998     }
999     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1000     CHKPTRQ(mat->rowvalues);
1001     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1002   }
1003 
1004 
1005   if (row < brstart || row >= brend) SETERRQ(1,0,"Only local rows")
1006   lrow = row - brstart;
1007 
1008   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1009   if (!v)   {pvA = 0; pvB = 0;}
1010   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1011   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1012   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1013   nztot = nzA + nzB;
1014 
1015   cmap  = mat->garray;
1016   if (v  || idx) {
1017     if (nztot) {
1018       /* Sort by increasing column numbers, assuming A and B already sorted */
1019       int imark = -1;
1020       if (v) {
1021         *v = v_p = mat->rowvalues;
1022         for ( i=0; i<nzB; i++ ) {
1023           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1024           else break;
1025         }
1026         imark = i;
1027         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1028         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1029       }
1030       if (idx) {
1031         *idx = idx_p = mat->rowindices;
1032         if (imark > -1) {
1033           for ( i=0; i<imark; i++ ) {
1034             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1035           }
1036         } else {
1037           for ( i=0; i<nzB; i++ ) {
1038             if (cmap[cworkB[i]/bs] < cstart)
1039               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1040             else break;
1041           }
1042           imark = i;
1043         }
1044         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1045         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1046       }
1047     }
1048     else {
1049       if (idx) *idx = 0;
1050       if (v)   *v   = 0;
1051     }
1052   }
1053   *nz = nztot;
1054   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1055   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1056   return 0;
1057 }
1058 
1059 #undef __FUNC__
1060 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1061 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1062 {
1063   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1064   if (baij->getrowactive == PETSC_FALSE) {
1065     SETERRQ(1,0,"MatGetRow not called");
1066   }
1067   baij->getrowactive = PETSC_FALSE;
1068   return 0;
1069 }
1070 
1071 #undef __FUNC__
1072 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1073 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1074 {
1075   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1076   *bs = baij->bs;
1077   return 0;
1078 }
1079 
1080 #undef __FUNC__
1081 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1082 int MatZeroEntries_MPIBAIJ(Mat A)
1083 {
1084   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1085   int         ierr;
1086   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1087   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1088   return 0;
1089 }
1090 
1091 #undef __FUNC__
1092 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1093 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1094 {
1095   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1096   Mat         A = a->A, B = a->B;
1097   int         ierr;
1098   double      isend[5], irecv[5];
1099 
1100   info->rows_global    = (double)a->M;
1101   info->columns_global = (double)a->N;
1102   info->rows_local     = (double)a->m;
1103   info->columns_local  = (double)a->N;
1104   info->block_size     = (double)a->bs;
1105   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1106   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1107   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1108   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1109   if (flag == MAT_LOCAL) {
1110     info->nz_used      = isend[0];
1111     info->nz_allocated = isend[1];
1112     info->nz_unneeded  = isend[2];
1113     info->memory       = isend[3];
1114     info->mallocs      = isend[4];
1115   } else if (flag == MAT_GLOBAL_MAX) {
1116     MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);
1117     info->nz_used      = irecv[0];
1118     info->nz_allocated = irecv[1];
1119     info->nz_unneeded  = irecv[2];
1120     info->memory       = irecv[3];
1121     info->mallocs      = irecv[4];
1122   } else if (flag == MAT_GLOBAL_SUM) {
1123     MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);
1124     info->nz_used      = irecv[0];
1125     info->nz_allocated = irecv[1];
1126     info->nz_unneeded  = irecv[2];
1127     info->memory       = irecv[3];
1128     info->mallocs      = irecv[4];
1129   }
1130   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1131   info->fill_ratio_needed = 0;
1132   info->factor_mallocs    = 0;
1133   return 0;
1134 }
1135 
1136 #undef __FUNC__
1137 #define __FUNC__ "MatSetOption_MPIBAIJ"
1138 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1139 {
1140   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1141 
1142   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1143       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1144       op == MAT_COLUMNS_UNSORTED ||
1145       op == MAT_COLUMNS_SORTED ||
1146       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1147       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1148         MatSetOption(a->A,op);
1149         MatSetOption(a->B,op);
1150   } else if (op == MAT_ROW_ORIENTED) {
1151         a->roworiented = 1;
1152         MatSetOption(a->A,op);
1153         MatSetOption(a->B,op);
1154   } else if (op == MAT_ROWS_SORTED ||
1155              op == MAT_ROWS_UNSORTED ||
1156              op == MAT_SYMMETRIC ||
1157              op == MAT_STRUCTURALLY_SYMMETRIC ||
1158              op == MAT_YES_NEW_DIAGONALS)
1159     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1160   else if (op == MAT_COLUMN_ORIENTED) {
1161     a->roworiented = 0;
1162     MatSetOption(a->A,op);
1163     MatSetOption(a->B,op);
1164   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1165     a->donotstash = 1;
1166   } else if (op == MAT_NO_NEW_DIAGONALS)
1167     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
1168   else
1169     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
1170   return 0;
1171 }
1172 
1173 #undef __FUNC__
1174 #define __FUNC__ "MatTranspose_MPIBAIJ("
1175 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1176 {
1177   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1178   Mat_SeqBAIJ *Aloc;
1179   Mat        B;
1180   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1181   int        bs=baij->bs,mbs=baij->mbs;
1182   Scalar     *a;
1183 
1184   if (matout == PETSC_NULL && M != N)
1185     SETERRQ(1,0,"Square matrix only for in-place");
1186   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1187   CHKERRQ(ierr);
1188 
1189   /* copy over the A part */
1190   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1191   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1192   row = baij->rstart;
1193   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1194 
1195   for ( i=0; i<mbs; i++ ) {
1196     rvals[0] = bs*(baij->rstart + i);
1197     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1198     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1199       col = (baij->cstart+aj[j])*bs;
1200       for (k=0; k<bs; k++ ) {
1201         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1202         col++; a += bs;
1203       }
1204     }
1205   }
1206   /* copy over the B part */
1207   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1208   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1209   row = baij->rstart*bs;
1210   for ( i=0; i<mbs; i++ ) {
1211     rvals[0] = bs*(baij->rstart + i);
1212     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1213     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1214       col = baij->garray[aj[j]]*bs;
1215       for (k=0; k<bs; k++ ) {
1216         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1217         col++; a += bs;
1218       }
1219     }
1220   }
1221   PetscFree(rvals);
1222   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1223   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1224 
1225   if (matout != PETSC_NULL) {
1226     *matout = B;
1227   } else {
1228     /* This isn't really an in-place transpose .... but free data structures from baij */
1229     PetscFree(baij->rowners);
1230     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1231     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1232     if (baij->colmap) PetscFree(baij->colmap);
1233     if (baij->garray) PetscFree(baij->garray);
1234     if (baij->lvec) VecDestroy(baij->lvec);
1235     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1236     PetscFree(baij);
1237     PetscMemcpy(A,B,sizeof(struct _Mat));
1238     PetscHeaderDestroy(B);
1239   }
1240   return 0;
1241 }
1242 
1243 #undef __FUNC__
1244 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1245 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1246 {
1247   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1248   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1249   int ierr,s1,s2,s3;
1250 
1251   if (ll)  {
1252     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1253     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1254     if (s1!=s2) SETERRQ(1,0,"non-conforming local sizes");
1255     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1256     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1257   }
1258   if (rr) SETERRQ(1,0,"not supported for right vector");
1259   return 0;
1260 }
1261 
1262 /* the code does not do the diagonal entries correctly unless the
1263    matrix is square and the column and row owerships are identical.
1264    This is a BUG. The only way to fix it seems to be to access
1265    baij->A and baij->B directly and not through the MatZeroRows()
1266    routine.
1267 */
1268 #undef __FUNC__
1269 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1270 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1271 {
1272   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1273   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1274   int            *procs,*nprocs,j,found,idx,nsends,*work;
1275   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1276   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1277   int            *lens,imdex,*lrows,*values,bs=l->bs;
1278   MPI_Comm       comm = A->comm;
1279   MPI_Request    *send_waits,*recv_waits;
1280   MPI_Status     recv_status,*send_status;
1281   IS             istmp;
1282 
1283   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1284   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1285 
1286   /*  first count number of contributors to each processor */
1287   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1288   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1289   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1290   for ( i=0; i<N; i++ ) {
1291     idx = rows[i];
1292     found = 0;
1293     for ( j=0; j<size; j++ ) {
1294       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1295         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1296       }
1297     }
1298     if (!found) SETERRQ(1,0,"Index out of range");
1299   }
1300   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1301 
1302   /* inform other processors of number of messages and max length*/
1303   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1304   MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
1305   nrecvs = work[rank];
1306   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
1307   nmax = work[rank];
1308   PetscFree(work);
1309 
1310   /* post receives:   */
1311   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); /*see note */
1312   CHKPTRQ(rvalues);
1313   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
1314   CHKPTRQ(recv_waits);
1315   for ( i=0; i<nrecvs; i++ ) {
1316     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1317   }
1318 
1319   /* do sends:
1320       1) starts[i] gives the starting index in svalues for stuff going to
1321          the ith processor
1322   */
1323   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1324   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
1325   CHKPTRQ(send_waits);
1326   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1327   starts[0] = 0;
1328   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1329   for ( i=0; i<N; i++ ) {
1330     svalues[starts[owner[i]]++] = rows[i];
1331   }
1332   ISRestoreIndices(is,&rows);
1333 
1334   starts[0] = 0;
1335   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1336   count = 0;
1337   for ( i=0; i<size; i++ ) {
1338     if (procs[i]) {
1339       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
1340     }
1341   }
1342   PetscFree(starts);
1343 
1344   base = owners[rank]*bs;
1345 
1346   /*  wait on receives */
1347   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1348   source = lens + nrecvs;
1349   count  = nrecvs; slen = 0;
1350   while (count) {
1351     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1352     /* unpack receives into our local space */
1353     MPI_Get_count(&recv_status,MPI_INT,&n);
1354     source[imdex]  = recv_status.MPI_SOURCE;
1355     lens[imdex]  = n;
1356     slen += n;
1357     count--;
1358   }
1359   PetscFree(recv_waits);
1360 
1361   /* move the data into the send scatter */
1362   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1363   count = 0;
1364   for ( i=0; i<nrecvs; i++ ) {
1365     values = rvalues + i*nmax;
1366     for ( j=0; j<lens[i]; j++ ) {
1367       lrows[count++] = values[j] - base;
1368     }
1369   }
1370   PetscFree(rvalues); PetscFree(lens);
1371   PetscFree(owner); PetscFree(nprocs);
1372 
1373   /* actually zap the local rows */
1374   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1375   PLogObjectParent(A,istmp);
1376   PetscFree(lrows);
1377   ierr = MatZeroRows(l->A,istmp,diag); CHKERRQ(ierr);
1378   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1379   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1380 
1381   /* wait on sends */
1382   if (nsends) {
1383     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));
1384     CHKPTRQ(send_status);
1385     MPI_Waitall(nsends,send_waits,send_status);
1386     PetscFree(send_status);
1387   }
1388   PetscFree(send_waits); PetscFree(svalues);
1389 
1390   return 0;
1391 }
1392 extern int MatPrintHelp_SeqBAIJ(Mat);
1393 #undef __FUNC__
1394 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1395 int MatPrintHelp_MPIBAIJ(Mat A)
1396 {
1397   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1398 
1399   if (!a->rank) return MatPrintHelp_SeqBAIJ(a->A);
1400   else return 0;
1401 }
1402 
1403 #undef __FUNC__
1404 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1405 int MatSetUnfactored_MPIBAIJ(Mat A)
1406 {
1407   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1408   int         ierr;
1409   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1410   return 0;
1411 }
1412 
1413 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1414 
1415 /* -------------------------------------------------------------------*/
1416 static struct _MatOps MatOps = {
1417   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1418   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1419   0,0,0,0,
1420   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1421   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1422   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1423   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1424   0,0,0,MatGetSize_MPIBAIJ,
1425   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1426   0,0,MatConvertSameType_MPIBAIJ,0,0,
1427   0,0,0,MatGetSubMatrices_MPIBAIJ,
1428   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1429   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1430   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1431 
1432 
1433 #undef __FUNC__
1434 #define __FUNC__ "MatCreateMPIBAIJ"
1435 /*@C
1436    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1437    (block compressed row).  For good matrix assembly performance
1438    the user should preallocate the matrix storage by setting the parameters
1439    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1440    performance can be increased by more than a factor of 50.
1441 
1442    Input Parameters:
1443 .  comm - MPI communicator
1444 .  bs   - size of blockk
1445 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1446            This value should be the same as the local size used in creating the
1447            y vector for the matrix-vector product y = Ax.
1448 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1449            This value should be the same as the local size used in creating the
1450            x vector for the matrix-vector product y = Ax.
1451 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1452 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1453 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1454            submatrix  (same for all local rows)
1455 .  d_nzz - array containing the number of block nonzeros in the various block rows
1456            of the in diagonal portion of the local (possibly different for each block
1457            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1458            it is zero.
1459 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1460            submatrix (same for all local rows).
1461 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1462            off-diagonal portion of the local submatrix (possibly different for
1463            each block row) or PETSC_NULL.
1464 
1465    Output Parameter:
1466 .  A - the matrix
1467 
1468    Notes:
1469    The user MUST specify either the local or global matrix dimensions
1470    (possibly both).
1471 
1472    Storage Information:
1473    For a square global matrix we define each processor's diagonal portion
1474    to be its local rows and the corresponding columns (a square submatrix);
1475    each processor's off-diagonal portion encompasses the remainder of the
1476    local matrix (a rectangular submatrix).
1477 
1478    The user can specify preallocated storage for the diagonal part of
1479    the local submatrix with either d_nz or d_nnz (not both).  Set
1480    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1481    memory allocation.  Likewise, specify preallocated storage for the
1482    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1483 
1484    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1485    the figure below we depict these three local rows and all columns (0-11).
1486 
1487 $          0 1 2 3 4 5 6 7 8 9 10 11
1488 $         -------------------
1489 $  row 3  |  o o o d d d o o o o o o
1490 $  row 4  |  o o o d d d o o o o o o
1491 $  row 5  |  o o o d d d o o o o o o
1492 $         -------------------
1493 $
1494 
1495    Thus, any entries in the d locations are stored in the d (diagonal)
1496    submatrix, and any entries in the o locations are stored in the
1497    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1498    stored simply in the MATSEQBAIJ format for compressed row storage.
1499 
1500    Now d_nz should indicate the number of nonzeros per row in the d matrix,
1501    and o_nz should indicate the number of nonzeros per row in the o matrix.
1502    In general, for PDE problems in which most nonzeros are near the diagonal,
1503    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1504    or you will get TERRIBLE performance; see the users' manual chapter on
1505    matrices.
1506 
1507 .keywords: matrix, block, aij, compressed row, sparse, parallel
1508 
1509 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1510 @*/
1511 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1512                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1513 {
1514   Mat          B;
1515   Mat_MPIBAIJ  *b;
1516   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1517 
1518   if (bs < 1) SETERRQ(1,0,"invalid block size specified");
1519   *A = 0;
1520   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIBAIJ,comm);
1521   PLogObjectCreate(B);
1522   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1523   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1524   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1525   MPI_Comm_size(comm,&size);
1526   if (size == 1) {
1527     B->ops.getrowij          = MatGetRowIJ_MPIBAIJ;
1528     B->ops.restorerowij      = MatRestoreRowIJ_MPIBAIJ;
1529     B->ops.lufactorsymbolic  = MatLUFactorSymbolic_MPIBAIJ;
1530     B->ops.lufactornumeric   = MatLUFactorNumeric_MPIBAIJ;
1531     B->ops.lufactor          = MatLUFactor_MPIBAIJ;
1532     B->ops.solve             = MatSolve_MPIBAIJ;
1533     B->ops.solveadd          = MatSolveAdd_MPIBAIJ;
1534     B->ops.solvetrans        = MatSolveTrans_MPIBAIJ;
1535     B->ops.solvetransadd     = MatSolveTransAdd_MPIBAIJ;
1536     B->ops.ilufactorsymbolic = MatILUFactorSymbolic_MPIBAIJ;
1537   }
1538 
1539   B->destroy    = MatDestroy_MPIBAIJ;
1540   B->view       = MatView_MPIBAIJ;
1541   B->mapping    = 0;
1542   B->factor     = 0;
1543   B->assembled  = PETSC_FALSE;
1544 
1545   B->insertmode = NOT_SET_VALUES;
1546   MPI_Comm_rank(comm,&b->rank);
1547   MPI_Comm_size(comm,&b->size);
1548 
1549   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
1550     SETERRQ(1,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1551   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) SETERRQ(1,0,"either M or m should be specified");
1552   if ( M == PETSC_DECIDE && n == PETSC_DECIDE)SETERRQ(1,0,"either N or n should be specified");
1553   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1554   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1555 
1556   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1557     work[0] = m; work[1] = n;
1558     mbs = m/bs; nbs = n/bs;
1559     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
1560     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1561     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1562   }
1563   if (m == PETSC_DECIDE) {
1564     Mbs = M/bs;
1565     if (Mbs*bs != M) SETERRQ(1,0,"No of global rows must be divisible by blocksize");
1566     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1567     m   = mbs*bs;
1568   }
1569   if (n == PETSC_DECIDE) {
1570     Nbs = N/bs;
1571     if (Nbs*bs != N) SETERRQ(1,0,"No of global cols must be divisible by blocksize");
1572     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1573     n   = nbs*bs;
1574   }
1575   if (mbs*bs != m || nbs*bs != n) SETERRQ(1,0,"No of local rows, cols must be divisible by blocksize");
1576 
1577   b->m = m; B->m = m;
1578   b->n = n; B->n = n;
1579   b->N = N; B->N = N;
1580   b->M = M; B->M = M;
1581   b->bs  = bs;
1582   b->bs2 = bs*bs;
1583   b->mbs = mbs;
1584   b->nbs = nbs;
1585   b->Mbs = Mbs;
1586   b->Nbs = Nbs;
1587 
1588   /* build local table of row and column ownerships */
1589   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1590   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
1591   b->cowners = b->rowners + b->size + 2;
1592   MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
1593   b->rowners[0] = 0;
1594   for ( i=2; i<=b->size; i++ ) {
1595     b->rowners[i] += b->rowners[i-1];
1596   }
1597   b->rstart    = b->rowners[b->rank];
1598   b->rend      = b->rowners[b->rank+1];
1599   b->rstart_bs = b->rstart * bs;
1600   b->rend_bs   = b->rend * bs;
1601 
1602   MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
1603   b->cowners[0] = 0;
1604   for ( i=2; i<=b->size; i++ ) {
1605     b->cowners[i] += b->cowners[i-1];
1606   }
1607   b->cstart    = b->cowners[b->rank];
1608   b->cend      = b->cowners[b->rank+1];
1609   b->cstart_bs = b->cstart * bs;
1610   b->cend_bs   = b->cend * bs;
1611 
1612   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1613   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1614   PLogObjectParent(B,b->A);
1615   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1616   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1617   PLogObjectParent(B,b->B);
1618 
1619   /* build cache for off array entries formed */
1620   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1621   b->donotstash  = 0;
1622   b->colmap      = 0;
1623   b->garray      = 0;
1624   b->roworiented = 1;
1625 
1626   /* stuff used in block assembly */
1627   b->barray      = 0;
1628 
1629   /* stuff used for matrix vector multiply */
1630   b->lvec        = 0;
1631   b->Mvctx       = 0;
1632 
1633   /* stuff for MatGetRow() */
1634   b->rowindices   = 0;
1635   b->rowvalues    = 0;
1636   b->getrowactive = PETSC_FALSE;
1637 
1638   *A = B;
1639   return 0;
1640 }
1641 
1642 #undef __FUNC__
1643 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1644 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1645 {
1646   Mat         mat;
1647   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1648   int         ierr, len=0, flg;
1649 
1650   *newmat       = 0;
1651   PetscHeaderCreate(mat,_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm);
1652   PLogObjectCreate(mat);
1653   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1654   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1655   mat->destroy    = MatDestroy_MPIBAIJ;
1656   mat->view       = MatView_MPIBAIJ;
1657   mat->factor     = matin->factor;
1658   mat->assembled  = PETSC_TRUE;
1659 
1660   a->m = mat->m   = oldmat->m;
1661   a->n = mat->n   = oldmat->n;
1662   a->M = mat->M   = oldmat->M;
1663   a->N = mat->N   = oldmat->N;
1664 
1665   a->bs  = oldmat->bs;
1666   a->bs2 = oldmat->bs2;
1667   a->mbs = oldmat->mbs;
1668   a->nbs = oldmat->nbs;
1669   a->Mbs = oldmat->Mbs;
1670   a->Nbs = oldmat->Nbs;
1671 
1672   a->rstart       = oldmat->rstart;
1673   a->rend         = oldmat->rend;
1674   a->cstart       = oldmat->cstart;
1675   a->cend         = oldmat->cend;
1676   a->size         = oldmat->size;
1677   a->rank         = oldmat->rank;
1678   mat->insertmode = NOT_SET_VALUES;
1679   a->rowvalues    = 0;
1680   a->getrowactive = PETSC_FALSE;
1681   a->barray       = 0;
1682 
1683   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1684   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
1685   a->cowners = a->rowners + a->size + 2;
1686   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1687   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1688   if (oldmat->colmap) {
1689     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1690     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1691     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1692   } else a->colmap = 0;
1693   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1694     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1695     PLogObjectMemory(mat,len*sizeof(int));
1696     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1697   } else a->garray = 0;
1698 
1699   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1700   PLogObjectParent(mat,a->lvec);
1701   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1702   PLogObjectParent(mat,a->Mvctx);
1703   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1704   PLogObjectParent(mat,a->A);
1705   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1706   PLogObjectParent(mat,a->B);
1707   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1708   if (flg) {
1709     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1710   }
1711   *newmat = mat;
1712   return 0;
1713 }
1714 
1715 #include "sys.h"
1716 
1717 #undef __FUNC__
1718 #define __FUNC__ "MatLoad_MPIBAIJ"
1719 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1720 {
1721   Mat          A;
1722   int          i, nz, ierr, j,rstart, rend, fd;
1723   Scalar       *vals,*buf;
1724   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1725   MPI_Status   status;
1726   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1727   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1728   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1729   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1730   int          dcount,kmax,k,nzcount,tmp;
1731 
1732 
1733   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1734   bs2  = bs*bs;
1735 
1736   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
1737   if (!rank) {
1738     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
1739     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
1740     if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not matrix object");
1741   }
1742 
1743   MPI_Bcast(header+1,3,MPI_INT,0,comm);
1744   M = header[1]; N = header[2];
1745 
1746   if (M != N) SETERRQ(1,0,"Can only do square matrices");
1747 
1748   /*
1749      This code adds extra rows to make sure the number of rows is
1750      divisible by the blocksize
1751   */
1752   Mbs        = M/bs;
1753   extra_rows = bs - M + bs*(Mbs);
1754   if (extra_rows == bs) extra_rows = 0;
1755   else                  Mbs++;
1756   if (extra_rows &&!rank) {
1757     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
1758   }
1759 
1760   /* determine ownership of all rows */
1761   mbs = Mbs/size + ((Mbs % size) > rank);
1762   m   = mbs * bs;
1763   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
1764   browners = rowners + size + 1;
1765   MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);
1766   rowners[0] = 0;
1767   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
1768   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
1769   rstart = rowners[rank];
1770   rend   = rowners[rank+1];
1771 
1772   /* distribute row lengths to all processors */
1773   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
1774   if (!rank) {
1775     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
1776     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
1777     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
1778     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
1779     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
1780     MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);
1781     PetscFree(sndcounts);
1782   }
1783   else {
1784     MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);
1785   }
1786 
1787   if (!rank) {
1788     /* calculate the number of nonzeros on each processor */
1789     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
1790     PetscMemzero(procsnz,size*sizeof(int));
1791     for ( i=0; i<size; i++ ) {
1792       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
1793         procsnz[i] += rowlengths[j];
1794       }
1795     }
1796     PetscFree(rowlengths);
1797 
1798     /* determine max buffer needed and allocate it */
1799     maxnz = 0;
1800     for ( i=0; i<size; i++ ) {
1801       maxnz = PetscMax(maxnz,procsnz[i]);
1802     }
1803     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
1804 
1805     /* read in my part of the matrix column indices  */
1806     nz = procsnz[0];
1807     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
1808     mycols = ibuf;
1809     if (size == 1)  nz -= extra_rows;
1810     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
1811     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
1812 
1813     /* read in every ones (except the last) and ship off */
1814     for ( i=1; i<size-1; i++ ) {
1815       nz = procsnz[i];
1816       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1817       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
1818     }
1819     /* read in the stuff for the last proc */
1820     if ( size != 1 ) {
1821       nz = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
1822       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
1823       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
1824       MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);
1825     }
1826     PetscFree(cols);
1827   }
1828   else {
1829     /* determine buffer space needed for message */
1830     nz = 0;
1831     for ( i=0; i<m; i++ ) {
1832       nz += locrowlens[i];
1833     }
1834     ibuf = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
1835     mycols = ibuf;
1836     /* receive message of column indices*/
1837     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
1838     MPI_Get_count(&status,MPI_INT,&maxnz);
1839     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1840   }
1841 
1842   /* loop over local rows, determining number of off diagonal entries */
1843   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
1844   odlens = dlens + (rend-rstart);
1845   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
1846   PetscMemzero(mask,3*Mbs*sizeof(int));
1847   masked1 = mask    + Mbs;
1848   masked2 = masked1 + Mbs;
1849   rowcount = 0; nzcount = 0;
1850   for ( i=0; i<mbs; i++ ) {
1851     dcount  = 0;
1852     odcount = 0;
1853     for ( j=0; j<bs; j++ ) {
1854       kmax = locrowlens[rowcount];
1855       for ( k=0; k<kmax; k++ ) {
1856         tmp = mycols[nzcount++]/bs;
1857         if (!mask[tmp]) {
1858           mask[tmp] = 1;
1859           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
1860           else masked1[dcount++] = tmp;
1861         }
1862       }
1863       rowcount++;
1864     }
1865 
1866     dlens[i]  = dcount;
1867     odlens[i] = odcount;
1868 
1869     /* zero out the mask elements we set */
1870     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
1871     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
1872   }
1873 
1874   /* create our matrix */
1875   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
1876          CHKERRQ(ierr);
1877   A = *newmat;
1878   MatSetOption(A,MAT_COLUMNS_SORTED);
1879 
1880   if (!rank) {
1881     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
1882     /* read in my part of the matrix numerical values  */
1883     nz = procsnz[0];
1884     vals = buf;
1885     mycols = ibuf;
1886     if (size == 1)  nz -= extra_rows;
1887     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1888     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
1889 
1890     /* insert into matrix */
1891     jj      = rstart*bs;
1892     for ( i=0; i<m; i++ ) {
1893       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
1894       mycols += locrowlens[i];
1895       vals   += locrowlens[i];
1896       jj++;
1897     }
1898     /* read in other processors (except the last one) and ship out */
1899     for ( i=1; i<size-1; i++ ) {
1900       nz = procsnz[i];
1901       vals = buf;
1902       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1903       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
1904     }
1905     /* the last proc */
1906     if ( size != 1 ){
1907       nz = procsnz[i] - extra_rows;
1908       vals = buf;
1909       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
1910       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
1911       MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);
1912     }
1913     PetscFree(procsnz);
1914   }
1915   else {
1916     /* receive numeric values */
1917     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
1918 
1919     /* receive message of values*/
1920     vals = buf;
1921     mycols = ibuf;
1922     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
1923     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
1924     if (maxnz != nz) SETERRQ(1,0,"something is wrong with file");
1925 
1926     /* insert into matrix */
1927     jj      = rstart*bs;
1928     for ( i=0; i<m; i++ ) {
1929       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
1930       mycols += locrowlens[i];
1931       vals   += locrowlens[i];
1932       jj++;
1933     }
1934   }
1935   PetscFree(locrowlens);
1936   PetscFree(buf);
1937   PetscFree(ibuf);
1938   PetscFree(rowners);
1939   PetscFree(dlens);
1940   PetscFree(mask);
1941   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1942   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1943   return 0;
1944 }
1945 
1946 
1947