xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 39b95ed191f95c36f0dab8969f3a8ca8da8a1f2c)
1 #ifndef lint
2 static char vcid[] = "$Id: mpibaij.c,v 1.3 1996/06/19 23:03:32 balay Exp balay $";
3 #endif
4 
5 #include "mpibaij.h"
6 
7 
8 #include "draw.h"
9 #include "pinclude/pviewer.h"
10 
11 extern int MatSetUpMultiply_MPIBAIJ(Mat);
12 extern int DisAssemble_MPIBAIJ(Mat);
13 
14 /* local utility routine that creates a mapping from the global column
15 number to the local number in the off-diagonal part of the local
16 storage of the matrix.  This is done in a non scable way since the
17 length of colmap equals the global matrix length.
18 */
19 static int CreateColmap_MPIBAIJ_Private(Mat mat)
20 {
21   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
22   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
23   int        nbs = B->nbs,i;
24 
25   baij->colmap = (int *) PetscMalloc(baij->Nbs*sizeof(int));CHKPTRQ(baij->colmap);
26   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
27   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
28   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i;
29   return 0;
30 }
31 
32 
33 static int MatGetReordering_MPIBAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
34 {
35   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
36   int         ierr;
37   if (baij->size == 1) {
38     ierr = MatGetReordering(baij->A,type,rperm,cperm); CHKERRQ(ierr);
39   } else SETERRQ(1,"MatGetReordering_MPIBAIJ:not supported in parallel");
40   return 0;
41 }
42 
43 static int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
44 {
45   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
46   Scalar      value;
47   int         ierr,i,j, rstart = baij->rstart, rend = baij->rend;
48   int         cstart = baij->cstart, cend = baij->cend,row,col;
49   int         roworiented = baij->roworiented,rstart_orig,rend_orig;
50   int         cstart_orig,cend_orig,bs=baij->bs;
51 
52   if (baij->insertmode != NOT_SET_VALUES && baij->insertmode != addv) {
53     SETERRQ(1,"MatSetValues_MPIBAIJ:Cannot mix inserts and adds");
54   }
55   baij->insertmode = addv;
56   rstart_orig = rstart*bs;
57   rend_orig   = rend*bs;
58   cstart_orig = cstart*bs;
59   cend_orig   = cend*bs;
60   for ( i=0; i<m; i++ ) {
61     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIBAIJ:Negative row");
62     if (im[i] >= baij->M) SETERRQ(1,"MatSetValues_MPIBAIJ:Row too large");
63     if (im[i] >= rstart_orig && im[i] < rend_orig) {
64       row = im[i] - rstart_orig;
65       for ( j=0; j<n; j++ ) {
66         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIBAIJ:Negative column");
67         if (in[j] >= baij->N) SETERRQ(1,"MatSetValues_MPIBAIJ:Col too large");
68         if (in[j] >= cstart_orig && in[j] < cend_orig){
69           col = in[j] - cstart_orig;
70           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
71           ierr = MatSetValues(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
72         }
73         else {
74           if (mat->was_assembled) {
75             if (!baij->colmap) {ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);}
76             col = baij->colmap[in[j]];
77             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
78               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
79               col =  in[j];
80             }
81           }
82           else col = in[j];
83           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
84           ierr = MatSetValues(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
85         }
86       }
87     }
88     else {
89       if (roworiented) {
90         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
91       }
92       else {
93         row = im[i];
94         for ( j=0; j<n; j++ ) {
95           ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
96         }
97       }
98     }
99   }
100   return 0;
101 }
102 
103 
104 static int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
105 {
106   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
107   MPI_Comm    comm = mat->comm;
108   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
109   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
110   MPI_Request *send_waits,*recv_waits;
111   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
112   InsertMode  addv;
113   Scalar      *rvalues,*svalues;
114 
115   /* make sure all processors are either in INSERTMODE or ADDMODE */
116   MPI_Allreduce(&baij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
117   if (addv == (ADD_VALUES|INSERT_VALUES)) {
118     SETERRQ(1,"MatAssemblyBegin_MPIBAIJ:Some processors inserted others added");
119   }
120   baij->insertmode = addv; /* in case this processor had no cache */
121 
122   /*  first count number of contributors to each processor */
123   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
124   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
125   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
126   for ( i=0; i<baij->stash.n; i++ ) {
127     idx = baij->stash.idx[i];
128     for ( j=0; j<size; j++ ) {
129       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
130         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
131       }
132     }
133   }
134   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
135 
136   /* inform other processors of number of messages and max length*/
137   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
138   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
139   nreceives = work[rank];
140   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
141   nmax = work[rank];
142   PetscFree(work);
143 
144   /* post receives:
145        1) each message will consist of ordered pairs
146      (global index,value) we store the global index as a double
147      to simplify the message passing.
148        2) since we don't know how long each individual message is we
149      allocate the largest needed buffer for each receive. Potentially
150      this is a lot of wasted space.
151 
152 
153        This could be done better.
154   */
155   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
156   CHKPTRQ(rvalues);
157   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
158   CHKPTRQ(recv_waits);
159   for ( i=0; i<nreceives; i++ ) {
160     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
161               comm,recv_waits+i);
162   }
163 
164   /* do sends:
165       1) starts[i] gives the starting index in svalues for stuff going to
166          the ith processor
167   */
168   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
169   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
170   CHKPTRQ(send_waits);
171   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
172   starts[0] = 0;
173   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
174   for ( i=0; i<baij->stash.n; i++ ) {
175     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
176     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
177     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
178   }
179   PetscFree(owner);
180   starts[0] = 0;
181   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
182   count = 0;
183   for ( i=0; i<size; i++ ) {
184     if (procs[i]) {
185       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
186                 comm,send_waits+count++);
187     }
188   }
189   PetscFree(starts); PetscFree(nprocs);
190 
191   /* Free cache space */
192   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off processor values %d\n",rank,baij->stash.n);
193   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
194 
195   baij->svalues    = svalues;    baij->rvalues    = rvalues;
196   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
197   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
198   baij->rmax       = nmax;
199 
200   return 0;
201 }
202 
203 
204 static int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
205 {
206   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
207   MPI_Status  *send_status,recv_status;
208   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
209   int         bs=baij->bs,row,col,other_disassembled;
210   Scalar      *values,val;
211   InsertMode  addv = baij->insertmode;
212 
213   /*  wait on receives */
214   while (count) {
215     MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);
216     /* unpack receives into our local space */
217     values = baij->rvalues + 3*imdex*baij->rmax;
218     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
219     n = n/3;
220     for ( i=0; i<n; i++ ) {
221       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
222       col = (int) PetscReal(values[3*i+1]);
223       val = values[3*i+2];
224       if (col >= baij->cstart*bs && col < baij->cend*bs) {
225         col -= baij->cstart*bs;
226         MatSetValues(baij->A,1,&row,1,&col,&val,addv);
227       }
228       else {
229         if (mat->was_assembled) {
230           if (!baij->colmap) {ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);}
231           col = baij->colmap[col/bs]*bs + col%bs;
232           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
233             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
234             col = (int) PetscReal(values[3*i+1]);
235           }
236         }
237         MatSetValues(baij->B,1,&row,1,&col,&val,addv);
238       }
239     }
240     count--;
241   }
242   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
243 
244   /* wait on sends */
245   if (baij->nsends) {
246     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));
247     CHKPTRQ(send_status);
248     MPI_Waitall(baij->nsends,baij->send_waits,send_status);
249     PetscFree(send_status);
250   }
251   PetscFree(baij->send_waits); PetscFree(baij->svalues);
252 
253   baij->insertmode = NOT_SET_VALUES;
254   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
255   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
256 
257   /* determine if any processor has disassembled, if so we must
258      also disassemble ourselfs, in order that we may reassemble. */
259   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
260   if (mat->was_assembled && !other_disassembled) {
261     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
262   }
263 
264   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
265     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
266   }
267   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
268   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
269 
270   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
271   return 0;
272 }
273 
274 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
275 {
276   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
277   int          ierr;
278 
279   if (baij->size == 1) {
280     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
281   }
282   else SETERRQ(1,"MatView_MPIBAIJ_Binary:Only uniprocessor output supported");
283   return 0;
284 }
285 
286 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
287 {
288   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
289   int          ierr, format,rank,bs=baij->bs;
290   FILE         *fd;
291   ViewerType   vtype;
292 
293   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
294   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
295     ierr = ViewerGetFormat(viewer,&format);
296     if (format == ASCII_FORMAT_INFO_DETAILED) {
297       int nz, nzalloc, mem;
298       MPI_Comm_rank(mat->comm,&rank);
299       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
300       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
301       PetscSequentialPhaseBegin(mat->comm,1);
302       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
303               rank,baij->m,nz*bs,nzalloc*bs,baij->bs,mem);
304       ierr = MatGetInfo(baij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
305       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz*bs);
306       ierr = MatGetInfo(baij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
307       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz*bs);
308       fflush(fd);
309       PetscSequentialPhaseEnd(mat->comm,1);
310       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
311       return 0;
312     }
313     else if (format == ASCII_FORMAT_INFO) {
314       return 0;
315     }
316   }
317 
318   if (vtype == DRAW_VIEWER) {
319     Draw       draw;
320     PetscTruth isnull;
321     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
322     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
323   }
324 
325   if (vtype == ASCII_FILE_VIEWER) {
326     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
327     PetscSequentialPhaseBegin(mat->comm,1);
328     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
329            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
330             baij->cstart*bs,baij->cend*bs);
331     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
332     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
333     fflush(fd);
334     PetscSequentialPhaseEnd(mat->comm,1);
335   }
336   else {
337     int size = baij->size;
338     rank = baij->rank;
339     if (size == 1) {
340       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
341     }
342     else {
343       /* assemble the entire matrix onto first processor. */
344       Mat         A;
345       Mat_SeqBAIJ *Aloc;
346       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
347       int         mbs=baij->mbs;
348       Scalar      *a;
349 
350       if (!rank) {
351         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
352         CHKERRQ(ierr);
353       }
354       else {
355         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
356         CHKERRQ(ierr);
357       }
358       PLogObjectParent(mat,A);
359 
360       /* copy over the A part */
361       Aloc = (Mat_SeqBAIJ*) baij->A->data;
362       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
363       row = baij->rstart;
364       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
365 
366       for ( i=0; i<mbs; i++ ) {
367         rvals[0] = bs*(baij->rstart + i);
368         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
369         for ( j=ai[i]; j<ai[i+1]; j++ ) {
370           col = (baij->cstart+aj[j])*bs;
371           for (k=0; k<bs; k++ ) {
372             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
373             col++; a += bs;
374           }
375         }
376       }
377       /* copy over the B part */
378       Aloc = (Mat_SeqBAIJ*) baij->B->data;
379       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
380       row = baij->rstart*bs;
381       for ( i=0; i<mbs; i++ ) {
382         rvals[0] = bs*(baij->rstart + i);
383         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
384         for ( j=ai[i]; j<ai[i+1]; j++ ) {
385           col = baij->garray[aj[j]]*bs;
386           for (k=0; k<bs; k++ ) {
387             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
388             col++; a += bs;
389           }
390         }
391       }
392       PetscFree(rvals);
393       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
394       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
395       if (!rank) {
396         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
397       }
398       ierr = MatDestroy(A); CHKERRQ(ierr);
399     }
400   }
401   return 0;
402 }
403 
404 
405 
406 static int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
407 {
408   Mat         mat = (Mat) obj;
409   int         ierr;
410   ViewerType  vtype;
411 
412   if (!viewer) {
413     viewer = STDOUT_VIEWER_SELF;
414   }
415   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
416   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
417       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
418     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
419   }
420   else if (vtype == BINARY_FILE_VIEWER) {
421     return MatView_MPIBAIJ_Binary(mat,viewer);
422   }
423   return 0;
424 }
425 
426 static int MatDestroy_MPIBAIJ(PetscObject obj)
427 {
428   Mat         mat = (Mat) obj;
429   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
430   int         ierr;
431 
432 #if defined(PETSC_LOG)
433   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
434 #endif
435 
436   PetscFree(baij->rowners);
437   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
438   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
439   if (baij->colmap) PetscFree(baij->colmap);
440   if (baij->garray) PetscFree(baij->garray);
441   if (baij->lvec)   VecDestroy(baij->lvec);
442   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
443   if (baij->rowvalues) PetscFree(baij->rowvalues);
444   PetscFree(baij);
445   PLogObjectDestroy(mat);
446   PetscHeaderDestroy(mat);
447   return 0;
448 }
449 
450 static int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
451 {
452   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
453   int        ierr;
454 
455   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
456   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
457   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx); CHKERRQ(ierr);
458   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
459   return 0;
460 }
461 
462 static int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
463 {
464   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
465   int        ierr;
466   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
467   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
468   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_ALL,a->Mvctx);CHKERRQ(ierr);
469   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
470   return 0;
471 }
472 
473 static int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
474 {
475   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
476   int        ierr;
477 
478   /* do nondiagonal part */
479   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
480   /* send it on its way */
481   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,
482                 (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
483   /* do local part */
484   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
485   /* receive remote parts: note this assumes the values are not actually */
486   /* inserted in yy until the next line, which is true for my implementation*/
487   /* but is not perhaps always true. */
488   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,
489                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
490   return 0;
491 }
492 
493 static int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
494 {
495   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
496   int        ierr;
497 
498   /* do nondiagonal part */
499   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
500   /* send it on its way */
501   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,
502                  (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
503   /* do local part */
504   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
505   /* receive remote parts: note this assumes the values are not actually */
506   /* inserted in yy until the next line, which is true for my implementation*/
507   /* but is not perhaps always true. */
508   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,
509                   (ScatterMode)(SCATTER_ALL|SCATTER_REVERSE),a->Mvctx); CHKERRQ(ierr);
510   return 0;
511 }
512 
513 /*
514   This only works correctly for square matrices where the subblock A->A is the
515    diagonal block
516 */
517 static int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
518 {
519   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
520   if (a->M != a->N)
521     SETERRQ(1,"MatGetDiagonal_MPIBAIJ:Supports only square matrix where A->A is diag block");
522   return MatGetDiagonal(a->A,v);
523 }
524 
525 static int MatScale_MPIBAIJ(Scalar *aa,Mat A)
526 {
527   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
528   int        ierr;
529   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
530   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
531   return 0;
532 }
533 static int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
534 {
535   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
536   *m = mat->M; *n = mat->N;
537   return 0;
538 }
539 
540 static int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
541 {
542   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
543   *m = mat->m; *n = mat->N;
544   return 0;
545 }
546 
547 static int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
548 {
549   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
550   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
551   return 0;
552 }
553 
554 /* -------------------------------------------------------------------*/
555 static struct _MatOps MatOps = {
556   MatSetValues_MPIBAIJ,0,0,MatMult_MPIBAIJ,
557   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
558   0,0,0,0,
559   0,0,0,0,
560   0,MatGetDiagonal_MPIBAIJ,0,0,
561   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,0,
562   0,0,MatGetReordering_MPIBAIJ,0,
563   0,0,0,MatGetSize_MPIBAIJ,
564   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
565   0,0,0,0,
566   0,0,0,0,
567   0,0,0,0,
568   0,0,0,0,
569   MatScale_MPIBAIJ,0,0};
570 
571 
572 /*@C
573    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
574    (block compressed row).  For good matrix assembly performance
575    the user should preallocate the matrix storage by setting the parameters
576    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
577    performance can be increased by more than a factor of 50.
578 
579    Input Parameters:
580 .  comm - MPI communicator
581 .  bs   - size of blockk
582 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
583 .  n - number of local columns (or PETSC_DECIDE to have calculated
584            if N is given)
585 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
586 .  N - number of global columns (or PETSC_DECIDE to have calculated
587            if n is given)
588 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
589            submatrix  (same for all local rows)
590 .  d_nzz - number of block nonzeros per block row in diagonal portion of local
591            submatrix or null (possibly different for each row).  You must leave
592            room for the diagonal entry even if it is zero.
593 .  o_nz  - number of block nonzeros per block row in off-diagonal portion of local
594            submatrix (same for all local rows).
595 .  o_nzz - number of block nonzeros per block row in off-diagonal portion of local
596            submatrix or null (possibly different for each row).
597 
598    Output Parameter:
599 .  A - the matrix
600 
601    Notes:
602    The user MUST specify either the local or global matrix dimensions
603    (possibly both).
604 
605    Storage Information:
606    For a square global matrix we define each processor's diagonal portion
607    to be its local rows and the corresponding columns (a square submatrix);
608    each processor's off-diagonal portion encompasses the remainder of the
609    local matrix (a rectangular submatrix).
610 
611    The user can specify preallocated storage for the diagonal part of
612    the local submatrix with either d_nz or d_nnz (not both).  Set
613    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
614    memory allocation.  Likewise, specify preallocated storage for the
615    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
616 
617    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
618    the figure below we depict these three local rows and all columns (0-11).
619 
620 $          0 1 2 3 4 5 6 7 8 9 10 11
621 $         -------------------
622 $  row 3  |  o o o d d d o o o o o o
623 $  row 4  |  o o o d d d o o o o o o
624 $  row 5  |  o o o d d d o o o o o o
625 $         -------------------
626 $
627 
628    Thus, any entries in the d locations are stored in the d (diagonal)
629    submatrix, and any entries in the o locations are stored in the
630    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
631    stored simply in the MATSEQBAIJ format for compressed row storage.
632 
633    Now d_nz should indicate the number of nonzeros per row in the d matrix,
634    and o_nz should indicate the number of nonzeros per row in the o matrix.
635    In general, for PDE problems in which most nonzeros are near the diagonal,
636    one expects d_nz >> o_nz.   For additional details, see the users manual
637    chapter on matrices and the file $(PETSC_DIR)/Performance.
638 
639 .keywords: matrix, aij, compressed row, sparse, parallel
640 
641 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
642 @*/
643 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
644                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
645 {
646   Mat          B;
647   Mat_MPIBAIJ  *b;
648   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE;
649 
650   if (bs < 1) SETERRQ(1,"MatCreateMPIBAIJ: invalid block size specified");
651   *A = 0;
652   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIBAIJ,comm);
653   PLogObjectCreate(B);
654   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
655   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
656   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
657   B->destroy    = MatDestroy_MPIBAIJ;
658   B->view       = MatView_MPIBAIJ;
659 
660   B->factor     = 0;
661   B->assembled  = PETSC_FALSE;
662 
663   b->insertmode = NOT_SET_VALUES;
664   MPI_Comm_rank(comm,&b->rank);
665   MPI_Comm_size(comm,&b->size);
666 
667   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
668     SETERRQ(1,"MatCreateMPIBAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
669   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) SETERRQ(1,"MatCreateMPIBAIJ: either M or m should be specified");
670   if ( M == PETSC_DECIDE && n == PETSC_DECIDE)SETERRQ(1,"MatCreateMPIBAIJ: either N or n should be specified");
671   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
672   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
673 
674   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
675     work[0] = m; work[1] = n;
676     mbs = m/bs; nbs = n/bs;
677     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
678     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
679     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
680   }
681   if (m == PETSC_DECIDE) {
682     Mbs = M/bs;
683     if (Mbs*bs != M) SETERRQ(1,"MatCreateMPIBAIJ: No of global rows must be divisible by blocksize");
684     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
685     m   = mbs*bs;
686   }
687   if (n == PETSC_DECIDE) {
688     Nbs = N/bs;
689     if (Nbs*bs != N) SETERRQ(1,"MatCreateMPIBAIJ: No of global cols must be divisible by blocksize");
690     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
691     n   = nbs*bs;
692   }
693   if (mbs*bs != m || nbs*bs != n) SETERRQ(1,"MatCreateMPIBAIJ: No of local rows, cols must be divisible by blocksize");
694 
695   b->m = m; B->m = m;
696   b->n = n; B->n = n;
697   b->N = N; B->N = N;
698   b->M = M; B->M = M;
699   b->bs  = bs;
700   b->bs2 = bs*bs;
701   b->mbs = mbs;
702   b->nbs = nbs;
703   b->Mbs = Mbs;
704   b->Nbs = Nbs;
705 
706   /* build local table of row and column ownerships */
707   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
708   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
709   b->cowners = b->rowners + b->size + 1;
710   MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
711   b->rowners[0] = 0;
712   for ( i=2; i<=b->size; i++ ) {
713     b->rowners[i] += b->rowners[i-1];
714   }
715   b->rstart = b->rowners[b->rank];
716   b->rend   = b->rowners[b->rank+1];
717   MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
718   b->cowners[0] = 0;
719   for ( i=2; i<=b->size; i++ ) {
720     b->cowners[i] += b->cowners[i-1];
721   }
722   b->cstart = b->cowners[b->rank];
723   b->cend   = b->cowners[b->rank+1];
724 
725   if (d_nz == PETSC_DEFAULT) d_nz = 5;
726   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
727   PLogObjectParent(B,b->A);
728   if (o_nz == PETSC_DEFAULT) o_nz = 0;
729   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
730   PLogObjectParent(B,b->B);
731 
732   /* build cache for off array entries formed */
733   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
734   b->colmap      = 0;
735   b->garray      = 0;
736   b->roworiented = 1;
737 
738   /* stuff used for matrix vector multiply */
739   b->lvec      = 0;
740   b->Mvctx     = 0;
741 
742   /* stuff for MatGetRow() */
743   b->rowindices   = 0;
744   b->rowvalues    = 0;
745   b->getrowactive = PETSC_FALSE;
746 
747   *A = B;
748   return 0;
749 }
750 
751 #include "sys.h"
752 
753 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
754 {
755   Mat          A;
756   int          i, nz, ierr, j,rstart, rend, fd;
757   Scalar       *vals,*buf;
758   MPI_Comm     comm = ((PetscObject)viewer)->comm;
759   MPI_Status   status;
760   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
761   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
762   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
763   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
764   int          dcount,kmax,k,nzcount,tmp;
765 
766 
767   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
768   bs2  = bs*bs;
769 
770   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
771   if (!rank) {
772     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
773     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
774     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIBAIJ:not matrix object");
775   }
776 
777   MPI_Bcast(header+1,3,MPI_INT,0,comm);
778   M = header[1]; N = header[2];
779 
780 
781   if (M != N) SETERRQ(1,"MatLoad_SeqBAIJ:Can only do square matrices");
782 
783   /*
784      This code adds extra rows to make sure the number of rows is
785      divisible by the blocksize
786   */
787   Mbs        = M/bs;
788   extra_rows = bs - M + bs*(Mbs);
789   if (extra_rows == bs) extra_rows = 0;
790   else                  Mbs++;
791   if (extra_rows &&!rank) {
792     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize");
793   }
794   /* determine ownership of all rows */
795   mbs = Mbs/size + ((Mbs % size) > rank);
796   m   = mbs * bs;
797   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
798   browners = rowners + size + 1;
799   MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);
800   rowners[0] = 0;
801   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
802   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
803   rstart = rowners[rank];
804   rend   = rowners[rank+1];
805 
806   /* distribute row lengths to all processors */
807   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
808   if (!rank) {
809     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
810     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
811     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
812     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
813     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
814     MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);
815     PetscFree(sndcounts);
816   }
817   else {
818     MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);
819   }
820 
821   if (!rank) {
822     /* calculate the number of nonzeros on each processor */
823     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
824     PetscMemzero(procsnz,size*sizeof(int));
825     for ( i=0; i<size; i++ ) {
826       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
827         procsnz[i] += rowlengths[j];
828       }
829     }
830     PetscFree(rowlengths);
831 
832     /* determine max buffer needed and allocate it */
833     maxnz = 0;
834     for ( i=0; i<size; i++ ) {
835       maxnz = PetscMax(maxnz,procsnz[i]);
836     }
837     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
838 
839     /* read in my part of the matrix column indices  */
840     nz = procsnz[0];
841     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
842     mycols = ibuf;
843     if (size == 1)  nz -= extra_rows;
844     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
845     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
846 
847     /* read in every ones (except the last) and ship off */
848     for ( i=1; i<size-1; i++ ) {
849       nz = procsnz[i];
850       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
851       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
852     }
853     /* read in the stuff for the last proc */
854     if ( size != 1 ) {
855       nz = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
856       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
857       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
858       MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);
859     }
860     PetscFree(cols);
861   }
862   else {
863     /* determine buffer space needed for message */
864     nz = 0;
865     for ( i=0; i<m; i++ ) {
866       nz += locrowlens[i];
867     }
868     ibuf = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
869     mycols = ibuf;
870     /* receive message of column indices*/
871     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
872     MPI_Get_count(&status,MPI_INT,&maxnz);
873     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIBAIJ:something is wrong with file");
874   }
875 
876   /* loop over local rows, determining number of off diagonal entries */
877   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
878   odlens = dlens + (rend-rstart);
879   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
880   PetscMemzero(mask,3*Mbs*sizeof(int));
881   masked1 = mask    + Mbs;
882   masked2 = masked1 + Mbs;
883   rowcount = 0; nzcount = 0;
884   for ( i=0; i<mbs; i++ ) {
885     dcount  = 0;
886     odcount = 0;
887     for ( j=0; j<bs; j++ ) {
888       kmax = locrowlens[rowcount];
889       for ( k=0; k<kmax; k++ ) {
890         tmp = mycols[nzcount++]/bs;
891         if (!mask[tmp]) {
892           mask[tmp] = 1;
893           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
894           else masked1[dcount++] = tmp;
895         }
896       }
897       rowcount++;
898     }
899 
900     dlens[i]  = dcount;
901     odlens[i] = odcount;
902 
903     /* zero out the mask elements we set */
904     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
905     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
906   }
907 
908   /* create our matrix */
909   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);CHKERRQ(ierr);
910   A = *newmat;
911   MatSetOption(A,COLUMNS_SORTED);
912 
913   if (!rank) {
914     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
915     /* read in my part of the matrix numerical values  */
916     nz = procsnz[0];
917     vals = buf;
918     mycols = ibuf;
919     if (size == 1)  nz -= extra_rows;
920     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
921     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
922     /* insert into matrix */
923     jj      = rstart*bs;
924     for ( i=0; i<m; i++ ) {
925       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
926       mycols += locrowlens[i];
927       vals   += locrowlens[i];
928       jj++;
929     }
930     /* read in other processors( except the last one) and ship out */
931     for ( i=1; i<size-1; i++ ) {
932       nz = procsnz[i];
933       vals = buf;
934       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
935       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
936     }
937     /* the last proc */
938     if ( size != 1 ){
939       nz = procsnz[i] - extra_rows;
940       vals = buf;
941       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
942       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
943       MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);
944     }
945     PetscFree(procsnz);
946   }
947   else {
948     /* receive numeric values */
949     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
950 
951     /* receive message of values*/
952     vals = buf;
953     mycols = ibuf;
954     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
955     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
956     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIBAIJ:something is wrong with file");
957 
958     /* insert into matrix */
959     jj      = rstart*bs;
960     for ( i=0; i<m; i++ ) {
961       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
962       mycols += locrowlens[i];
963       vals   += locrowlens[i];
964       jj++;
965     }
966   }
967   PetscFree(locrowlens);
968   PetscFree(buf);
969   PetscFree(ibuf);
970   PetscFree(rowners);
971   PetscFree(dlens);
972   PetscFree(mask);
973   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
974   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
975   return 0;
976 }
977 
978 
979