xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 57b952d6c22e98ea0d8a99e8978d81f652deb08e)
1 #ifndef lint
2 static char vcid[] = "$Id: mpibaij.c,v 1.2 1996/06/04 21:40:25 balay Exp balay $";
3 #endif
4 
5 #include "mpibaij.h"
6 
7 
8 #include "draw.h"
9 #include "pinclude/pviewer.h"
10 
11 extern int MatSetUpMultiply_MPIBAIJ(Mat);
12 extern int DisAssemble_MPIBAIJ(Mat);
13 
14 /* local utility routine that creates a mapping from the global column
15 number to the local number in the off-diagonal part of the local
16 storage of the matrix.  This is done in a non scable way since the
17 length of colmap equals the global matrix length.
18 */
19 static int CreateColmap_MPIBAIJ_Private(Mat mat)
20 {
21   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
22   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
23   int        nbs = B->nbs,i;
24 
25   baij->colmap = (int *) PetscMalloc(baij->Nbs*sizeof(int));CHKPTRQ(baij->colmap);
26   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
27   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
28   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i;
29   return 0;
30 }
31 
32 
33 static int MatGetReordering_MPIBAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm)
34 {
35   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
36   int         ierr;
37   if (baij->size == 1) {
38     ierr = MatGetReordering(baij->A,type,rperm,cperm); CHKERRQ(ierr);
39   } else SETERRQ(1,"MatGetReordering_MPIBAIJ:not supported in parallel");
40   return 0;
41 }
42 
43 static int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
44 {
45   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
46   Scalar      value;
47   int         ierr,i,j, rstart = baij->rstart, rend = baij->rend;
48   int         cstart = baij->cstart, cend = baij->cend,row,col;
49   int         roworiented = baij->roworiented,rstart_orig,rend_orig;
50   int         cstart_orig,cend_orig,bs=baij->bs;
51 
52   if (baij->insertmode != NOT_SET_VALUES && baij->insertmode != addv) {
53     SETERRQ(1,"MatSetValues_MPIBAIJ:Cannot mix inserts and adds");
54   }
55   baij->insertmode = addv;
56   rstart_orig = rstart*bs;
57   rend_orig   = rend*bs;
58   cstart_orig = cstart*bs;
59   cend_orig   = cend*bs;
60   for ( i=0; i<m; i++ ) {
61     if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIBAIJ:Negative row");
62     if (im[i] >= baij->M) SETERRQ(1,"MatSetValues_MPIBAIJ:Row too large");
63     if (im[i] >= rstart_orig && im[i] < rend_orig) {
64       row = im[i] - rstart_orig;
65       for ( j=0; j<n; j++ ) {
66         if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIBAIJ:Negative column");
67         if (in[j] >= baij->N) SETERRQ(1,"MatSetValues_MPIBAIJ:Col too large");
68         if (in[j] >= cstart_orig && in[j] < cend_orig){
69           col = in[j] - cstart_orig;
70           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
71           ierr = MatSetValues(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
72         }
73         else {
74           if (mat->was_assembled) {
75             if (!baij->colmap) {ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);}
76             col = baij->colmap[in[j]];
77             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
78               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
79               col =  in[j];
80             }
81           }
82           else col = in[j];
83           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
84           ierr = MatSetValues(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr);
85         }
86       }
87     }
88     else {
89       if (roworiented) {
90         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
91       }
92       else {
93         row = im[i];
94         for ( j=0; j<n; j++ ) {
95           ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
96         }
97       }
98     }
99   }
100   return 0;
101 }
102 
103 
104 static int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
105 {
106   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
107   MPI_Comm    comm = mat->comm;
108   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
109   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
110   MPI_Request *send_waits,*recv_waits;
111   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
112   InsertMode  addv;
113   Scalar      *rvalues,*svalues;
114 
115   /* make sure all processors are either in INSERTMODE or ADDMODE */
116   MPI_Allreduce(&baij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);
117   if (addv == (ADD_VALUES|INSERT_VALUES)) {
118     SETERRQ(1,"MatAssemblyBegin_MPIBAIJ:Some processors inserted others added");
119   }
120   baij->insertmode = addv; /* in case this processor had no cache */
121 
122   /*  first count number of contributors to each processor */
123   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
124   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
125   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
126   for ( i=0; i<baij->stash.n; i++ ) {
127     idx = baij->stash.idx[i];
128     for ( j=0; j<size; j++ ) {
129       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
130         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
131       }
132     }
133   }
134   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
135 
136   /* inform other processors of number of messages and max length*/
137   work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
138   MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);
139   nreceives = work[rank];
140   MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
141   nmax = work[rank];
142   PetscFree(work);
143 
144   /* post receives:
145        1) each message will consist of ordered pairs
146      (global index,value) we store the global index as a double
147      to simplify the message passing.
148        2) since we don't know how long each individual message is we
149      allocate the largest needed buffer for each receive. Potentially
150      this is a lot of wasted space.
151 
152 
153        This could be done better.
154   */
155   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));
156   CHKPTRQ(rvalues);
157   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));
158   CHKPTRQ(recv_waits);
159   for ( i=0; i<nreceives; i++ ) {
160     MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,
161               comm,recv_waits+i);
162   }
163 
164   /* do sends:
165       1) starts[i] gives the starting index in svalues for stuff going to
166          the ith processor
167   */
168   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
169   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));
170   CHKPTRQ(send_waits);
171   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
172   starts[0] = 0;
173   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
174   for ( i=0; i<baij->stash.n; i++ ) {
175     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
176     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
177     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
178   }
179   PetscFree(owner);
180   starts[0] = 0;
181   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
182   count = 0;
183   for ( i=0; i<size; i++ ) {
184     if (procs[i]) {
185       MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,
186                 comm,send_waits+count++);
187     }
188   }
189   PetscFree(starts); PetscFree(nprocs);
190 
191   /* Free cache space */
192   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off processor values %d\n",rank,baij->stash.n);
193   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
194 
195   baij->svalues    = svalues;    baij->rvalues    = rvalues;
196   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
197   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
198   baij->rmax       = nmax;
199 
200   return 0;
201 }
202 
203 
204 static int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
205 {
206   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
207   MPI_Status  *send_status,recv_status;
208   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
209   int         bs=baij->bs,row,col,other_disassembled;
210   Scalar      *values,val;
211   InsertMode  addv = baij->insertmode;
212 
213   /*  wait on receives */
214   while (count) {
215     MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);
216     /* unpack receives into our local space */
217     values = baij->rvalues + 3*imdex*baij->rmax;
218     MPI_Get_count(&recv_status,MPIU_SCALAR,&n);
219     n = n/3;
220     for ( i=0; i<n; i++ ) {
221       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
222       col = (int) PetscReal(values[3*i+1]);
223       val = values[3*i+2];
224       if (col >= baij->cstart*bs && col < baij->cend*bs) {
225         col -= baij->cstart*bs;
226         MatSetValues(baij->A,1,&row,1,&col,&val,addv);
227       }
228       else {
229         if (mat->was_assembled) {
230           if (!baij->colmap) {ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);}
231           col = baij->colmap[col/bs]*bs + col%bs;
232           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
233             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
234             col = (int) PetscReal(values[3*i+1]);
235           }
236         }
237         MatSetValues(baij->B,1,&row,1,&col,&val,addv);
238       }
239     }
240     count--;
241   }
242   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
243 
244   /* wait on sends */
245   if (baij->nsends) {
246     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));
247     CHKPTRQ(send_status);
248     MPI_Waitall(baij->nsends,baij->send_waits,send_status);
249     PetscFree(send_status);
250   }
251   PetscFree(baij->send_waits); PetscFree(baij->svalues);
252 
253   baij->insertmode = NOT_SET_VALUES;
254   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
255   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
256 
257   /* determine if any processor has disassembled, if so we must
258      also disassemble ourselfs, in order that we may reassemble. */
259   MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);
260   if (mat->was_assembled && !other_disassembled) {
261     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
262   }
263 
264   if (!mat->was_assembled && mode == FINAL_ASSEMBLY) {
265     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
266   }
267   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
268   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
269 
270   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
271   return 0;
272 }
273 
274 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
275 {
276   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
277   int          ierr;
278 
279   if (baij->size == 1) {
280     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
281   }
282   else SETERRQ(1,"MatView_MPIBAIJ_Binary:Only uniprocessor output supported");
283   return 0;
284 }
285 
286 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
287 {
288   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
289   int          ierr, format,rank,bs=baij->bs,bs2=baij->bs2;
290   FILE         *fd;
291   ViewerType   vtype;
292 
293   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
294   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
295     ierr = ViewerGetFormat(viewer,&format);
296     if (format == ASCII_FORMAT_INFO_DETAILED) {
297       int nz, nzalloc, mem;
298       MPI_Comm_rank(mat->comm,&rank);
299       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
300       ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem);
301       PetscSequentialPhaseBegin(mat->comm,1);
302       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
303               rank,baij->m,nz*bs,nzalloc*bs,baij->bs,mem);
304       ierr = MatGetInfo(baij->A,MAT_LOCAL,&nz,&nzalloc,&mem);
305       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz*bs);
306       ierr = MatGetInfo(baij->B,MAT_LOCAL,&nz,&nzalloc,&mem);
307       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz*bs);
308       fflush(fd);
309       PetscSequentialPhaseEnd(mat->comm,1);
310       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
311       return 0;
312     }
313     else if (format == ASCII_FORMAT_INFO) {
314       return 0;
315     }
316   }
317 
318   if (vtype == DRAW_VIEWER) {
319     Draw       draw;
320     PetscTruth isnull;
321     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
322     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
323   }
324 
325   if (vtype == ASCII_FILE_VIEWER) {
326     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
327     PetscSequentialPhaseBegin(mat->comm,1);
328     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
329            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
330             baij->cstart*bs,baij->cend*bs);
331     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
332     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
333     fflush(fd);
334     PetscSequentialPhaseEnd(mat->comm,1);
335   }
336   else {
337     int size = baij->size;
338     rank = baij->rank;
339     if (size == 1) {
340       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
341     }
342     else {
343       /* assemble the entire matrix onto first processor. */
344       Mat         A;
345       Mat_SeqBAIJ *Aloc;
346       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
347       int         mbs=baij->mbs;
348       Scalar      *a;
349 
350       if (!rank) {
351         ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
352                CHKERRQ(ierr);
353       }
354       else {
355         ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);
356                CHKERRQ(ierr);
357       }
358       PLogObjectParent(mat,A);
359 
360       /* copy over the A part */
361       Aloc = (Mat_SeqBAIJ*) baij->A->data;
362       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
363       row = baij->rstart;
364       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
365 
366       for ( i=0; i<mbs; i++ ) {
367         rvals[0] = bs*(baij->rstart + i);
368         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
369         for ( j=ai[i]; j<ai[i+1]; j++ ) {
370           col = (baij->cstart+aj[j])*bs;
371           for (k=0; k<bs; k++ ) {
372             ierr = MatSetValues(A,bs,rvals,1,&col,a+j*bs2,INSERT_VALUES);CHKERRQ(ierr);
373             col++;
374           }
375         }
376       }
377       /* copy over the B part */
378       Aloc = (Mat_SeqBAIJ*) baij->B->data;
379       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
380       row = baij->rstart*bs;
381       for ( i=0; i<mbs; i++ ) {
382         rvals[0] = bs*(baij->rstart + i);
383         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
384         for ( j=ai[i]; j<ai[i+1]; j++ ) {
385           col = baij->garray[aj[j]]*bs;
386           for (k=0; k<bs; k++ ) {
387             ierr = MatSetValues(A,bs,rvals,1,&col,a+j*bs2,INSERT_VALUES);CHKERRQ(ierr);
388             col++;
389           }
390         }
391       }
392       PetscFree(rvals);
393       ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
394       ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
395       if (!rank) {
396         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
397       }
398       ierr = MatDestroy(A); CHKERRQ(ierr);
399     }
400   }
401   return 0;
402 }
403 
404 
405 
406 static int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
407 {
408   Mat         mat = (Mat) obj;
409   int         ierr;
410   ViewerType  vtype;
411 
412   if (!viewer) {
413     viewer = STDOUT_VIEWER_SELF;
414   }
415   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
416   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
417       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
418     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
419   }
420   else if (vtype == BINARY_FILE_VIEWER) {
421     return MatView_MPIBAIJ_Binary(mat,viewer);
422   }
423   return 0;
424 }
425 
426 static int MatDestroy_MPIBAIJ(PetscObject obj)
427 {
428   Mat         mat = (Mat) obj;
429   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
430   int         ierr;
431 
432 #if defined(PETSC_LOG)
433   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
434 #endif
435 
436   PetscFree(baij->rowners);
437   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
438   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
439   if (baij->colmap) PetscFree(baij->colmap);
440   if (baij->garray) PetscFree(baij->garray);
441   if (baij->lvec)   VecDestroy(baij->lvec);
442   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
443   if (baij->rowvalues) PetscFree(baij->rowvalues);
444   PetscFree(baij);
445   PLogObjectDestroy(mat);
446   PetscHeaderDestroy(mat);
447   return 0;
448 }
449 
450 static int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
451 {
452   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
453   *m = mat->M; *n = mat->N;
454   return 0;
455 }
456 
457 static int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
458 {
459   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
460   *m = mat->m; *n = mat->N;
461   return 0;
462 }
463 
464 static int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
465 {
466   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
467   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
468   return 0;
469 }
470 
471 /* -------------------------------------------------------------------*/
472 static struct _MatOps MatOps = {
473   MatSetValues_MPIBAIJ,0,0,0,
474   0,0,0,0,
475   0,0,0,0,
476   0,0,0,0,
477   0,0,0,0,
478   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,0,
479   0,0,MatGetReordering_MPIBAIJ,0,
480   0,0,0,MatGetSize_MPIBAIJ,
481   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
482   0,0,0,0,
483   0,0,0,0,
484   0,0,0,0,
485   0,0,0,0,
486   0};
487 
488 
489 /*@C
490    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
491    (block compressed row).  For good matrix assembly performance
492    the user should preallocate the matrix storage by setting the parameters
493    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
494    performance can be increased by more than a factor of 50.
495 
496    Input Parameters:
497 .  comm - MPI communicator
498 .  bs   - size of blockk
499 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
500 .  n - number of local columns (or PETSC_DECIDE to have calculated
501            if N is given)
502 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
503 .  N - number of global columns (or PETSC_DECIDE to have calculated
504            if n is given)
505 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
506            submatrix  (same for all local rows)
507 .  d_nzz - number of block nonzeros per block row in diagonal portion of local
508            submatrix or null (possibly different for each row).  You must leave
509            room for the diagonal entry even if it is zero.
510 .  o_nz  - number of block nonzeros per block row in off-diagonal portion of local
511            submatrix (same for all local rows).
512 .  o_nzz - number of block nonzeros per block row in off-diagonal portion of local
513            submatrix or null (possibly different for each row).
514 
515    Output Parameter:
516 .  A - the matrix
517 
518    Notes:
519    The user MUST specify either the local or global matrix dimensions
520    (possibly both).
521 
522    Storage Information:
523    For a square global matrix we define each processor's diagonal portion
524    to be its local rows and the corresponding columns (a square submatrix);
525    each processor's off-diagonal portion encompasses the remainder of the
526    local matrix (a rectangular submatrix).
527 
528    The user can specify preallocated storage for the diagonal part of
529    the local submatrix with either d_nz or d_nnz (not both).  Set
530    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
531    memory allocation.  Likewise, specify preallocated storage for the
532    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
533 
534    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
535    the figure below we depict these three local rows and all columns (0-11).
536 
537 $          0 1 2 3 4 5 6 7 8 9 10 11
538 $         -------------------
539 $  row 3  |  o o o d d d o o o o o o
540 $  row 4  |  o o o d d d o o o o o o
541 $  row 5  |  o o o d d d o o o o o o
542 $         -------------------
543 $
544 
545    Thus, any entries in the d locations are stored in the d (diagonal)
546    submatrix, and any entries in the o locations are stored in the
547    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
548    stored simply in the MATSEQBAIJ format for compressed row storage.
549 
550    Now d_nz should indicate the number of nonzeros per row in the d matrix,
551    and o_nz should indicate the number of nonzeros per row in the o matrix.
552    In general, for PDE problems in which most nonzeros are near the diagonal,
553    one expects d_nz >> o_nz.   For additional details, see the users manual
554    chapter on matrices and the file $(PETSC_DIR)/Performance.
555 
556 .keywords: matrix, aij, compressed row, sparse, parallel
557 
558 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
559 @*/
560 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
561                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
562 {
563   Mat          B;
564   Mat_MPIBAIJ  *b;
565   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs,Nbs;
566 
567   if (bs < 1) SETERRQ(1,"MatCreateMPIBAIJ: invalid block size specified");
568   *A = 0;
569   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIBAIJ,comm);
570   PLogObjectCreate(B);
571   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
572   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
573   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
574   B->destroy    = MatDestroy_MPIBAIJ;
575   B->view       = MatView_MPIBAIJ;
576 
577   B->factor     = 0;
578   B->assembled  = PETSC_FALSE;
579 
580   b->insertmode = NOT_SET_VALUES;
581   MPI_Comm_rank(comm,&b->rank);
582   MPI_Comm_size(comm,&b->size);
583 
584   if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL))
585     SETERRQ(1,"MatCreateMPIBAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
586 
587   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
588     work[0] = m; work[1] = n;
589     mbs = m/bs; nbs = n/bs;
590     if (mbs*bs != m || nbs*n != nbs) SETERRQ(1,"MatCreateMPIBAIJ: No of local rows, cols must be divisible by blocksize");
591     MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );
592     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
593     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
594   }
595   if (m == PETSC_DECIDE) {
596     Mbs = M/bs;
597     if (Mbs*bs != M) SETERRQ(1,"MatCreateMPIBAIJ: No of global rows must be divisible by blocksize");
598     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
599     m   = mbs*bs;
600   }
601   if (n == PETSC_DECIDE) {
602     Nbs = N/bs;
603     if (Nbs*bs != N) SETERRQ(1,"MatCreateMPIBAIJ: No of global cols must be divisible by blocksize");
604     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
605     n   = nbs*bs;
606   }
607 
608   b->m = m; B->m = m;
609   b->n = n; B->n = n;
610   b->N = N; B->N = N;
611   b->M = M; B->M = M;
612   b->bs  = bs;
613   b->bs2 = bs*bs;
614   b->mbs = mbs;
615   b->nbs = nbs;
616   b->Mbs = Mbs;
617   b->Nbs = Nbs;
618 
619   /* build local table of row and column ownerships */
620   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
621   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ));
622   b->cowners = b->rowners + b->size + 1;
623   MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);
624   b->rowners[0] = 0;
625   for ( i=2; i<=b->size; i++ ) {
626     b->rowners[i] += b->rowners[i-1];
627   }
628   b->rstart = b->rowners[b->rank];
629   b->rend   = b->rowners[b->rank+1];
630   MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);
631   b->cowners[0] = 0;
632   for ( i=2; i<=b->size; i++ ) {
633     b->cowners[i] += b->cowners[i-1];
634   }
635   b->cstart = b->cowners[b->rank];
636   b->cend   = b->cowners[b->rank+1];
637 
638   if (d_nz == PETSC_DEFAULT) d_nz = 5;
639   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
640   PLogObjectParent(B,b->A);
641   if (o_nz == PETSC_DEFAULT) o_nz = 0;
642   ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
643   PLogObjectParent(B,b->B);
644 
645   /* build cache for off array entries formed */
646   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
647   b->colmap      = 0;
648   b->garray      = 0;
649   b->roworiented = 1;
650 
651   /* stuff used for matrix vector multiply */
652   b->lvec      = 0;
653   b->Mvctx     = 0;
654 
655   /* stuff for MatGetRow() */
656   b->rowindices   = 0;
657   b->rowvalues    = 0;
658   b->getrowactive = PETSC_FALSE;
659 
660   *A = B;
661   return 0;
662 }
663 
664 #include "sys.h"
665 
666 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
667 {
668   Mat          A;
669   int          i, nz, ierr, j,rstart, rend, fd;
670   Scalar       *vals,*buf;
671   MPI_Comm     comm = ((PetscObject)viewer)->comm;
672   MPI_Status   status;
673   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols;
674   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
675   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
676   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
677   int          dcount,kmax,k,nzcount,tmp;
678 
679 
680   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
681   bs2  = bs*bs;
682 
683   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
684   if (!rank) {
685     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
686     ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr);
687     if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object");
688   }
689 
690   MPI_Bcast(header+1,3,MPI_INT,0,comm);
691   M = header[1]; N = header[2];
692 
693 
694   if (M != N) SETERRQ(1,"MatLoad_SeqBAIJ:Can only do square matrices");
695 
696   /*
697      This code adds extra rows to make sure the number of rows is
698      divisible by the blocksize
699   */
700   Mbs        = M/bs;
701   extra_rows = bs - M + bs*(Mbs);
702   if (extra_rows == bs) extra_rows = 0;
703   else                  Mbs++;
704   if (extra_rows &&!rank) {
705     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize");
706   }
707   /* determine ownership of all rows */
708   mbs = Mbs/size + ((Mbs % size) > rank);
709   m   = mbs * bs;
710   rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners);
711   MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);
712   rowners[0] = 0;
713   for ( i=2; i<=size; i++ ) {
714     rowners[i] += rowners[i-1];
715   }
716   rstart = rowners[rank];
717   rend   = rowners[rank+1];
718 
719   /* distribute row lengths to all processors */
720   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
721   if (!rank) {
722     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
723     ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
724     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
725     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
726     for ( i=0; i<size; i++ ) sndcounts[i] = (rowners[i+1] - rowners[i])*bs;
727     MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);
728     PetscFree(sndcounts);
729   }
730   else {
731     MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);
732   }
733 
734   if (!rank) {
735     /* calculate the number of nonzeros on each processor */
736     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
737     PetscMemzero(procsnz,size*sizeof(int));
738     for ( i=0; i<size; i++ ) {
739       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
740         procsnz[i] += rowlengths[j];
741       }
742     }
743     PetscFree(rowlengths);
744 
745     /* determine max buffer needed and allocate it */
746     maxnz = 0;
747     for ( i=0; i<size; i++ ) {
748       maxnz = PetscMax(maxnz,procsnz[i]);
749     }
750     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
751 
752     /* read in my part of the matrix column indices  */
753     nz = procsnz[0];
754     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
755     mycols = ibuf;
756     ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr);
757     /* read in every ones (except the last) and ship off */
758     for ( i=1; i<size-1; i++ ) {
759       nz = procsnz[i];
760       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
761       MPI_Send(cols,nz,MPI_INT,i,tag,comm);
762     }
763     /* read in the stuff for the last proc */
764     if ( size != 1 ) {
765       nz = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
766       ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr);
767       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
768       MPI_Send(cols,nz+extra_rows,MPI_INT,i,tag,comm);
769     }
770     PetscFree(cols);
771   }
772   else {
773     /* determine buffer space needed for message */
774     nz = 0;
775     for ( i=0; i<m; i++ ) {
776       nz += locrowlens[i];
777     }
778     ibuf = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
779     mycols = ibuf;
780     /* receive message of column indices*/
781     MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);
782     MPI_Get_count(&status,MPI_INT,&maxnz);
783     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
784   }
785 
786   /* loop over local rows, determining number of off diagonal entries */
787   dlens  = (int *) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(dlens);
788   odlens = dlens + (rstart-rend);
789   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
790   PetscMemzero(mask,Mbs*sizeof(int));
791   masked1 = mask    + Mbs;
792   masked2 = masked1 + Mbs;
793   rowcount = 0; nzcount = 0;
794   for ( i=0; i<mbs; i++ ) {
795     dcount  = 0;
796     odcount = 0;
797     for ( j=0; j<bs; j++ ) {
798       kmax = locrowlens[rowcount];
799       for ( k=0; k<kmax; k++ ) {
800         tmp = mycols[nzcount++]/bs;
801         if (!mask[tmp]) {
802           mask[tmp] = 1;
803           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
804           else masked1[dcount++] = tmp;
805         }
806       }
807       rowcount++;
808     }
809     dlens[i]  = dcount;
810     odlens[i] = odcount;
811     /* zero out the mask elements we set */
812     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
813     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
814   }
815   /* create our matrix */
816   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);CHKERRQ(ierr);
817   A = *newmat;
818   MatSetOption(A,COLUMNS_SORTED);
819 
820   if (!rank) {
821     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
822 
823     /* read in my part of the matrix numerical values  */
824     nz = procsnz[0];
825     vals = buf;
826     ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
827 
828     /* insert into matrix */
829     jj      = rstart*bs;
830     for ( i=0; i<m; i++ ) {
831       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
832       mycols += locrowlens[i];
833       vals   += locrowlens[i];
834       jj++;
835     }
836 
837     /* read in other processors( except the last one) and ship out */
838     for ( i=1; i<size-1; i++ ) {
839       nz = procsnz[i];
840       vals = buf;
841       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
842       MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);
843     }
844     /* the last proc */
845     if ( size != 1 ){
846       nz = procsnz[i] - extra_rows;
847       ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr);
848       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
849       MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,i,A->tag,comm);
850     }
851     PetscFree(procsnz);
852   }
853   else {
854     /* receive numeric values */
855     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
856 
857     /* receive message of values*/
858     vals = buf;
859     MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);
860     MPI_Get_count(&status,MPIU_SCALAR,&maxnz);
861     if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file");
862 
863     /* insert into matrix */
864     jj      = rstart*bs;
865     for ( i=0; i<mbs; i++ ) {
866       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
867       mycols += locrowlens[i];
868       vals   += locrowlens[i];
869       jj++;
870     }
871   }
872   PetscFree(locrowlens);
873   PetscFree(buf);
874   PetscFree(ibuf);
875   PetscFree(rowners);
876   PetscFree(dlens);
877   ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
878   ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr);
879   return 0;
880 }
881 
882 
883