xref: /petsc/src/mat/impls/baij/seq/baij.c (revision d252947ab307443ec0f8cabd384072ff75afeb3e)
1 #ifndef lint
2 static char vcid[] = "$Id: baij.c,v 1.87 1997/02/03 20:15:40 bsmith Exp balay $";
3 #endif
4 
5 /*
6     Defines the basic matrix operations for the BAIJ (compressed row)
7   matrix storage format.
8 */
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/spops.h"
12 #include "petsc.h"
13 
14 
15 /*
16      Adds diagonal pointers to sparse matrix structure.
17 */
18 
19 #undef __FUNC__
20 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
21 int MatMarkDiag_SeqBAIJ(Mat A)
22 {
23   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
24   int         i,j, *diag, m = a->mbs;
25 
26   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
27   PLogObjectMemory(A,(m+1)*sizeof(int));
28   for ( i=0; i<m; i++ ) {
29     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
30       if (a->j[j] == i) {
31         diag[i] = j;
32         break;
33       }
34     }
35   }
36   a->diag = diag;
37   return 0;
38 }
39 
40 #include "draw.h"
41 #include "pinclude/pviewer.h"
42 #include "sys.h"
43 
44 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
48 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
52   int         ierr, n = a->mbs,i;
53 
54   *nn = n;
55   if (!ia) return 0;
56   if (symmetric) {
57     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
58   } else if (oshift == 1) {
59     /* temporarily add 1 to i and j indices */
60     int nz = a->i[n] + 1;
61     for ( i=0; i<nz; i++ ) a->j[i]++;
62     for ( i=0; i<n+1; i++ ) a->i[i]++;
63     *ia = a->i; *ja = a->j;
64   } else {
65     *ia = a->i; *ja = a->j;
66   }
67 
68   return 0;
69 }
70 
71 #undef __FUNC__
72 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ"
73 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
74                                 PetscTruth *done)
75 {
76   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
77   int         i,n = a->mbs;
78 
79   if (!ia) return 0;
80   if (symmetric) {
81     PetscFree(*ia);
82     PetscFree(*ja);
83   } else if (oshift == 1) {
84     int nz = a->i[n];
85     for ( i=0; i<nz; i++ ) a->j[i]--;
86     for ( i=0; i<n+1; i++ ) a->i[i]--;
87   }
88   return 0;
89 }
90 
91 
92 #undef __FUNC__
93 #define __FUNC__ "MatView_SeqBAIJ_Binary"
94 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
95 {
96   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
97   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
98   Scalar      *aa;
99 
100   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
101   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
102   col_lens[0] = MAT_COOKIE;
103 
104   col_lens[1] = a->m;
105   col_lens[2] = a->n;
106   col_lens[3] = a->nz*bs2;
107 
108   /* store lengths of each row and write (including header) to file */
109   count = 0;
110   for ( i=0; i<a->mbs; i++ ) {
111     for ( j=0; j<bs; j++ ) {
112       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
113     }
114   }
115   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
116   PetscFree(col_lens);
117 
118   /* store column indices (zero start index) */
119   jj = (int *) PetscMalloc( a->nz*bs2*sizeof(int) ); CHKPTRQ(jj);
120   count = 0;
121   for ( i=0; i<a->mbs; i++ ) {
122     for ( j=0; j<bs; j++ ) {
123       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
124         for ( l=0; l<bs; l++ ) {
125           jj[count++] = bs*a->j[k] + l;
126         }
127       }
128     }
129   }
130   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
131   PetscFree(jj);
132 
133   /* store nonzero values */
134   aa = (Scalar *) PetscMalloc(a->nz*bs2*sizeof(Scalar)); CHKPTRQ(aa);
135   count = 0;
136   for ( i=0; i<a->mbs; i++ ) {
137     for ( j=0; j<bs; j++ ) {
138       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
139         for ( l=0; l<bs; l++ ) {
140           aa[count++] = a->a[bs2*k + l*bs + j];
141         }
142       }
143     }
144   }
145   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
146   PetscFree(aa);
147   return 0;
148 }
149 
150 #undef __FUNC__
151 #define __FUNC__ "MatView_SeqBAIJ_ASCII"
152 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
153 {
154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
155   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
156   FILE        *fd;
157   char        *outputname;
158 
159   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
160   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
161   ierr = ViewerGetFormat(viewer,&format);
162   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
163     fprintf(fd,"  block size is %d\n",bs);
164   }
165   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
166     SETERRQ(1,0,"Matlab format not supported");
167   }
168   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
169     for ( i=0; i<a->mbs; i++ ) {
170       for ( j=0; j<bs; j++ ) {
171         fprintf(fd,"row %d:",i*bs+j);
172         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
173           for ( l=0; l<bs; l++ ) {
174 #if defined(PETSC_COMPLEX)
175           if (imag(a->a[bs2*k + l*bs + j]) != 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
176             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
177               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
178           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
179             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
180 #else
181           if (a->a[bs2*k + l*bs + j] != 0.0)
182             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
183 #endif
184           }
185         }
186         fprintf(fd,"\n");
187       }
188     }
189   }
190   else {
191     for ( i=0; i<a->mbs; i++ ) {
192       for ( j=0; j<bs; j++ ) {
193         fprintf(fd,"row %d:",i*bs+j);
194         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
195           for ( l=0; l<bs; l++ ) {
196 #if defined(PETSC_COMPLEX)
197           if (imag(a->a[bs2*k + l*bs + j]) != 0.0) {
198             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
199               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
200           }
201           else {
202             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
203           }
204 #else
205             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
206 #endif
207           }
208         }
209         fprintf(fd,"\n");
210       }
211     }
212   }
213   fflush(fd);
214   return 0;
215 }
216 
217 #undef __FUNC__
218 #define __FUNC__ "MatView_SeqBAIJ_Draw"
219 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
220 {
221   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
222   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
223   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
224   Scalar       *aa;
225   Draw         draw;
226   DrawButton   button;
227   PetscTruth   isnull;
228 
229   ViewerDrawGetDraw(viewer,&draw);
230   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
231 
232   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
233   xr += w;    yr += h;  xl = -w;     yl = -h;
234   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
235   /* loop over matrix elements drawing boxes */
236   color = DRAW_BLUE;
237   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
238     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
239       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
240       x_l = a->j[j]*bs; x_r = x_l + 1.0;
241       aa = a->a + j*bs2;
242       for ( k=0; k<bs; k++ ) {
243         for ( l=0; l<bs; l++ ) {
244           if (PetscReal(*aa++) >=  0.) continue;
245           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
246         }
247       }
248     }
249   }
250   color = DRAW_CYAN;
251   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
252     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
253       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
254       x_l = a->j[j]*bs; x_r = x_l + 1.0;
255       aa = a->a + j*bs2;
256       for ( k=0; k<bs; k++ ) {
257         for ( l=0; l<bs; l++ ) {
258           if (PetscReal(*aa++) != 0.) continue;
259           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
260         }
261       }
262     }
263   }
264 
265   color = DRAW_RED;
266   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
267     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
268       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
269       x_l = a->j[j]*bs; x_r = x_l + 1.0;
270       aa = a->a + j*bs2;
271       for ( k=0; k<bs; k++ ) {
272         for ( l=0; l<bs; l++ ) {
273           if (PetscReal(*aa++) <= 0.) continue;
274           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
275         }
276       }
277     }
278   }
279 
280   DrawFlush(draw);
281   DrawGetPause(draw,&pause);
282   if (pause >= 0) { PetscSleep(pause); return 0;}
283 
284   /* allow the matrix to zoom or shrink */
285   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
286   while (button != BUTTON_RIGHT) {
287     DrawClear(draw);
288     if (button == BUTTON_LEFT) scale = .5;
289     else if (button == BUTTON_CENTER) scale = 2.;
290     xl = scale*(xl + w - xc) + xc - w*scale;
291     xr = scale*(xr - w - xc) + xc + w*scale;
292     yl = scale*(yl + h - yc) + yc - h*scale;
293     yr = scale*(yr - h - yc) + yc + h*scale;
294     w *= scale; h *= scale;
295     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
296     color = DRAW_BLUE;
297     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
298       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
299         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
300         x_l = a->j[j]*bs; x_r = x_l + 1.0;
301         aa = a->a + j*bs2;
302         for ( k=0; k<bs; k++ ) {
303           for ( l=0; l<bs; l++ ) {
304             if (PetscReal(*aa++) >=  0.) continue;
305             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
306           }
307         }
308       }
309     }
310     color = DRAW_CYAN;
311     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
312       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
313         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
314         x_l = a->j[j]*bs; x_r = x_l + 1.0;
315         aa = a->a + j*bs2;
316         for ( k=0; k<bs; k++ ) {
317           for ( l=0; l<bs; l++ ) {
318           if (PetscReal(*aa++) != 0.) continue;
319           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
320           }
321         }
322       }
323     }
324 
325     color = DRAW_RED;
326     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
327       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
328         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
329         x_l = a->j[j]*bs; x_r = x_l + 1.0;
330         aa = a->a + j*bs2;
331         for ( k=0; k<bs; k++ ) {
332           for ( l=0; l<bs; l++ ) {
333             if (PetscReal(*aa++) <= 0.) continue;
334             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
335           }
336         }
337       }
338     }
339     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
340   }
341   return 0;
342 }
343 
344 #undef __FUNC__
345 #define __FUNC__ "MatView_SeqBAIJ"
346 static int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
347 {
348   Mat         A = (Mat) obj;
349   ViewerType  vtype;
350   int         ierr;
351 
352   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
353   if (vtype == MATLAB_VIEWER) {
354     SETERRQ(1,0,"Matlab viewer not supported");
355   }
356   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
357     return MatView_SeqBAIJ_ASCII(A,viewer);
358   }
359   else if (vtype == BINARY_FILE_VIEWER) {
360     return MatView_SeqBAIJ_Binary(A,viewer);
361   }
362   else if (vtype == DRAW_VIEWER) {
363     return MatView_SeqBAIJ_Draw(A,viewer);
364   }
365   return 0;
366 }
367 
368 #define CHUNKSIZE  10
369 
370 #undef __FUNC__
371 #define __FUNC__ "MatSetValues_SeqBAIJ"
372 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
373 {
374   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
375   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
376   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
377   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
378   int          ridx,cidx,bs2=a->bs2;
379   Scalar      *ap,value,*aa=a->a,*bap;
380 
381   for ( k=0; k<m; k++ ) { /* loop over added rows */
382     row  = im[k]; brow = row/bs;
383 #if defined(PETSC_BOPT_g)
384     if (row < 0) SETERRQ(1,0,"Negative row");
385     if (row >= a->m) SETERRQ(1,0,"Row too large");
386 #endif
387     rp   = aj + ai[brow];
388     ap   = aa + bs2*ai[brow];
389     rmax = imax[brow];
390     nrow = ailen[brow];
391     low  = 0;
392     for ( l=0; l<n; l++ ) { /* loop over added columns */
393 #if defined(PETSC_BOPT_g)
394       if (in[l] < 0) SETERRQ(1,0,"Negative column");
395       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
396 #endif
397       col = in[l]; bcol = col/bs;
398       ridx = row % bs; cidx = col % bs;
399       if (roworiented) {
400         value = *v++;
401       }
402       else {
403         value = v[k + l*m];
404       }
405       if (!sorted) low = 0; high = nrow;
406       while (high-low > 7) {
407         t = (low+high)/2;
408         if (rp[t] > bcol) high = t;
409         else              low  = t;
410       }
411       for ( i=low; i<high; i++ ) {
412         if (rp[i] > bcol) break;
413         if (rp[i] == bcol) {
414           bap  = ap +  bs2*i + bs*cidx + ridx;
415           if (is == ADD_VALUES) *bap += value;
416           else                  *bap  = value;
417           goto noinsert;
418         }
419       }
420       if (nonew) goto noinsert;
421       if (nrow >= rmax) {
422         /* there is no extra room in row, therefore enlarge */
423         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
424         Scalar *new_a;
425 
426         /* malloc new storage space */
427         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
428         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
429         new_j   = (int *) (new_a + bs2*new_nz);
430         new_i   = new_j + new_nz;
431 
432         /* copy over old data into new slots */
433         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
434         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
435         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
436         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
437         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
438                                                            len*sizeof(int));
439         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
440         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
441         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
442                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
443         /* free up old matrix storage */
444         PetscFree(a->a);
445         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
446         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
447         a->singlemalloc = 1;
448 
449         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
450         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
451         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
452         a->maxnz += bs2*CHUNKSIZE;
453         a->reallocs++;
454         a->nz++;
455       }
456       N = nrow++ - 1;
457       /* shift up all the later entries in this row */
458       for ( ii=N; ii>=i; ii-- ) {
459         rp[ii+1] = rp[ii];
460         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
461       }
462       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
463       rp[i]                      = bcol;
464       ap[bs2*i + bs*cidx + ridx] = value;
465       noinsert:;
466       low = i;
467     }
468     ailen[brow] = nrow;
469   }
470   return 0;
471 }
472 
473 #undef __FUNC__
474 #define __FUNC__ "MatSetValues_SeqBAIJ"
475 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
476 {
477   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
478   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
479   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
480   int         *aj=a->j,nonew=a->nonew,bs=a->bs;
481   int          ridx,cidx,bs2=a->bs2;
482   Scalar      *ap,value,*aa=a->a,*bap;
483 
484   for ( k=0; k<m; k++ ) { /* loop over added rows */
485     row  = im[k];
486 #if defined(PETSC_BOPT_g)
487     if (row < 0) SETERRQ(1,0,"Negative row");
488     if (row >= a->m) SETERRQ(1,0,"Row too large");
489 #endif
490     rp   = aj + ai[row];
491     ap   = aa + bs2*ai[row];
492     rmax = imax[row];
493     nrow = ailen[row];
494     low  = 0;
495     for ( l=0; l<n; l++ ) { /* loop over added columns */
496 #if defined(PETSC_BOPT_g)
497       if (in[l] < 0) SETERRQ(1,0,"Negative column");
498       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
499 #endif
500       col = in[l];
501       ridx = row % bs; cidx = col % bs;
502       if (roworiented) {
503         value = *v++;
504       }
505       else {
506         value = v[k + l*m];
507       }
508       if (!sorted) low = 0; high = nrow;
509       while (high-low > 7) {
510         t = (low+high)/2;
511         if (rp[t] > col) high = t;
512         else             low  = t;
513       }
514       for ( i=low; i<high; i++ ) {
515         if (rp[i] > col) break;
516         if (rp[i] == col) {
517           bap  = ap +  bs2*i + bs*cidx + ridx;
518           if (is == ADD_VALUES) *bap += value;
519           else                  *bap  = value;
520           goto noinsert;
521         }
522       }
523       if (nonew) goto noinsert;
524       if (nrow >= rmax) {
525         /* there is no extra room in row, therefore enlarge */
526         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
527         Scalar *new_a;
528 
529         /* malloc new storage space */
530         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
531         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
532         new_j   = (int *) (new_a + bs2*new_nz);
533         new_i   = new_j + new_nz;
534 
535         /* copy over old data into new slots */
536         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
537         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
538         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
539         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
540         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,
541                                                            len*sizeof(int));
542         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
543         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
544         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),
545                     aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
546         /* free up old matrix storage */
547         PetscFree(a->a);
548         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
549         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
550         a->singlemalloc = 1;
551 
552         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
553         rmax = imax[row] = imax[row] + CHUNKSIZE;
554         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
555         a->maxnz += bs2*CHUNKSIZE;
556         a->reallocs++;
557         a->nz++;
558       }
559       N = nrow++ - 1;
560       /* shift up all the later entries in this row */
561       for ( ii=N; ii>=i; ii-- ) {
562         rp[ii+1] = rp[ii];
563         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
564       }
565       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
566       rp[i]                      = col;
567       ap[bs2*i + bs*cidx + ridx] = value;
568       noinsert:;
569       low = i;
570     }
571     ailen[row] = nrow;
572   }
573   return 0;
574 }
575 
576 #undef __FUNC__
577 #define __FUNC__ "MatGetSize_SeqBAIJ"
578 static int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
579 {
580   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
581   *m = a->m; *n = a->n;
582   return 0;
583 }
584 
585 #undef __FUNC__
586 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ"
587 static int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
588 {
589   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
590   *m = 0; *n = a->m;
591   return 0;
592 }
593 
594 #undef __FUNC__
595 #define __FUNC__ "MatGetRow_SeqBAIJ"
596 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
597 {
598   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
599   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
600   Scalar      *aa,*v_i,*aa_i;
601 
602   bs  = a->bs;
603   ai  = a->i;
604   aj  = a->j;
605   aa  = a->a;
606   bs2 = a->bs2;
607 
608   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
609 
610   bn  = row/bs;   /* Block number */
611   bp  = row % bs; /* Block Position */
612   M   = ai[bn+1] - ai[bn];
613   *nz = bs*M;
614 
615   if (v) {
616     *v = 0;
617     if (*nz) {
618       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
619       for ( i=0; i<M; i++ ) { /* for each block in the block row */
620         v_i  = *v + i*bs;
621         aa_i = aa + bs2*(ai[bn] + i);
622         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
623       }
624     }
625   }
626 
627   if (idx) {
628     *idx = 0;
629     if (*nz) {
630       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
631       for ( i=0; i<M; i++ ) { /* for each block in the block row */
632         idx_i = *idx + i*bs;
633         itmp  = bs*aj[ai[bn] + i];
634         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
635       }
636     }
637   }
638   return 0;
639 }
640 
641 #undef __FUNC__
642 #define __FUNC__ "MatRestoreRow_SeqBAIJ"
643 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
644 {
645   if (idx) {if (*idx) PetscFree(*idx);}
646   if (v)   {if (*v)   PetscFree(*v);}
647   return 0;
648 }
649 
650 #undef __FUNC__
651 #define __FUNC__ "MatTranspose_SeqBAIJ"
652 static int MatTranspose_SeqBAIJ(Mat A,Mat *B)
653 {
654   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
655   Mat         C;
656   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
657   int         *rows,*cols,bs2=a->bs2;
658   Scalar      *array=a->a;
659 
660   if (B==PETSC_NULL && mbs!=nbs)
661     SETERRQ(1,0,"Square matrix only for in-place");
662   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
663   PetscMemzero(col,(1+nbs)*sizeof(int));
664 
665   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
666   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
667   PetscFree(col);
668   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
669   cols = rows + bs;
670   for ( i=0; i<mbs; i++ ) {
671     cols[0] = i*bs;
672     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
673     len = ai[i+1] - ai[i];
674     for ( j=0; j<len; j++ ) {
675       rows[0] = (*aj++)*bs;
676       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
677       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
678       array += bs2;
679     }
680   }
681   PetscFree(rows);
682 
683   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
684   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
685 
686   if (B != PETSC_NULL) {
687     *B = C;
688   } else {
689     /* This isn't really an in-place transpose */
690     PetscFree(a->a);
691     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
692     if (a->diag) PetscFree(a->diag);
693     if (a->ilen) PetscFree(a->ilen);
694     if (a->imax) PetscFree(a->imax);
695     if (a->solve_work) PetscFree(a->solve_work);
696     PetscFree(a);
697     PetscMemcpy(A,C,sizeof(struct _Mat));
698     PetscHeaderDestroy(C);
699   }
700   return 0;
701 }
702 
703 
704 #undef __FUNC__
705 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
706 static int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
707 {
708   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
709   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
710   int        m = a->m,*ip, N, *ailen = a->ilen;
711   int        mbs = a->mbs, bs2 = a->bs2,rmax;
712   Scalar     *aa = a->a, *ap;
713 
714   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
715 
716   rmax = ailen[0];
717   for ( i=1; i<mbs; i++ ) {
718     /* move each row back by the amount of empty slots (fshift) before it*/
719     fshift += imax[i-1] - ailen[i-1];
720     rmax   = PetscMax(rmax,ailen[i]);
721     if (fshift) {
722       ip = aj + ai[i]; ap = aa + bs2*ai[i];
723       N = ailen[i];
724       for ( j=0; j<N; j++ ) {
725         ip[j-fshift] = ip[j];
726         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
727       }
728     }
729     ai[i] = ai[i-1] + ailen[i-1];
730   }
731   if (mbs) {
732     fshift += imax[mbs-1] - ailen[mbs-1];
733     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
734   }
735   /* reset ilen and imax for each row */
736   for ( i=0; i<mbs; i++ ) {
737     ailen[i] = imax[i] = ai[i+1] - ai[i];
738   }
739   a->nz = ai[mbs];
740 
741   /* diagonals may have moved, so kill the diagonal pointers */
742   if (fshift && a->diag) {
743     PetscFree(a->diag);
744     PLogObjectMemory(A,-(m+1)*sizeof(int));
745     a->diag = 0;
746   }
747   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
748            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
749   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
750            a->reallocs);
751   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
752   a->reallocs          = 0;
753   A->info.nz_unneeded  = (double)fshift*bs2;
754 
755   return 0;
756 }
757 
758 #undef __FUNC__
759 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
760 static int MatZeroEntries_SeqBAIJ(Mat A)
761 {
762   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
763   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
764   return 0;
765 }
766 
767 #undef __FUNC__
768 #define __FUNC__ "MatDestroy_SeqBAIJ"
769 int MatDestroy_SeqBAIJ(PetscObject obj)
770 {
771   Mat         A  = (Mat) obj;
772   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
773   int         ierr;
774 
775 #if defined(PETSC_LOG)
776   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
777 #endif
778   PetscFree(a->a);
779   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
780   if (a->diag) PetscFree(a->diag);
781   if (a->ilen) PetscFree(a->ilen);
782   if (a->imax) PetscFree(a->imax);
783   if (a->solve_work) PetscFree(a->solve_work);
784   if (a->mult_work) PetscFree(a->mult_work);
785   PetscFree(a);
786   if (A->mapping) {
787     ierr = ISLocalToGlobalMappingDestroy(A->mapping); CHKERRQ(ierr);
788   }
789   PLogObjectDestroy(A);
790   PetscHeaderDestroy(A);
791   return 0;
792 }
793 
794 #undef __FUNC__
795 #define __FUNC__ "MatSetOption_SeqBAIJ"
796 static int MatSetOption_SeqBAIJ(Mat A,MatOption op)
797 {
798   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
799   if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
800   else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0;
801   else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
802   else if (op == MAT_COLUMNS_UNSORTED)          a->sorted      = 0;
803   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
804   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
805   else if (op == MAT_ROWS_SORTED ||
806            op == MAT_ROWS_UNSORTED ||
807            op == MAT_SYMMETRIC ||
808            op == MAT_STRUCTURALLY_SYMMETRIC ||
809            op == MAT_YES_NEW_DIAGONALS ||
810            op == MAT_IGNORE_OFF_PROCESSOR_ENTRIES)
811     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
812   else if (op == MAT_NO_NEW_DIAGONALS)
813     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
814   else
815     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
816   return 0;
817 }
818 
819 
820 /* -------------------------------------------------------*/
821 /* Should check that shapes of vectors and matrices match */
822 /* -------------------------------------------------------*/
823 #include "pinclude/plapack.h"
824 
825 #undef __FUNC__
826 #define __FUNC__ "MatMult_SeqBAIJ_1"
827 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
828 {
829   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
830   register Scalar *x,*z,*v,sum;
831   int             mbs=a->mbs,i,*idx,*ii,n;
832 
833   VecGetArray_Fast(xx,x);
834   VecGetArray_Fast(zz,z);
835 
836   idx   = a->j;
837   v     = a->a;
838   ii    = a->i;
839 
840   for ( i=0; i<mbs; i++ ) {
841     n    = ii[1] - ii[0]; ii++;
842     sum  = 0.0;
843     while (n--) sum += *v++ * x[*idx++];
844     z[i] = sum;
845   }
846   VecRestoreArray_Fast(xx,x);
847   VecRestoreArray_Fast(zz,z);
848   PLogFlops(2*a->nz - a->m);
849   return 0;
850 }
851 
852 #undef __FUNC__
853 #define __FUNC__ "MatMult_SeqBAIJ_2"
854 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
855 {
856   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
857   register Scalar *x,*z,*v,*xb,sum1,sum2;
858   register Scalar x1,x2;
859   int             mbs=a->mbs,i,*idx,*ii;
860   int             j,n;
861 
862   VecGetArray_Fast(xx,x);
863   VecGetArray_Fast(zz,z);
864 
865   idx   = a->j;
866   v     = a->a;
867   ii    = a->i;
868 
869   for ( i=0; i<mbs; i++ ) {
870     n  = ii[1] - ii[0]; ii++;
871     sum1 = 0.0; sum2 = 0.0;
872     for ( j=0; j<n; j++ ) {
873       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
874       sum1 += v[0]*x1 + v[2]*x2;
875       sum2 += v[1]*x1 + v[3]*x2;
876       v += 4;
877     }
878     z[0] = sum1; z[1] = sum2;
879     z += 2;
880   }
881   VecRestoreArray_Fast(xx,x);
882   VecRestoreArray_Fast(zz,z);
883   PLogFlops(4*a->nz - a->m);
884   return 0;
885 }
886 
887 #undef __FUNC__
888 #define __FUNC__ "MatMult_SeqBAIJ_3"
889 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
890 {
891   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
892   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
893   int             mbs=a->mbs,i,*idx,*ii,j,n;
894 
895   VecGetArray_Fast(xx,x);
896   VecGetArray_Fast(zz,z);
897 
898   idx   = a->j;
899   v     = a->a;
900   ii    = a->i;
901 
902   for ( i=0; i<mbs; i++ ) {
903     n  = ii[1] - ii[0]; ii++;
904     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
905     for ( j=0; j<n; j++ ) {
906       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
907       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
908       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
909       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
910       v += 9;
911     }
912     z[0] = sum1; z[1] = sum2; z[2] = sum3;
913     z += 3;
914   }
915   VecRestoreArray_Fast(xx,x);
916   VecRestoreArray_Fast(zz,z);
917   PLogFlops(18*a->nz - a->m);
918   return 0;
919 }
920 
921 #undef __FUNC__
922 #define __FUNC__ "MatMult_SeqBAIJ_4"
923 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
924 {
925   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
926   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
927   register Scalar x1,x2,x3,x4;
928   int             mbs=a->mbs,i,*idx,*ii;
929   int             j,n;
930 
931   VecGetArray_Fast(xx,x);
932   VecGetArray_Fast(zz,z);
933 
934   idx   = a->j;
935   v     = a->a;
936   ii    = a->i;
937 
938   for ( i=0; i<mbs; i++ ) {
939     n  = ii[1] - ii[0]; ii++;
940     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
941     for ( j=0; j<n; j++ ) {
942       xb = x + 4*(*idx++);
943       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
944       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
945       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
946       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
947       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
948       v += 16;
949     }
950     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
951     z += 4;
952   }
953   VecRestoreArray_Fast(xx,x);
954   VecRestoreArray_Fast(zz,z);
955   PLogFlops(32*a->nz - a->m);
956   return 0;
957 }
958 
959 #undef __FUNC__
960 #define __FUNC__ "MatMult_SeqBAIJ_5"
961 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
962 {
963   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
964   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
965   register Scalar x1,x2,x3,x4,x5;
966   int             mbs=a->mbs,i,*idx,*ii,j,n;
967 
968   VecGetArray_Fast(xx,x);
969   VecGetArray_Fast(zz,z);
970 
971   idx   = a->j;
972   v     = a->a;
973   ii    = a->i;
974 
975   for ( i=0; i<mbs; i++ ) {
976     n  = ii[1] - ii[0]; ii++;
977     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
978     for ( j=0; j<n; j++ ) {
979       xb = x + 5*(*idx++);
980       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
981       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
982       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
983       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
984       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
985       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
986       v += 25;
987     }
988     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
989     z += 5;
990   }
991   VecRestoreArray_Fast(xx,x);
992   VecRestoreArray_Fast(zz,z);
993   PLogFlops(50*a->nz - a->m);
994   return 0;
995 }
996 
997 #undef __FUNC__
998 #define __FUNC__ "MatMult_SeqBAIJ_7"
999 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1000 {
1001   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1002   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1003   register Scalar x1,x2,x3,x4,x5,x6,x7;
1004   int             mbs=a->mbs,i,*idx,*ii,j,n;
1005 
1006   VecGetArray_Fast(xx,x);
1007   VecGetArray_Fast(zz,z);
1008 
1009   idx   = a->j;
1010   v     = a->a;
1011   ii    = a->i;
1012 
1013   for ( i=0; i<mbs; i++ ) {
1014     n  = ii[1] - ii[0]; ii++;
1015     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1016     for ( j=0; j<n; j++ ) {
1017       xb = x + 7*(*idx++);
1018       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1019       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1020       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1021       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1022       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1023       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1024       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1025       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1026       v += 49;
1027     }
1028     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1029     z += 7;
1030   }
1031 
1032   VecRestoreArray_Fast(xx,x);
1033   VecRestoreArray_Fast(zz,z);
1034   PLogFlops(98*a->nz - a->m);
1035   return 0;
1036 }
1037 
1038 #undef __FUNC__
1039 #define __FUNC__ "MatMult_SeqBAIJ_N"
1040 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1041 {
1042   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1043   register Scalar *x,*z,*v,*xb;
1044   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1045   int             _One = 1,ncols,k;
1046   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1047 
1048   VecGetArray_Fast(xx,x);
1049   VecGetArray_Fast(zz,z);
1050 
1051   idx   = a->j;
1052   v     = a->a;
1053   ii    = a->i;
1054 
1055 
1056   if (!a->mult_work) {
1057     k = PetscMax(a->m,a->n);
1058     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1059   }
1060   work = a->mult_work;
1061   for ( i=0; i<mbs; i++ ) {
1062     n     = ii[1] - ii[0]; ii++;
1063     ncols = n*bs;
1064     workt = work;
1065     for ( j=0; j<n; j++ ) {
1066       xb = x + bs*(*idx++);
1067       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1068       workt += bs;
1069     }
1070     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1071     v += n*bs2;
1072     z += bs;
1073   }
1074   VecRestoreArray_Fast(xx,x);
1075   VecRestoreArray_Fast(zz,z);
1076   PLogFlops(2*a->nz*bs2 - a->m);
1077   return 0;
1078 }
1079 
1080 #undef __FUNC__
1081 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1082 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1083 {
1084   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1085   register Scalar *x,*y,*z,*v,sum;
1086   int             mbs=a->mbs,i,*idx,*ii,n;
1087 
1088   VecGetArray_Fast(xx,x);
1089   VecGetArray_Fast(yy,y);
1090   VecGetArray_Fast(zz,z);
1091 
1092   idx   = a->j;
1093   v     = a->a;
1094   ii    = a->i;
1095 
1096   for ( i=0; i<mbs; i++ ) {
1097     n    = ii[1] - ii[0]; ii++;
1098     sum  = y[i];
1099     while (n--) sum += *v++ * x[*idx++];
1100     z[i] = sum;
1101   }
1102   VecRestoreArray_Fast(xx,x);
1103   VecRestoreArray_Fast(yy,y);
1104   VecRestoreArray_Fast(zz,z);
1105   PLogFlops(2*a->nz);
1106   return 0;
1107 }
1108 
1109 #undef __FUNC__
1110 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1111 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1112 {
1113   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1114   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1115   register Scalar x1,x2;
1116   int             mbs=a->mbs,i,*idx,*ii;
1117   int             j,n;
1118 
1119   VecGetArray_Fast(xx,x);
1120   VecGetArray_Fast(yy,y);
1121   VecGetArray_Fast(zz,z);
1122 
1123   idx   = a->j;
1124   v     = a->a;
1125   ii    = a->i;
1126 
1127   for ( i=0; i<mbs; i++ ) {
1128     n  = ii[1] - ii[0]; ii++;
1129     sum1 = y[0]; sum2 = y[1];
1130     for ( j=0; j<n; j++ ) {
1131       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1132       sum1 += v[0]*x1 + v[2]*x2;
1133       sum2 += v[1]*x1 + v[3]*x2;
1134       v += 4;
1135     }
1136     z[0] = sum1; z[1] = sum2;
1137     z += 2; y += 2;
1138   }
1139   VecRestoreArray_Fast(xx,x);
1140   VecRestoreArray_Fast(yy,y);
1141   VecRestoreArray_Fast(zz,z);
1142   PLogFlops(4*a->nz);
1143   return 0;
1144 }
1145 
1146 #undef __FUNC__
1147 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1148 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1149 {
1150   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1151   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1152   int             mbs=a->mbs,i,*idx,*ii,j,n;
1153 
1154   VecGetArray_Fast(xx,x);
1155   VecGetArray_Fast(yy,y);
1156   VecGetArray_Fast(zz,z);
1157 
1158   idx   = a->j;
1159   v     = a->a;
1160   ii    = a->i;
1161 
1162   for ( i=0; i<mbs; i++ ) {
1163     n  = ii[1] - ii[0]; ii++;
1164     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1165     for ( j=0; j<n; j++ ) {
1166       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1167       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1168       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1169       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1170       v += 9;
1171     }
1172     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1173     z += 3; y += 3;
1174   }
1175   VecRestoreArray_Fast(xx,x);
1176   VecRestoreArray_Fast(yy,y);
1177   VecRestoreArray_Fast(zz,z);
1178   PLogFlops(18*a->nz);
1179   return 0;
1180 }
1181 
1182 #undef __FUNC__
1183 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1184 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1185 {
1186   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1187   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1188   register Scalar x1,x2,x3,x4;
1189   int             mbs=a->mbs,i,*idx,*ii;
1190   int             j,n;
1191 
1192   VecGetArray_Fast(xx,x);
1193   VecGetArray_Fast(yy,y);
1194   VecGetArray_Fast(zz,z);
1195 
1196   idx   = a->j;
1197   v     = a->a;
1198   ii    = a->i;
1199 
1200   for ( i=0; i<mbs; i++ ) {
1201     n  = ii[1] - ii[0]; ii++;
1202     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1203     for ( j=0; j<n; j++ ) {
1204       xb = x + 4*(*idx++);
1205       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1206       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1207       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1208       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1209       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1210       v += 16;
1211     }
1212     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1213     z += 4; y += 4;
1214   }
1215   VecRestoreArray_Fast(xx,x);
1216   VecRestoreArray_Fast(yy,y);
1217   VecRestoreArray_Fast(zz,z);
1218   PLogFlops(32*a->nz);
1219   return 0;
1220 }
1221 
1222 #undef __FUNC__
1223 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1224 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1225 {
1226   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1227   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1228   register Scalar x1,x2,x3,x4,x5;
1229   int             mbs=a->mbs,i,*idx,*ii,j,n;
1230 
1231   VecGetArray_Fast(xx,x);
1232   VecGetArray_Fast(yy,y);
1233   VecGetArray_Fast(zz,z);
1234 
1235   idx   = a->j;
1236   v     = a->a;
1237   ii    = a->i;
1238 
1239   for ( i=0; i<mbs; i++ ) {
1240     n  = ii[1] - ii[0]; ii++;
1241     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1242     for ( j=0; j<n; j++ ) {
1243       xb = x + 5*(*idx++);
1244       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1245       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1246       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1247       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1248       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1249       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1250       v += 25;
1251     }
1252     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1253     z += 5; y += 5;
1254   }
1255   VecRestoreArray_Fast(xx,x);
1256   VecRestoreArray_Fast(yy,y);
1257   VecRestoreArray_Fast(zz,z);
1258   PLogFlops(50*a->nz);
1259   return 0;
1260 }
1261 
1262 #undef __FUNC__
1263 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1264 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1265 {
1266   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1267   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1268   register Scalar x1,x2,x3,x4,x5,x6,x7;
1269   int             mbs=a->mbs,i,*idx,*ii,j,n;
1270 
1271   VecGetArray_Fast(xx,x);
1272   VecGetArray_Fast(yy,y);
1273   VecGetArray_Fast(zz,z);
1274 
1275   idx   = a->j;
1276   v     = a->a;
1277   ii    = a->i;
1278 
1279   for ( i=0; i<mbs; i++ ) {
1280     n  = ii[1] - ii[0]; ii++;
1281     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1282     for ( j=0; j<n; j++ ) {
1283       xb = x + 7*(*idx++);
1284       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1285       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1286       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1287       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1288       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1289       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1290       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1291       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1292       v += 49;
1293     }
1294     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1295     z += 7; y += 7;
1296   }
1297   VecRestoreArray_Fast(xx,x);
1298   VecRestoreArray_Fast(yy,y);
1299   VecRestoreArray_Fast(zz,z);
1300   PLogFlops(98*a->nz);
1301   return 0;
1302 }
1303 
1304 
1305 #undef __FUNC__
1306 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1307 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1308 {
1309   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1310   register Scalar *x,*z,*v,*xb;
1311   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1312   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1313 
1314   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1315 
1316   VecGetArray_Fast(xx,x);
1317   VecGetArray_Fast(zz,z);
1318 
1319   idx   = a->j;
1320   v     = a->a;
1321   ii    = a->i;
1322 
1323 
1324   if (!a->mult_work) {
1325     k = PetscMax(a->m,a->n);
1326     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1327   }
1328   work = a->mult_work;
1329   for ( i=0; i<mbs; i++ ) {
1330     n     = ii[1] - ii[0]; ii++;
1331     ncols = n*bs;
1332     workt = work;
1333     for ( j=0; j<n; j++ ) {
1334       xb = x + bs*(*idx++);
1335       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1336       workt += bs;
1337     }
1338     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1339     v += n*bs2;
1340     z += bs;
1341   }
1342   VecRestoreArray_Fast(xx,x);
1343   VecRestoreArray_Fast(zz,z);
1344   PLogFlops(2*a->nz*bs2 );
1345   return 0;
1346 }
1347 
1348 #undef __FUNC__
1349 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1350 static int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1351 {
1352   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1353   Scalar          *xg,*zg,*zb;
1354   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1355   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1356   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1357 
1358 
1359   VecGetArray_Fast(xx,xg); x = xg;
1360   VecGetArray_Fast(zz,zg); z = zg;
1361   PetscMemzero(z,N*sizeof(Scalar));
1362 
1363   idx   = a->j;
1364   v     = a->a;
1365   ii    = a->i;
1366 
1367   switch (bs) {
1368   case 1:
1369     for ( i=0; i<mbs; i++ ) {
1370       n  = ii[1] - ii[0]; ii++;
1371       xb = x + i; x1 = xb[0];
1372       ib = idx + ai[i];
1373       for ( j=0; j<n; j++ ) {
1374         rval    = ib[j];
1375         z[rval] += *v++ * x1;
1376       }
1377     }
1378     break;
1379   case 2:
1380     for ( i=0; i<mbs; i++ ) {
1381       n  = ii[1] - ii[0]; ii++;
1382       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1383       ib = idx + ai[i];
1384       for ( j=0; j<n; j++ ) {
1385         rval      = ib[j]*2;
1386         z[rval++] += v[0]*x1 + v[1]*x2;
1387         z[rval++] += v[2]*x1 + v[3]*x2;
1388         v += 4;
1389       }
1390     }
1391     break;
1392   case 3:
1393     for ( i=0; i<mbs; i++ ) {
1394       n  = ii[1] - ii[0]; ii++;
1395       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1396       ib = idx + ai[i];
1397       for ( j=0; j<n; j++ ) {
1398         rval      = ib[j]*3;
1399         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1400         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1401         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1402         v += 9;
1403       }
1404     }
1405     break;
1406   case 4:
1407     for ( i=0; i<mbs; i++ ) {
1408       n  = ii[1] - ii[0]; ii++;
1409       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1410       ib = idx + ai[i];
1411       for ( j=0; j<n; j++ ) {
1412         rval      = ib[j]*4;
1413         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1414         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1415         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1416         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1417         v += 16;
1418       }
1419     }
1420     break;
1421   case 5:
1422     for ( i=0; i<mbs; i++ ) {
1423       n  = ii[1] - ii[0]; ii++;
1424       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1425       x4 = xb[3];   x5 = xb[4];
1426       ib = idx + ai[i];
1427       for ( j=0; j<n; j++ ) {
1428         rval      = ib[j]*5;
1429         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1430         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1431         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1432         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1433         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1434         v += 25;
1435       }
1436     }
1437     break;
1438       /* block sizes larger then 5 by 5 are handled by BLAS */
1439     default: {
1440       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1441       if (!a->mult_work) {
1442         k = PetscMax(a->m,a->n);
1443         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1444         CHKPTRQ(a->mult_work);
1445       }
1446       work = a->mult_work;
1447       for ( i=0; i<mbs; i++ ) {
1448         n     = ii[1] - ii[0]; ii++;
1449         ncols = n*bs;
1450         PetscMemzero(work,ncols*sizeof(Scalar));
1451         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1452         v += n*bs2;
1453         x += bs;
1454         workt = work;
1455         for ( j=0; j<n; j++ ) {
1456           zb = z + bs*(*idx++);
1457           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1458           workt += bs;
1459         }
1460       }
1461     }
1462   }
1463   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1464   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1465   PLogFlops(2*a->nz*a->bs2 - a->n);
1466   return 0;
1467 }
1468 
1469 #undef __FUNC__
1470 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1471 static int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1472 {
1473   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1474   Scalar          *xg,*zg,*zb;
1475   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1476   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1477   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1478 
1479 
1480 
1481   VecGetArray_Fast(xx,xg); x = xg;
1482   VecGetArray_Fast(zz,zg); z = zg;
1483 
1484   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1485   else PetscMemzero(z,N*sizeof(Scalar));
1486 
1487 
1488   idx   = a->j;
1489   v     = a->a;
1490   ii    = a->i;
1491 
1492   switch (bs) {
1493   case 1:
1494     for ( i=0; i<mbs; i++ ) {
1495       n  = ii[1] - ii[0]; ii++;
1496       xb = x + i; x1 = xb[0];
1497       ib = idx + ai[i];
1498       for ( j=0; j<n; j++ ) {
1499         rval    = ib[j];
1500         z[rval] += *v++ * x1;
1501       }
1502     }
1503     break;
1504   case 2:
1505     for ( i=0; i<mbs; i++ ) {
1506       n  = ii[1] - ii[0]; ii++;
1507       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1508       ib = idx + ai[i];
1509       for ( j=0; j<n; j++ ) {
1510         rval      = ib[j]*2;
1511         z[rval++] += v[0]*x1 + v[1]*x2;
1512         z[rval++] += v[2]*x1 + v[3]*x2;
1513         v += 4;
1514       }
1515     }
1516     break;
1517   case 3:
1518     for ( i=0; i<mbs; i++ ) {
1519       n  = ii[1] - ii[0]; ii++;
1520       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1521       ib = idx + ai[i];
1522       for ( j=0; j<n; j++ ) {
1523         rval      = ib[j]*3;
1524         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1525         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1526         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1527         v += 9;
1528       }
1529     }
1530     break;
1531   case 4:
1532     for ( i=0; i<mbs; i++ ) {
1533       n  = ii[1] - ii[0]; ii++;
1534       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1535       ib = idx + ai[i];
1536       for ( j=0; j<n; j++ ) {
1537         rval      = ib[j]*4;
1538         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1539         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1540         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1541         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1542         v += 16;
1543       }
1544     }
1545     break;
1546   case 5:
1547     for ( i=0; i<mbs; i++ ) {
1548       n  = ii[1] - ii[0]; ii++;
1549       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1550       x4 = xb[3];   x5 = xb[4];
1551       ib = idx + ai[i];
1552       for ( j=0; j<n; j++ ) {
1553         rval      = ib[j]*5;
1554         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1555         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1556         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1557         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1558         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1559         v += 25;
1560       }
1561     }
1562     break;
1563       /* block sizes larger then 5 by 5 are handled by BLAS */
1564     default: {
1565       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1566       if (!a->mult_work) {
1567         k = PetscMax(a->m,a->n);
1568         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1569         CHKPTRQ(a->mult_work);
1570       }
1571       work = a->mult_work;
1572       for ( i=0; i<mbs; i++ ) {
1573         n     = ii[1] - ii[0]; ii++;
1574         ncols = n*bs;
1575         PetscMemzero(work,ncols*sizeof(Scalar));
1576         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1577         v += n*bs2;
1578         x += bs;
1579         workt = work;
1580         for ( j=0; j<n; j++ ) {
1581           zb = z + bs*(*idx++);
1582           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1583           workt += bs;
1584         }
1585       }
1586     }
1587   }
1588   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1589   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1590   PLogFlops(2*a->nz*a->bs2);
1591   return 0;
1592 }
1593 
1594 #undef __FUNC__
1595 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1596 static int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1597 {
1598   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1599 
1600   info->rows_global    = (double)a->m;
1601   info->columns_global = (double)a->n;
1602   info->rows_local     = (double)a->m;
1603   info->columns_local  = (double)a->n;
1604   info->block_size     = a->bs2;
1605   info->nz_allocated   = a->maxnz;
1606   info->nz_used        = a->bs2*a->nz;
1607   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1608   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1609     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1610   info->assemblies   = A->num_ass;
1611   info->mallocs      = a->reallocs;
1612   info->memory       = A->mem;
1613   if (A->factor) {
1614     info->fill_ratio_given  = A->info.fill_ratio_given;
1615     info->fill_ratio_needed = A->info.fill_ratio_needed;
1616     info->factor_mallocs    = A->info.factor_mallocs;
1617   } else {
1618     info->fill_ratio_given  = 0;
1619     info->fill_ratio_needed = 0;
1620     info->factor_mallocs    = 0;
1621   }
1622   return 0;
1623 }
1624 
1625 #undef __FUNC__
1626 #define __FUNC__ "MatEqual_SeqBAIJ"
1627 static int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1628 {
1629   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1630 
1631   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1632 
1633   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1634   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1635       (a->nz != b->nz)) {
1636     *flg = PETSC_FALSE; return 0;
1637   }
1638 
1639   /* if the a->i are the same */
1640   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1641     *flg = PETSC_FALSE; return 0;
1642   }
1643 
1644   /* if a->j are the same */
1645   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1646     *flg = PETSC_FALSE; return 0;
1647   }
1648 
1649   /* if a->a are the same */
1650   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1651     *flg = PETSC_FALSE; return 0;
1652   }
1653   *flg = PETSC_TRUE;
1654   return 0;
1655 
1656 }
1657 
1658 #undef __FUNC__
1659 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1660 static int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1661 {
1662   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1663   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1664   Scalar      *x, zero = 0.0,*aa,*aa_j;
1665 
1666   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1667   bs   = a->bs;
1668   aa   = a->a;
1669   ai   = a->i;
1670   aj   = a->j;
1671   ambs = a->mbs;
1672   bs2  = a->bs2;
1673 
1674   VecSet(&zero,v);
1675   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1676   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1677   for ( i=0; i<ambs; i++ ) {
1678     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1679       if (aj[j] == i) {
1680         row  = i*bs;
1681         aa_j = aa+j*bs2;
1682         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1683         break;
1684       }
1685     }
1686   }
1687   return 0;
1688 }
1689 
1690 #undef __FUNC__
1691 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1692 static int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1693 {
1694   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1695   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1696   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1697 
1698   ai  = a->i;
1699   aj  = a->j;
1700   aa  = a->a;
1701   m   = a->m;
1702   n   = a->n;
1703   bs  = a->bs;
1704   mbs = a->mbs;
1705   bs2 = a->bs2;
1706   if (ll) {
1707     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1708     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1709     for ( i=0; i<mbs; i++ ) { /* for each block row */
1710       M  = ai[i+1] - ai[i];
1711       li = l + i*bs;
1712       v  = aa + bs2*ai[i];
1713       for ( j=0; j<M; j++ ) { /* for each block */
1714         for ( k=0; k<bs2; k++ ) {
1715           (*v++) *= li[k%bs];
1716         }
1717       }
1718     }
1719   }
1720 
1721   if (rr) {
1722     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1723     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1724     for ( i=0; i<mbs; i++ ) { /* for each block row */
1725       M  = ai[i+1] - ai[i];
1726       v  = aa + bs2*ai[i];
1727       for ( j=0; j<M; j++ ) { /* for each block */
1728         ri = r + bs*aj[ai[i]+j];
1729         for ( k=0; k<bs; k++ ) {
1730           x = ri[k];
1731           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1732         }
1733       }
1734     }
1735   }
1736   return 0;
1737 }
1738 
1739 
1740 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1741 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1742 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1743 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1744 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1745 
1746 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1747 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1748 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1749 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1750 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1751 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1752 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1753 
1754 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1755 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1756 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1757 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1758 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1759 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1760 
1761 #undef __FUNC__
1762 #define __FUNC__ "MatNorm_SeqBAIJ"
1763 static int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1764 {
1765   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1766   Scalar      *v = a->a;
1767   double      sum = 0.0;
1768   int         i,nz=a->nz,bs2=a->bs2;
1769 
1770   if (type == NORM_FROBENIUS) {
1771     for (i=0; i< bs2*nz; i++ ) {
1772 #if defined(PETSC_COMPLEX)
1773       sum += real(conj(*v)*(*v)); v++;
1774 #else
1775       sum += (*v)*(*v); v++;
1776 #endif
1777     }
1778     *norm = sqrt(sum);
1779   }
1780   else {
1781     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1782   }
1783   return 0;
1784 }
1785 
1786 /*
1787      note: This can only work for identity for row and col. It would
1788    be good to check this and otherwise generate an error.
1789 */
1790 #undef __FUNC__
1791 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1792 static int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1793 {
1794   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1795   Mat         outA;
1796   int         ierr;
1797 
1798   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1799 
1800   outA          = inA;
1801   inA->factor   = FACTOR_LU;
1802   a->row        = row;
1803   a->col        = col;
1804 
1805   a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1806 
1807   if (!a->diag) {
1808     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1809   }
1810   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1811   return 0;
1812 }
1813 
1814 #undef __FUNC__
1815 #define __FUNC__ "MatScale_SeqBAIJ"
1816 static int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1817 {
1818   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1819   int         one = 1, totalnz = a->bs2*a->nz;
1820   BLscal_( &totalnz, alpha, a->a, &one );
1821   PLogFlops(totalnz);
1822   return 0;
1823 }
1824 
1825 #undef __FUNC__
1826 #define __FUNC__ "MatGetValues_SeqBAIJ"
1827 static int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1828 {
1829   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1830   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1831   int        *ai = a->i, *ailen = a->ilen;
1832   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1833   Scalar     *ap, *aa = a->a, zero = 0.0;
1834 
1835   for ( k=0; k<m; k++ ) { /* loop over rows */
1836     row  = im[k]; brow = row/bs;
1837     if (row < 0) SETERRQ(1,0,"Negative row");
1838     if (row >= a->m) SETERRQ(1,0,"Row too large");
1839     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1840     nrow = ailen[brow];
1841     for ( l=0; l<n; l++ ) { /* loop over columns */
1842       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1843       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1844       col  = in[l] ;
1845       bcol = col/bs;
1846       cidx = col%bs;
1847       ridx = row%bs;
1848       high = nrow;
1849       low  = 0; /* assume unsorted */
1850       while (high-low > 5) {
1851         t = (low+high)/2;
1852         if (rp[t] > bcol) high = t;
1853         else             low  = t;
1854       }
1855       for ( i=low; i<high; i++ ) {
1856         if (rp[i] > bcol) break;
1857         if (rp[i] == bcol) {
1858           *v++ = ap[bs2*i+bs*cidx+ridx];
1859           goto finished;
1860         }
1861       }
1862       *v++ = zero;
1863       finished:;
1864     }
1865   }
1866   return 0;
1867 }
1868 
1869 #undef __FUNC__
1870 #define __FUNC__ "MatGetBlockSize_SeqBAIJ"
1871 static int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1872 {
1873   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1874   *bs = baij->bs;
1875   return 0;
1876 }
1877 
1878 /* idx should be of length atlease bs */
1879 #undef __FUNC__
1880 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1881 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1882 {
1883   int i,row;
1884   row = idx[0];
1885   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1886 
1887   for ( i=1; i<bs; i++ ) {
1888     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1889   }
1890   *flg = PETSC_TRUE;
1891   return 0;
1892 }
1893 
1894 #undef __FUNC__
1895 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1896 static int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1897 {
1898   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1899   IS          is_local;
1900   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1901   PetscTruth  flg;
1902   Scalar      zero = 0.0,*aa;
1903 
1904   /* Make a copy of the IS and  sort it */
1905   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1906   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1907   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1908   ierr = ISSort(is_local); CHKERRQ(ierr);
1909   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1910 
1911   i = 0;
1912   while (i < is_n) {
1913     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1914     flg = PETSC_FALSE;
1915     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1916     if (flg) { /* There exists a block of rows to be Zerowed */
1917       baij->ilen[rows[i]/bs] = 0;
1918       i += bs;
1919     } else { /* Zero out only the requested row */
1920       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1921       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1922       for ( j=0; j<count; j++ ) {
1923         aa[0] = zero;
1924         aa+=bs;
1925       }
1926       i++;
1927     }
1928   }
1929   if (diag) {
1930     for ( j=0; j<is_n; j++ ) {
1931       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1932     }
1933   }
1934   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
1935   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
1936   ierr = ISDestroy(is_local); CHKERRQ(ierr);
1937   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1938 
1939   return 0;
1940 }
1941 
1942 #undef __FUNC__
1943 #define __FUNC__ "MatPrintHelp_SeqBAIJ"
1944 int MatPrintHelp_SeqBAIJ(Mat A)
1945 {
1946   static int called = 0;
1947   MPI_Comm   comm = A->comm;
1948 
1949   if (called) return 0; else called = 1;
1950   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
1951   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
1952   return 0;
1953 }
1954 
1955 /* -------------------------------------------------------------------*/
1956 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
1957        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
1958        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
1959        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
1960        MatSolve_SeqBAIJ_N,0,
1961        0,0,
1962        MatLUFactor_SeqBAIJ,0,
1963        0,
1964        MatTranspose_SeqBAIJ,
1965        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
1966        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
1967        0,MatAssemblyEnd_SeqBAIJ,
1968        0,
1969        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
1970        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
1971        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
1972        MatILUFactorSymbolic_SeqBAIJ,0,
1973        0,0,
1974        MatConvertSameType_SeqBAIJ,0,0,
1975        MatILUFactor_SeqBAIJ,0,0,
1976        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
1977        MatGetValues_SeqBAIJ,0,
1978        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
1979        0,0,0,MatGetBlockSize_SeqBAIJ,
1980        MatGetRowIJ_SeqBAIJ,
1981        MatRestoreRowIJ_SeqBAIJ,
1982        0,0,0,0,0,0,
1983        MatSetValuesBlocked_SeqBAIJ};
1984 
1985 #undef __FUNC__
1986 #define __FUNC__ "MatCreateSeqBAIJ"
1987 /*@C
1988    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
1989    compressed row) format.  For good matrix assembly performance the
1990    user should preallocate the matrix storage by setting the parameter nz
1991    (or the array nzz).  By setting these parameters accurately, performance
1992    during matrix assembly can be increased by more than a factor of 50.
1993 
1994    Input Parameters:
1995 .  comm - MPI communicator, set to MPI_COMM_SELF
1996 .  bs - size of block
1997 .  m - number of rows
1998 .  n - number of columns
1999 .  nz - number of block nonzeros per block row (same for all rows)
2000 .  nzz - array containing the number of block nonzeros in the various block rows
2001          (possibly different for each block row) or PETSC_NULL
2002 
2003    Output Parameter:
2004 .  A - the matrix
2005 
2006    Options Database Keys:
2007 $    -mat_no_unroll - uses code that does not unroll the loops in the
2008 $                     block calculations (much solver)
2009 $    -mat_block_size - size of the blocks to use
2010 
2011    Notes:
2012    The block AIJ format is fully compatible with standard Fortran 77
2013    storage.  That is, the stored row and column indices can begin at
2014    either one (as in Fortran) or zero.  See the users' manual for details.
2015 
2016    Specify the preallocated storage with either nz or nnz (not both).
2017    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2018    allocation.  For additional details, see the users manual chapter on
2019    matrices.
2020 
2021 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2022 @*/
2023 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2024 {
2025   Mat         B;
2026   Mat_SeqBAIJ *b;
2027   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2028 
2029   MPI_Comm_size(comm,&size);
2030   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2031 
2032   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2033 
2034   if (mbs*bs!=m || nbs*bs!=n)
2035     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2036 
2037   *A = 0;
2038   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
2039   PLogObjectCreate(B);
2040   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2041   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2042   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2043   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2044   if (!flg) {
2045     switch (bs) {
2046     case 1:
2047       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2048       B->ops.solve           = MatSolve_SeqBAIJ_1;
2049       B->ops.mult            = MatMult_SeqBAIJ_1;
2050       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2051       break;
2052     case 2:
2053       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2054       B->ops.solve           = MatSolve_SeqBAIJ_2;
2055       B->ops.mult            = MatMult_SeqBAIJ_2;
2056       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2057       break;
2058     case 3:
2059       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2060       B->ops.solve           = MatSolve_SeqBAIJ_3;
2061       B->ops.mult            = MatMult_SeqBAIJ_3;
2062       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2063       break;
2064     case 4:
2065       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2066       B->ops.solve           = MatSolve_SeqBAIJ_4;
2067       B->ops.mult            = MatMult_SeqBAIJ_4;
2068       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2069       break;
2070     case 5:
2071       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2072       B->ops.solve           = MatSolve_SeqBAIJ_5;
2073       B->ops.mult            = MatMult_SeqBAIJ_5;
2074       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2075       break;
2076     case 7:
2077       B->ops.mult            = MatMult_SeqBAIJ_7;
2078       B->ops.solve           = MatSolve_SeqBAIJ_7;
2079       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2080       break;
2081     }
2082   }
2083   B->destroy          = MatDestroy_SeqBAIJ;
2084   B->view             = MatView_SeqBAIJ;
2085   B->factor           = 0;
2086   B->lupivotthreshold = 1.0;
2087   B->mapping          = 0;
2088   b->row              = 0;
2089   b->col              = 0;
2090   b->reallocs         = 0;
2091 
2092   b->m       = m; B->m = m; B->M = m;
2093   b->n       = n; B->n = n; B->N = n;
2094   b->mbs     = mbs;
2095   b->nbs     = nbs;
2096   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2097   if (nnz == PETSC_NULL) {
2098     if (nz == PETSC_DEFAULT) nz = 5;
2099     else if (nz <= 0)        nz = 1;
2100     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2101     nz = nz*mbs;
2102   }
2103   else {
2104     nz = 0;
2105     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2106   }
2107 
2108   /* allocate the matrix space */
2109   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2110   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2111   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2112   b->j  = (int *) (b->a + nz*bs2);
2113   PetscMemzero(b->j,nz*sizeof(int));
2114   b->i  = b->j + nz;
2115   b->singlemalloc = 1;
2116 
2117   b->i[0] = 0;
2118   for (i=1; i<mbs+1; i++) {
2119     b->i[i] = b->i[i-1] + b->imax[i-1];
2120   }
2121 
2122   /* b->ilen will count nonzeros in each block row so far. */
2123   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2124   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
2125   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2126 
2127   b->bs               = bs;
2128   b->bs2              = bs2;
2129   b->mbs              = mbs;
2130   b->nz               = 0;
2131   b->maxnz            = nz*bs2;
2132   b->sorted           = 0;
2133   b->roworiented      = 1;
2134   b->nonew            = 0;
2135   b->diag             = 0;
2136   b->solve_work       = 0;
2137   b->mult_work        = 0;
2138   b->spptr            = 0;
2139   B->info.nz_unneeded = (double)b->maxnz;
2140 
2141   *A = B;
2142   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2143   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2144   return 0;
2145 }
2146 
2147 #undef __FUNC__
2148 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2149 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2150 {
2151   Mat         C;
2152   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2153   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2154 
2155   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2156 
2157   *B = 0;
2158   PetscHeaderCreate(C,_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
2159   PLogObjectCreate(C);
2160   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2161   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2162   C->destroy    = MatDestroy_SeqBAIJ;
2163   C->view       = MatView_SeqBAIJ;
2164   C->factor     = A->factor;
2165   c->row        = 0;
2166   c->col        = 0;
2167   C->assembled  = PETSC_TRUE;
2168 
2169   c->m = C->m   = a->m;
2170   c->n = C->n   = a->n;
2171   C->M          = a->m;
2172   C->N          = a->n;
2173 
2174   c->bs         = a->bs;
2175   c->bs2        = a->bs2;
2176   c->mbs        = a->mbs;
2177   c->nbs        = a->nbs;
2178 
2179   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2180   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2181   for ( i=0; i<mbs; i++ ) {
2182     c->imax[i] = a->imax[i];
2183     c->ilen[i] = a->ilen[i];
2184   }
2185 
2186   /* allocate the matrix space */
2187   c->singlemalloc = 1;
2188   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2189   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2190   c->j  = (int *) (c->a + nz*bs2);
2191   c->i  = c->j + nz;
2192   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2193   if (mbs > 0) {
2194     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2195     if (cpvalues == COPY_VALUES) {
2196       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2197     }
2198   }
2199 
2200   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
2201   c->sorted      = a->sorted;
2202   c->roworiented = a->roworiented;
2203   c->nonew       = a->nonew;
2204 
2205   if (a->diag) {
2206     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2207     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2208     for ( i=0; i<mbs; i++ ) {
2209       c->diag[i] = a->diag[i];
2210     }
2211   }
2212   else c->diag          = 0;
2213   c->nz                 = a->nz;
2214   c->maxnz              = a->maxnz;
2215   c->solve_work         = 0;
2216   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2217   c->mult_work          = 0;
2218   *B = C;
2219   return 0;
2220 }
2221 
2222 #undef __FUNC__
2223 #define __FUNC__ "MatLoad_SeqBAIJ"
2224 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2225 {
2226   Mat_SeqBAIJ  *a;
2227   Mat          B;
2228   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2229   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2230   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2231   int          *masked, nmask,tmp,bs2,ishift;
2232   Scalar       *aa;
2233   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2234 
2235   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2236   bs2  = bs*bs;
2237 
2238   MPI_Comm_size(comm,&size);
2239   if (size > 1) SETERRQ(1,0,"view must have one processor");
2240   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2241   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
2242   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2243   M = header[1]; N = header[2]; nz = header[3];
2244 
2245   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2246 
2247   /*
2248      This code adds extra rows to make sure the number of rows is
2249     divisible by the blocksize
2250   */
2251   mbs        = M/bs;
2252   extra_rows = bs - M + bs*(mbs);
2253   if (extra_rows == bs) extra_rows = 0;
2254   else                  mbs++;
2255   if (extra_rows) {
2256     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2257   }
2258 
2259   /* read in row lengths */
2260   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2261   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
2262   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2263 
2264   /* read in column indices */
2265   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2266   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
2267   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2268 
2269   /* loop over row lengths determining block row lengths */
2270   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2271   PetscMemzero(browlengths,mbs*sizeof(int));
2272   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2273   PetscMemzero(mask,mbs*sizeof(int));
2274   masked = mask + mbs;
2275   rowcount = 0; nzcount = 0;
2276   for ( i=0; i<mbs; i++ ) {
2277     nmask = 0;
2278     for ( j=0; j<bs; j++ ) {
2279       kmax = rowlengths[rowcount];
2280       for ( k=0; k<kmax; k++ ) {
2281         tmp = jj[nzcount++]/bs;
2282         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2283       }
2284       rowcount++;
2285     }
2286     browlengths[i] += nmask;
2287     /* zero out the mask elements we set */
2288     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2289   }
2290 
2291   /* create our matrix */
2292   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);
2293          CHKERRQ(ierr);
2294   B = *A;
2295   a = (Mat_SeqBAIJ *) B->data;
2296 
2297   /* set matrix "i" values */
2298   a->i[0] = 0;
2299   for ( i=1; i<= mbs; i++ ) {
2300     a->i[i]      = a->i[i-1] + browlengths[i-1];
2301     a->ilen[i-1] = browlengths[i-1];
2302   }
2303   a->nz         = 0;
2304   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2305 
2306   /* read in nonzero values */
2307   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2308   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
2309   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2310 
2311   /* set "a" and "j" values into matrix */
2312   nzcount = 0; jcount = 0;
2313   for ( i=0; i<mbs; i++ ) {
2314     nzcountb = nzcount;
2315     nmask    = 0;
2316     for ( j=0; j<bs; j++ ) {
2317       kmax = rowlengths[i*bs+j];
2318       for ( k=0; k<kmax; k++ ) {
2319         tmp = jj[nzcount++]/bs;
2320 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2321       }
2322       rowcount++;
2323     }
2324     /* sort the masked values */
2325     PetscSortInt(nmask,masked);
2326 
2327     /* set "j" values into matrix */
2328     maskcount = 1;
2329     for ( j=0; j<nmask; j++ ) {
2330       a->j[jcount++]  = masked[j];
2331       mask[masked[j]] = maskcount++;
2332     }
2333     /* set "a" values into matrix */
2334     ishift = bs2*a->i[i];
2335     for ( j=0; j<bs; j++ ) {
2336       kmax = rowlengths[i*bs+j];
2337       for ( k=0; k<kmax; k++ ) {
2338         tmp    = jj[nzcountb]/bs ;
2339         block  = mask[tmp] - 1;
2340         point  = jj[nzcountb] - bs*tmp;
2341         idx    = ishift + bs2*block + j + bs*point;
2342         a->a[idx] = aa[nzcountb++];
2343       }
2344     }
2345     /* zero out the mask elements we set */
2346     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2347   }
2348   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2349 
2350   PetscFree(rowlengths);
2351   PetscFree(browlengths);
2352   PetscFree(aa);
2353   PetscFree(jj);
2354   PetscFree(mask);
2355 
2356   B->assembled = PETSC_TRUE;
2357 
2358   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
2359   if (flg) {
2360     Viewer tviewer;
2361     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2362     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO,0);CHKERRQ(ierr);
2363     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2364     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2365   }
2366   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
2367   if (flg) {
2368     Viewer tviewer;
2369     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2370     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO_LONG,0);CHKERRQ(ierr);
2371     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2372     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2373   }
2374   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
2375   if (flg) {
2376     Viewer tviewer;
2377     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2378     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2379     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2380   }
2381   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
2382   if (flg) {
2383     Viewer tviewer;
2384     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2385     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_MATLAB,"M");CHKERRQ(ierr);
2386     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2387     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2388   }
2389   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
2390   if (flg) {
2391     Viewer tviewer;
2392     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
2393     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
2394     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
2395     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2396   }
2397   return 0;
2398 }
2399 
2400 
2401 
2402