xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 3131a8b617862149c3444cb94dfcb3997d6b3bc6)
1 #ifndef lint
2 static char vcid[] = "$Id: baij.c,v 1.88 1997/02/04 01:21:20 balay Exp balay $";
3 #endif
4 
5 /*
6     Defines the basic matrix operations for the BAIJ (compressed row)
7   matrix storage format.
8 */
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/spops.h"
12 #include "petsc.h"
13 
14 
15 /*
16      Adds diagonal pointers to sparse matrix structure.
17 */
18 
19 #undef __FUNC__
20 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
21 int MatMarkDiag_SeqBAIJ(Mat A)
22 {
23   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
24   int         i,j, *diag, m = a->mbs;
25 
26   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
27   PLogObjectMemory(A,(m+1)*sizeof(int));
28   for ( i=0; i<m; i++ ) {
29     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
30       if (a->j[j] == i) {
31         diag[i] = j;
32         break;
33       }
34     }
35   }
36   a->diag = diag;
37   return 0;
38 }
39 
40 #include "draw.h"
41 #include "pinclude/pviewer.h"
42 #include "sys.h"
43 
44 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
48 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
52   int         ierr, n = a->mbs,i;
53 
54   *nn = n;
55   if (!ia) return 0;
56   if (symmetric) {
57     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
58   } else if (oshift == 1) {
59     /* temporarily add 1 to i and j indices */
60     int nz = a->i[n] + 1;
61     for ( i=0; i<nz; i++ ) a->j[i]++;
62     for ( i=0; i<n+1; i++ ) a->i[i]++;
63     *ia = a->i; *ja = a->j;
64   } else {
65     *ia = a->i; *ja = a->j;
66   }
67 
68   return 0;
69 }
70 
71 #undef __FUNC__
72 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ"
73 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
74                                 PetscTruth *done)
75 {
76   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
77   int         i,n = a->mbs;
78 
79   if (!ia) return 0;
80   if (symmetric) {
81     PetscFree(*ia);
82     PetscFree(*ja);
83   } else if (oshift == 1) {
84     int nz = a->i[n];
85     for ( i=0; i<nz; i++ ) a->j[i]--;
86     for ( i=0; i<n+1; i++ ) a->i[i]--;
87   }
88   return 0;
89 }
90 
91 
92 #undef __FUNC__
93 #define __FUNC__ "MatView_SeqBAIJ_Binary"
94 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
95 {
96   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
97   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
98   Scalar      *aa;
99 
100   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
101   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
102   col_lens[0] = MAT_COOKIE;
103 
104   col_lens[1] = a->m;
105   col_lens[2] = a->n;
106   col_lens[3] = a->nz*bs2;
107 
108   /* store lengths of each row and write (including header) to file */
109   count = 0;
110   for ( i=0; i<a->mbs; i++ ) {
111     for ( j=0; j<bs; j++ ) {
112       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
113     }
114   }
115   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
116   PetscFree(col_lens);
117 
118   /* store column indices (zero start index) */
119   jj = (int *) PetscMalloc( a->nz*bs2*sizeof(int) ); CHKPTRQ(jj);
120   count = 0;
121   for ( i=0; i<a->mbs; i++ ) {
122     for ( j=0; j<bs; j++ ) {
123       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
124         for ( l=0; l<bs; l++ ) {
125           jj[count++] = bs*a->j[k] + l;
126         }
127       }
128     }
129   }
130   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
131   PetscFree(jj);
132 
133   /* store nonzero values */
134   aa = (Scalar *) PetscMalloc(a->nz*bs2*sizeof(Scalar)); CHKPTRQ(aa);
135   count = 0;
136   for ( i=0; i<a->mbs; i++ ) {
137     for ( j=0; j<bs; j++ ) {
138       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
139         for ( l=0; l<bs; l++ ) {
140           aa[count++] = a->a[bs2*k + l*bs + j];
141         }
142       }
143     }
144   }
145   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
146   PetscFree(aa);
147   return 0;
148 }
149 
150 #undef __FUNC__
151 #define __FUNC__ "MatView_SeqBAIJ_ASCII"
152 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
153 {
154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
155   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
156   FILE        *fd;
157   char        *outputname;
158 
159   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
160   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
161   ierr = ViewerGetFormat(viewer,&format);
162   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
163     fprintf(fd,"  block size is %d\n",bs);
164   }
165   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
166     SETERRQ(1,0,"Matlab format not supported");
167   }
168   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
169     for ( i=0; i<a->mbs; i++ ) {
170       for ( j=0; j<bs; j++ ) {
171         fprintf(fd,"row %d:",i*bs+j);
172         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
173           for ( l=0; l<bs; l++ ) {
174 #if defined(PETSC_COMPLEX)
175           if (imag(a->a[bs2*k + l*bs + j]) != 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
176             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
177               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
178           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
179             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
180 #else
181           if (a->a[bs2*k + l*bs + j] != 0.0)
182             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
183 #endif
184           }
185         }
186         fprintf(fd,"\n");
187       }
188     }
189   }
190   else {
191     for ( i=0; i<a->mbs; i++ ) {
192       for ( j=0; j<bs; j++ ) {
193         fprintf(fd,"row %d:",i*bs+j);
194         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
195           for ( l=0; l<bs; l++ ) {
196 #if defined(PETSC_COMPLEX)
197           if (imag(a->a[bs2*k + l*bs + j]) != 0.0) {
198             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
199               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
200           }
201           else {
202             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
203           }
204 #else
205             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
206 #endif
207           }
208         }
209         fprintf(fd,"\n");
210       }
211     }
212   }
213   fflush(fd);
214   return 0;
215 }
216 
217 #undef __FUNC__
218 #define __FUNC__ "MatView_SeqBAIJ_Draw"
219 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
220 {
221   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
222   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
223   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
224   Scalar       *aa;
225   Draw         draw;
226   DrawButton   button;
227   PetscTruth   isnull;
228 
229   ViewerDrawGetDraw(viewer,&draw);
230   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
231 
232   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
233   xr += w;    yr += h;  xl = -w;     yl = -h;
234   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
235   /* loop over matrix elements drawing boxes */
236   color = DRAW_BLUE;
237   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
238     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
239       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
240       x_l = a->j[j]*bs; x_r = x_l + 1.0;
241       aa = a->a + j*bs2;
242       for ( k=0; k<bs; k++ ) {
243         for ( l=0; l<bs; l++ ) {
244           if (PetscReal(*aa++) >=  0.) continue;
245           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
246         }
247       }
248     }
249   }
250   color = DRAW_CYAN;
251   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
252     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
253       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
254       x_l = a->j[j]*bs; x_r = x_l + 1.0;
255       aa = a->a + j*bs2;
256       for ( k=0; k<bs; k++ ) {
257         for ( l=0; l<bs; l++ ) {
258           if (PetscReal(*aa++) != 0.) continue;
259           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
260         }
261       }
262     }
263   }
264 
265   color = DRAW_RED;
266   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
267     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
268       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
269       x_l = a->j[j]*bs; x_r = x_l + 1.0;
270       aa = a->a + j*bs2;
271       for ( k=0; k<bs; k++ ) {
272         for ( l=0; l<bs; l++ ) {
273           if (PetscReal(*aa++) <= 0.) continue;
274           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
275         }
276       }
277     }
278   }
279 
280   DrawFlush(draw);
281   DrawGetPause(draw,&pause);
282   if (pause >= 0) { PetscSleep(pause); return 0;}
283 
284   /* allow the matrix to zoom or shrink */
285   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
286   while (button != BUTTON_RIGHT) {
287     DrawClear(draw);
288     if (button == BUTTON_LEFT) scale = .5;
289     else if (button == BUTTON_CENTER) scale = 2.;
290     xl = scale*(xl + w - xc) + xc - w*scale;
291     xr = scale*(xr - w - xc) + xc + w*scale;
292     yl = scale*(yl + h - yc) + yc - h*scale;
293     yr = scale*(yr - h - yc) + yc + h*scale;
294     w *= scale; h *= scale;
295     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
296     color = DRAW_BLUE;
297     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
298       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
299         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
300         x_l = a->j[j]*bs; x_r = x_l + 1.0;
301         aa = a->a + j*bs2;
302         for ( k=0; k<bs; k++ ) {
303           for ( l=0; l<bs; l++ ) {
304             if (PetscReal(*aa++) >=  0.) continue;
305             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
306           }
307         }
308       }
309     }
310     color = DRAW_CYAN;
311     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
312       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
313         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
314         x_l = a->j[j]*bs; x_r = x_l + 1.0;
315         aa = a->a + j*bs2;
316         for ( k=0; k<bs; k++ ) {
317           for ( l=0; l<bs; l++ ) {
318           if (PetscReal(*aa++) != 0.) continue;
319           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
320           }
321         }
322       }
323     }
324 
325     color = DRAW_RED;
326     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
327       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
328         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
329         x_l = a->j[j]*bs; x_r = x_l + 1.0;
330         aa = a->a + j*bs2;
331         for ( k=0; k<bs; k++ ) {
332           for ( l=0; l<bs; l++ ) {
333             if (PetscReal(*aa++) <= 0.) continue;
334             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
335           }
336         }
337       }
338     }
339     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
340   }
341   return 0;
342 }
343 
344 #undef __FUNC__
345 #define __FUNC__ "MatView_SeqBAIJ"
346 static int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
347 {
348   Mat         A = (Mat) obj;
349   ViewerType  vtype;
350   int         ierr;
351 
352   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
353   if (vtype == MATLAB_VIEWER) {
354     SETERRQ(1,0,"Matlab viewer not supported");
355   }
356   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
357     return MatView_SeqBAIJ_ASCII(A,viewer);
358   }
359   else if (vtype == BINARY_FILE_VIEWER) {
360     return MatView_SeqBAIJ_Binary(A,viewer);
361   }
362   else if (vtype == DRAW_VIEWER) {
363     return MatView_SeqBAIJ_Draw(A,viewer);
364   }
365   return 0;
366 }
367 
368 #define CHUNKSIZE  10
369 
370 #undef __FUNC__
371 #define __FUNC__ "MatSetValues_SeqBAIJ"
372 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
373 {
374   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
375   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
376   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
377   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
378   int          ridx,cidx,bs2=a->bs2;
379   Scalar      *ap,value,*aa=a->a,*bap;
380 
381   for ( k=0; k<m; k++ ) { /* loop over added rows */
382     row  = im[k]; brow = row/bs;
383 #if defined(PETSC_BOPT_g)
384     if (row < 0) SETERRQ(1,0,"Negative row");
385     if (row >= a->m) SETERRQ(1,0,"Row too large");
386 #endif
387     rp   = aj + ai[brow];
388     ap   = aa + bs2*ai[brow];
389     rmax = imax[brow];
390     nrow = ailen[brow];
391     low  = 0;
392     for ( l=0; l<n; l++ ) { /* loop over added columns */
393 #if defined(PETSC_BOPT_g)
394       if (in[l] < 0) SETERRQ(1,0,"Negative column");
395       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
396 #endif
397       col = in[l]; bcol = col/bs;
398       ridx = row % bs; cidx = col % bs;
399       if (roworiented) {
400         value = *v++;
401       }
402       else {
403         value = v[k + l*m];
404       }
405       if (!sorted) low = 0; high = nrow;
406       while (high-low > 7) {
407         t = (low+high)/2;
408         if (rp[t] > bcol) high = t;
409         else              low  = t;
410       }
411       for ( i=low; i<high; i++ ) {
412         if (rp[i] > bcol) break;
413         if (rp[i] == bcol) {
414           bap  = ap +  bs2*i + bs*cidx + ridx;
415           if (is == ADD_VALUES) *bap += value;
416           else                  *bap  = value;
417           goto noinsert;
418         }
419       }
420       if (nonew) goto noinsert;
421       if (nrow >= rmax) {
422         /* there is no extra room in row, therefore enlarge */
423         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
424         Scalar *new_a;
425 
426         /* malloc new storage space */
427         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
428         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
429         new_j   = (int *) (new_a + bs2*new_nz);
430         new_i   = new_j + new_nz;
431 
432         /* copy over old data into new slots */
433         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
434         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
435         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
436         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
437         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
438                                                            len*sizeof(int));
439         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
440         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
441         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
442                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
443         /* free up old matrix storage */
444         PetscFree(a->a);
445         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
446         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
447         a->singlemalloc = 1;
448 
449         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
450         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
451         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
452         a->maxnz += bs2*CHUNKSIZE;
453         a->reallocs++;
454         a->nz++;
455       }
456       N = nrow++ - 1;
457       /* shift up all the later entries in this row */
458       for ( ii=N; ii>=i; ii-- ) {
459         rp[ii+1] = rp[ii];
460         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
461       }
462       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
463       rp[i]                      = bcol;
464       ap[bs2*i + bs*cidx + ridx] = value;
465       noinsert:;
466       low = i;
467     }
468     ailen[brow] = nrow;
469   }
470   return 0;
471 }
472 
473 #undef __FUNC__
474 #define __FUNC__ "MatSetValues_SeqBAIJ"
475 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
476 {
477   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
478   int         *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
479   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
480   int         *aj=a->j,nonew=a->nonew;
481   int          bs2=a->bs2,bs=a->bs;
482   Scalar      *ap,*value,*aa=a->a,*bap;
483 
484   for ( k=0; k<m; k++ ) { /* loop over added rows */
485     row  = im[k];
486 #if defined(PETSC_BOPT_g)
487     if (row < 0) SETERRQ(1,0,"Negative row");
488     if (row >= a->mbs) SETERRQ(1,0,"Row too large");
489 #endif
490     rp   = aj + ai[row];
491     ap   = aa + bs2*ai[row];
492     rmax = imax[row];
493     nrow = ailen[row];
494     low  = 0;
495     for ( l=0; l<n; l++ ) { /* loop over added columns */
496 #if defined(PETSC_BOPT_g)
497       if (in[l] < 0) SETERRQ(1,0,"Negative column");
498       if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large");
499 #endif
500       col = in[l];
501       if (roworiented) {
502         value = v; v+=bs2;
503       }
504       else {
505         value = v+(k + l*m)*bs2;
506       }
507       if (!sorted) low = 0; high = nrow;
508       while (high-low > 7) {
509         t = (low+high)/2;
510         if (rp[t] > col) high = t;
511         else             low  = t;
512       }
513       for ( i=low; i<high; i++ ) {
514         if (rp[i] > col) break;
515         if (rp[i] == col) {
516           bap  = ap +  bs2*i;
517           if (is == ADD_VALUES) {
518             for ( ii=0; ii<bs; ii++)
519               for (jj=ii; jj<bs2; jj+=bs )
520                 bap[jj] += *value++;
521           }
522           else {
523             for ( ii=0; ii<bs; ii++)
524               for (jj=ii; jj<bs2; jj+=bs )
525                 bap[jj] += *value++;
526           }
527           goto noinsert;
528         }
529       }
530       if (nonew) goto noinsert;
531       if (nrow >= rmax) {
532         /* there is no extra room in row, therefore enlarge */
533         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
534         Scalar *new_a;
535 
536         /* malloc new storage space */
537         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
538         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
539         new_j   = (int *) (new_a + bs2*new_nz);
540         new_i   = new_j + new_nz;
541 
542         /* copy over old data into new slots */
543         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
544         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
545         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
546         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
547         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,
548                                                            len*sizeof(int));
549         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
550         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
551         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),
552                     aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
553         /* free up old matrix storage */
554         PetscFree(a->a);
555         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
556         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
557         a->singlemalloc = 1;
558 
559         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
560         rmax = imax[row] = imax[row] + CHUNKSIZE;
561         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
562         a->maxnz += bs2*CHUNKSIZE;
563         a->reallocs++;
564         a->nz++;
565       }
566       N = nrow++ - 1;
567       /* shift up all the later entries in this row */
568       for ( ii=N; ii>=i; ii-- ) {
569         rp[ii+1] = rp[ii];
570         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
571       }
572       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
573       rp[i] = col;
574       bap   = ap +  bs2*i;
575       for ( ii=0; ii<bs; ii++)
576         for (jj=ii; jj<bs2; jj+=bs )
577           bap[jj] += *value++;
578       noinsert:;
579       low = i;
580     }
581     ailen[row] = nrow;
582   }
583   return 0;
584 }
585 
586 #undef __FUNC__
587 #define __FUNC__ "MatGetSize_SeqBAIJ"
588 static int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
589 {
590   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
591   *m = a->m; *n = a->n;
592   return 0;
593 }
594 
595 #undef __FUNC__
596 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ"
597 static int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
598 {
599   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
600   *m = 0; *n = a->m;
601   return 0;
602 }
603 
604 #undef __FUNC__
605 #define __FUNC__ "MatGetRow_SeqBAIJ"
606 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
607 {
608   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
609   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
610   Scalar      *aa,*v_i,*aa_i;
611 
612   bs  = a->bs;
613   ai  = a->i;
614   aj  = a->j;
615   aa  = a->a;
616   bs2 = a->bs2;
617 
618   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
619 
620   bn  = row/bs;   /* Block number */
621   bp  = row % bs; /* Block Position */
622   M   = ai[bn+1] - ai[bn];
623   *nz = bs*M;
624 
625   if (v) {
626     *v = 0;
627     if (*nz) {
628       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
629       for ( i=0; i<M; i++ ) { /* for each block in the block row */
630         v_i  = *v + i*bs;
631         aa_i = aa + bs2*(ai[bn] + i);
632         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
633       }
634     }
635   }
636 
637   if (idx) {
638     *idx = 0;
639     if (*nz) {
640       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
641       for ( i=0; i<M; i++ ) { /* for each block in the block row */
642         idx_i = *idx + i*bs;
643         itmp  = bs*aj[ai[bn] + i];
644         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
645       }
646     }
647   }
648   return 0;
649 }
650 
651 #undef __FUNC__
652 #define __FUNC__ "MatRestoreRow_SeqBAIJ"
653 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
654 {
655   if (idx) {if (*idx) PetscFree(*idx);}
656   if (v)   {if (*v)   PetscFree(*v);}
657   return 0;
658 }
659 
660 #undef __FUNC__
661 #define __FUNC__ "MatTranspose_SeqBAIJ"
662 static int MatTranspose_SeqBAIJ(Mat A,Mat *B)
663 {
664   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
665   Mat         C;
666   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
667   int         *rows,*cols,bs2=a->bs2;
668   Scalar      *array=a->a;
669 
670   if (B==PETSC_NULL && mbs!=nbs)
671     SETERRQ(1,0,"Square matrix only for in-place");
672   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
673   PetscMemzero(col,(1+nbs)*sizeof(int));
674 
675   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
676   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
677   PetscFree(col);
678   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
679   cols = rows + bs;
680   for ( i=0; i<mbs; i++ ) {
681     cols[0] = i*bs;
682     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
683     len = ai[i+1] - ai[i];
684     for ( j=0; j<len; j++ ) {
685       rows[0] = (*aj++)*bs;
686       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
687       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
688       array += bs2;
689     }
690   }
691   PetscFree(rows);
692 
693   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
694   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
695 
696   if (B != PETSC_NULL) {
697     *B = C;
698   } else {
699     /* This isn't really an in-place transpose */
700     PetscFree(a->a);
701     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
702     if (a->diag) PetscFree(a->diag);
703     if (a->ilen) PetscFree(a->ilen);
704     if (a->imax) PetscFree(a->imax);
705     if (a->solve_work) PetscFree(a->solve_work);
706     PetscFree(a);
707     PetscMemcpy(A,C,sizeof(struct _Mat));
708     PetscHeaderDestroy(C);
709   }
710   return 0;
711 }
712 
713 
714 #undef __FUNC__
715 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
716 static int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
717 {
718   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
719   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
720   int        m = a->m,*ip, N, *ailen = a->ilen;
721   int        mbs = a->mbs, bs2 = a->bs2,rmax;
722   Scalar     *aa = a->a, *ap;
723 
724   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
725 
726   rmax = ailen[0];
727   for ( i=1; i<mbs; i++ ) {
728     /* move each row back by the amount of empty slots (fshift) before it*/
729     fshift += imax[i-1] - ailen[i-1];
730     rmax   = PetscMax(rmax,ailen[i]);
731     if (fshift) {
732       ip = aj + ai[i]; ap = aa + bs2*ai[i];
733       N = ailen[i];
734       for ( j=0; j<N; j++ ) {
735         ip[j-fshift] = ip[j];
736         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
737       }
738     }
739     ai[i] = ai[i-1] + ailen[i-1];
740   }
741   if (mbs) {
742     fshift += imax[mbs-1] - ailen[mbs-1];
743     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
744   }
745   /* reset ilen and imax for each row */
746   for ( i=0; i<mbs; i++ ) {
747     ailen[i] = imax[i] = ai[i+1] - ai[i];
748   }
749   a->nz = ai[mbs];
750 
751   /* diagonals may have moved, so kill the diagonal pointers */
752   if (fshift && a->diag) {
753     PetscFree(a->diag);
754     PLogObjectMemory(A,-(m+1)*sizeof(int));
755     a->diag = 0;
756   }
757   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
758            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
759   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
760            a->reallocs);
761   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
762   a->reallocs          = 0;
763   A->info.nz_unneeded  = (double)fshift*bs2;
764 
765   return 0;
766 }
767 
768 #undef __FUNC__
769 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
770 static int MatZeroEntries_SeqBAIJ(Mat A)
771 {
772   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
773   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
774   return 0;
775 }
776 
777 #undef __FUNC__
778 #define __FUNC__ "MatDestroy_SeqBAIJ"
779 int MatDestroy_SeqBAIJ(PetscObject obj)
780 {
781   Mat         A  = (Mat) obj;
782   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
783   int         ierr;
784 
785 #if defined(PETSC_LOG)
786   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
787 #endif
788   PetscFree(a->a);
789   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
790   if (a->diag) PetscFree(a->diag);
791   if (a->ilen) PetscFree(a->ilen);
792   if (a->imax) PetscFree(a->imax);
793   if (a->solve_work) PetscFree(a->solve_work);
794   if (a->mult_work) PetscFree(a->mult_work);
795   PetscFree(a);
796   if (A->mapping) {
797     ierr = ISLocalToGlobalMappingDestroy(A->mapping); CHKERRQ(ierr);
798   }
799   PLogObjectDestroy(A);
800   PetscHeaderDestroy(A);
801   return 0;
802 }
803 
804 #undef __FUNC__
805 #define __FUNC__ "MatSetOption_SeqBAIJ"
806 static int MatSetOption_SeqBAIJ(Mat A,MatOption op)
807 {
808   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
809   if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
810   else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0;
811   else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
812   else if (op == MAT_COLUMNS_UNSORTED)          a->sorted      = 0;
813   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
814   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
815   else if (op == MAT_ROWS_SORTED ||
816            op == MAT_ROWS_UNSORTED ||
817            op == MAT_SYMMETRIC ||
818            op == MAT_STRUCTURALLY_SYMMETRIC ||
819            op == MAT_YES_NEW_DIAGONALS ||
820            op == MAT_IGNORE_OFF_PROCESSOR_ENTRIES)
821     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
822   else if (op == MAT_NO_NEW_DIAGONALS)
823     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
824   else
825     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
826   return 0;
827 }
828 
829 
830 /* -------------------------------------------------------*/
831 /* Should check that shapes of vectors and matrices match */
832 /* -------------------------------------------------------*/
833 #include "pinclude/plapack.h"
834 
835 #undef __FUNC__
836 #define __FUNC__ "MatMult_SeqBAIJ_1"
837 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
838 {
839   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
840   register Scalar *x,*z,*v,sum;
841   int             mbs=a->mbs,i,*idx,*ii,n;
842 
843   VecGetArray_Fast(xx,x);
844   VecGetArray_Fast(zz,z);
845 
846   idx   = a->j;
847   v     = a->a;
848   ii    = a->i;
849 
850   for ( i=0; i<mbs; i++ ) {
851     n    = ii[1] - ii[0]; ii++;
852     sum  = 0.0;
853     while (n--) sum += *v++ * x[*idx++];
854     z[i] = sum;
855   }
856   VecRestoreArray_Fast(xx,x);
857   VecRestoreArray_Fast(zz,z);
858   PLogFlops(2*a->nz - a->m);
859   return 0;
860 }
861 
862 #undef __FUNC__
863 #define __FUNC__ "MatMult_SeqBAIJ_2"
864 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
865 {
866   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
867   register Scalar *x,*z,*v,*xb,sum1,sum2;
868   register Scalar x1,x2;
869   int             mbs=a->mbs,i,*idx,*ii;
870   int             j,n;
871 
872   VecGetArray_Fast(xx,x);
873   VecGetArray_Fast(zz,z);
874 
875   idx   = a->j;
876   v     = a->a;
877   ii    = a->i;
878 
879   for ( i=0; i<mbs; i++ ) {
880     n  = ii[1] - ii[0]; ii++;
881     sum1 = 0.0; sum2 = 0.0;
882     for ( j=0; j<n; j++ ) {
883       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
884       sum1 += v[0]*x1 + v[2]*x2;
885       sum2 += v[1]*x1 + v[3]*x2;
886       v += 4;
887     }
888     z[0] = sum1; z[1] = sum2;
889     z += 2;
890   }
891   VecRestoreArray_Fast(xx,x);
892   VecRestoreArray_Fast(zz,z);
893   PLogFlops(4*a->nz - a->m);
894   return 0;
895 }
896 
897 #undef __FUNC__
898 #define __FUNC__ "MatMult_SeqBAIJ_3"
899 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
900 {
901   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
902   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
903   int             mbs=a->mbs,i,*idx,*ii,j,n;
904 
905   VecGetArray_Fast(xx,x);
906   VecGetArray_Fast(zz,z);
907 
908   idx   = a->j;
909   v     = a->a;
910   ii    = a->i;
911 
912   for ( i=0; i<mbs; i++ ) {
913     n  = ii[1] - ii[0]; ii++;
914     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
915     for ( j=0; j<n; j++ ) {
916       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
917       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
918       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
919       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
920       v += 9;
921     }
922     z[0] = sum1; z[1] = sum2; z[2] = sum3;
923     z += 3;
924   }
925   VecRestoreArray_Fast(xx,x);
926   VecRestoreArray_Fast(zz,z);
927   PLogFlops(18*a->nz - a->m);
928   return 0;
929 }
930 
931 #undef __FUNC__
932 #define __FUNC__ "MatMult_SeqBAIJ_4"
933 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
934 {
935   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
936   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
937   register Scalar x1,x2,x3,x4;
938   int             mbs=a->mbs,i,*idx,*ii;
939   int             j,n;
940 
941   VecGetArray_Fast(xx,x);
942   VecGetArray_Fast(zz,z);
943 
944   idx   = a->j;
945   v     = a->a;
946   ii    = a->i;
947 
948   for ( i=0; i<mbs; i++ ) {
949     n  = ii[1] - ii[0]; ii++;
950     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
951     for ( j=0; j<n; j++ ) {
952       xb = x + 4*(*idx++);
953       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
954       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
955       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
956       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
957       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
958       v += 16;
959     }
960     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
961     z += 4;
962   }
963   VecRestoreArray_Fast(xx,x);
964   VecRestoreArray_Fast(zz,z);
965   PLogFlops(32*a->nz - a->m);
966   return 0;
967 }
968 
969 #undef __FUNC__
970 #define __FUNC__ "MatMult_SeqBAIJ_5"
971 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
972 {
973   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
974   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
975   register Scalar x1,x2,x3,x4,x5;
976   int             mbs=a->mbs,i,*idx,*ii,j,n;
977 
978   VecGetArray_Fast(xx,x);
979   VecGetArray_Fast(zz,z);
980 
981   idx   = a->j;
982   v     = a->a;
983   ii    = a->i;
984 
985   for ( i=0; i<mbs; i++ ) {
986     n  = ii[1] - ii[0]; ii++;
987     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
988     for ( j=0; j<n; j++ ) {
989       xb = x + 5*(*idx++);
990       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
991       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
992       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
993       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
994       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
995       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
996       v += 25;
997     }
998     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
999     z += 5;
1000   }
1001   VecRestoreArray_Fast(xx,x);
1002   VecRestoreArray_Fast(zz,z);
1003   PLogFlops(50*a->nz - a->m);
1004   return 0;
1005 }
1006 
1007 #undef __FUNC__
1008 #define __FUNC__ "MatMult_SeqBAIJ_7"
1009 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1010 {
1011   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1012   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1013   register Scalar x1,x2,x3,x4,x5,x6,x7;
1014   int             mbs=a->mbs,i,*idx,*ii,j,n;
1015 
1016   VecGetArray_Fast(xx,x);
1017   VecGetArray_Fast(zz,z);
1018 
1019   idx   = a->j;
1020   v     = a->a;
1021   ii    = a->i;
1022 
1023   for ( i=0; i<mbs; i++ ) {
1024     n  = ii[1] - ii[0]; ii++;
1025     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1026     for ( j=0; j<n; j++ ) {
1027       xb = x + 7*(*idx++);
1028       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1029       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1030       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1031       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1032       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1033       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1034       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1035       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1036       v += 49;
1037     }
1038     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1039     z += 7;
1040   }
1041 
1042   VecRestoreArray_Fast(xx,x);
1043   VecRestoreArray_Fast(zz,z);
1044   PLogFlops(98*a->nz - a->m);
1045   return 0;
1046 }
1047 
1048 #undef __FUNC__
1049 #define __FUNC__ "MatMult_SeqBAIJ_N"
1050 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1051 {
1052   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1053   register Scalar *x,*z,*v,*xb;
1054   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1055   int             _One = 1,ncols,k;
1056   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1057 
1058   VecGetArray_Fast(xx,x);
1059   VecGetArray_Fast(zz,z);
1060 
1061   idx   = a->j;
1062   v     = a->a;
1063   ii    = a->i;
1064 
1065 
1066   if (!a->mult_work) {
1067     k = PetscMax(a->m,a->n);
1068     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1069   }
1070   work = a->mult_work;
1071   for ( i=0; i<mbs; i++ ) {
1072     n     = ii[1] - ii[0]; ii++;
1073     ncols = n*bs;
1074     workt = work;
1075     for ( j=0; j<n; j++ ) {
1076       xb = x + bs*(*idx++);
1077       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1078       workt += bs;
1079     }
1080     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1081     v += n*bs2;
1082     z += bs;
1083   }
1084   VecRestoreArray_Fast(xx,x);
1085   VecRestoreArray_Fast(zz,z);
1086   PLogFlops(2*a->nz*bs2 - a->m);
1087   return 0;
1088 }
1089 
1090 #undef __FUNC__
1091 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1092 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1093 {
1094   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1095   register Scalar *x,*y,*z,*v,sum;
1096   int             mbs=a->mbs,i,*idx,*ii,n;
1097 
1098   VecGetArray_Fast(xx,x);
1099   VecGetArray_Fast(yy,y);
1100   VecGetArray_Fast(zz,z);
1101 
1102   idx   = a->j;
1103   v     = a->a;
1104   ii    = a->i;
1105 
1106   for ( i=0; i<mbs; i++ ) {
1107     n    = ii[1] - ii[0]; ii++;
1108     sum  = y[i];
1109     while (n--) sum += *v++ * x[*idx++];
1110     z[i] = sum;
1111   }
1112   VecRestoreArray_Fast(xx,x);
1113   VecRestoreArray_Fast(yy,y);
1114   VecRestoreArray_Fast(zz,z);
1115   PLogFlops(2*a->nz);
1116   return 0;
1117 }
1118 
1119 #undef __FUNC__
1120 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1121 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1122 {
1123   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1124   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1125   register Scalar x1,x2;
1126   int             mbs=a->mbs,i,*idx,*ii;
1127   int             j,n;
1128 
1129   VecGetArray_Fast(xx,x);
1130   VecGetArray_Fast(yy,y);
1131   VecGetArray_Fast(zz,z);
1132 
1133   idx   = a->j;
1134   v     = a->a;
1135   ii    = a->i;
1136 
1137   for ( i=0; i<mbs; i++ ) {
1138     n  = ii[1] - ii[0]; ii++;
1139     sum1 = y[0]; sum2 = y[1];
1140     for ( j=0; j<n; j++ ) {
1141       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1142       sum1 += v[0]*x1 + v[2]*x2;
1143       sum2 += v[1]*x1 + v[3]*x2;
1144       v += 4;
1145     }
1146     z[0] = sum1; z[1] = sum2;
1147     z += 2; y += 2;
1148   }
1149   VecRestoreArray_Fast(xx,x);
1150   VecRestoreArray_Fast(yy,y);
1151   VecRestoreArray_Fast(zz,z);
1152   PLogFlops(4*a->nz);
1153   return 0;
1154 }
1155 
1156 #undef __FUNC__
1157 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1158 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1159 {
1160   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1161   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1162   int             mbs=a->mbs,i,*idx,*ii,j,n;
1163 
1164   VecGetArray_Fast(xx,x);
1165   VecGetArray_Fast(yy,y);
1166   VecGetArray_Fast(zz,z);
1167 
1168   idx   = a->j;
1169   v     = a->a;
1170   ii    = a->i;
1171 
1172   for ( i=0; i<mbs; i++ ) {
1173     n  = ii[1] - ii[0]; ii++;
1174     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1175     for ( j=0; j<n; j++ ) {
1176       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1177       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1178       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1179       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1180       v += 9;
1181     }
1182     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1183     z += 3; y += 3;
1184   }
1185   VecRestoreArray_Fast(xx,x);
1186   VecRestoreArray_Fast(yy,y);
1187   VecRestoreArray_Fast(zz,z);
1188   PLogFlops(18*a->nz);
1189   return 0;
1190 }
1191 
1192 #undef __FUNC__
1193 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1194 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1195 {
1196   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1197   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1198   register Scalar x1,x2,x3,x4;
1199   int             mbs=a->mbs,i,*idx,*ii;
1200   int             j,n;
1201 
1202   VecGetArray_Fast(xx,x);
1203   VecGetArray_Fast(yy,y);
1204   VecGetArray_Fast(zz,z);
1205 
1206   idx   = a->j;
1207   v     = a->a;
1208   ii    = a->i;
1209 
1210   for ( i=0; i<mbs; i++ ) {
1211     n  = ii[1] - ii[0]; ii++;
1212     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1213     for ( j=0; j<n; j++ ) {
1214       xb = x + 4*(*idx++);
1215       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1216       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1217       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1218       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1219       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1220       v += 16;
1221     }
1222     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1223     z += 4; y += 4;
1224   }
1225   VecRestoreArray_Fast(xx,x);
1226   VecRestoreArray_Fast(yy,y);
1227   VecRestoreArray_Fast(zz,z);
1228   PLogFlops(32*a->nz);
1229   return 0;
1230 }
1231 
1232 #undef __FUNC__
1233 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1234 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1235 {
1236   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1237   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1238   register Scalar x1,x2,x3,x4,x5;
1239   int             mbs=a->mbs,i,*idx,*ii,j,n;
1240 
1241   VecGetArray_Fast(xx,x);
1242   VecGetArray_Fast(yy,y);
1243   VecGetArray_Fast(zz,z);
1244 
1245   idx   = a->j;
1246   v     = a->a;
1247   ii    = a->i;
1248 
1249   for ( i=0; i<mbs; i++ ) {
1250     n  = ii[1] - ii[0]; ii++;
1251     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1252     for ( j=0; j<n; j++ ) {
1253       xb = x + 5*(*idx++);
1254       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1255       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1256       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1257       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1258       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1259       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1260       v += 25;
1261     }
1262     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1263     z += 5; y += 5;
1264   }
1265   VecRestoreArray_Fast(xx,x);
1266   VecRestoreArray_Fast(yy,y);
1267   VecRestoreArray_Fast(zz,z);
1268   PLogFlops(50*a->nz);
1269   return 0;
1270 }
1271 
1272 #undef __FUNC__
1273 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1274 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1275 {
1276   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1277   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1278   register Scalar x1,x2,x3,x4,x5,x6,x7;
1279   int             mbs=a->mbs,i,*idx,*ii,j,n;
1280 
1281   VecGetArray_Fast(xx,x);
1282   VecGetArray_Fast(yy,y);
1283   VecGetArray_Fast(zz,z);
1284 
1285   idx   = a->j;
1286   v     = a->a;
1287   ii    = a->i;
1288 
1289   for ( i=0; i<mbs; i++ ) {
1290     n  = ii[1] - ii[0]; ii++;
1291     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1292     for ( j=0; j<n; j++ ) {
1293       xb = x + 7*(*idx++);
1294       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1295       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1296       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1297       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1298       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1299       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1300       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1301       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1302       v += 49;
1303     }
1304     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1305     z += 7; y += 7;
1306   }
1307   VecRestoreArray_Fast(xx,x);
1308   VecRestoreArray_Fast(yy,y);
1309   VecRestoreArray_Fast(zz,z);
1310   PLogFlops(98*a->nz);
1311   return 0;
1312 }
1313 
1314 
1315 #undef __FUNC__
1316 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1317 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1318 {
1319   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1320   register Scalar *x,*z,*v,*xb;
1321   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1322   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1323 
1324   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1325 
1326   VecGetArray_Fast(xx,x);
1327   VecGetArray_Fast(zz,z);
1328 
1329   idx   = a->j;
1330   v     = a->a;
1331   ii    = a->i;
1332 
1333 
1334   if (!a->mult_work) {
1335     k = PetscMax(a->m,a->n);
1336     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1337   }
1338   work = a->mult_work;
1339   for ( i=0; i<mbs; i++ ) {
1340     n     = ii[1] - ii[0]; ii++;
1341     ncols = n*bs;
1342     workt = work;
1343     for ( j=0; j<n; j++ ) {
1344       xb = x + bs*(*idx++);
1345       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1346       workt += bs;
1347     }
1348     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1349     v += n*bs2;
1350     z += bs;
1351   }
1352   VecRestoreArray_Fast(xx,x);
1353   VecRestoreArray_Fast(zz,z);
1354   PLogFlops(2*a->nz*bs2 );
1355   return 0;
1356 }
1357 
1358 #undef __FUNC__
1359 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1360 static int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1361 {
1362   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1363   Scalar          *xg,*zg,*zb;
1364   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1365   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1366   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1367 
1368 
1369   VecGetArray_Fast(xx,xg); x = xg;
1370   VecGetArray_Fast(zz,zg); z = zg;
1371   PetscMemzero(z,N*sizeof(Scalar));
1372 
1373   idx   = a->j;
1374   v     = a->a;
1375   ii    = a->i;
1376 
1377   switch (bs) {
1378   case 1:
1379     for ( i=0; i<mbs; i++ ) {
1380       n  = ii[1] - ii[0]; ii++;
1381       xb = x + i; x1 = xb[0];
1382       ib = idx + ai[i];
1383       for ( j=0; j<n; j++ ) {
1384         rval    = ib[j];
1385         z[rval] += *v++ * x1;
1386       }
1387     }
1388     break;
1389   case 2:
1390     for ( i=0; i<mbs; i++ ) {
1391       n  = ii[1] - ii[0]; ii++;
1392       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1393       ib = idx + ai[i];
1394       for ( j=0; j<n; j++ ) {
1395         rval      = ib[j]*2;
1396         z[rval++] += v[0]*x1 + v[1]*x2;
1397         z[rval++] += v[2]*x1 + v[3]*x2;
1398         v += 4;
1399       }
1400     }
1401     break;
1402   case 3:
1403     for ( i=0; i<mbs; i++ ) {
1404       n  = ii[1] - ii[0]; ii++;
1405       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1406       ib = idx + ai[i];
1407       for ( j=0; j<n; j++ ) {
1408         rval      = ib[j]*3;
1409         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1410         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1411         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1412         v += 9;
1413       }
1414     }
1415     break;
1416   case 4:
1417     for ( i=0; i<mbs; i++ ) {
1418       n  = ii[1] - ii[0]; ii++;
1419       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1420       ib = idx + ai[i];
1421       for ( j=0; j<n; j++ ) {
1422         rval      = ib[j]*4;
1423         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1424         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1425         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1426         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1427         v += 16;
1428       }
1429     }
1430     break;
1431   case 5:
1432     for ( i=0; i<mbs; i++ ) {
1433       n  = ii[1] - ii[0]; ii++;
1434       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1435       x4 = xb[3];   x5 = xb[4];
1436       ib = idx + ai[i];
1437       for ( j=0; j<n; j++ ) {
1438         rval      = ib[j]*5;
1439         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1440         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1441         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1442         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1443         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1444         v += 25;
1445       }
1446     }
1447     break;
1448       /* block sizes larger then 5 by 5 are handled by BLAS */
1449     default: {
1450       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1451       if (!a->mult_work) {
1452         k = PetscMax(a->m,a->n);
1453         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1454         CHKPTRQ(a->mult_work);
1455       }
1456       work = a->mult_work;
1457       for ( i=0; i<mbs; i++ ) {
1458         n     = ii[1] - ii[0]; ii++;
1459         ncols = n*bs;
1460         PetscMemzero(work,ncols*sizeof(Scalar));
1461         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1462         v += n*bs2;
1463         x += bs;
1464         workt = work;
1465         for ( j=0; j<n; j++ ) {
1466           zb = z + bs*(*idx++);
1467           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1468           workt += bs;
1469         }
1470       }
1471     }
1472   }
1473   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1474   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1475   PLogFlops(2*a->nz*a->bs2 - a->n);
1476   return 0;
1477 }
1478 
1479 #undef __FUNC__
1480 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1481 static int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1482 {
1483   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1484   Scalar          *xg,*zg,*zb;
1485   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1486   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1487   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1488 
1489 
1490 
1491   VecGetArray_Fast(xx,xg); x = xg;
1492   VecGetArray_Fast(zz,zg); z = zg;
1493 
1494   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1495   else PetscMemzero(z,N*sizeof(Scalar));
1496 
1497 
1498   idx   = a->j;
1499   v     = a->a;
1500   ii    = a->i;
1501 
1502   switch (bs) {
1503   case 1:
1504     for ( i=0; i<mbs; i++ ) {
1505       n  = ii[1] - ii[0]; ii++;
1506       xb = x + i; x1 = xb[0];
1507       ib = idx + ai[i];
1508       for ( j=0; j<n; j++ ) {
1509         rval    = ib[j];
1510         z[rval] += *v++ * x1;
1511       }
1512     }
1513     break;
1514   case 2:
1515     for ( i=0; i<mbs; i++ ) {
1516       n  = ii[1] - ii[0]; ii++;
1517       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1518       ib = idx + ai[i];
1519       for ( j=0; j<n; j++ ) {
1520         rval      = ib[j]*2;
1521         z[rval++] += v[0]*x1 + v[1]*x2;
1522         z[rval++] += v[2]*x1 + v[3]*x2;
1523         v += 4;
1524       }
1525     }
1526     break;
1527   case 3:
1528     for ( i=0; i<mbs; i++ ) {
1529       n  = ii[1] - ii[0]; ii++;
1530       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1531       ib = idx + ai[i];
1532       for ( j=0; j<n; j++ ) {
1533         rval      = ib[j]*3;
1534         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1535         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1536         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1537         v += 9;
1538       }
1539     }
1540     break;
1541   case 4:
1542     for ( i=0; i<mbs; i++ ) {
1543       n  = ii[1] - ii[0]; ii++;
1544       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1545       ib = idx + ai[i];
1546       for ( j=0; j<n; j++ ) {
1547         rval      = ib[j]*4;
1548         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1549         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1550         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1551         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1552         v += 16;
1553       }
1554     }
1555     break;
1556   case 5:
1557     for ( i=0; i<mbs; i++ ) {
1558       n  = ii[1] - ii[0]; ii++;
1559       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1560       x4 = xb[3];   x5 = xb[4];
1561       ib = idx + ai[i];
1562       for ( j=0; j<n; j++ ) {
1563         rval      = ib[j]*5;
1564         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1565         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1566         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1567         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1568         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1569         v += 25;
1570       }
1571     }
1572     break;
1573       /* block sizes larger then 5 by 5 are handled by BLAS */
1574     default: {
1575       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1576       if (!a->mult_work) {
1577         k = PetscMax(a->m,a->n);
1578         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1579         CHKPTRQ(a->mult_work);
1580       }
1581       work = a->mult_work;
1582       for ( i=0; i<mbs; i++ ) {
1583         n     = ii[1] - ii[0]; ii++;
1584         ncols = n*bs;
1585         PetscMemzero(work,ncols*sizeof(Scalar));
1586         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1587         v += n*bs2;
1588         x += bs;
1589         workt = work;
1590         for ( j=0; j<n; j++ ) {
1591           zb = z + bs*(*idx++);
1592           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1593           workt += bs;
1594         }
1595       }
1596     }
1597   }
1598   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1599   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1600   PLogFlops(2*a->nz*a->bs2);
1601   return 0;
1602 }
1603 
1604 #undef __FUNC__
1605 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1606 static int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1607 {
1608   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1609 
1610   info->rows_global    = (double)a->m;
1611   info->columns_global = (double)a->n;
1612   info->rows_local     = (double)a->m;
1613   info->columns_local  = (double)a->n;
1614   info->block_size     = a->bs2;
1615   info->nz_allocated   = a->maxnz;
1616   info->nz_used        = a->bs2*a->nz;
1617   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1618   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1619     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1620   info->assemblies   = A->num_ass;
1621   info->mallocs      = a->reallocs;
1622   info->memory       = A->mem;
1623   if (A->factor) {
1624     info->fill_ratio_given  = A->info.fill_ratio_given;
1625     info->fill_ratio_needed = A->info.fill_ratio_needed;
1626     info->factor_mallocs    = A->info.factor_mallocs;
1627   } else {
1628     info->fill_ratio_given  = 0;
1629     info->fill_ratio_needed = 0;
1630     info->factor_mallocs    = 0;
1631   }
1632   return 0;
1633 }
1634 
1635 #undef __FUNC__
1636 #define __FUNC__ "MatEqual_SeqBAIJ"
1637 static int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1638 {
1639   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1640 
1641   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1642 
1643   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1644   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1645       (a->nz != b->nz)) {
1646     *flg = PETSC_FALSE; return 0;
1647   }
1648 
1649   /* if the a->i are the same */
1650   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1651     *flg = PETSC_FALSE; return 0;
1652   }
1653 
1654   /* if a->j are the same */
1655   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1656     *flg = PETSC_FALSE; return 0;
1657   }
1658 
1659   /* if a->a are the same */
1660   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1661     *flg = PETSC_FALSE; return 0;
1662   }
1663   *flg = PETSC_TRUE;
1664   return 0;
1665 
1666 }
1667 
1668 #undef __FUNC__
1669 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1670 static int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1671 {
1672   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1673   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1674   Scalar      *x, zero = 0.0,*aa,*aa_j;
1675 
1676   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1677   bs   = a->bs;
1678   aa   = a->a;
1679   ai   = a->i;
1680   aj   = a->j;
1681   ambs = a->mbs;
1682   bs2  = a->bs2;
1683 
1684   VecSet(&zero,v);
1685   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1686   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1687   for ( i=0; i<ambs; i++ ) {
1688     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1689       if (aj[j] == i) {
1690         row  = i*bs;
1691         aa_j = aa+j*bs2;
1692         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1693         break;
1694       }
1695     }
1696   }
1697   return 0;
1698 }
1699 
1700 #undef __FUNC__
1701 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1702 static int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1703 {
1704   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1705   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1706   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1707 
1708   ai  = a->i;
1709   aj  = a->j;
1710   aa  = a->a;
1711   m   = a->m;
1712   n   = a->n;
1713   bs  = a->bs;
1714   mbs = a->mbs;
1715   bs2 = a->bs2;
1716   if (ll) {
1717     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1718     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1719     for ( i=0; i<mbs; i++ ) { /* for each block row */
1720       M  = ai[i+1] - ai[i];
1721       li = l + i*bs;
1722       v  = aa + bs2*ai[i];
1723       for ( j=0; j<M; j++ ) { /* for each block */
1724         for ( k=0; k<bs2; k++ ) {
1725           (*v++) *= li[k%bs];
1726         }
1727       }
1728     }
1729   }
1730 
1731   if (rr) {
1732     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1733     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1734     for ( i=0; i<mbs; i++ ) { /* for each block row */
1735       M  = ai[i+1] - ai[i];
1736       v  = aa + bs2*ai[i];
1737       for ( j=0; j<M; j++ ) { /* for each block */
1738         ri = r + bs*aj[ai[i]+j];
1739         for ( k=0; k<bs; k++ ) {
1740           x = ri[k];
1741           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1742         }
1743       }
1744     }
1745   }
1746   return 0;
1747 }
1748 
1749 
1750 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1751 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1752 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1753 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1754 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1755 
1756 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1757 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1758 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1759 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1760 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1761 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1762 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1763 
1764 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1765 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1766 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1767 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1768 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1769 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1770 
1771 #undef __FUNC__
1772 #define __FUNC__ "MatNorm_SeqBAIJ"
1773 static int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1774 {
1775   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1776   Scalar      *v = a->a;
1777   double      sum = 0.0;
1778   int         i,nz=a->nz,bs2=a->bs2;
1779 
1780   if (type == NORM_FROBENIUS) {
1781     for (i=0; i< bs2*nz; i++ ) {
1782 #if defined(PETSC_COMPLEX)
1783       sum += real(conj(*v)*(*v)); v++;
1784 #else
1785       sum += (*v)*(*v); v++;
1786 #endif
1787     }
1788     *norm = sqrt(sum);
1789   }
1790   else {
1791     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1792   }
1793   return 0;
1794 }
1795 
1796 /*
1797      note: This can only work for identity for row and col. It would
1798    be good to check this and otherwise generate an error.
1799 */
1800 #undef __FUNC__
1801 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1802 static int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1803 {
1804   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1805   Mat         outA;
1806   int         ierr;
1807 
1808   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1809 
1810   outA          = inA;
1811   inA->factor   = FACTOR_LU;
1812   a->row        = row;
1813   a->col        = col;
1814 
1815   a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1816 
1817   if (!a->diag) {
1818     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1819   }
1820   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1821   return 0;
1822 }
1823 
1824 #undef __FUNC__
1825 #define __FUNC__ "MatScale_SeqBAIJ"
1826 static int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1827 {
1828   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1829   int         one = 1, totalnz = a->bs2*a->nz;
1830   BLscal_( &totalnz, alpha, a->a, &one );
1831   PLogFlops(totalnz);
1832   return 0;
1833 }
1834 
1835 #undef __FUNC__
1836 #define __FUNC__ "MatGetValues_SeqBAIJ"
1837 static int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1838 {
1839   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1840   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1841   int        *ai = a->i, *ailen = a->ilen;
1842   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1843   Scalar     *ap, *aa = a->a, zero = 0.0;
1844 
1845   for ( k=0; k<m; k++ ) { /* loop over rows */
1846     row  = im[k]; brow = row/bs;
1847     if (row < 0) SETERRQ(1,0,"Negative row");
1848     if (row >= a->m) SETERRQ(1,0,"Row too large");
1849     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1850     nrow = ailen[brow];
1851     for ( l=0; l<n; l++ ) { /* loop over columns */
1852       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1853       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1854       col  = in[l] ;
1855       bcol = col/bs;
1856       cidx = col%bs;
1857       ridx = row%bs;
1858       high = nrow;
1859       low  = 0; /* assume unsorted */
1860       while (high-low > 5) {
1861         t = (low+high)/2;
1862         if (rp[t] > bcol) high = t;
1863         else             low  = t;
1864       }
1865       for ( i=low; i<high; i++ ) {
1866         if (rp[i] > bcol) break;
1867         if (rp[i] == bcol) {
1868           *v++ = ap[bs2*i+bs*cidx+ridx];
1869           goto finished;
1870         }
1871       }
1872       *v++ = zero;
1873       finished:;
1874     }
1875   }
1876   return 0;
1877 }
1878 
1879 #undef __FUNC__
1880 #define __FUNC__ "MatGetBlockSize_SeqBAIJ"
1881 static int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1882 {
1883   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1884   *bs = baij->bs;
1885   return 0;
1886 }
1887 
1888 /* idx should be of length atlease bs */
1889 #undef __FUNC__
1890 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1891 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1892 {
1893   int i,row;
1894   row = idx[0];
1895   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1896 
1897   for ( i=1; i<bs; i++ ) {
1898     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1899   }
1900   *flg = PETSC_TRUE;
1901   return 0;
1902 }
1903 
1904 #undef __FUNC__
1905 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1906 static int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1907 {
1908   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1909   IS          is_local;
1910   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1911   PetscTruth  flg;
1912   Scalar      zero = 0.0,*aa;
1913 
1914   /* Make a copy of the IS and  sort it */
1915   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1916   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1917   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1918   ierr = ISSort(is_local); CHKERRQ(ierr);
1919   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1920 
1921   i = 0;
1922   while (i < is_n) {
1923     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1924     flg = PETSC_FALSE;
1925     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1926     if (flg) { /* There exists a block of rows to be Zerowed */
1927       baij->ilen[rows[i]/bs] = 0;
1928       i += bs;
1929     } else { /* Zero out only the requested row */
1930       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1931       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1932       for ( j=0; j<count; j++ ) {
1933         aa[0] = zero;
1934         aa+=bs;
1935       }
1936       i++;
1937     }
1938   }
1939   if (diag) {
1940     for ( j=0; j<is_n; j++ ) {
1941       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1942     }
1943   }
1944   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
1945   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
1946   ierr = ISDestroy(is_local); CHKERRQ(ierr);
1947   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1948 
1949   return 0;
1950 }
1951 
1952 #undef __FUNC__
1953 #define __FUNC__ "MatPrintHelp_SeqBAIJ"
1954 int MatPrintHelp_SeqBAIJ(Mat A)
1955 {
1956   static int called = 0;
1957   MPI_Comm   comm = A->comm;
1958 
1959   if (called) return 0; else called = 1;
1960   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
1961   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
1962   return 0;
1963 }
1964 
1965 /* -------------------------------------------------------------------*/
1966 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
1967        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
1968        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
1969        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
1970        MatSolve_SeqBAIJ_N,0,
1971        0,0,
1972        MatLUFactor_SeqBAIJ,0,
1973        0,
1974        MatTranspose_SeqBAIJ,
1975        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
1976        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
1977        0,MatAssemblyEnd_SeqBAIJ,
1978        0,
1979        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
1980        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
1981        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
1982        MatILUFactorSymbolic_SeqBAIJ,0,
1983        0,0,
1984        MatConvertSameType_SeqBAIJ,0,0,
1985        MatILUFactor_SeqBAIJ,0,0,
1986        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
1987        MatGetValues_SeqBAIJ,0,
1988        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
1989        0,0,0,MatGetBlockSize_SeqBAIJ,
1990        MatGetRowIJ_SeqBAIJ,
1991        MatRestoreRowIJ_SeqBAIJ,
1992        0,0,0,0,0,0,
1993        MatSetValuesBlocked_SeqBAIJ};
1994 
1995 #undef __FUNC__
1996 #define __FUNC__ "MatCreateSeqBAIJ"
1997 /*@C
1998    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
1999    compressed row) format.  For good matrix assembly performance the
2000    user should preallocate the matrix storage by setting the parameter nz
2001    (or the array nzz).  By setting these parameters accurately, performance
2002    during matrix assembly can be increased by more than a factor of 50.
2003 
2004    Input Parameters:
2005 .  comm - MPI communicator, set to MPI_COMM_SELF
2006 .  bs - size of block
2007 .  m - number of rows
2008 .  n - number of columns
2009 .  nz - number of block nonzeros per block row (same for all rows)
2010 .  nzz - array containing the number of block nonzeros in the various block rows
2011          (possibly different for each block row) or PETSC_NULL
2012 
2013    Output Parameter:
2014 .  A - the matrix
2015 
2016    Options Database Keys:
2017 $    -mat_no_unroll - uses code that does not unroll the loops in the
2018 $                     block calculations (much solver)
2019 $    -mat_block_size - size of the blocks to use
2020 
2021    Notes:
2022    The block AIJ format is fully compatible with standard Fortran 77
2023    storage.  That is, the stored row and column indices can begin at
2024    either one (as in Fortran) or zero.  See the users' manual for details.
2025 
2026    Specify the preallocated storage with either nz or nnz (not both).
2027    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2028    allocation.  For additional details, see the users manual chapter on
2029    matrices.
2030 
2031 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2032 @*/
2033 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2034 {
2035   Mat         B;
2036   Mat_SeqBAIJ *b;
2037   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2038 
2039   MPI_Comm_size(comm,&size);
2040   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2041 
2042   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2043 
2044   if (mbs*bs!=m || nbs*bs!=n)
2045     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2046 
2047   *A = 0;
2048   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
2049   PLogObjectCreate(B);
2050   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2051   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2052   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2053   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2054   if (!flg) {
2055     switch (bs) {
2056     case 1:
2057       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2058       B->ops.solve           = MatSolve_SeqBAIJ_1;
2059       B->ops.mult            = MatMult_SeqBAIJ_1;
2060       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2061       break;
2062     case 2:
2063       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2064       B->ops.solve           = MatSolve_SeqBAIJ_2;
2065       B->ops.mult            = MatMult_SeqBAIJ_2;
2066       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2067       break;
2068     case 3:
2069       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2070       B->ops.solve           = MatSolve_SeqBAIJ_3;
2071       B->ops.mult            = MatMult_SeqBAIJ_3;
2072       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2073       break;
2074     case 4:
2075       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2076       B->ops.solve           = MatSolve_SeqBAIJ_4;
2077       B->ops.mult            = MatMult_SeqBAIJ_4;
2078       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2079       break;
2080     case 5:
2081       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2082       B->ops.solve           = MatSolve_SeqBAIJ_5;
2083       B->ops.mult            = MatMult_SeqBAIJ_5;
2084       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2085       break;
2086     case 7:
2087       B->ops.mult            = MatMult_SeqBAIJ_7;
2088       B->ops.solve           = MatSolve_SeqBAIJ_7;
2089       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2090       break;
2091     }
2092   }
2093   B->destroy          = MatDestroy_SeqBAIJ;
2094   B->view             = MatView_SeqBAIJ;
2095   B->factor           = 0;
2096   B->lupivotthreshold = 1.0;
2097   B->mapping          = 0;
2098   b->row              = 0;
2099   b->col              = 0;
2100   b->reallocs         = 0;
2101 
2102   b->m       = m; B->m = m; B->M = m;
2103   b->n       = n; B->n = n; B->N = n;
2104   b->mbs     = mbs;
2105   b->nbs     = nbs;
2106   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2107   if (nnz == PETSC_NULL) {
2108     if (nz == PETSC_DEFAULT) nz = 5;
2109     else if (nz <= 0)        nz = 1;
2110     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2111     nz = nz*mbs;
2112   }
2113   else {
2114     nz = 0;
2115     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2116   }
2117 
2118   /* allocate the matrix space */
2119   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2120   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2121   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2122   b->j  = (int *) (b->a + nz*bs2);
2123   PetscMemzero(b->j,nz*sizeof(int));
2124   b->i  = b->j + nz;
2125   b->singlemalloc = 1;
2126 
2127   b->i[0] = 0;
2128   for (i=1; i<mbs+1; i++) {
2129     b->i[i] = b->i[i-1] + b->imax[i-1];
2130   }
2131 
2132   /* b->ilen will count nonzeros in each block row so far. */
2133   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2134   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
2135   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2136 
2137   b->bs               = bs;
2138   b->bs2              = bs2;
2139   b->mbs              = mbs;
2140   b->nz               = 0;
2141   b->maxnz            = nz*bs2;
2142   b->sorted           = 0;
2143   b->roworiented      = 1;
2144   b->nonew            = 0;
2145   b->diag             = 0;
2146   b->solve_work       = 0;
2147   b->mult_work        = 0;
2148   b->spptr            = 0;
2149   B->info.nz_unneeded = (double)b->maxnz;
2150 
2151   *A = B;
2152   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2153   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2154   return 0;
2155 }
2156 
2157 #undef __FUNC__
2158 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2159 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2160 {
2161   Mat         C;
2162   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2163   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2164 
2165   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2166 
2167   *B = 0;
2168   PetscHeaderCreate(C,_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
2169   PLogObjectCreate(C);
2170   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2171   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2172   C->destroy    = MatDestroy_SeqBAIJ;
2173   C->view       = MatView_SeqBAIJ;
2174   C->factor     = A->factor;
2175   c->row        = 0;
2176   c->col        = 0;
2177   C->assembled  = PETSC_TRUE;
2178 
2179   c->m = C->m   = a->m;
2180   c->n = C->n   = a->n;
2181   C->M          = a->m;
2182   C->N          = a->n;
2183 
2184   c->bs         = a->bs;
2185   c->bs2        = a->bs2;
2186   c->mbs        = a->mbs;
2187   c->nbs        = a->nbs;
2188 
2189   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2190   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2191   for ( i=0; i<mbs; i++ ) {
2192     c->imax[i] = a->imax[i];
2193     c->ilen[i] = a->ilen[i];
2194   }
2195 
2196   /* allocate the matrix space */
2197   c->singlemalloc = 1;
2198   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2199   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2200   c->j  = (int *) (c->a + nz*bs2);
2201   c->i  = c->j + nz;
2202   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2203   if (mbs > 0) {
2204     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2205     if (cpvalues == COPY_VALUES) {
2206       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2207     }
2208   }
2209 
2210   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
2211   c->sorted      = a->sorted;
2212   c->roworiented = a->roworiented;
2213   c->nonew       = a->nonew;
2214 
2215   if (a->diag) {
2216     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2217     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2218     for ( i=0; i<mbs; i++ ) {
2219       c->diag[i] = a->diag[i];
2220     }
2221   }
2222   else c->diag          = 0;
2223   c->nz                 = a->nz;
2224   c->maxnz              = a->maxnz;
2225   c->solve_work         = 0;
2226   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2227   c->mult_work          = 0;
2228   *B = C;
2229   return 0;
2230 }
2231 
2232 #undef __FUNC__
2233 #define __FUNC__ "MatLoad_SeqBAIJ"
2234 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2235 {
2236   Mat_SeqBAIJ  *a;
2237   Mat          B;
2238   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2239   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2240   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2241   int          *masked, nmask,tmp,bs2,ishift;
2242   Scalar       *aa;
2243   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2244 
2245   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2246   bs2  = bs*bs;
2247 
2248   MPI_Comm_size(comm,&size);
2249   if (size > 1) SETERRQ(1,0,"view must have one processor");
2250   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2251   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
2252   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2253   M = header[1]; N = header[2]; nz = header[3];
2254 
2255   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2256 
2257   /*
2258      This code adds extra rows to make sure the number of rows is
2259     divisible by the blocksize
2260   */
2261   mbs        = M/bs;
2262   extra_rows = bs - M + bs*(mbs);
2263   if (extra_rows == bs) extra_rows = 0;
2264   else                  mbs++;
2265   if (extra_rows) {
2266     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2267   }
2268 
2269   /* read in row lengths */
2270   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2271   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
2272   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2273 
2274   /* read in column indices */
2275   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2276   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
2277   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2278 
2279   /* loop over row lengths determining block row lengths */
2280   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2281   PetscMemzero(browlengths,mbs*sizeof(int));
2282   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2283   PetscMemzero(mask,mbs*sizeof(int));
2284   masked = mask + mbs;
2285   rowcount = 0; nzcount = 0;
2286   for ( i=0; i<mbs; i++ ) {
2287     nmask = 0;
2288     for ( j=0; j<bs; j++ ) {
2289       kmax = rowlengths[rowcount];
2290       for ( k=0; k<kmax; k++ ) {
2291         tmp = jj[nzcount++]/bs;
2292         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2293       }
2294       rowcount++;
2295     }
2296     browlengths[i] += nmask;
2297     /* zero out the mask elements we set */
2298     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2299   }
2300 
2301   /* create our matrix */
2302   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);
2303          CHKERRQ(ierr);
2304   B = *A;
2305   a = (Mat_SeqBAIJ *) B->data;
2306 
2307   /* set matrix "i" values */
2308   a->i[0] = 0;
2309   for ( i=1; i<= mbs; i++ ) {
2310     a->i[i]      = a->i[i-1] + browlengths[i-1];
2311     a->ilen[i-1] = browlengths[i-1];
2312   }
2313   a->nz         = 0;
2314   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2315 
2316   /* read in nonzero values */
2317   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2318   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
2319   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2320 
2321   /* set "a" and "j" values into matrix */
2322   nzcount = 0; jcount = 0;
2323   for ( i=0; i<mbs; i++ ) {
2324     nzcountb = nzcount;
2325     nmask    = 0;
2326     for ( j=0; j<bs; j++ ) {
2327       kmax = rowlengths[i*bs+j];
2328       for ( k=0; k<kmax; k++ ) {
2329         tmp = jj[nzcount++]/bs;
2330 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2331       }
2332       rowcount++;
2333     }
2334     /* sort the masked values */
2335     PetscSortInt(nmask,masked);
2336 
2337     /* set "j" values into matrix */
2338     maskcount = 1;
2339     for ( j=0; j<nmask; j++ ) {
2340       a->j[jcount++]  = masked[j];
2341       mask[masked[j]] = maskcount++;
2342     }
2343     /* set "a" values into matrix */
2344     ishift = bs2*a->i[i];
2345     for ( j=0; j<bs; j++ ) {
2346       kmax = rowlengths[i*bs+j];
2347       for ( k=0; k<kmax; k++ ) {
2348         tmp    = jj[nzcountb]/bs ;
2349         block  = mask[tmp] - 1;
2350         point  = jj[nzcountb] - bs*tmp;
2351         idx    = ishift + bs2*block + j + bs*point;
2352         a->a[idx] = aa[nzcountb++];
2353       }
2354     }
2355     /* zero out the mask elements we set */
2356     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2357   }
2358   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2359 
2360   PetscFree(rowlengths);
2361   PetscFree(browlengths);
2362   PetscFree(aa);
2363   PetscFree(jj);
2364   PetscFree(mask);
2365 
2366   B->assembled = PETSC_TRUE;
2367 
2368   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
2369   if (flg) {
2370     Viewer tviewer;
2371     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2372     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO,0);CHKERRQ(ierr);
2373     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2374     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2375   }
2376   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
2377   if (flg) {
2378     Viewer tviewer;
2379     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2380     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO_LONG,0);CHKERRQ(ierr);
2381     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2382     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2383   }
2384   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
2385   if (flg) {
2386     Viewer tviewer;
2387     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2388     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2389     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2390   }
2391   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
2392   if (flg) {
2393     Viewer tviewer;
2394     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2395     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_MATLAB,"M");CHKERRQ(ierr);
2396     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2397     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2398   }
2399   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
2400   if (flg) {
2401     Viewer tviewer;
2402     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
2403     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
2404     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
2405     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2406   }
2407   return 0;
2408 }
2409 
2410 
2411 
2412