xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 8ba3a721c9105e9240dbd90dceddbf8833cabcd1)
1 #ifndef lint
2 static char vcid[] = "$Id: baij.c,v 1.89 1997/02/05 23:30:18 balay Exp balay $";
3 #endif
4 
5 /*
6     Defines the basic matrix operations for the BAIJ (compressed row)
7   matrix storage format.
8 */
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/spops.h"
12 #include "petsc.h"
13 
14 
15 /*
16      Adds diagonal pointers to sparse matrix structure.
17 */
18 
19 #undef __FUNC__
20 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
21 int MatMarkDiag_SeqBAIJ(Mat A)
22 {
23   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
24   int         i,j, *diag, m = a->mbs;
25 
26   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
27   PLogObjectMemory(A,(m+1)*sizeof(int));
28   for ( i=0; i<m; i++ ) {
29     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
30       if (a->j[j] == i) {
31         diag[i] = j;
32         break;
33       }
34     }
35   }
36   a->diag = diag;
37   return 0;
38 }
39 
40 #include "draw.h"
41 #include "pinclude/pviewer.h"
42 #include "sys.h"
43 
44 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
48 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
52   int         ierr, n = a->mbs,i;
53 
54   *nn = n;
55   if (!ia) return 0;
56   if (symmetric) {
57     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
58   } else if (oshift == 1) {
59     /* temporarily add 1 to i and j indices */
60     int nz = a->i[n] + 1;
61     for ( i=0; i<nz; i++ ) a->j[i]++;
62     for ( i=0; i<n+1; i++ ) a->i[i]++;
63     *ia = a->i; *ja = a->j;
64   } else {
65     *ia = a->i; *ja = a->j;
66   }
67 
68   return 0;
69 }
70 
71 #undef __FUNC__
72 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ"
73 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
74                                 PetscTruth *done)
75 {
76   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
77   int         i,n = a->mbs;
78 
79   if (!ia) return 0;
80   if (symmetric) {
81     PetscFree(*ia);
82     PetscFree(*ja);
83   } else if (oshift == 1) {
84     int nz = a->i[n];
85     for ( i=0; i<nz; i++ ) a->j[i]--;
86     for ( i=0; i<n+1; i++ ) a->i[i]--;
87   }
88   return 0;
89 }
90 
91 
92 #undef __FUNC__
93 #define __FUNC__ "MatView_SeqBAIJ_Binary"
94 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
95 {
96   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
97   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
98   Scalar      *aa;
99 
100   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
101   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
102   col_lens[0] = MAT_COOKIE;
103 
104   col_lens[1] = a->m;
105   col_lens[2] = a->n;
106   col_lens[3] = a->nz*bs2;
107 
108   /* store lengths of each row and write (including header) to file */
109   count = 0;
110   for ( i=0; i<a->mbs; i++ ) {
111     for ( j=0; j<bs; j++ ) {
112       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
113     }
114   }
115   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
116   PetscFree(col_lens);
117 
118   /* store column indices (zero start index) */
119   jj = (int *) PetscMalloc( a->nz*bs2*sizeof(int) ); CHKPTRQ(jj);
120   count = 0;
121   for ( i=0; i<a->mbs; i++ ) {
122     for ( j=0; j<bs; j++ ) {
123       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
124         for ( l=0; l<bs; l++ ) {
125           jj[count++] = bs*a->j[k] + l;
126         }
127       }
128     }
129   }
130   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
131   PetscFree(jj);
132 
133   /* store nonzero values */
134   aa = (Scalar *) PetscMalloc(a->nz*bs2*sizeof(Scalar)); CHKPTRQ(aa);
135   count = 0;
136   for ( i=0; i<a->mbs; i++ ) {
137     for ( j=0; j<bs; j++ ) {
138       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
139         for ( l=0; l<bs; l++ ) {
140           aa[count++] = a->a[bs2*k + l*bs + j];
141         }
142       }
143     }
144   }
145   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
146   PetscFree(aa);
147   return 0;
148 }
149 
150 #undef __FUNC__
151 #define __FUNC__ "MatView_SeqBAIJ_ASCII"
152 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
153 {
154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
155   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
156   FILE        *fd;
157   char        *outputname;
158 
159   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
160   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
161   ierr = ViewerGetFormat(viewer,&format);
162   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
163     fprintf(fd,"  block size is %d\n",bs);
164   }
165   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
166     SETERRQ(1,0,"Matlab format not supported");
167   }
168   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
169     for ( i=0; i<a->mbs; i++ ) {
170       for ( j=0; j<bs; j++ ) {
171         fprintf(fd,"row %d:",i*bs+j);
172         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
173           for ( l=0; l<bs; l++ ) {
174 #if defined(PETSC_COMPLEX)
175           if (imag(a->a[bs2*k + l*bs + j]) != 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
176             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
177               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
178           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
179             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
180 #else
181           if (a->a[bs2*k + l*bs + j] != 0.0)
182             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
183 #endif
184           }
185         }
186         fprintf(fd,"\n");
187       }
188     }
189   }
190   else {
191     for ( i=0; i<a->mbs; i++ ) {
192       for ( j=0; j<bs; j++ ) {
193         fprintf(fd,"row %d:",i*bs+j);
194         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
195           for ( l=0; l<bs; l++ ) {
196 #if defined(PETSC_COMPLEX)
197           if (imag(a->a[bs2*k + l*bs + j]) != 0.0) {
198             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
199               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
200           }
201           else {
202             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
203           }
204 #else
205             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
206 #endif
207           }
208         }
209         fprintf(fd,"\n");
210       }
211     }
212   }
213   fflush(fd);
214   return 0;
215 }
216 
217 #undef __FUNC__
218 #define __FUNC__ "MatView_SeqBAIJ_Draw"
219 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
220 {
221   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
222   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
223   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
224   Scalar       *aa;
225   Draw         draw;
226   DrawButton   button;
227   PetscTruth   isnull;
228 
229   ViewerDrawGetDraw(viewer,&draw);
230   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
231 
232   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
233   xr += w;    yr += h;  xl = -w;     yl = -h;
234   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
235   /* loop over matrix elements drawing boxes */
236   color = DRAW_BLUE;
237   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
238     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
239       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
240       x_l = a->j[j]*bs; x_r = x_l + 1.0;
241       aa = a->a + j*bs2;
242       for ( k=0; k<bs; k++ ) {
243         for ( l=0; l<bs; l++ ) {
244           if (PetscReal(*aa++) >=  0.) continue;
245           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
246         }
247       }
248     }
249   }
250   color = DRAW_CYAN;
251   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
252     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
253       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
254       x_l = a->j[j]*bs; x_r = x_l + 1.0;
255       aa = a->a + j*bs2;
256       for ( k=0; k<bs; k++ ) {
257         for ( l=0; l<bs; l++ ) {
258           if (PetscReal(*aa++) != 0.) continue;
259           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
260         }
261       }
262     }
263   }
264 
265   color = DRAW_RED;
266   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
267     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
268       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
269       x_l = a->j[j]*bs; x_r = x_l + 1.0;
270       aa = a->a + j*bs2;
271       for ( k=0; k<bs; k++ ) {
272         for ( l=0; l<bs; l++ ) {
273           if (PetscReal(*aa++) <= 0.) continue;
274           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
275         }
276       }
277     }
278   }
279 
280   DrawFlush(draw);
281   DrawGetPause(draw,&pause);
282   if (pause >= 0) { PetscSleep(pause); return 0;}
283 
284   /* allow the matrix to zoom or shrink */
285   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
286   while (button != BUTTON_RIGHT) {
287     DrawClear(draw);
288     if (button == BUTTON_LEFT) scale = .5;
289     else if (button == BUTTON_CENTER) scale = 2.;
290     xl = scale*(xl + w - xc) + xc - w*scale;
291     xr = scale*(xr - w - xc) + xc + w*scale;
292     yl = scale*(yl + h - yc) + yc - h*scale;
293     yr = scale*(yr - h - yc) + yc + h*scale;
294     w *= scale; h *= scale;
295     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
296     color = DRAW_BLUE;
297     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
298       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
299         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
300         x_l = a->j[j]*bs; x_r = x_l + 1.0;
301         aa = a->a + j*bs2;
302         for ( k=0; k<bs; k++ ) {
303           for ( l=0; l<bs; l++ ) {
304             if (PetscReal(*aa++) >=  0.) continue;
305             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
306           }
307         }
308       }
309     }
310     color = DRAW_CYAN;
311     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
312       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
313         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
314         x_l = a->j[j]*bs; x_r = x_l + 1.0;
315         aa = a->a + j*bs2;
316         for ( k=0; k<bs; k++ ) {
317           for ( l=0; l<bs; l++ ) {
318           if (PetscReal(*aa++) != 0.) continue;
319           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
320           }
321         }
322       }
323     }
324 
325     color = DRAW_RED;
326     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
327       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
328         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
329         x_l = a->j[j]*bs; x_r = x_l + 1.0;
330         aa = a->a + j*bs2;
331         for ( k=0; k<bs; k++ ) {
332           for ( l=0; l<bs; l++ ) {
333             if (PetscReal(*aa++) <= 0.) continue;
334             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
335           }
336         }
337       }
338     }
339     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
340   }
341   return 0;
342 }
343 
344 #undef __FUNC__
345 #define __FUNC__ "MatView_SeqBAIJ"
346 static int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
347 {
348   Mat         A = (Mat) obj;
349   ViewerType  vtype;
350   int         ierr;
351 
352   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
353   if (vtype == MATLAB_VIEWER) {
354     SETERRQ(1,0,"Matlab viewer not supported");
355   }
356   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
357     return MatView_SeqBAIJ_ASCII(A,viewer);
358   }
359   else if (vtype == BINARY_FILE_VIEWER) {
360     return MatView_SeqBAIJ_Binary(A,viewer);
361   }
362   else if (vtype == DRAW_VIEWER) {
363     return MatView_SeqBAIJ_Draw(A,viewer);
364   }
365   return 0;
366 }
367 
368 #define CHUNKSIZE  10
369 
370 #undef __FUNC__
371 #define __FUNC__ "MatSetValues_SeqBAIJ"
372 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
373 {
374   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
375   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
376   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
377   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
378   int          ridx,cidx,bs2=a->bs2;
379   Scalar      *ap,value,*aa=a->a,*bap;
380 
381   for ( k=0; k<m; k++ ) { /* loop over added rows */
382     row  = im[k]; brow = row/bs;
383 #if defined(PETSC_BOPT_g)
384     if (row < 0) SETERRQ(1,0,"Negative row");
385     if (row >= a->m) SETERRQ(1,0,"Row too large");
386 #endif
387     rp   = aj + ai[brow];
388     ap   = aa + bs2*ai[brow];
389     rmax = imax[brow];
390     nrow = ailen[brow];
391     low  = 0;
392     for ( l=0; l<n; l++ ) { /* loop over added columns */
393 #if defined(PETSC_BOPT_g)
394       if (in[l] < 0) SETERRQ(1,0,"Negative column");
395       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
396 #endif
397       col = in[l]; bcol = col/bs;
398       ridx = row % bs; cidx = col % bs;
399       if (roworiented) {
400         value = *v++;
401       }
402       else {
403         value = v[k + l*m];
404       }
405       if (!sorted) low = 0; high = nrow;
406       while (high-low > 7) {
407         t = (low+high)/2;
408         if (rp[t] > bcol) high = t;
409         else              low  = t;
410       }
411       for ( i=low; i<high; i++ ) {
412         if (rp[i] > bcol) break;
413         if (rp[i] == bcol) {
414           bap  = ap +  bs2*i + bs*cidx + ridx;
415           if (is == ADD_VALUES) *bap += value;
416           else                  *bap  = value;
417           goto noinsert;
418         }
419       }
420       if (nonew) goto noinsert;
421       if (nrow >= rmax) {
422         /* there is no extra room in row, therefore enlarge */
423         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
424         Scalar *new_a;
425 
426         /* malloc new storage space */
427         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
428         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
429         new_j   = (int *) (new_a + bs2*new_nz);
430         new_i   = new_j + new_nz;
431 
432         /* copy over old data into new slots */
433         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
434         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
435         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
436         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
437         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
438                                                            len*sizeof(int));
439         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
440         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
441         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
442                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
443         /* free up old matrix storage */
444         PetscFree(a->a);
445         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
446         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
447         a->singlemalloc = 1;
448 
449         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
450         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
451         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
452         a->maxnz += bs2*CHUNKSIZE;
453         a->reallocs++;
454         a->nz++;
455       }
456       N = nrow++ - 1;
457       /* shift up all the later entries in this row */
458       for ( ii=N; ii>=i; ii-- ) {
459         rp[ii+1] = rp[ii];
460         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
461       }
462       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
463       rp[i]                      = bcol;
464       ap[bs2*i + bs*cidx + ridx] = value;
465       noinsert:;
466       low = i;
467     }
468     ailen[brow] = nrow;
469   }
470   return 0;
471 }
472 
473 #undef __FUNC__
474 #define __FUNC__ "MatSetValues_SeqBAIJ"
475 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
476 {
477   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
478   int         *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
479   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen;
480   int         roworiented=a->roworiented;
481   int         *aj=a->j,nonew=a->nonew;
482   int          bs2=a->bs2,bs=a->bs,stepval;
483   Scalar      *ap,*value=v,*aa=a->a,*bap;
484 
485   if (roworiented) {
486     stepval = (n-1)*bs;
487   } else {
488     stepval = (m-1)*bs;
489   }
490   for ( k=0; k<m; k++ ) { /* loop over added rows */
491     row  = im[k];
492 #if defined(PETSC_BOPT_g)
493     if (row < 0) SETERRQ(1,0,"Negative row");
494     if (row >= a->mbs) SETERRQ(1,0,"Row too large");
495 #endif
496     rp   = aj + ai[row];
497     ap   = aa + bs2*ai[row];
498     rmax = imax[row];
499     nrow = ailen[row];
500     low  = 0;
501     for ( l=0; l<n; l++ ) { /* loop over added columns */
502 #if defined(PETSC_BOPT_g)
503       if (in[l] < 0) SETERRQ(1,0,"Negative column");
504       if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large");
505 #endif
506       col = in[l];
507       if (roworiented) {
508         value = v + k*(stepval+bs)*bs + l*bs;
509       } else {
510         value = v + l*(stepval+bs)*bs + k*bs;
511       }
512       if (!sorted) low = 0; high = nrow;
513       while (high-low > 7) {
514         t = (low+high)/2;
515         if (rp[t] > col) high = t;
516         else             low  = t;
517       }
518       for ( i=low; i<high; i++ ) {
519         if (rp[i] > col) break;
520         if (rp[i] == col) {
521           bap  = ap +  bs2*i;
522           if (roworiented) {
523             if (is == ADD_VALUES) {
524               for ( ii=0; ii<bs; ii++,value+=stepval )
525                 for (jj=ii; jj<bs2; jj+=bs )
526                   bap[jj] += *value++;
527             } else {
528               for ( ii=0; ii<bs; ii++,value+=stepval )
529                 for (jj=ii; jj<bs2; jj+=bs )
530                   bap[jj] = *value++;
531             }
532           } else {
533             if (is == ADD_VALUES) {
534               for ( ii=0; ii<bs; ii++,value+=stepval )
535                 for (jj=0; jj<bs; jj++ )
536                   *bap++ += *value++;
537             } else {
538               for ( ii=0; ii<bs; ii++,value+=stepval )
539                 for (jj=0; jj<bs; jj++ )
540                   *bap++  = *value++;
541             }
542           }
543           goto noinsert;
544         }
545       }
546       if (nonew) goto noinsert;
547       if (nrow >= rmax) {
548         /* there is no extra room in row, therefore enlarge */
549         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
550         Scalar *new_a;
551 
552         /* malloc new storage space */
553         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
554         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
555         new_j   = (int *) (new_a + bs2*new_nz);
556         new_i   = new_j + new_nz;
557 
558         /* copy over old data into new slots */
559         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
560         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
561         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
562         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
563         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,
564                                                            len*sizeof(int));
565         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
566         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
567         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),
568                     aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
569         /* free up old matrix storage */
570         PetscFree(a->a);
571         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
572         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
573         a->singlemalloc = 1;
574 
575         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
576         rmax = imax[row] = imax[row] + CHUNKSIZE;
577         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
578         a->maxnz += bs2*CHUNKSIZE;
579         a->reallocs++;
580         a->nz++;
581       }
582       N = nrow++ - 1;
583       /* shift up all the later entries in this row */
584       for ( ii=N; ii>=i; ii-- ) {
585         rp[ii+1] = rp[ii];
586         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
587       }
588       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
589       rp[i] = col;
590       bap   = ap +  bs2*i;
591       if (roworiented) {
592         for ( ii=0; ii<bs; ii++,value+=stepval)
593           for (jj=ii; jj<bs2; jj+=bs )
594             bap[jj] = *value++;
595       } else {
596         for ( ii=0; ii<bs; ii++,value+=stepval )
597           for (jj=0; jj<bs; jj++ )
598             *bap++  = *value++;
599       }
600       noinsert:;
601       low = i;
602     }
603     ailen[row] = nrow;
604   }
605   return 0;
606 }
607 
608 #undef __FUNC__
609 #define __FUNC__ "MatGetSize_SeqBAIJ"
610 static int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
611 {
612   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
613   *m = a->m; *n = a->n;
614   return 0;
615 }
616 
617 #undef __FUNC__
618 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ"
619 static int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
620 {
621   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
622   *m = 0; *n = a->m;
623   return 0;
624 }
625 
626 #undef __FUNC__
627 #define __FUNC__ "MatGetRow_SeqBAIJ"
628 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
629 {
630   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
631   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
632   Scalar      *aa,*v_i,*aa_i;
633 
634   bs  = a->bs;
635   ai  = a->i;
636   aj  = a->j;
637   aa  = a->a;
638   bs2 = a->bs2;
639 
640   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
641 
642   bn  = row/bs;   /* Block number */
643   bp  = row % bs; /* Block Position */
644   M   = ai[bn+1] - ai[bn];
645   *nz = bs*M;
646 
647   if (v) {
648     *v = 0;
649     if (*nz) {
650       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
651       for ( i=0; i<M; i++ ) { /* for each block in the block row */
652         v_i  = *v + i*bs;
653         aa_i = aa + bs2*(ai[bn] + i);
654         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
655       }
656     }
657   }
658 
659   if (idx) {
660     *idx = 0;
661     if (*nz) {
662       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
663       for ( i=0; i<M; i++ ) { /* for each block in the block row */
664         idx_i = *idx + i*bs;
665         itmp  = bs*aj[ai[bn] + i];
666         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
667       }
668     }
669   }
670   return 0;
671 }
672 
673 #undef __FUNC__
674 #define __FUNC__ "MatRestoreRow_SeqBAIJ"
675 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
676 {
677   if (idx) {if (*idx) PetscFree(*idx);}
678   if (v)   {if (*v)   PetscFree(*v);}
679   return 0;
680 }
681 
682 #undef __FUNC__
683 #define __FUNC__ "MatTranspose_SeqBAIJ"
684 static int MatTranspose_SeqBAIJ(Mat A,Mat *B)
685 {
686   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
687   Mat         C;
688   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
689   int         *rows,*cols,bs2=a->bs2;
690   Scalar      *array=a->a;
691 
692   if (B==PETSC_NULL && mbs!=nbs)
693     SETERRQ(1,0,"Square matrix only for in-place");
694   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
695   PetscMemzero(col,(1+nbs)*sizeof(int));
696 
697   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
698   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
699   PetscFree(col);
700   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
701   cols = rows + bs;
702   for ( i=0; i<mbs; i++ ) {
703     cols[0] = i*bs;
704     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
705     len = ai[i+1] - ai[i];
706     for ( j=0; j<len; j++ ) {
707       rows[0] = (*aj++)*bs;
708       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
709       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
710       array += bs2;
711     }
712   }
713   PetscFree(rows);
714 
715   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
716   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
717 
718   if (B != PETSC_NULL) {
719     *B = C;
720   } else {
721     /* This isn't really an in-place transpose */
722     PetscFree(a->a);
723     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
724     if (a->diag) PetscFree(a->diag);
725     if (a->ilen) PetscFree(a->ilen);
726     if (a->imax) PetscFree(a->imax);
727     if (a->solve_work) PetscFree(a->solve_work);
728     PetscFree(a);
729     PetscMemcpy(A,C,sizeof(struct _Mat));
730     PetscHeaderDestroy(C);
731   }
732   return 0;
733 }
734 
735 
736 #undef __FUNC__
737 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
738 static int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
739 {
740   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
741   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
742   int        m = a->m,*ip, N, *ailen = a->ilen;
743   int        mbs = a->mbs, bs2 = a->bs2,rmax;
744   Scalar     *aa = a->a, *ap;
745 
746   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
747 
748   rmax = ailen[0];
749   for ( i=1; i<mbs; i++ ) {
750     /* move each row back by the amount of empty slots (fshift) before it*/
751     fshift += imax[i-1] - ailen[i-1];
752     rmax   = PetscMax(rmax,ailen[i]);
753     if (fshift) {
754       ip = aj + ai[i]; ap = aa + bs2*ai[i];
755       N = ailen[i];
756       for ( j=0; j<N; j++ ) {
757         ip[j-fshift] = ip[j];
758         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
759       }
760     }
761     ai[i] = ai[i-1] + ailen[i-1];
762   }
763   if (mbs) {
764     fshift += imax[mbs-1] - ailen[mbs-1];
765     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
766   }
767   /* reset ilen and imax for each row */
768   for ( i=0; i<mbs; i++ ) {
769     ailen[i] = imax[i] = ai[i+1] - ai[i];
770   }
771   a->nz = ai[mbs];
772 
773   /* diagonals may have moved, so kill the diagonal pointers */
774   if (fshift && a->diag) {
775     PetscFree(a->diag);
776     PLogObjectMemory(A,-(m+1)*sizeof(int));
777     a->diag = 0;
778   }
779   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
780            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
781   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
782            a->reallocs);
783   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
784   a->reallocs          = 0;
785   A->info.nz_unneeded  = (double)fshift*bs2;
786 
787   return 0;
788 }
789 
790 #undef __FUNC__
791 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
792 static int MatZeroEntries_SeqBAIJ(Mat A)
793 {
794   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
795   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
796   return 0;
797 }
798 
799 #undef __FUNC__
800 #define __FUNC__ "MatDestroy_SeqBAIJ"
801 int MatDestroy_SeqBAIJ(PetscObject obj)
802 {
803   Mat         A  = (Mat) obj;
804   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
805   int         ierr;
806 
807 #if defined(PETSC_LOG)
808   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
809 #endif
810   PetscFree(a->a);
811   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
812   if (a->diag) PetscFree(a->diag);
813   if (a->ilen) PetscFree(a->ilen);
814   if (a->imax) PetscFree(a->imax);
815   if (a->solve_work) PetscFree(a->solve_work);
816   if (a->mult_work) PetscFree(a->mult_work);
817   PetscFree(a);
818   if (A->mapping) {
819     ierr = ISLocalToGlobalMappingDestroy(A->mapping); CHKERRQ(ierr);
820   }
821   PLogObjectDestroy(A);
822   PetscHeaderDestroy(A);
823   return 0;
824 }
825 
826 #undef __FUNC__
827 #define __FUNC__ "MatSetOption_SeqBAIJ"
828 static int MatSetOption_SeqBAIJ(Mat A,MatOption op)
829 {
830   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
831   if      (op == MAT_ROW_ORIENTED)              a->roworiented = 1;
832   else if (op == MAT_COLUMN_ORIENTED)           a->roworiented = 0;
833   else if (op == MAT_COLUMNS_SORTED)            a->sorted      = 1;
834   else if (op == MAT_COLUMNS_UNSORTED)          a->sorted      = 0;
835   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)  a->nonew       = 1;
836   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS) a->nonew       = 0;
837   else if (op == MAT_ROWS_SORTED ||
838            op == MAT_ROWS_UNSORTED ||
839            op == MAT_SYMMETRIC ||
840            op == MAT_STRUCTURALLY_SYMMETRIC ||
841            op == MAT_YES_NEW_DIAGONALS ||
842            op == MAT_IGNORE_OFF_PROCESSOR_ENTRIES)
843     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
844   else if (op == MAT_NO_NEW_DIAGONALS)
845     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
846   else
847     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
848   return 0;
849 }
850 
851 
852 /* -------------------------------------------------------*/
853 /* Should check that shapes of vectors and matrices match */
854 /* -------------------------------------------------------*/
855 #include "pinclude/plapack.h"
856 
857 #undef __FUNC__
858 #define __FUNC__ "MatMult_SeqBAIJ_1"
859 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
860 {
861   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
862   register Scalar *x,*z,*v,sum;
863   int             mbs=a->mbs,i,*idx,*ii,n;
864 
865   VecGetArray_Fast(xx,x);
866   VecGetArray_Fast(zz,z);
867 
868   idx   = a->j;
869   v     = a->a;
870   ii    = a->i;
871 
872   for ( i=0; i<mbs; i++ ) {
873     n    = ii[1] - ii[0]; ii++;
874     sum  = 0.0;
875     while (n--) sum += *v++ * x[*idx++];
876     z[i] = sum;
877   }
878   VecRestoreArray_Fast(xx,x);
879   VecRestoreArray_Fast(zz,z);
880   PLogFlops(2*a->nz - a->m);
881   return 0;
882 }
883 
884 #undef __FUNC__
885 #define __FUNC__ "MatMult_SeqBAIJ_2"
886 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
887 {
888   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
889   register Scalar *x,*z,*v,*xb,sum1,sum2;
890   register Scalar x1,x2;
891   int             mbs=a->mbs,i,*idx,*ii;
892   int             j,n;
893 
894   VecGetArray_Fast(xx,x);
895   VecGetArray_Fast(zz,z);
896 
897   idx   = a->j;
898   v     = a->a;
899   ii    = a->i;
900 
901   for ( i=0; i<mbs; i++ ) {
902     n  = ii[1] - ii[0]; ii++;
903     sum1 = 0.0; sum2 = 0.0;
904     for ( j=0; j<n; j++ ) {
905       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
906       sum1 += v[0]*x1 + v[2]*x2;
907       sum2 += v[1]*x1 + v[3]*x2;
908       v += 4;
909     }
910     z[0] = sum1; z[1] = sum2;
911     z += 2;
912   }
913   VecRestoreArray_Fast(xx,x);
914   VecRestoreArray_Fast(zz,z);
915   PLogFlops(4*a->nz - a->m);
916   return 0;
917 }
918 
919 #undef __FUNC__
920 #define __FUNC__ "MatMult_SeqBAIJ_3"
921 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
922 {
923   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
924   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
925   int             mbs=a->mbs,i,*idx,*ii,j,n;
926 
927   VecGetArray_Fast(xx,x);
928   VecGetArray_Fast(zz,z);
929 
930   idx   = a->j;
931   v     = a->a;
932   ii    = a->i;
933 
934   for ( i=0; i<mbs; i++ ) {
935     n  = ii[1] - ii[0]; ii++;
936     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
937     for ( j=0; j<n; j++ ) {
938       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
939       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
940       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
941       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
942       v += 9;
943     }
944     z[0] = sum1; z[1] = sum2; z[2] = sum3;
945     z += 3;
946   }
947   VecRestoreArray_Fast(xx,x);
948   VecRestoreArray_Fast(zz,z);
949   PLogFlops(18*a->nz - a->m);
950   return 0;
951 }
952 
953 #undef __FUNC__
954 #define __FUNC__ "MatMult_SeqBAIJ_4"
955 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
956 {
957   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
958   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
959   register Scalar x1,x2,x3,x4;
960   int             mbs=a->mbs,i,*idx,*ii;
961   int             j,n;
962 
963   VecGetArray_Fast(xx,x);
964   VecGetArray_Fast(zz,z);
965 
966   idx   = a->j;
967   v     = a->a;
968   ii    = a->i;
969 
970   for ( i=0; i<mbs; i++ ) {
971     n  = ii[1] - ii[0]; ii++;
972     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
973     for ( j=0; j<n; j++ ) {
974       xb = x + 4*(*idx++);
975       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
976       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
977       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
978       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
979       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
980       v += 16;
981     }
982     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
983     z += 4;
984   }
985   VecRestoreArray_Fast(xx,x);
986   VecRestoreArray_Fast(zz,z);
987   PLogFlops(32*a->nz - a->m);
988   return 0;
989 }
990 
991 #undef __FUNC__
992 #define __FUNC__ "MatMult_SeqBAIJ_5"
993 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
994 {
995   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
996   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
997   register Scalar x1,x2,x3,x4,x5;
998   int             mbs=a->mbs,i,*idx,*ii,j,n;
999 
1000   VecGetArray_Fast(xx,x);
1001   VecGetArray_Fast(zz,z);
1002 
1003   idx   = a->j;
1004   v     = a->a;
1005   ii    = a->i;
1006 
1007   for ( i=0; i<mbs; i++ ) {
1008     n  = ii[1] - ii[0]; ii++;
1009     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
1010     for ( j=0; j<n; j++ ) {
1011       xb = x + 5*(*idx++);
1012       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1013       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1014       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1015       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1016       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1017       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1018       v += 25;
1019     }
1020     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1021     z += 5;
1022   }
1023   VecRestoreArray_Fast(xx,x);
1024   VecRestoreArray_Fast(zz,z);
1025   PLogFlops(50*a->nz - a->m);
1026   return 0;
1027 }
1028 
1029 #undef __FUNC__
1030 #define __FUNC__ "MatMult_SeqBAIJ_7"
1031 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1032 {
1033   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1034   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1035   register Scalar x1,x2,x3,x4,x5,x6,x7;
1036   int             mbs=a->mbs,i,*idx,*ii,j,n;
1037 
1038   VecGetArray_Fast(xx,x);
1039   VecGetArray_Fast(zz,z);
1040 
1041   idx   = a->j;
1042   v     = a->a;
1043   ii    = a->i;
1044 
1045   for ( i=0; i<mbs; i++ ) {
1046     n  = ii[1] - ii[0]; ii++;
1047     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1048     for ( j=0; j<n; j++ ) {
1049       xb = x + 7*(*idx++);
1050       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1051       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1052       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1053       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1054       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1055       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1056       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1057       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1058       v += 49;
1059     }
1060     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1061     z += 7;
1062   }
1063 
1064   VecRestoreArray_Fast(xx,x);
1065   VecRestoreArray_Fast(zz,z);
1066   PLogFlops(98*a->nz - a->m);
1067   return 0;
1068 }
1069 
1070 #undef __FUNC__
1071 #define __FUNC__ "MatMult_SeqBAIJ_N"
1072 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1073 {
1074   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1075   register Scalar *x,*z,*v,*xb;
1076   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1077   int             _One = 1,ncols,k;
1078   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1079 
1080   VecGetArray_Fast(xx,x);
1081   VecGetArray_Fast(zz,z);
1082 
1083   idx   = a->j;
1084   v     = a->a;
1085   ii    = a->i;
1086 
1087 
1088   if (!a->mult_work) {
1089     k = PetscMax(a->m,a->n);
1090     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1091   }
1092   work = a->mult_work;
1093   for ( i=0; i<mbs; i++ ) {
1094     n     = ii[1] - ii[0]; ii++;
1095     ncols = n*bs;
1096     workt = work;
1097     for ( j=0; j<n; j++ ) {
1098       xb = x + bs*(*idx++);
1099       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1100       workt += bs;
1101     }
1102     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1103     v += n*bs2;
1104     z += bs;
1105   }
1106   VecRestoreArray_Fast(xx,x);
1107   VecRestoreArray_Fast(zz,z);
1108   PLogFlops(2*a->nz*bs2 - a->m);
1109   return 0;
1110 }
1111 
1112 #undef __FUNC__
1113 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1114 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1115 {
1116   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1117   register Scalar *x,*y,*z,*v,sum;
1118   int             mbs=a->mbs,i,*idx,*ii,n;
1119 
1120   VecGetArray_Fast(xx,x);
1121   VecGetArray_Fast(yy,y);
1122   VecGetArray_Fast(zz,z);
1123 
1124   idx   = a->j;
1125   v     = a->a;
1126   ii    = a->i;
1127 
1128   for ( i=0; i<mbs; i++ ) {
1129     n    = ii[1] - ii[0]; ii++;
1130     sum  = y[i];
1131     while (n--) sum += *v++ * x[*idx++];
1132     z[i] = sum;
1133   }
1134   VecRestoreArray_Fast(xx,x);
1135   VecRestoreArray_Fast(yy,y);
1136   VecRestoreArray_Fast(zz,z);
1137   PLogFlops(2*a->nz);
1138   return 0;
1139 }
1140 
1141 #undef __FUNC__
1142 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1143 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1144 {
1145   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1146   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1147   register Scalar x1,x2;
1148   int             mbs=a->mbs,i,*idx,*ii;
1149   int             j,n;
1150 
1151   VecGetArray_Fast(xx,x);
1152   VecGetArray_Fast(yy,y);
1153   VecGetArray_Fast(zz,z);
1154 
1155   idx   = a->j;
1156   v     = a->a;
1157   ii    = a->i;
1158 
1159   for ( i=0; i<mbs; i++ ) {
1160     n  = ii[1] - ii[0]; ii++;
1161     sum1 = y[0]; sum2 = y[1];
1162     for ( j=0; j<n; j++ ) {
1163       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1164       sum1 += v[0]*x1 + v[2]*x2;
1165       sum2 += v[1]*x1 + v[3]*x2;
1166       v += 4;
1167     }
1168     z[0] = sum1; z[1] = sum2;
1169     z += 2; y += 2;
1170   }
1171   VecRestoreArray_Fast(xx,x);
1172   VecRestoreArray_Fast(yy,y);
1173   VecRestoreArray_Fast(zz,z);
1174   PLogFlops(4*a->nz);
1175   return 0;
1176 }
1177 
1178 #undef __FUNC__
1179 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1180 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1181 {
1182   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1183   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1184   int             mbs=a->mbs,i,*idx,*ii,j,n;
1185 
1186   VecGetArray_Fast(xx,x);
1187   VecGetArray_Fast(yy,y);
1188   VecGetArray_Fast(zz,z);
1189 
1190   idx   = a->j;
1191   v     = a->a;
1192   ii    = a->i;
1193 
1194   for ( i=0; i<mbs; i++ ) {
1195     n  = ii[1] - ii[0]; ii++;
1196     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1197     for ( j=0; j<n; j++ ) {
1198       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1199       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1200       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1201       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1202       v += 9;
1203     }
1204     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1205     z += 3; y += 3;
1206   }
1207   VecRestoreArray_Fast(xx,x);
1208   VecRestoreArray_Fast(yy,y);
1209   VecRestoreArray_Fast(zz,z);
1210   PLogFlops(18*a->nz);
1211   return 0;
1212 }
1213 
1214 #undef __FUNC__
1215 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1216 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1217 {
1218   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1219   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1220   register Scalar x1,x2,x3,x4;
1221   int             mbs=a->mbs,i,*idx,*ii;
1222   int             j,n;
1223 
1224   VecGetArray_Fast(xx,x);
1225   VecGetArray_Fast(yy,y);
1226   VecGetArray_Fast(zz,z);
1227 
1228   idx   = a->j;
1229   v     = a->a;
1230   ii    = a->i;
1231 
1232   for ( i=0; i<mbs; i++ ) {
1233     n  = ii[1] - ii[0]; ii++;
1234     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1235     for ( j=0; j<n; j++ ) {
1236       xb = x + 4*(*idx++);
1237       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1238       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1239       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1240       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1241       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1242       v += 16;
1243     }
1244     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1245     z += 4; y += 4;
1246   }
1247   VecRestoreArray_Fast(xx,x);
1248   VecRestoreArray_Fast(yy,y);
1249   VecRestoreArray_Fast(zz,z);
1250   PLogFlops(32*a->nz);
1251   return 0;
1252 }
1253 
1254 #undef __FUNC__
1255 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1256 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1257 {
1258   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1259   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1260   register Scalar x1,x2,x3,x4,x5;
1261   int             mbs=a->mbs,i,*idx,*ii,j,n;
1262 
1263   VecGetArray_Fast(xx,x);
1264   VecGetArray_Fast(yy,y);
1265   VecGetArray_Fast(zz,z);
1266 
1267   idx   = a->j;
1268   v     = a->a;
1269   ii    = a->i;
1270 
1271   for ( i=0; i<mbs; i++ ) {
1272     n  = ii[1] - ii[0]; ii++;
1273     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1274     for ( j=0; j<n; j++ ) {
1275       xb = x + 5*(*idx++);
1276       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1277       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1278       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1279       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1280       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1281       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1282       v += 25;
1283     }
1284     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1285     z += 5; y += 5;
1286   }
1287   VecRestoreArray_Fast(xx,x);
1288   VecRestoreArray_Fast(yy,y);
1289   VecRestoreArray_Fast(zz,z);
1290   PLogFlops(50*a->nz);
1291   return 0;
1292 }
1293 
1294 #undef __FUNC__
1295 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1296 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1297 {
1298   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1299   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1300   register Scalar x1,x2,x3,x4,x5,x6,x7;
1301   int             mbs=a->mbs,i,*idx,*ii,j,n;
1302 
1303   VecGetArray_Fast(xx,x);
1304   VecGetArray_Fast(yy,y);
1305   VecGetArray_Fast(zz,z);
1306 
1307   idx   = a->j;
1308   v     = a->a;
1309   ii    = a->i;
1310 
1311   for ( i=0; i<mbs; i++ ) {
1312     n  = ii[1] - ii[0]; ii++;
1313     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1314     for ( j=0; j<n; j++ ) {
1315       xb = x + 7*(*idx++);
1316       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1317       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1318       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1319       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1320       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1321       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1322       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1323       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1324       v += 49;
1325     }
1326     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1327     z += 7; y += 7;
1328   }
1329   VecRestoreArray_Fast(xx,x);
1330   VecRestoreArray_Fast(yy,y);
1331   VecRestoreArray_Fast(zz,z);
1332   PLogFlops(98*a->nz);
1333   return 0;
1334 }
1335 
1336 
1337 #undef __FUNC__
1338 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1339 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1340 {
1341   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1342   register Scalar *x,*z,*v,*xb;
1343   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1344   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1345 
1346   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1347 
1348   VecGetArray_Fast(xx,x);
1349   VecGetArray_Fast(zz,z);
1350 
1351   idx   = a->j;
1352   v     = a->a;
1353   ii    = a->i;
1354 
1355 
1356   if (!a->mult_work) {
1357     k = PetscMax(a->m,a->n);
1358     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1359   }
1360   work = a->mult_work;
1361   for ( i=0; i<mbs; i++ ) {
1362     n     = ii[1] - ii[0]; ii++;
1363     ncols = n*bs;
1364     workt = work;
1365     for ( j=0; j<n; j++ ) {
1366       xb = x + bs*(*idx++);
1367       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1368       workt += bs;
1369     }
1370     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1371     v += n*bs2;
1372     z += bs;
1373   }
1374   VecRestoreArray_Fast(xx,x);
1375   VecRestoreArray_Fast(zz,z);
1376   PLogFlops(2*a->nz*bs2 );
1377   return 0;
1378 }
1379 
1380 #undef __FUNC__
1381 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1382 static int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1383 {
1384   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1385   Scalar          *xg,*zg,*zb;
1386   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1387   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1388   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1389 
1390 
1391   VecGetArray_Fast(xx,xg); x = xg;
1392   VecGetArray_Fast(zz,zg); z = zg;
1393   PetscMemzero(z,N*sizeof(Scalar));
1394 
1395   idx   = a->j;
1396   v     = a->a;
1397   ii    = a->i;
1398 
1399   switch (bs) {
1400   case 1:
1401     for ( i=0; i<mbs; i++ ) {
1402       n  = ii[1] - ii[0]; ii++;
1403       xb = x + i; x1 = xb[0];
1404       ib = idx + ai[i];
1405       for ( j=0; j<n; j++ ) {
1406         rval    = ib[j];
1407         z[rval] += *v++ * x1;
1408       }
1409     }
1410     break;
1411   case 2:
1412     for ( i=0; i<mbs; i++ ) {
1413       n  = ii[1] - ii[0]; ii++;
1414       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1415       ib = idx + ai[i];
1416       for ( j=0; j<n; j++ ) {
1417         rval      = ib[j]*2;
1418         z[rval++] += v[0]*x1 + v[1]*x2;
1419         z[rval++] += v[2]*x1 + v[3]*x2;
1420         v += 4;
1421       }
1422     }
1423     break;
1424   case 3:
1425     for ( i=0; i<mbs; i++ ) {
1426       n  = ii[1] - ii[0]; ii++;
1427       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1428       ib = idx + ai[i];
1429       for ( j=0; j<n; j++ ) {
1430         rval      = ib[j]*3;
1431         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1432         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1433         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1434         v += 9;
1435       }
1436     }
1437     break;
1438   case 4:
1439     for ( i=0; i<mbs; i++ ) {
1440       n  = ii[1] - ii[0]; ii++;
1441       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1442       ib = idx + ai[i];
1443       for ( j=0; j<n; j++ ) {
1444         rval      = ib[j]*4;
1445         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1446         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1447         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1448         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1449         v += 16;
1450       }
1451     }
1452     break;
1453   case 5:
1454     for ( i=0; i<mbs; i++ ) {
1455       n  = ii[1] - ii[0]; ii++;
1456       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1457       x4 = xb[3];   x5 = xb[4];
1458       ib = idx + ai[i];
1459       for ( j=0; j<n; j++ ) {
1460         rval      = ib[j]*5;
1461         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1462         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1463         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1464         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1465         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1466         v += 25;
1467       }
1468     }
1469     break;
1470       /* block sizes larger then 5 by 5 are handled by BLAS */
1471     default: {
1472       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1473       if (!a->mult_work) {
1474         k = PetscMax(a->m,a->n);
1475         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1476         CHKPTRQ(a->mult_work);
1477       }
1478       work = a->mult_work;
1479       for ( i=0; i<mbs; i++ ) {
1480         n     = ii[1] - ii[0]; ii++;
1481         ncols = n*bs;
1482         PetscMemzero(work,ncols*sizeof(Scalar));
1483         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1484         v += n*bs2;
1485         x += bs;
1486         workt = work;
1487         for ( j=0; j<n; j++ ) {
1488           zb = z + bs*(*idx++);
1489           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1490           workt += bs;
1491         }
1492       }
1493     }
1494   }
1495   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1496   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1497   PLogFlops(2*a->nz*a->bs2 - a->n);
1498   return 0;
1499 }
1500 
1501 #undef __FUNC__
1502 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1503 static int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1504 {
1505   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1506   Scalar          *xg,*zg,*zb;
1507   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1508   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1509   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1510 
1511 
1512 
1513   VecGetArray_Fast(xx,xg); x = xg;
1514   VecGetArray_Fast(zz,zg); z = zg;
1515 
1516   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1517   else PetscMemzero(z,N*sizeof(Scalar));
1518 
1519 
1520   idx   = a->j;
1521   v     = a->a;
1522   ii    = a->i;
1523 
1524   switch (bs) {
1525   case 1:
1526     for ( i=0; i<mbs; i++ ) {
1527       n  = ii[1] - ii[0]; ii++;
1528       xb = x + i; x1 = xb[0];
1529       ib = idx + ai[i];
1530       for ( j=0; j<n; j++ ) {
1531         rval    = ib[j];
1532         z[rval] += *v++ * x1;
1533       }
1534     }
1535     break;
1536   case 2:
1537     for ( i=0; i<mbs; i++ ) {
1538       n  = ii[1] - ii[0]; ii++;
1539       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1540       ib = idx + ai[i];
1541       for ( j=0; j<n; j++ ) {
1542         rval      = ib[j]*2;
1543         z[rval++] += v[0]*x1 + v[1]*x2;
1544         z[rval++] += v[2]*x1 + v[3]*x2;
1545         v += 4;
1546       }
1547     }
1548     break;
1549   case 3:
1550     for ( i=0; i<mbs; i++ ) {
1551       n  = ii[1] - ii[0]; ii++;
1552       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1553       ib = idx + ai[i];
1554       for ( j=0; j<n; j++ ) {
1555         rval      = ib[j]*3;
1556         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1557         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1558         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1559         v += 9;
1560       }
1561     }
1562     break;
1563   case 4:
1564     for ( i=0; i<mbs; i++ ) {
1565       n  = ii[1] - ii[0]; ii++;
1566       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1567       ib = idx + ai[i];
1568       for ( j=0; j<n; j++ ) {
1569         rval      = ib[j]*4;
1570         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1571         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1572         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1573         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1574         v += 16;
1575       }
1576     }
1577     break;
1578   case 5:
1579     for ( i=0; i<mbs; i++ ) {
1580       n  = ii[1] - ii[0]; ii++;
1581       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1582       x4 = xb[3];   x5 = xb[4];
1583       ib = idx + ai[i];
1584       for ( j=0; j<n; j++ ) {
1585         rval      = ib[j]*5;
1586         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1587         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1588         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1589         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1590         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1591         v += 25;
1592       }
1593     }
1594     break;
1595       /* block sizes larger then 5 by 5 are handled by BLAS */
1596     default: {
1597       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1598       if (!a->mult_work) {
1599         k = PetscMax(a->m,a->n);
1600         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1601         CHKPTRQ(a->mult_work);
1602       }
1603       work = a->mult_work;
1604       for ( i=0; i<mbs; i++ ) {
1605         n     = ii[1] - ii[0]; ii++;
1606         ncols = n*bs;
1607         PetscMemzero(work,ncols*sizeof(Scalar));
1608         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1609         v += n*bs2;
1610         x += bs;
1611         workt = work;
1612         for ( j=0; j<n; j++ ) {
1613           zb = z + bs*(*idx++);
1614           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1615           workt += bs;
1616         }
1617       }
1618     }
1619   }
1620   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1621   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1622   PLogFlops(2*a->nz*a->bs2);
1623   return 0;
1624 }
1625 
1626 #undef __FUNC__
1627 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1628 static int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1629 {
1630   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1631 
1632   info->rows_global    = (double)a->m;
1633   info->columns_global = (double)a->n;
1634   info->rows_local     = (double)a->m;
1635   info->columns_local  = (double)a->n;
1636   info->block_size     = a->bs2;
1637   info->nz_allocated   = a->maxnz;
1638   info->nz_used        = a->bs2*a->nz;
1639   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1640   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1641     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1642   info->assemblies   = A->num_ass;
1643   info->mallocs      = a->reallocs;
1644   info->memory       = A->mem;
1645   if (A->factor) {
1646     info->fill_ratio_given  = A->info.fill_ratio_given;
1647     info->fill_ratio_needed = A->info.fill_ratio_needed;
1648     info->factor_mallocs    = A->info.factor_mallocs;
1649   } else {
1650     info->fill_ratio_given  = 0;
1651     info->fill_ratio_needed = 0;
1652     info->factor_mallocs    = 0;
1653   }
1654   return 0;
1655 }
1656 
1657 #undef __FUNC__
1658 #define __FUNC__ "MatEqual_SeqBAIJ"
1659 static int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1660 {
1661   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1662 
1663   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1664 
1665   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1666   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1667       (a->nz != b->nz)) {
1668     *flg = PETSC_FALSE; return 0;
1669   }
1670 
1671   /* if the a->i are the same */
1672   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1673     *flg = PETSC_FALSE; return 0;
1674   }
1675 
1676   /* if a->j are the same */
1677   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1678     *flg = PETSC_FALSE; return 0;
1679   }
1680 
1681   /* if a->a are the same */
1682   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1683     *flg = PETSC_FALSE; return 0;
1684   }
1685   *flg = PETSC_TRUE;
1686   return 0;
1687 
1688 }
1689 
1690 #undef __FUNC__
1691 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1692 static int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1693 {
1694   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1695   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1696   Scalar      *x, zero = 0.0,*aa,*aa_j;
1697 
1698   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1699   bs   = a->bs;
1700   aa   = a->a;
1701   ai   = a->i;
1702   aj   = a->j;
1703   ambs = a->mbs;
1704   bs2  = a->bs2;
1705 
1706   VecSet(&zero,v);
1707   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1708   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1709   for ( i=0; i<ambs; i++ ) {
1710     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1711       if (aj[j] == i) {
1712         row  = i*bs;
1713         aa_j = aa+j*bs2;
1714         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1715         break;
1716       }
1717     }
1718   }
1719   return 0;
1720 }
1721 
1722 #undef __FUNC__
1723 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1724 static int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1725 {
1726   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1727   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1728   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1729 
1730   ai  = a->i;
1731   aj  = a->j;
1732   aa  = a->a;
1733   m   = a->m;
1734   n   = a->n;
1735   bs  = a->bs;
1736   mbs = a->mbs;
1737   bs2 = a->bs2;
1738   if (ll) {
1739     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1740     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1741     for ( i=0; i<mbs; i++ ) { /* for each block row */
1742       M  = ai[i+1] - ai[i];
1743       li = l + i*bs;
1744       v  = aa + bs2*ai[i];
1745       for ( j=0; j<M; j++ ) { /* for each block */
1746         for ( k=0; k<bs2; k++ ) {
1747           (*v++) *= li[k%bs];
1748         }
1749       }
1750     }
1751   }
1752 
1753   if (rr) {
1754     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1755     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1756     for ( i=0; i<mbs; i++ ) { /* for each block row */
1757       M  = ai[i+1] - ai[i];
1758       v  = aa + bs2*ai[i];
1759       for ( j=0; j<M; j++ ) { /* for each block */
1760         ri = r + bs*aj[ai[i]+j];
1761         for ( k=0; k<bs; k++ ) {
1762           x = ri[k];
1763           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1764         }
1765       }
1766     }
1767   }
1768   return 0;
1769 }
1770 
1771 
1772 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1773 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1774 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1775 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1776 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1777 
1778 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1779 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1780 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1781 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1782 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1783 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1784 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1785 
1786 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1787 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1788 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1789 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1790 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1791 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1792 
1793 #undef __FUNC__
1794 #define __FUNC__ "MatNorm_SeqBAIJ"
1795 static int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1796 {
1797   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1798   Scalar      *v = a->a;
1799   double      sum = 0.0;
1800   int         i,nz=a->nz,bs2=a->bs2;
1801 
1802   if (type == NORM_FROBENIUS) {
1803     for (i=0; i< bs2*nz; i++ ) {
1804 #if defined(PETSC_COMPLEX)
1805       sum += real(conj(*v)*(*v)); v++;
1806 #else
1807       sum += (*v)*(*v); v++;
1808 #endif
1809     }
1810     *norm = sqrt(sum);
1811   }
1812   else {
1813     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1814   }
1815   return 0;
1816 }
1817 
1818 /*
1819      note: This can only work for identity for row and col. It would
1820    be good to check this and otherwise generate an error.
1821 */
1822 #undef __FUNC__
1823 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1824 static int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1825 {
1826   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1827   Mat         outA;
1828   int         ierr;
1829 
1830   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1831 
1832   outA          = inA;
1833   inA->factor   = FACTOR_LU;
1834   a->row        = row;
1835   a->col        = col;
1836 
1837   a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1838 
1839   if (!a->diag) {
1840     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1841   }
1842   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1843   return 0;
1844 }
1845 
1846 #undef __FUNC__
1847 #define __FUNC__ "MatScale_SeqBAIJ"
1848 static int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1849 {
1850   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1851   int         one = 1, totalnz = a->bs2*a->nz;
1852   BLscal_( &totalnz, alpha, a->a, &one );
1853   PLogFlops(totalnz);
1854   return 0;
1855 }
1856 
1857 #undef __FUNC__
1858 #define __FUNC__ "MatGetValues_SeqBAIJ"
1859 static int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1860 {
1861   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1862   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1863   int        *ai = a->i, *ailen = a->ilen;
1864   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1865   Scalar     *ap, *aa = a->a, zero = 0.0;
1866 
1867   for ( k=0; k<m; k++ ) { /* loop over rows */
1868     row  = im[k]; brow = row/bs;
1869     if (row < 0) SETERRQ(1,0,"Negative row");
1870     if (row >= a->m) SETERRQ(1,0,"Row too large");
1871     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1872     nrow = ailen[brow];
1873     for ( l=0; l<n; l++ ) { /* loop over columns */
1874       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1875       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1876       col  = in[l] ;
1877       bcol = col/bs;
1878       cidx = col%bs;
1879       ridx = row%bs;
1880       high = nrow;
1881       low  = 0; /* assume unsorted */
1882       while (high-low > 5) {
1883         t = (low+high)/2;
1884         if (rp[t] > bcol) high = t;
1885         else             low  = t;
1886       }
1887       for ( i=low; i<high; i++ ) {
1888         if (rp[i] > bcol) break;
1889         if (rp[i] == bcol) {
1890           *v++ = ap[bs2*i+bs*cidx+ridx];
1891           goto finished;
1892         }
1893       }
1894       *v++ = zero;
1895       finished:;
1896     }
1897   }
1898   return 0;
1899 }
1900 
1901 #undef __FUNC__
1902 #define __FUNC__ "MatGetBlockSize_SeqBAIJ"
1903 static int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1904 {
1905   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1906   *bs = baij->bs;
1907   return 0;
1908 }
1909 
1910 /* idx should be of length atlease bs */
1911 #undef __FUNC__
1912 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1913 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1914 {
1915   int i,row;
1916   row = idx[0];
1917   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1918 
1919   for ( i=1; i<bs; i++ ) {
1920     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1921   }
1922   *flg = PETSC_TRUE;
1923   return 0;
1924 }
1925 
1926 #undef __FUNC__
1927 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1928 static int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1929 {
1930   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1931   IS          is_local;
1932   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1933   PetscTruth  flg;
1934   Scalar      zero = 0.0,*aa;
1935 
1936   /* Make a copy of the IS and  sort it */
1937   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1938   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1939   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1940   ierr = ISSort(is_local); CHKERRQ(ierr);
1941   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1942 
1943   i = 0;
1944   while (i < is_n) {
1945     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1946     flg = PETSC_FALSE;
1947     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1948     if (flg) { /* There exists a block of rows to be Zerowed */
1949       baij->ilen[rows[i]/bs] = 0;
1950       i += bs;
1951     } else { /* Zero out only the requested row */
1952       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1953       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1954       for ( j=0; j<count; j++ ) {
1955         aa[0] = zero;
1956         aa+=bs;
1957       }
1958       i++;
1959     }
1960   }
1961   if (diag) {
1962     for ( j=0; j<is_n; j++ ) {
1963       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1964     }
1965   }
1966   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
1967   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
1968   ierr = ISDestroy(is_local); CHKERRQ(ierr);
1969   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1970 
1971   return 0;
1972 }
1973 
1974 #undef __FUNC__
1975 #define __FUNC__ "MatPrintHelp_SeqBAIJ"
1976 int MatPrintHelp_SeqBAIJ(Mat A)
1977 {
1978   static int called = 0;
1979   MPI_Comm   comm = A->comm;
1980 
1981   if (called) return 0; else called = 1;
1982   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
1983   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
1984   return 0;
1985 }
1986 
1987 /* -------------------------------------------------------------------*/
1988 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
1989        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
1990        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
1991        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
1992        MatSolve_SeqBAIJ_N,0,
1993        0,0,
1994        MatLUFactor_SeqBAIJ,0,
1995        0,
1996        MatTranspose_SeqBAIJ,
1997        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
1998        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
1999        0,MatAssemblyEnd_SeqBAIJ,
2000        0,
2001        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
2002        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
2003        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
2004        MatILUFactorSymbolic_SeqBAIJ,0,
2005        0,0,
2006        MatConvertSameType_SeqBAIJ,0,0,
2007        MatILUFactor_SeqBAIJ,0,0,
2008        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
2009        MatGetValues_SeqBAIJ,0,
2010        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
2011        0,0,0,MatGetBlockSize_SeqBAIJ,
2012        MatGetRowIJ_SeqBAIJ,
2013        MatRestoreRowIJ_SeqBAIJ,
2014        0,0,0,0,0,0,
2015        MatSetValuesBlocked_SeqBAIJ};
2016 
2017 #undef __FUNC__
2018 #define __FUNC__ "MatCreateSeqBAIJ"
2019 /*@C
2020    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
2021    compressed row) format.  For good matrix assembly performance the
2022    user should preallocate the matrix storage by setting the parameter nz
2023    (or the array nzz).  By setting these parameters accurately, performance
2024    during matrix assembly can be increased by more than a factor of 50.
2025 
2026    Input Parameters:
2027 .  comm - MPI communicator, set to MPI_COMM_SELF
2028 .  bs - size of block
2029 .  m - number of rows
2030 .  n - number of columns
2031 .  nz - number of block nonzeros per block row (same for all rows)
2032 .  nzz - array containing the number of block nonzeros in the various block rows
2033          (possibly different for each block row) or PETSC_NULL
2034 
2035    Output Parameter:
2036 .  A - the matrix
2037 
2038    Options Database Keys:
2039 $    -mat_no_unroll - uses code that does not unroll the loops in the
2040 $                     block calculations (much solver)
2041 $    -mat_block_size - size of the blocks to use
2042 
2043    Notes:
2044    The block AIJ format is fully compatible with standard Fortran 77
2045    storage.  That is, the stored row and column indices can begin at
2046    either one (as in Fortran) or zero.  See the users' manual for details.
2047 
2048    Specify the preallocated storage with either nz or nnz (not both).
2049    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2050    allocation.  For additional details, see the users manual chapter on
2051    matrices.
2052 
2053 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2054 @*/
2055 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2056 {
2057   Mat         B;
2058   Mat_SeqBAIJ *b;
2059   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2060 
2061   MPI_Comm_size(comm,&size);
2062   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2063 
2064   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2065 
2066   if (mbs*bs!=m || nbs*bs!=n)
2067     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2068 
2069   *A = 0;
2070   PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
2071   PLogObjectCreate(B);
2072   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2073   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2074   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2075   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2076   if (!flg) {
2077     switch (bs) {
2078     case 1:
2079       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2080       B->ops.solve           = MatSolve_SeqBAIJ_1;
2081       B->ops.mult            = MatMult_SeqBAIJ_1;
2082       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2083       break;
2084     case 2:
2085       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2086       B->ops.solve           = MatSolve_SeqBAIJ_2;
2087       B->ops.mult            = MatMult_SeqBAIJ_2;
2088       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2089       break;
2090     case 3:
2091       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2092       B->ops.solve           = MatSolve_SeqBAIJ_3;
2093       B->ops.mult            = MatMult_SeqBAIJ_3;
2094       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2095       break;
2096     case 4:
2097       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2098       B->ops.solve           = MatSolve_SeqBAIJ_4;
2099       B->ops.mult            = MatMult_SeqBAIJ_4;
2100       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2101       break;
2102     case 5:
2103       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2104       B->ops.solve           = MatSolve_SeqBAIJ_5;
2105       B->ops.mult            = MatMult_SeqBAIJ_5;
2106       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2107       break;
2108     case 7:
2109       B->ops.mult            = MatMult_SeqBAIJ_7;
2110       B->ops.solve           = MatSolve_SeqBAIJ_7;
2111       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2112       break;
2113     }
2114   }
2115   B->destroy          = MatDestroy_SeqBAIJ;
2116   B->view             = MatView_SeqBAIJ;
2117   B->factor           = 0;
2118   B->lupivotthreshold = 1.0;
2119   B->mapping          = 0;
2120   b->row              = 0;
2121   b->col              = 0;
2122   b->reallocs         = 0;
2123 
2124   b->m       = m; B->m = m; B->M = m;
2125   b->n       = n; B->n = n; B->N = n;
2126   b->mbs     = mbs;
2127   b->nbs     = nbs;
2128   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2129   if (nnz == PETSC_NULL) {
2130     if (nz == PETSC_DEFAULT) nz = 5;
2131     else if (nz <= 0)        nz = 1;
2132     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2133     nz = nz*mbs;
2134   }
2135   else {
2136     nz = 0;
2137     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2138   }
2139 
2140   /* allocate the matrix space */
2141   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2142   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2143   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2144   b->j  = (int *) (b->a + nz*bs2);
2145   PetscMemzero(b->j,nz*sizeof(int));
2146   b->i  = b->j + nz;
2147   b->singlemalloc = 1;
2148 
2149   b->i[0] = 0;
2150   for (i=1; i<mbs+1; i++) {
2151     b->i[i] = b->i[i-1] + b->imax[i-1];
2152   }
2153 
2154   /* b->ilen will count nonzeros in each block row so far. */
2155   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2156   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
2157   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2158 
2159   b->bs               = bs;
2160   b->bs2              = bs2;
2161   b->mbs              = mbs;
2162   b->nz               = 0;
2163   b->maxnz            = nz*bs2;
2164   b->sorted           = 0;
2165   b->roworiented      = 1;
2166   b->nonew            = 0;
2167   b->diag             = 0;
2168   b->solve_work       = 0;
2169   b->mult_work        = 0;
2170   b->spptr            = 0;
2171   B->info.nz_unneeded = (double)b->maxnz;
2172 
2173   *A = B;
2174   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2175   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2176   return 0;
2177 }
2178 
2179 #undef __FUNC__
2180 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2181 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2182 {
2183   Mat         C;
2184   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2185   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2186 
2187   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2188 
2189   *B = 0;
2190   PetscHeaderCreate(C,_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
2191   PLogObjectCreate(C);
2192   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2193   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2194   C->destroy    = MatDestroy_SeqBAIJ;
2195   C->view       = MatView_SeqBAIJ;
2196   C->factor     = A->factor;
2197   c->row        = 0;
2198   c->col        = 0;
2199   C->assembled  = PETSC_TRUE;
2200 
2201   c->m = C->m   = a->m;
2202   c->n = C->n   = a->n;
2203   C->M          = a->m;
2204   C->N          = a->n;
2205 
2206   c->bs         = a->bs;
2207   c->bs2        = a->bs2;
2208   c->mbs        = a->mbs;
2209   c->nbs        = a->nbs;
2210 
2211   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2212   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2213   for ( i=0; i<mbs; i++ ) {
2214     c->imax[i] = a->imax[i];
2215     c->ilen[i] = a->ilen[i];
2216   }
2217 
2218   /* allocate the matrix space */
2219   c->singlemalloc = 1;
2220   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2221   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2222   c->j  = (int *) (c->a + nz*bs2);
2223   c->i  = c->j + nz;
2224   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2225   if (mbs > 0) {
2226     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2227     if (cpvalues == COPY_VALUES) {
2228       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2229     }
2230   }
2231 
2232   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_SeqBAIJ));
2233   c->sorted      = a->sorted;
2234   c->roworiented = a->roworiented;
2235   c->nonew       = a->nonew;
2236 
2237   if (a->diag) {
2238     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2239     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2240     for ( i=0; i<mbs; i++ ) {
2241       c->diag[i] = a->diag[i];
2242     }
2243   }
2244   else c->diag          = 0;
2245   c->nz                 = a->nz;
2246   c->maxnz              = a->maxnz;
2247   c->solve_work         = 0;
2248   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2249   c->mult_work          = 0;
2250   *B = C;
2251   return 0;
2252 }
2253 
2254 #undef __FUNC__
2255 #define __FUNC__ "MatLoad_SeqBAIJ"
2256 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2257 {
2258   Mat_SeqBAIJ  *a;
2259   Mat          B;
2260   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2261   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2262   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2263   int          *masked, nmask,tmp,bs2,ishift;
2264   Scalar       *aa;
2265   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2266 
2267   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2268   bs2  = bs*bs;
2269 
2270   MPI_Comm_size(comm,&size);
2271   if (size > 1) SETERRQ(1,0,"view must have one processor");
2272   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2273   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
2274   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2275   M = header[1]; N = header[2]; nz = header[3];
2276 
2277   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2278 
2279   /*
2280      This code adds extra rows to make sure the number of rows is
2281     divisible by the blocksize
2282   */
2283   mbs        = M/bs;
2284   extra_rows = bs - M + bs*(mbs);
2285   if (extra_rows == bs) extra_rows = 0;
2286   else                  mbs++;
2287   if (extra_rows) {
2288     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2289   }
2290 
2291   /* read in row lengths */
2292   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2293   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
2294   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2295 
2296   /* read in column indices */
2297   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2298   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
2299   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2300 
2301   /* loop over row lengths determining block row lengths */
2302   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2303   PetscMemzero(browlengths,mbs*sizeof(int));
2304   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2305   PetscMemzero(mask,mbs*sizeof(int));
2306   masked = mask + mbs;
2307   rowcount = 0; nzcount = 0;
2308   for ( i=0; i<mbs; i++ ) {
2309     nmask = 0;
2310     for ( j=0; j<bs; j++ ) {
2311       kmax = rowlengths[rowcount];
2312       for ( k=0; k<kmax; k++ ) {
2313         tmp = jj[nzcount++]/bs;
2314         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2315       }
2316       rowcount++;
2317     }
2318     browlengths[i] += nmask;
2319     /* zero out the mask elements we set */
2320     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2321   }
2322 
2323   /* create our matrix */
2324   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);
2325          CHKERRQ(ierr);
2326   B = *A;
2327   a = (Mat_SeqBAIJ *) B->data;
2328 
2329   /* set matrix "i" values */
2330   a->i[0] = 0;
2331   for ( i=1; i<= mbs; i++ ) {
2332     a->i[i]      = a->i[i-1] + browlengths[i-1];
2333     a->ilen[i-1] = browlengths[i-1];
2334   }
2335   a->nz         = 0;
2336   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2337 
2338   /* read in nonzero values */
2339   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2340   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
2341   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2342 
2343   /* set "a" and "j" values into matrix */
2344   nzcount = 0; jcount = 0;
2345   for ( i=0; i<mbs; i++ ) {
2346     nzcountb = nzcount;
2347     nmask    = 0;
2348     for ( j=0; j<bs; j++ ) {
2349       kmax = rowlengths[i*bs+j];
2350       for ( k=0; k<kmax; k++ ) {
2351         tmp = jj[nzcount++]/bs;
2352 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2353       }
2354       rowcount++;
2355     }
2356     /* sort the masked values */
2357     PetscSortInt(nmask,masked);
2358 
2359     /* set "j" values into matrix */
2360     maskcount = 1;
2361     for ( j=0; j<nmask; j++ ) {
2362       a->j[jcount++]  = masked[j];
2363       mask[masked[j]] = maskcount++;
2364     }
2365     /* set "a" values into matrix */
2366     ishift = bs2*a->i[i];
2367     for ( j=0; j<bs; j++ ) {
2368       kmax = rowlengths[i*bs+j];
2369       for ( k=0; k<kmax; k++ ) {
2370         tmp    = jj[nzcountb]/bs ;
2371         block  = mask[tmp] - 1;
2372         point  = jj[nzcountb] - bs*tmp;
2373         idx    = ishift + bs2*block + j + bs*point;
2374         a->a[idx] = aa[nzcountb++];
2375       }
2376     }
2377     /* zero out the mask elements we set */
2378     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2379   }
2380   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2381 
2382   PetscFree(rowlengths);
2383   PetscFree(browlengths);
2384   PetscFree(aa);
2385   PetscFree(jj);
2386   PetscFree(mask);
2387 
2388   B->assembled = PETSC_TRUE;
2389 
2390   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
2391   if (flg) {
2392     Viewer tviewer;
2393     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2394     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO,0);CHKERRQ(ierr);
2395     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2396     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2397   }
2398   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
2399   if (flg) {
2400     Viewer tviewer;
2401     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2402     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO_LONG,0);CHKERRQ(ierr);
2403     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2404     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2405   }
2406   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
2407   if (flg) {
2408     Viewer tviewer;
2409     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2410     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2411     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2412   }
2413   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
2414   if (flg) {
2415     Viewer tviewer;
2416     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2417     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_MATLAB,"M");CHKERRQ(ierr);
2418     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2419     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2420   }
2421   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
2422   if (flg) {
2423     Viewer tviewer;
2424     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
2425     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
2426     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
2427     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2428   }
2429   return 0;
2430 }
2431 
2432 
2433 
2434