xref: /petsc/src/mat/impls/baij/seq/baij.c (revision bbd702db22f56e953ae24ab45c34809dbb746cb2)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij.c,v 1.111 1997/08/29 17:15:56 curfman Exp bsmith $";
3 #endif
4 
5 /*
6     Defines the basic matrix operations for the BAIJ (compressed row)
7   matrix storage format.
8 */
9 
10 #include "pinclude/pviewer.h"
11 #include "sys.h"
12 #include "src/mat/impls/baij/seq/baij.h"
13 #include "src/vec/vecimpl.h"
14 #include "src/inline/spops.h"
15 #include "petsc.h"
16 
17 
18 /*
19      Adds diagonal pointers to sparse matrix structure.
20 */
21 
22 #undef __FUNC__
23 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
24 int MatMarkDiag_SeqBAIJ(Mat A)
25 {
26   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
27   int         i,j, *diag, m = a->mbs;
28 
29   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
30   PLogObjectMemory(A,(m+1)*sizeof(int));
31   for ( i=0; i<m; i++ ) {
32     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
33       if (a->j[j] == i) {
34         diag[i] = j;
35         break;
36       }
37     }
38   }
39   a->diag = diag;
40   return 0;
41 }
42 
43 
44 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
48 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
52   int         ierr, n = a->mbs,i;
53 
54   *nn = n;
55   if (!ia) return 0;
56   if (symmetric) {
57     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
58   } else if (oshift == 1) {
59     /* temporarily add 1 to i and j indices */
60     int nz = a->i[n] + 1;
61     for ( i=0; i<nz; i++ ) a->j[i]++;
62     for ( i=0; i<n+1; i++ ) a->i[i]++;
63     *ia = a->i; *ja = a->j;
64   } else {
65     *ia = a->i; *ja = a->j;
66   }
67 
68   return 0;
69 }
70 
71 #undef __FUNC__
72 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ"
73 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
74                                 PetscTruth *done)
75 {
76   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
77   int         i,n = a->mbs;
78 
79   if (!ia) return 0;
80   if (symmetric) {
81     PetscFree(*ia);
82     PetscFree(*ja);
83   } else if (oshift == 1) {
84     int nz = a->i[n];
85     for ( i=0; i<nz; i++ ) a->j[i]--;
86     for ( i=0; i<n+1; i++ ) a->i[i]--;
87   }
88   return 0;
89 }
90 
91 
92 #undef __FUNC__
93 #define __FUNC__ "MatView_SeqBAIJ_Binary"
94 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
95 {
96   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
97   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
98   Scalar      *aa;
99 
100   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
101   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
102   col_lens[0] = MAT_COOKIE;
103 
104   col_lens[1] = a->m;
105   col_lens[2] = a->n;
106   col_lens[3] = a->nz*bs2;
107 
108   /* store lengths of each row and write (including header) to file */
109   count = 0;
110   for ( i=0; i<a->mbs; i++ ) {
111     for ( j=0; j<bs; j++ ) {
112       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
113     }
114   }
115   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,PETSC_INT,1); CHKERRQ(ierr);
116   PetscFree(col_lens);
117 
118   /* store column indices (zero start index) */
119   jj = (int *) PetscMalloc( (a->nz+1)*bs2*sizeof(int) ); CHKPTRQ(jj);
120   count = 0;
121   for ( i=0; i<a->mbs; i++ ) {
122     for ( j=0; j<bs; j++ ) {
123       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
124         for ( l=0; l<bs; l++ ) {
125           jj[count++] = bs*a->j[k] + l;
126         }
127       }
128     }
129   }
130   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,PETSC_INT,0); CHKERRQ(ierr);
131   PetscFree(jj);
132 
133   /* store nonzero values */
134   aa = (Scalar *) PetscMalloc((a->nz+1)*bs2*sizeof(Scalar)); CHKPTRQ(aa);
135   count = 0;
136   for ( i=0; i<a->mbs; i++ ) {
137     for ( j=0; j<bs; j++ ) {
138       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
139         for ( l=0; l<bs; l++ ) {
140           aa[count++] = a->a[bs2*k + l*bs + j];
141         }
142       }
143     }
144   }
145   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,PETSC_SCALAR,0); CHKERRQ(ierr);
146   PetscFree(aa);
147   return 0;
148 }
149 
150 #undef __FUNC__
151 #define __FUNC__ "MatView_SeqBAIJ_ASCII"
152 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
153 {
154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
155   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
156   FILE        *fd;
157   char        *outputname;
158 
159   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
160   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
161   ierr = ViewerGetFormat(viewer,&format);
162   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
163     fprintf(fd,"  block size is %d\n",bs);
164   }
165   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
166     SETERRQ(1,0,"Matlab format not supported");
167   }
168   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
169     for ( i=0; i<a->mbs; i++ ) {
170       for ( j=0; j<bs; j++ ) {
171         fprintf(fd,"row %d:",i*bs+j);
172         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
173           for ( l=0; l<bs; l++ ) {
174 #if defined(PETSC_COMPLEX)
175           if (imag(a->a[bs2*k + l*bs + j]) > 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
176             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
177               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
178           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
179             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
180               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
181           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
182             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
183 #else
184           if (a->a[bs2*k + l*bs + j] != 0.0)
185             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
186 #endif
187           }
188         }
189         fprintf(fd,"\n");
190       }
191     }
192   }
193   else {
194     for ( i=0; i<a->mbs; i++ ) {
195       for ( j=0; j<bs; j++ ) {
196         fprintf(fd,"row %d:",i*bs+j);
197         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
198           for ( l=0; l<bs; l++ ) {
199 #if defined(PETSC_COMPLEX)
200           if (imag(a->a[bs2*k + l*bs + j]) > 0.0) {
201             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
202               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
203           }
204           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0) {
205             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
206               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
207           }
208           else {
209             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
210           }
211 #else
212             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
213 #endif
214           }
215         }
216         fprintf(fd,"\n");
217       }
218     }
219   }
220   fflush(fd);
221   return 0;
222 }
223 
224 #undef __FUNC__
225 #define __FUNC__ "MatView_SeqBAIJ_Draw"
226 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
227 {
228   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
229   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
230   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
231   Scalar       *aa;
232   Draw         draw;
233   DrawButton   button;
234   PetscTruth   isnull;
235 
236   ViewerDrawGetDraw(viewer,&draw);
237   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
238 
239   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
240   xr += w;    yr += h;  xl = -w;     yl = -h;
241   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
242   /* loop over matrix elements drawing boxes */
243   color = DRAW_BLUE;
244   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
245     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
246       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
247       x_l = a->j[j]*bs; x_r = x_l + 1.0;
248       aa = a->a + j*bs2;
249       for ( k=0; k<bs; k++ ) {
250         for ( l=0; l<bs; l++ ) {
251           if (PetscReal(*aa++) >=  0.) continue;
252           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
253         }
254       }
255     }
256   }
257   color = DRAW_CYAN;
258   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
259     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
260       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
261       x_l = a->j[j]*bs; x_r = x_l + 1.0;
262       aa = a->a + j*bs2;
263       for ( k=0; k<bs; k++ ) {
264         for ( l=0; l<bs; l++ ) {
265           if (PetscReal(*aa++) != 0.) continue;
266           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
267         }
268       }
269     }
270   }
271 
272   color = DRAW_RED;
273   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
274     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
275       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
276       x_l = a->j[j]*bs; x_r = x_l + 1.0;
277       aa = a->a + j*bs2;
278       for ( k=0; k<bs; k++ ) {
279         for ( l=0; l<bs; l++ ) {
280           if (PetscReal(*aa++) <= 0.) continue;
281           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
282         }
283       }
284     }
285   }
286 
287   DrawFlush(draw);
288   DrawGetPause(draw,&pause);
289   if (pause >= 0) { PetscSleep(pause); return 0;}
290 
291   /* allow the matrix to zoom or shrink */
292   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
293   while (button != BUTTON_RIGHT) {
294     DrawClear(draw);
295     if (button == BUTTON_LEFT) scale = .5;
296     else if (button == BUTTON_CENTER) scale = 2.;
297     xl = scale*(xl + w - xc) + xc - w*scale;
298     xr = scale*(xr - w - xc) + xc + w*scale;
299     yl = scale*(yl + h - yc) + yc - h*scale;
300     yr = scale*(yr - h - yc) + yc + h*scale;
301     w *= scale; h *= scale;
302     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
303     color = DRAW_BLUE;
304     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
305       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
306         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
307         x_l = a->j[j]*bs; x_r = x_l + 1.0;
308         aa = a->a + j*bs2;
309         for ( k=0; k<bs; k++ ) {
310           for ( l=0; l<bs; l++ ) {
311             if (PetscReal(*aa++) >=  0.) continue;
312             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
313           }
314         }
315       }
316     }
317     color = DRAW_CYAN;
318     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
319       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
320         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
321         x_l = a->j[j]*bs; x_r = x_l + 1.0;
322         aa = a->a + j*bs2;
323         for ( k=0; k<bs; k++ ) {
324           for ( l=0; l<bs; l++ ) {
325           if (PetscReal(*aa++) != 0.) continue;
326           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
327           }
328         }
329       }
330     }
331 
332     color = DRAW_RED;
333     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
334       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
335         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
336         x_l = a->j[j]*bs; x_r = x_l + 1.0;
337         aa = a->a + j*bs2;
338         for ( k=0; k<bs; k++ ) {
339           for ( l=0; l<bs; l++ ) {
340             if (PetscReal(*aa++) <= 0.) continue;
341             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
342           }
343         }
344       }
345     }
346     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
347   }
348   return 0;
349 }
350 
351 #undef __FUNC__
352 #define __FUNC__ "MatView_SeqBAIJ"
353 int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
354 {
355   Mat         A = (Mat) obj;
356   ViewerType  vtype;
357   int         ierr;
358 
359   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
360   if (vtype == MATLAB_VIEWER) {
361     SETERRQ(1,0,"Matlab viewer not supported");
362   }
363   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
364     return MatView_SeqBAIJ_ASCII(A,viewer);
365   }
366   else if (vtype == BINARY_FILE_VIEWER) {
367     return MatView_SeqBAIJ_Binary(A,viewer);
368   }
369   else if (vtype == DRAW_VIEWER) {
370     return MatView_SeqBAIJ_Draw(A,viewer);
371   }
372   return 0;
373 }
374 
375 #define CHUNKSIZE  10
376 
377 #undef __FUNC__
378 #define __FUNC__ "MatSetValues_SeqBAIJ"
379 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
380 {
381   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
382   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
383   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
384   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
385   int          ridx,cidx,bs2=a->bs2;
386   Scalar      *ap,value,*aa=a->a,*bap;
387 
388   for ( k=0; k<m; k++ ) { /* loop over added rows */
389     row  = im[k]; brow = row/bs;
390 #if defined(PETSC_BOPT_g)
391     if (row < 0) SETERRQ(1,0,"Negative row");
392     if (row >= a->m) SETERRQ(1,0,"Row too large");
393 #endif
394     rp   = aj + ai[brow];
395     ap   = aa + bs2*ai[brow];
396     rmax = imax[brow];
397     nrow = ailen[brow];
398     low  = 0;
399     for ( l=0; l<n; l++ ) { /* loop over added columns */
400 #if defined(PETSC_BOPT_g)
401       if (in[l] < 0) SETERRQ(1,0,"Negative column");
402       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
403 #endif
404       col = in[l]; bcol = col/bs;
405       ridx = row % bs; cidx = col % bs;
406       if (roworiented) {
407         value = *v++;
408       }
409       else {
410         value = v[k + l*m];
411       }
412       if (!sorted) low = 0; high = nrow;
413       while (high-low > 7) {
414         t = (low+high)/2;
415         if (rp[t] > bcol) high = t;
416         else              low  = t;
417       }
418       for ( i=low; i<high; i++ ) {
419         if (rp[i] > bcol) break;
420         if (rp[i] == bcol) {
421           bap  = ap +  bs2*i + bs*cidx + ridx;
422           if (is == ADD_VALUES) *bap += value;
423           else                  *bap  = value;
424           goto noinsert1;
425         }
426       }
427       if (nonew == 1) goto noinsert1;
428       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
429       if (nrow >= rmax) {
430         /* there is no extra room in row, therefore enlarge */
431         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
432         Scalar *new_a;
433 
434         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
435 
436         /* Malloc new storage space */
437         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
438         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
439         new_j   = (int *) (new_a + bs2*new_nz);
440         new_i   = new_j + new_nz;
441 
442         /* copy over old data into new slots */
443         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
444         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
445         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
446         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
447         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
448                                                            len*sizeof(int));
449         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
450         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
451         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
452                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
453         /* free up old matrix storage */
454         PetscFree(a->a);
455         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
456         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
457         a->singlemalloc = 1;
458 
459         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
460         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
461         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
462         a->maxnz += bs2*CHUNKSIZE;
463         a->reallocs++;
464         a->nz++;
465       }
466       N = nrow++ - 1;
467       /* shift up all the later entries in this row */
468       for ( ii=N; ii>=i; ii-- ) {
469         rp[ii+1] = rp[ii];
470         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
471       }
472       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
473       rp[i]                      = bcol;
474       ap[bs2*i + bs*cidx + ridx] = value;
475       noinsert1:;
476       low = i;
477     }
478     ailen[brow] = nrow;
479   }
480   return 0;
481 }
482 
483 #undef __FUNC__
484 #define __FUNC__ "MatSetValuesBlocked_SeqBAIJ"
485 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
486 {
487   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
488   int         *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
489   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen, roworiented=a->roworiented;
490   int         *aj=a->j,nonew=a->nonew,bs2=a->bs2,bs=a->bs,stepval;
491   Scalar      *ap,*value=v,*aa=a->a,*bap;
492 
493   if (roworiented) {
494     stepval = (n-1)*bs;
495   } else {
496     stepval = (m-1)*bs;
497   }
498   for ( k=0; k<m; k++ ) { /* loop over added rows */
499     row  = im[k];
500 #if defined(PETSC_BOPT_g)
501     if (row < 0) SETERRQ(1,0,"Negative row");
502     if (row >= a->mbs) SETERRQ(1,0,"Row too large");
503 #endif
504     rp   = aj + ai[row];
505     ap   = aa + bs2*ai[row];
506     rmax = imax[row];
507     nrow = ailen[row];
508     low  = 0;
509     for ( l=0; l<n; l++ ) { /* loop over added columns */
510 #if defined(PETSC_BOPT_g)
511       if (in[l] < 0) SETERRQ(1,0,"Negative column");
512       if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large");
513 #endif
514       col = in[l];
515       if (roworiented) {
516         value = v + k*(stepval+bs)*bs + l*bs;
517       } else {
518         value = v + l*(stepval+bs)*bs + k*bs;
519       }
520       if (!sorted) low = 0; high = nrow;
521       while (high-low > 7) {
522         t = (low+high)/2;
523         if (rp[t] > col) high = t;
524         else             low  = t;
525       }
526       for ( i=low; i<high; i++ ) {
527         if (rp[i] > col) break;
528         if (rp[i] == col) {
529           bap  = ap +  bs2*i;
530           if (roworiented) {
531             if (is == ADD_VALUES) {
532               for ( ii=0; ii<bs; ii++,value+=stepval ) {
533                 for (jj=ii; jj<bs2; jj+=bs ) {
534                   bap[jj] += *value++;
535                 }
536               }
537             } else {
538               for ( ii=0; ii<bs; ii++,value+=stepval ) {
539                 for (jj=ii; jj<bs2; jj+=bs ) {
540                   bap[jj] = *value++;
541                 }
542               }
543             }
544           } else {
545             if (is == ADD_VALUES) {
546               for ( ii=0; ii<bs; ii++,value+=stepval ) {
547                 for (jj=0; jj<bs; jj++ ) {
548                   *bap++ += *value++;
549                 }
550               }
551             } else {
552               for ( ii=0; ii<bs; ii++,value+=stepval ) {
553                 for (jj=0; jj<bs; jj++ ) {
554                   *bap++  = *value++;
555                 }
556               }
557             }
558           }
559           goto noinsert2;
560         }
561       }
562       if (nonew == 1) goto noinsert2;
563       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
564       if (nrow >= rmax) {
565         /* there is no extra room in row, therefore enlarge */
566         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
567         Scalar *new_a;
568 
569         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
570 
571         /* malloc new storage space */
572         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
573         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
574         new_j   = (int *) (new_a + bs2*new_nz);
575         new_i   = new_j + new_nz;
576 
577         /* copy over old data into new slots */
578         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
579         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
580         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
581         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
582         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,len*sizeof(int));
583         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
584         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
585         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
586         /* free up old matrix storage */
587         PetscFree(a->a);
588         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
589         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
590         a->singlemalloc = 1;
591 
592         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
593         rmax = imax[row] = imax[row] + CHUNKSIZE;
594         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
595         a->maxnz += bs2*CHUNKSIZE;
596         a->reallocs++;
597         a->nz++;
598       }
599       N = nrow++ - 1;
600       /* shift up all the later entries in this row */
601       for ( ii=N; ii>=i; ii-- ) {
602         rp[ii+1] = rp[ii];
603         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
604       }
605       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
606       rp[i] = col;
607       bap   = ap +  bs2*i;
608       if (roworiented) {
609         for ( ii=0; ii<bs; ii++,value+=stepval) {
610           for (jj=ii; jj<bs2; jj+=bs ) {
611             bap[jj] = *value++;
612           }
613         }
614       } else {
615         for ( ii=0; ii<bs; ii++,value+=stepval ) {
616           for (jj=0; jj<bs; jj++ ) {
617             *bap++  = *value++;
618           }
619         }
620       }
621       noinsert2:;
622       low = i;
623     }
624     ailen[row] = nrow;
625   }
626   return 0;
627 }
628 
629 #undef __FUNC__
630 #define __FUNC__ "MatGetSize_SeqBAIJ"
631 int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
632 {
633   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
634   if (m) *m = a->m;
635   if (n) *n = a->n;
636   return 0;
637 }
638 
639 #undef __FUNC__
640 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ"
641 int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
642 {
643   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
644   *m = 0; *n = a->m;
645   return 0;
646 }
647 
648 #undef __FUNC__
649 #define __FUNC__ "MatGetRow_SeqBAIJ"
650 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
651 {
652   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
653   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
654   Scalar      *aa,*v_i,*aa_i;
655 
656   bs  = a->bs;
657   ai  = a->i;
658   aj  = a->j;
659   aa  = a->a;
660   bs2 = a->bs2;
661 
662   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
663 
664   bn  = row/bs;   /* Block number */
665   bp  = row % bs; /* Block Position */
666   M   = ai[bn+1] - ai[bn];
667   *nz = bs*M;
668 
669   if (v) {
670     *v = 0;
671     if (*nz) {
672       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
673       for ( i=0; i<M; i++ ) { /* for each block in the block row */
674         v_i  = *v + i*bs;
675         aa_i = aa + bs2*(ai[bn] + i);
676         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
677       }
678     }
679   }
680 
681   if (idx) {
682     *idx = 0;
683     if (*nz) {
684       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
685       for ( i=0; i<M; i++ ) { /* for each block in the block row */
686         idx_i = *idx + i*bs;
687         itmp  = bs*aj[ai[bn] + i];
688         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
689       }
690     }
691   }
692   return 0;
693 }
694 
695 #undef __FUNC__
696 #define __FUNC__ "MatRestoreRow_SeqBAIJ"
697 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
698 {
699   if (idx) {if (*idx) PetscFree(*idx);}
700   if (v)   {if (*v)   PetscFree(*v);}
701   return 0;
702 }
703 
704 #undef __FUNC__
705 #define __FUNC__ "MatTranspose_SeqBAIJ"
706 int MatTranspose_SeqBAIJ(Mat A,Mat *B)
707 {
708   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
709   Mat         C;
710   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
711   int         *rows,*cols,bs2=a->bs2;
712   Scalar      *array=a->a;
713 
714   if (B==PETSC_NULL && mbs!=nbs)
715     SETERRQ(1,0,"Square matrix only for in-place");
716   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
717   PetscMemzero(col,(1+nbs)*sizeof(int));
718 
719   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
720   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
721   PetscFree(col);
722   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
723   cols = rows + bs;
724   for ( i=0; i<mbs; i++ ) {
725     cols[0] = i*bs;
726     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
727     len = ai[i+1] - ai[i];
728     for ( j=0; j<len; j++ ) {
729       rows[0] = (*aj++)*bs;
730       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
731       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
732       array += bs2;
733     }
734   }
735   PetscFree(rows);
736 
737   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
738   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
739 
740   if (B != PETSC_NULL) {
741     *B = C;
742   } else {
743     /* This isn't really an in-place transpose */
744     PetscFree(a->a);
745     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
746     if (a->diag) PetscFree(a->diag);
747     if (a->ilen) PetscFree(a->ilen);
748     if (a->imax) PetscFree(a->imax);
749     if (a->solve_work) PetscFree(a->solve_work);
750     PetscFree(a);
751     PetscMemcpy(A,C,sizeof(struct _p_Mat));
752     PetscHeaderDestroy(C);
753   }
754   return 0;
755 }
756 
757 
758 #undef __FUNC__
759 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
760 int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
761 {
762   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
763   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
764   int        m = a->m,*ip, N, *ailen = a->ilen;
765   int        mbs = a->mbs, bs2 = a->bs2,rmax = 0;
766   Scalar     *aa = a->a, *ap;
767 
768   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
769 
770   if (m) rmax = ailen[0];
771   for ( i=1; i<mbs; i++ ) {
772     /* move each row back by the amount of empty slots (fshift) before it*/
773     fshift += imax[i-1] - ailen[i-1];
774     rmax   = PetscMax(rmax,ailen[i]);
775     if (fshift) {
776       ip = aj + ai[i]; ap = aa + bs2*ai[i];
777       N = ailen[i];
778       for ( j=0; j<N; j++ ) {
779         ip[j-fshift] = ip[j];
780         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
781       }
782     }
783     ai[i] = ai[i-1] + ailen[i-1];
784   }
785   if (mbs) {
786     fshift += imax[mbs-1] - ailen[mbs-1];
787     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
788   }
789   /* reset ilen and imax for each row */
790   for ( i=0; i<mbs; i++ ) {
791     ailen[i] = imax[i] = ai[i+1] - ai[i];
792   }
793   a->nz = ai[mbs];
794 
795   /* diagonals may have moved, so kill the diagonal pointers */
796   if (fshift && a->diag) {
797     PetscFree(a->diag);
798     PLogObjectMemory(A,-(m+1)*sizeof(int));
799     a->diag = 0;
800   }
801   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
802            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
803   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
804            a->reallocs);
805   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
806   a->reallocs          = 0;
807   A->info.nz_unneeded  = (double)fshift*bs2;
808 
809   return 0;
810 }
811 
812 #undef __FUNC__
813 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
814 int MatZeroEntries_SeqBAIJ(Mat A)
815 {
816   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
817   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
818   return 0;
819 }
820 
821 #undef __FUNC__
822 #define __FUNC__ "MatDestroy_SeqBAIJ"
823 int MatDestroy_SeqBAIJ(PetscObject obj)
824 {
825   Mat         A  = (Mat) obj;
826   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
827 
828 #if defined(PETSC_LOG)
829   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
830 #endif
831   PetscFree(a->a);
832   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
833   if (a->diag) PetscFree(a->diag);
834   if (a->ilen) PetscFree(a->ilen);
835   if (a->imax) PetscFree(a->imax);
836   if (a->solve_work) PetscFree(a->solve_work);
837   if (a->mult_work) PetscFree(a->mult_work);
838   PetscFree(a);
839   PLogObjectDestroy(A);
840   PetscHeaderDestroy(A);
841   return 0;
842 }
843 
844 #undef __FUNC__
845 #define __FUNC__ "MatSetOption_SeqBAIJ"
846 int MatSetOption_SeqBAIJ(Mat A,MatOption op)
847 {
848   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
849   if      (op == MAT_ROW_ORIENTED)                 a->roworiented = 1;
850   else if (op == MAT_COLUMN_ORIENTED)              a->roworiented = 0;
851   else if (op == MAT_COLUMNS_SORTED)               a->sorted      = 1;
852   else if (op == MAT_COLUMNS_UNSORTED)             a->sorted      = 0;
853   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)     a->nonew       = 1;
854   else if (op == MAT_NEW_NONZERO_LOCATION_ERROR)   a->nonew       = -1;
855   else if (op == MAT_NEW_NONZERO_ALLOCATION_ERROR) a->nonew       = -2;
856   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS)    a->nonew       = 0;
857   else if (op == MAT_ROWS_SORTED ||
858            op == MAT_ROWS_UNSORTED ||
859            op == MAT_SYMMETRIC ||
860            op == MAT_STRUCTURALLY_SYMMETRIC ||
861            op == MAT_YES_NEW_DIAGONALS ||
862            op == MAT_IGNORE_OFF_PROC_ENTRIES)
863     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
864   else if (op == MAT_NO_NEW_DIAGONALS)
865     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
866   else
867     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
868   return 0;
869 }
870 
871 
872 /* -------------------------------------------------------*/
873 /* Should check that shapes of vectors and matrices match */
874 /* -------------------------------------------------------*/
875 #include "pinclude/plapack.h"
876 
877 #undef __FUNC__
878 #define __FUNC__ "MatMult_SeqBAIJ_1"
879 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
880 {
881   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
882   register Scalar *x,*z,*v,sum;
883   int             mbs=a->mbs,i,*idx,*ii,n;
884 
885   VecGetArray_Fast(xx,x);
886   VecGetArray_Fast(zz,z);
887 
888   idx   = a->j;
889   v     = a->a;
890   ii    = a->i;
891 
892   for ( i=0; i<mbs; i++ ) {
893     n    = ii[1] - ii[0]; ii++;
894     sum  = 0.0;
895     while (n--) sum += *v++ * x[*idx++];
896     z[i] = sum;
897   }
898   VecRestoreArray_Fast(xx,x);
899   VecRestoreArray_Fast(zz,z);
900   PLogFlops(2*a->nz - a->m);
901   return 0;
902 }
903 
904 #undef __FUNC__
905 #define __FUNC__ "MatMult_SeqBAIJ_2"
906 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
907 {
908   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
909   register Scalar *x,*z,*v,*xb,sum1,sum2;
910   register Scalar x1,x2;
911   int             mbs=a->mbs,i,*idx,*ii;
912   int             j,n;
913 
914   VecGetArray_Fast(xx,x);
915   VecGetArray_Fast(zz,z);
916 
917   idx   = a->j;
918   v     = a->a;
919   ii    = a->i;
920 
921   for ( i=0; i<mbs; i++ ) {
922     n  = ii[1] - ii[0]; ii++;
923     sum1 = 0.0; sum2 = 0.0;
924     for ( j=0; j<n; j++ ) {
925       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
926       sum1 += v[0]*x1 + v[2]*x2;
927       sum2 += v[1]*x1 + v[3]*x2;
928       v += 4;
929     }
930     z[0] = sum1; z[1] = sum2;
931     z += 2;
932   }
933   VecRestoreArray_Fast(xx,x);
934   VecRestoreArray_Fast(zz,z);
935   PLogFlops(4*a->nz - a->m);
936   return 0;
937 }
938 
939 #undef __FUNC__
940 #define __FUNC__ "MatMult_SeqBAIJ_3"
941 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
942 {
943   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
944   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
945   int             mbs=a->mbs,i,*idx,*ii,j,n;
946 
947   VecGetArray_Fast(xx,x);
948   VecGetArray_Fast(zz,z);
949 
950   idx   = a->j;
951   v     = a->a;
952   ii    = a->i;
953 
954   for ( i=0; i<mbs; i++ ) {
955     n  = ii[1] - ii[0]; ii++;
956     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
957     for ( j=0; j<n; j++ ) {
958       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
959       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
960       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
961       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
962       v += 9;
963     }
964     z[0] = sum1; z[1] = sum2; z[2] = sum3;
965     z += 3;
966   }
967   VecRestoreArray_Fast(xx,x);
968   VecRestoreArray_Fast(zz,z);
969   PLogFlops(18*a->nz - a->m);
970   return 0;
971 }
972 
973 #undef __FUNC__
974 #define __FUNC__ "MatMult_SeqBAIJ_4"
975 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
976 {
977   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
978   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
979   register Scalar x1,x2,x3,x4;
980   int             mbs=a->mbs,i,*idx,*ii;
981   int             j,n;
982 
983   VecGetArray_Fast(xx,x);
984   VecGetArray_Fast(zz,z);
985 
986   idx   = a->j;
987   v     = a->a;
988   ii    = a->i;
989 
990   for ( i=0; i<mbs; i++ ) {
991     n  = ii[1] - ii[0]; ii++;
992     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
993     for ( j=0; j<n; j++ ) {
994       xb = x + 4*(*idx++);
995       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
996       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
997       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
998       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
999       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1000       v += 16;
1001     }
1002     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1003     z += 4;
1004   }
1005   VecRestoreArray_Fast(xx,x);
1006   VecRestoreArray_Fast(zz,z);
1007   PLogFlops(32*a->nz - a->m);
1008   return 0;
1009 }
1010 
1011 #undef __FUNC__
1012 #define __FUNC__ "MatMult_SeqBAIJ_5"
1013 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
1014 {
1015   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1016   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1017   register Scalar x1,x2,x3,x4,x5;
1018   int             mbs=a->mbs,i,*idx,*ii,j,n;
1019 
1020   VecGetArray_Fast(xx,x);
1021   VecGetArray_Fast(zz,z);
1022 
1023   idx   = a->j;
1024   v     = a->a;
1025   ii    = a->i;
1026 
1027   for ( i=0; i<mbs; i++ ) {
1028     n  = ii[1] - ii[0]; ii++;
1029     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
1030     for ( j=0; j<n; j++ ) {
1031       xb = x + 5*(*idx++);
1032       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1033       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1034       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1035       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1036       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1037       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1038       v += 25;
1039     }
1040     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1041     z += 5;
1042   }
1043   VecRestoreArray_Fast(xx,x);
1044   VecRestoreArray_Fast(zz,z);
1045   PLogFlops(50*a->nz - a->m);
1046   return 0;
1047 }
1048 
1049 #undef __FUNC__
1050 #define __FUNC__ "MatMult_SeqBAIJ_7"
1051 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1052 {
1053   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1054   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1055   register Scalar x1,x2,x3,x4,x5,x6,x7;
1056   int             mbs=a->mbs,i,*idx,*ii,j,n;
1057 
1058   VecGetArray_Fast(xx,x);
1059   VecGetArray_Fast(zz,z);
1060 
1061   idx   = a->j;
1062   v     = a->a;
1063   ii    = a->i;
1064 
1065   for ( i=0; i<mbs; i++ ) {
1066     n  = ii[1] - ii[0]; ii++;
1067     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1068     for ( j=0; j<n; j++ ) {
1069       xb = x + 7*(*idx++);
1070       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1071       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1072       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1073       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1074       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1075       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1076       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1077       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1078       v += 49;
1079     }
1080     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1081     z += 7;
1082   }
1083 
1084   VecRestoreArray_Fast(xx,x);
1085   VecRestoreArray_Fast(zz,z);
1086   PLogFlops(98*a->nz - a->m);
1087   return 0;
1088 }
1089 
1090 #undef __FUNC__
1091 #define __FUNC__ "MatMult_SeqBAIJ_N"
1092 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1093 {
1094   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1095   register Scalar *x,*z,*v,*xb;
1096   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1097   int             _One = 1,ncols,k;
1098   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1099 
1100   VecGetArray_Fast(xx,x);
1101   VecGetArray_Fast(zz,z);
1102 
1103   idx   = a->j;
1104   v     = a->a;
1105   ii    = a->i;
1106 
1107 
1108   if (!a->mult_work) {
1109     k = PetscMax(a->m,a->n);
1110     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1111   }
1112   work = a->mult_work;
1113   for ( i=0; i<mbs; i++ ) {
1114     n     = ii[1] - ii[0]; ii++;
1115     ncols = n*bs;
1116     workt = work;
1117     for ( j=0; j<n; j++ ) {
1118       xb = x + bs*(*idx++);
1119       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1120       workt += bs;
1121     }
1122     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1123     v += n*bs2;
1124     z += bs;
1125   }
1126   VecRestoreArray_Fast(xx,x);
1127   VecRestoreArray_Fast(zz,z);
1128   PLogFlops(2*a->nz*bs2 - a->m);
1129   return 0;
1130 }
1131 
1132 #undef __FUNC__
1133 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1134 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1135 {
1136   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1137   register Scalar *x,*y,*z,*v,sum;
1138   int             mbs=a->mbs,i,*idx,*ii,n;
1139 
1140   VecGetArray_Fast(xx,x);
1141   VecGetArray_Fast(yy,y);
1142   VecGetArray_Fast(zz,z);
1143 
1144   idx   = a->j;
1145   v     = a->a;
1146   ii    = a->i;
1147 
1148   for ( i=0; i<mbs; i++ ) {
1149     n    = ii[1] - ii[0]; ii++;
1150     sum  = y[i];
1151     while (n--) sum += *v++ * x[*idx++];
1152     z[i] = sum;
1153   }
1154   VecRestoreArray_Fast(xx,x);
1155   VecRestoreArray_Fast(yy,y);
1156   VecRestoreArray_Fast(zz,z);
1157   PLogFlops(2*a->nz);
1158   return 0;
1159 }
1160 
1161 #undef __FUNC__
1162 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1163 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1164 {
1165   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1166   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1167   register Scalar x1,x2;
1168   int             mbs=a->mbs,i,*idx,*ii;
1169   int             j,n;
1170 
1171   VecGetArray_Fast(xx,x);
1172   VecGetArray_Fast(yy,y);
1173   VecGetArray_Fast(zz,z);
1174 
1175   idx   = a->j;
1176   v     = a->a;
1177   ii    = a->i;
1178 
1179   for ( i=0; i<mbs; i++ ) {
1180     n  = ii[1] - ii[0]; ii++;
1181     sum1 = y[0]; sum2 = y[1];
1182     for ( j=0; j<n; j++ ) {
1183       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1184       sum1 += v[0]*x1 + v[2]*x2;
1185       sum2 += v[1]*x1 + v[3]*x2;
1186       v += 4;
1187     }
1188     z[0] = sum1; z[1] = sum2;
1189     z += 2; y += 2;
1190   }
1191   VecRestoreArray_Fast(xx,x);
1192   VecRestoreArray_Fast(yy,y);
1193   VecRestoreArray_Fast(zz,z);
1194   PLogFlops(4*a->nz);
1195   return 0;
1196 }
1197 
1198 #undef __FUNC__
1199 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1200 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1201 {
1202   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1203   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1204   int             mbs=a->mbs,i,*idx,*ii,j,n;
1205 
1206   VecGetArray_Fast(xx,x);
1207   VecGetArray_Fast(yy,y);
1208   VecGetArray_Fast(zz,z);
1209 
1210   idx   = a->j;
1211   v     = a->a;
1212   ii    = a->i;
1213 
1214   for ( i=0; i<mbs; i++ ) {
1215     n  = ii[1] - ii[0]; ii++;
1216     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1217     for ( j=0; j<n; j++ ) {
1218       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1219       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1220       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1221       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1222       v += 9;
1223     }
1224     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1225     z += 3; y += 3;
1226   }
1227   VecRestoreArray_Fast(xx,x);
1228   VecRestoreArray_Fast(yy,y);
1229   VecRestoreArray_Fast(zz,z);
1230   PLogFlops(18*a->nz);
1231   return 0;
1232 }
1233 
1234 #undef __FUNC__
1235 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1236 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1237 {
1238   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1239   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1240   register Scalar x1,x2,x3,x4;
1241   int             mbs=a->mbs,i,*idx,*ii;
1242   int             j,n;
1243 
1244   VecGetArray_Fast(xx,x);
1245   VecGetArray_Fast(yy,y);
1246   VecGetArray_Fast(zz,z);
1247 
1248   idx   = a->j;
1249   v     = a->a;
1250   ii    = a->i;
1251 
1252   for ( i=0; i<mbs; i++ ) {
1253     n  = ii[1] - ii[0]; ii++;
1254     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1255     for ( j=0; j<n; j++ ) {
1256       xb = x + 4*(*idx++);
1257       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1258       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1259       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1260       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1261       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1262       v += 16;
1263     }
1264     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1265     z += 4; y += 4;
1266   }
1267   VecRestoreArray_Fast(xx,x);
1268   VecRestoreArray_Fast(yy,y);
1269   VecRestoreArray_Fast(zz,z);
1270   PLogFlops(32*a->nz);
1271   return 0;
1272 }
1273 
1274 #undef __FUNC__
1275 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1276 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1277 {
1278   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1279   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1280   register Scalar x1,x2,x3,x4,x5;
1281   int             mbs=a->mbs,i,*idx,*ii,j,n;
1282 
1283   VecGetArray_Fast(xx,x);
1284   VecGetArray_Fast(yy,y);
1285   VecGetArray_Fast(zz,z);
1286 
1287   idx   = a->j;
1288   v     = a->a;
1289   ii    = a->i;
1290 
1291   for ( i=0; i<mbs; i++ ) {
1292     n  = ii[1] - ii[0]; ii++;
1293     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1294     for ( j=0; j<n; j++ ) {
1295       xb = x + 5*(*idx++);
1296       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1297       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1298       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1299       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1300       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1301       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1302       v += 25;
1303     }
1304     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1305     z += 5; y += 5;
1306   }
1307   VecRestoreArray_Fast(xx,x);
1308   VecRestoreArray_Fast(yy,y);
1309   VecRestoreArray_Fast(zz,z);
1310   PLogFlops(50*a->nz);
1311   return 0;
1312 }
1313 
1314 #undef __FUNC__
1315 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1316 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1317 {
1318   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1319   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1320   register Scalar x1,x2,x3,x4,x5,x6,x7;
1321   int             mbs=a->mbs,i,*idx,*ii,j,n;
1322 
1323   VecGetArray_Fast(xx,x);
1324   VecGetArray_Fast(yy,y);
1325   VecGetArray_Fast(zz,z);
1326 
1327   idx   = a->j;
1328   v     = a->a;
1329   ii    = a->i;
1330 
1331   for ( i=0; i<mbs; i++ ) {
1332     n  = ii[1] - ii[0]; ii++;
1333     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1334     for ( j=0; j<n; j++ ) {
1335       xb = x + 7*(*idx++);
1336       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1337       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1338       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1339       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1340       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1341       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1342       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1343       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1344       v += 49;
1345     }
1346     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1347     z += 7; y += 7;
1348   }
1349   VecRestoreArray_Fast(xx,x);
1350   VecRestoreArray_Fast(yy,y);
1351   VecRestoreArray_Fast(zz,z);
1352   PLogFlops(98*a->nz);
1353   return 0;
1354 }
1355 
1356 
1357 #undef __FUNC__
1358 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1359 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1360 {
1361   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1362   register Scalar *x,*z,*v,*xb;
1363   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1364   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1365 
1366   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1367 
1368   VecGetArray_Fast(xx,x);
1369   VecGetArray_Fast(zz,z);
1370 
1371   idx   = a->j;
1372   v     = a->a;
1373   ii    = a->i;
1374 
1375 
1376   if (!a->mult_work) {
1377     k = PetscMax(a->m,a->n);
1378     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1379   }
1380   work = a->mult_work;
1381   for ( i=0; i<mbs; i++ ) {
1382     n     = ii[1] - ii[0]; ii++;
1383     ncols = n*bs;
1384     workt = work;
1385     for ( j=0; j<n; j++ ) {
1386       xb = x + bs*(*idx++);
1387       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1388       workt += bs;
1389     }
1390     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1391     v += n*bs2;
1392     z += bs;
1393   }
1394   VecRestoreArray_Fast(xx,x);
1395   VecRestoreArray_Fast(zz,z);
1396   PLogFlops(2*a->nz*bs2 );
1397   return 0;
1398 }
1399 
1400 #undef __FUNC__
1401 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1402 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1403 {
1404   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1405   Scalar          *xg,*zg,*zb;
1406   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1407   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1408   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1409 
1410 
1411   VecGetArray_Fast(xx,xg); x = xg;
1412   VecGetArray_Fast(zz,zg); z = zg;
1413   PetscMemzero(z,N*sizeof(Scalar));
1414 
1415   idx   = a->j;
1416   v     = a->a;
1417   ii    = a->i;
1418 
1419   switch (bs) {
1420   case 1:
1421     for ( i=0; i<mbs; i++ ) {
1422       n  = ii[1] - ii[0]; ii++;
1423       xb = x + i; x1 = xb[0];
1424       ib = idx + ai[i];
1425       for ( j=0; j<n; j++ ) {
1426         rval    = ib[j];
1427         z[rval] += *v++ * x1;
1428       }
1429     }
1430     break;
1431   case 2:
1432     for ( i=0; i<mbs; i++ ) {
1433       n  = ii[1] - ii[0]; ii++;
1434       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1435       ib = idx + ai[i];
1436       for ( j=0; j<n; j++ ) {
1437         rval      = ib[j]*2;
1438         z[rval++] += v[0]*x1 + v[1]*x2;
1439         z[rval++] += v[2]*x1 + v[3]*x2;
1440         v += 4;
1441       }
1442     }
1443     break;
1444   case 3:
1445     for ( i=0; i<mbs; i++ ) {
1446       n  = ii[1] - ii[0]; ii++;
1447       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1448       ib = idx + ai[i];
1449       for ( j=0; j<n; j++ ) {
1450         rval      = ib[j]*3;
1451         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1452         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1453         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1454         v += 9;
1455       }
1456     }
1457     break;
1458   case 4:
1459     for ( i=0; i<mbs; i++ ) {
1460       n  = ii[1] - ii[0]; ii++;
1461       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1462       ib = idx + ai[i];
1463       for ( j=0; j<n; j++ ) {
1464         rval      = ib[j]*4;
1465         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1466         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1467         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1468         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1469         v += 16;
1470       }
1471     }
1472     break;
1473   case 5:
1474     for ( i=0; i<mbs; i++ ) {
1475       n  = ii[1] - ii[0]; ii++;
1476       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1477       x4 = xb[3];   x5 = xb[4];
1478       ib = idx + ai[i];
1479       for ( j=0; j<n; j++ ) {
1480         rval      = ib[j]*5;
1481         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1482         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1483         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1484         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1485         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1486         v += 25;
1487       }
1488     }
1489     break;
1490       /* block sizes larger then 5 by 5 are handled by BLAS */
1491     default: {
1492       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1493       if (!a->mult_work) {
1494         k = PetscMax(a->m,a->n);
1495         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1496         CHKPTRQ(a->mult_work);
1497       }
1498       work = a->mult_work;
1499       for ( i=0; i<mbs; i++ ) {
1500         n     = ii[1] - ii[0]; ii++;
1501         ncols = n*bs;
1502         PetscMemzero(work,ncols*sizeof(Scalar));
1503         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1504         v += n*bs2;
1505         x += bs;
1506         workt = work;
1507         for ( j=0; j<n; j++ ) {
1508           zb = z + bs*(*idx++);
1509           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1510           workt += bs;
1511         }
1512       }
1513     }
1514   }
1515   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1516   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1517   PLogFlops(2*a->nz*a->bs2 - a->n);
1518   return 0;
1519 }
1520 
1521 #undef __FUNC__
1522 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1523 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1524 {
1525   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1526   Scalar          *xg,*zg,*zb;
1527   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1528   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1529   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1530 
1531 
1532 
1533   VecGetArray_Fast(xx,xg); x = xg;
1534   VecGetArray_Fast(zz,zg); z = zg;
1535 
1536   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1537   else PetscMemzero(z,N*sizeof(Scalar));
1538 
1539 
1540   idx   = a->j;
1541   v     = a->a;
1542   ii    = a->i;
1543 
1544   switch (bs) {
1545   case 1:
1546     for ( i=0; i<mbs; i++ ) {
1547       n  = ii[1] - ii[0]; ii++;
1548       xb = x + i; x1 = xb[0];
1549       ib = idx + ai[i];
1550       for ( j=0; j<n; j++ ) {
1551         rval    = ib[j];
1552         z[rval] += *v++ * x1;
1553       }
1554     }
1555     break;
1556   case 2:
1557     for ( i=0; i<mbs; i++ ) {
1558       n  = ii[1] - ii[0]; ii++;
1559       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1560       ib = idx + ai[i];
1561       for ( j=0; j<n; j++ ) {
1562         rval      = ib[j]*2;
1563         z[rval++] += v[0]*x1 + v[1]*x2;
1564         z[rval++] += v[2]*x1 + v[3]*x2;
1565         v += 4;
1566       }
1567     }
1568     break;
1569   case 3:
1570     for ( i=0; i<mbs; i++ ) {
1571       n  = ii[1] - ii[0]; ii++;
1572       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1573       ib = idx + ai[i];
1574       for ( j=0; j<n; j++ ) {
1575         rval      = ib[j]*3;
1576         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1577         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1578         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1579         v += 9;
1580       }
1581     }
1582     break;
1583   case 4:
1584     for ( i=0; i<mbs; i++ ) {
1585       n  = ii[1] - ii[0]; ii++;
1586       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1587       ib = idx + ai[i];
1588       for ( j=0; j<n; j++ ) {
1589         rval      = ib[j]*4;
1590         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1591         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1592         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1593         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1594         v += 16;
1595       }
1596     }
1597     break;
1598   case 5:
1599     for ( i=0; i<mbs; i++ ) {
1600       n  = ii[1] - ii[0]; ii++;
1601       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1602       x4 = xb[3];   x5 = xb[4];
1603       ib = idx + ai[i];
1604       for ( j=0; j<n; j++ ) {
1605         rval      = ib[j]*5;
1606         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1607         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1608         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1609         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1610         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1611         v += 25;
1612       }
1613     }
1614     break;
1615       /* block sizes larger then 5 by 5 are handled by BLAS */
1616     default: {
1617       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1618       if (!a->mult_work) {
1619         k = PetscMax(a->m,a->n);
1620         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1621         CHKPTRQ(a->mult_work);
1622       }
1623       work = a->mult_work;
1624       for ( i=0; i<mbs; i++ ) {
1625         n     = ii[1] - ii[0]; ii++;
1626         ncols = n*bs;
1627         PetscMemzero(work,ncols*sizeof(Scalar));
1628         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1629         v += n*bs2;
1630         x += bs;
1631         workt = work;
1632         for ( j=0; j<n; j++ ) {
1633           zb = z + bs*(*idx++);
1634           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1635           workt += bs;
1636         }
1637       }
1638     }
1639   }
1640   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1641   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1642   PLogFlops(2*a->nz*a->bs2);
1643   return 0;
1644 }
1645 
1646 #undef __FUNC__
1647 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1648 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1649 {
1650   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1651 
1652   info->rows_global    = (double)a->m;
1653   info->columns_global = (double)a->n;
1654   info->rows_local     = (double)a->m;
1655   info->columns_local  = (double)a->n;
1656   info->block_size     = a->bs2;
1657   info->nz_allocated   = a->maxnz;
1658   info->nz_used        = a->bs2*a->nz;
1659   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1660   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1661     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1662   info->assemblies   = A->num_ass;
1663   info->mallocs      = a->reallocs;
1664   info->memory       = A->mem;
1665   if (A->factor) {
1666     info->fill_ratio_given  = A->info.fill_ratio_given;
1667     info->fill_ratio_needed = A->info.fill_ratio_needed;
1668     info->factor_mallocs    = A->info.factor_mallocs;
1669   } else {
1670     info->fill_ratio_given  = 0;
1671     info->fill_ratio_needed = 0;
1672     info->factor_mallocs    = 0;
1673   }
1674   return 0;
1675 }
1676 
1677 #undef __FUNC__
1678 #define __FUNC__ "MatEqual_SeqBAIJ"
1679 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1680 {
1681   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1682 
1683   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1684 
1685   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1686   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1687       (a->nz != b->nz)) {
1688     *flg = PETSC_FALSE; return 0;
1689   }
1690 
1691   /* if the a->i are the same */
1692   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1693     *flg = PETSC_FALSE; return 0;
1694   }
1695 
1696   /* if a->j are the same */
1697   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1698     *flg = PETSC_FALSE; return 0;
1699   }
1700 
1701   /* if a->a are the same */
1702   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1703     *flg = PETSC_FALSE; return 0;
1704   }
1705   *flg = PETSC_TRUE;
1706   return 0;
1707 
1708 }
1709 
1710 #undef __FUNC__
1711 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1712 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1713 {
1714   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1715   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1716   Scalar      *x, zero = 0.0,*aa,*aa_j;
1717 
1718   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1719   bs   = a->bs;
1720   aa   = a->a;
1721   ai   = a->i;
1722   aj   = a->j;
1723   ambs = a->mbs;
1724   bs2  = a->bs2;
1725 
1726   VecSet(&zero,v);
1727   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1728   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1729   for ( i=0; i<ambs; i++ ) {
1730     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1731       if (aj[j] == i) {
1732         row  = i*bs;
1733         aa_j = aa+j*bs2;
1734         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1735         break;
1736       }
1737     }
1738   }
1739   return 0;
1740 }
1741 
1742 #undef __FUNC__
1743 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1744 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1745 {
1746   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1747   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1748   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1749 
1750   ai  = a->i;
1751   aj  = a->j;
1752   aa  = a->a;
1753   m   = a->m;
1754   n   = a->n;
1755   bs  = a->bs;
1756   mbs = a->mbs;
1757   bs2 = a->bs2;
1758   if (ll) {
1759     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1760     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1761     for ( i=0; i<mbs; i++ ) { /* for each block row */
1762       M  = ai[i+1] - ai[i];
1763       li = l + i*bs;
1764       v  = aa + bs2*ai[i];
1765       for ( j=0; j<M; j++ ) { /* for each block */
1766         for ( k=0; k<bs2; k++ ) {
1767           (*v++) *= li[k%bs];
1768         }
1769       }
1770     }
1771   }
1772 
1773   if (rr) {
1774     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1775     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1776     for ( i=0; i<mbs; i++ ) { /* for each block row */
1777       M  = ai[i+1] - ai[i];
1778       v  = aa + bs2*ai[i];
1779       for ( j=0; j<M; j++ ) { /* for each block */
1780         ri = r + bs*aj[ai[i]+j];
1781         for ( k=0; k<bs; k++ ) {
1782           x = ri[k];
1783           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1784         }
1785       }
1786     }
1787   }
1788   return 0;
1789 }
1790 
1791 
1792 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1793 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1794 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1795 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1796 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1797 
1798 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1799 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1800 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1801 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1802 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1803 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1804 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1805 
1806 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1807 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1808 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1809 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1810 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1811 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1812 
1813 #undef __FUNC__
1814 #define __FUNC__ "MatNorm_SeqBAIJ"
1815 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1816 {
1817   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1818   Scalar      *v = a->a;
1819   double      sum = 0.0;
1820   int         i,nz=a->nz,bs2=a->bs2;
1821 
1822   if (type == NORM_FROBENIUS) {
1823     for (i=0; i< bs2*nz; i++ ) {
1824 #if defined(PETSC_COMPLEX)
1825       sum += real(conj(*v)*(*v)); v++;
1826 #else
1827       sum += (*v)*(*v); v++;
1828 #endif
1829     }
1830     *norm = sqrt(sum);
1831   }
1832   else {
1833     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1834   }
1835   return 0;
1836 }
1837 
1838 extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec);
1839 /*
1840      note: This can only work for identity for row and col. It would
1841    be good to check this and otherwise generate an error.
1842 */
1843 #undef __FUNC__
1844 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1845 int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1846 {
1847   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1848   Mat         outA;
1849   int         ierr;
1850 
1851   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1852 
1853   outA          = inA;
1854   inA->factor   = FACTOR_LU;
1855   a->row        = row;
1856   a->col        = col;
1857 
1858   if (!a->solve_work) {
1859     a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1860     PLogObjectMemory(inA,(a->m+a->bs)*sizeof(Scalar));
1861   }
1862 
1863   if (!a->diag) {
1864     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1865   }
1866   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1867 
1868   /*
1869       Blocksize 4 has a special faster solver for ILU(0) factorization
1870     with natural ordering
1871   */
1872   if (a->bs == 4) {
1873     inA->ops.solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
1874   }
1875 
1876   return 0;
1877 }
1878 
1879 #undef __FUNC__
1880 #define __FUNC__ "MatScale_SeqBAIJ"
1881 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1882 {
1883   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1884   int         one = 1, totalnz = a->bs2*a->nz;
1885   BLscal_( &totalnz, alpha, a->a, &one );
1886   PLogFlops(totalnz);
1887   return 0;
1888 }
1889 
1890 #undef __FUNC__
1891 #define __FUNC__ "MatGetValues_SeqBAIJ"
1892 int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1893 {
1894   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1895   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1896   int        *ai = a->i, *ailen = a->ilen;
1897   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1898   Scalar     *ap, *aa = a->a, zero = 0.0;
1899 
1900   for ( k=0; k<m; k++ ) { /* loop over rows */
1901     row  = im[k]; brow = row/bs;
1902     if (row < 0) SETERRQ(1,0,"Negative row");
1903     if (row >= a->m) SETERRQ(1,0,"Row too large");
1904     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1905     nrow = ailen[brow];
1906     for ( l=0; l<n; l++ ) { /* loop over columns */
1907       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1908       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1909       col  = in[l] ;
1910       bcol = col/bs;
1911       cidx = col%bs;
1912       ridx = row%bs;
1913       high = nrow;
1914       low  = 0; /* assume unsorted */
1915       while (high-low > 5) {
1916         t = (low+high)/2;
1917         if (rp[t] > bcol) high = t;
1918         else             low  = t;
1919       }
1920       for ( i=low; i<high; i++ ) {
1921         if (rp[i] > bcol) break;
1922         if (rp[i] == bcol) {
1923           *v++ = ap[bs2*i+bs*cidx+ridx];
1924           goto finished;
1925         }
1926       }
1927       *v++ = zero;
1928       finished:;
1929     }
1930   }
1931   return 0;
1932 }
1933 
1934 #undef __FUNC__
1935 #define __FUNC__ "MatGetBlockSize_SeqBAIJ"
1936 int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1937 {
1938   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1939   *bs = baij->bs;
1940   return 0;
1941 }
1942 
1943 /* idx should be of length atlease bs */
1944 #undef __FUNC__
1945 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1946 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1947 {
1948   int i,row;
1949   row = idx[0];
1950   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1951 
1952   for ( i=1; i<bs; i++ ) {
1953     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1954   }
1955   *flg = PETSC_TRUE;
1956   return 0;
1957 }
1958 
1959 #undef __FUNC__
1960 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1961 int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1962 {
1963   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1964   IS          is_local;
1965   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1966   PetscTruth  flg;
1967   Scalar      zero = 0.0,*aa;
1968 
1969   /* Make a copy of the IS and  sort it */
1970   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1971   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1972   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1973   ierr = ISSort(is_local); CHKERRQ(ierr);
1974   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1975 
1976   i = 0;
1977   while (i < is_n) {
1978     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1979     flg = PETSC_FALSE;
1980     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1981     if (flg) { /* There exists a block of rows to be Zerowed */
1982       baij->ilen[rows[i]/bs] = 0;
1983       i += bs;
1984     } else { /* Zero out only the requested row */
1985       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1986       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1987       for ( j=0; j<count; j++ ) {
1988         aa[0] = zero;
1989         aa+=bs;
1990       }
1991       i++;
1992     }
1993   }
1994   if (diag) {
1995     for ( j=0; j<is_n; j++ ) {
1996       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1997     }
1998   }
1999   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
2000   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
2001   ierr = ISDestroy(is_local); CHKERRQ(ierr);
2002   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2003 
2004   return 0;
2005 }
2006 
2007 #undef __FUNC__
2008 #define __FUNC__ "MatPrintHelp_SeqBAIJ"
2009 int MatPrintHelp_SeqBAIJ(Mat A)
2010 {
2011   static int called = 0;
2012   MPI_Comm   comm = A->comm;
2013 
2014   if (called) return 0; else called = 1;
2015   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
2016   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
2017   return 0;
2018 }
2019 
2020 /* -------------------------------------------------------------------*/
2021 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
2022        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
2023        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
2024        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
2025        MatSolve_SeqBAIJ_N,0,
2026        0,0,
2027        MatLUFactor_SeqBAIJ,0,
2028        0,
2029        MatTranspose_SeqBAIJ,
2030        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
2031        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
2032        0,MatAssemblyEnd_SeqBAIJ,
2033        0,
2034        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
2035        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
2036        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
2037        MatILUFactorSymbolic_SeqBAIJ,0,
2038        0,0,
2039        MatConvertSameType_SeqBAIJ,0,0,
2040        MatILUFactor_SeqBAIJ,0,0,
2041        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
2042        MatGetValues_SeqBAIJ,0,
2043        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
2044        0,0,0,MatGetBlockSize_SeqBAIJ,
2045        MatGetRowIJ_SeqBAIJ,
2046        MatRestoreRowIJ_SeqBAIJ,
2047        0,0,0,0,0,0,
2048        MatSetValuesBlocked_SeqBAIJ};
2049 
2050 #undef __FUNC__
2051 #define __FUNC__ "MatCreateSeqBAIJ"
2052 /*@C
2053    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
2054    compressed row) format.  For good matrix assembly performance the
2055    user should preallocate the matrix storage by setting the parameter nz
2056    (or the array nzz).  By setting these parameters accurately, performance
2057    during matrix assembly can be increased by more than a factor of 50.
2058 
2059    Input Parameters:
2060 .  comm - MPI communicator, set to PETSC_COMM_SELF
2061 .  bs - size of block
2062 .  m - number of rows
2063 .  n - number of columns
2064 .  nz - number of block nonzeros per block row (same for all rows)
2065 .  nzz - array containing the number of block nonzeros in the various block rows
2066          (possibly different for each block row) or PETSC_NULL
2067 
2068    Output Parameter:
2069 .  A - the matrix
2070 
2071    Options Database Keys:
2072 $    -mat_no_unroll - uses code that does not unroll the loops in the
2073 $                     block calculations (much slower)
2074 $    -mat_block_size - size of the blocks to use
2075 
2076    Notes:
2077    The block AIJ format is fully compatible with standard Fortran 77
2078    storage.  That is, the stored row and column indices can begin at
2079    either one (as in Fortran) or zero.  See the users' manual for details.
2080 
2081    Specify the preallocated storage with either nz or nnz (not both).
2082    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2083    allocation.  For additional details, see the users manual chapter on
2084    matrices.
2085 
2086 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2087 @*/
2088 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2089 {
2090   Mat         B;
2091   Mat_SeqBAIJ *b;
2092   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2093 
2094   MPI_Comm_size(comm,&size);
2095   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2096 
2097   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2098 
2099   if (mbs*bs!=m || nbs*bs!=n)
2100     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2101 
2102   *A = 0;
2103   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATSEQBAIJ,comm,MatDestroy,MatView);
2104   PLogObjectCreate(B);
2105   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2106   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2107   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2108   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2109   if (!flg) {
2110     switch (bs) {
2111     case 1:
2112       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2113       B->ops.solve           = MatSolve_SeqBAIJ_1;
2114       B->ops.mult            = MatMult_SeqBAIJ_1;
2115       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2116       break;
2117     case 2:
2118       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2119       B->ops.solve           = MatSolve_SeqBAIJ_2;
2120       B->ops.mult            = MatMult_SeqBAIJ_2;
2121       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2122       break;
2123     case 3:
2124       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2125       B->ops.solve           = MatSolve_SeqBAIJ_3;
2126       B->ops.mult            = MatMult_SeqBAIJ_3;
2127       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2128       break;
2129     case 4:
2130       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2131       B->ops.solve           = MatSolve_SeqBAIJ_4;
2132       B->ops.mult            = MatMult_SeqBAIJ_4;
2133       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2134       break;
2135     case 5:
2136       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2137       B->ops.solve           = MatSolve_SeqBAIJ_5;
2138       B->ops.mult            = MatMult_SeqBAIJ_5;
2139       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2140       break;
2141     case 7:
2142       B->ops.mult            = MatMult_SeqBAIJ_7;
2143       B->ops.solve           = MatSolve_SeqBAIJ_7;
2144       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2145       break;
2146     }
2147   }
2148   B->destroy          = MatDestroy_SeqBAIJ;
2149   B->view             = MatView_SeqBAIJ;
2150   B->factor           = 0;
2151   B->lupivotthreshold = 1.0;
2152   B->mapping          = 0;
2153   b->row              = 0;
2154   b->col              = 0;
2155   b->reallocs         = 0;
2156 
2157   b->m       = m; B->m = m; B->M = m;
2158   b->n       = n; B->n = n; B->N = n;
2159   b->mbs     = mbs;
2160   b->nbs     = nbs;
2161   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2162   if (nnz == PETSC_NULL) {
2163     if (nz == PETSC_DEFAULT) nz = 5;
2164     else if (nz <= 0)        nz = 1;
2165     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2166     nz = nz*mbs;
2167   }
2168   else {
2169     nz = 0;
2170     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2171   }
2172 
2173   /* allocate the matrix space */
2174   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2175   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2176   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2177   b->j  = (int *) (b->a + nz*bs2);
2178   PetscMemzero(b->j,nz*sizeof(int));
2179   b->i  = b->j + nz;
2180   b->singlemalloc = 1;
2181 
2182   b->i[0] = 0;
2183   for (i=1; i<mbs+1; i++) {
2184     b->i[i] = b->i[i-1] + b->imax[i-1];
2185   }
2186 
2187   /* b->ilen will count nonzeros in each block row so far. */
2188   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2189   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2190   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2191 
2192   b->bs               = bs;
2193   b->bs2              = bs2;
2194   b->mbs              = mbs;
2195   b->nz               = 0;
2196   b->maxnz            = nz*bs2;
2197   b->sorted           = 0;
2198   b->roworiented      = 1;
2199   b->nonew            = 0;
2200   b->diag             = 0;
2201   b->solve_work       = 0;
2202   b->mult_work        = 0;
2203   b->spptr            = 0;
2204   B->info.nz_unneeded = (double)b->maxnz;
2205 
2206   *A = B;
2207   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2208   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2209   return 0;
2210 }
2211 
2212 #undef __FUNC__
2213 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2214 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2215 {
2216   Mat         C;
2217   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2218   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2219 
2220   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2221 
2222   *B = 0;
2223   PetscHeaderCreate(C,_p_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm,MatDestroy,MatView);
2224   PLogObjectCreate(C);
2225   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2226   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2227   C->destroy    = MatDestroy_SeqBAIJ;
2228   C->view       = MatView_SeqBAIJ;
2229   C->factor     = A->factor;
2230   c->row        = 0;
2231   c->col        = 0;
2232   C->assembled  = PETSC_TRUE;
2233 
2234   c->m = C->m   = a->m;
2235   c->n = C->n   = a->n;
2236   C->M          = a->m;
2237   C->N          = a->n;
2238 
2239   c->bs         = a->bs;
2240   c->bs2        = a->bs2;
2241   c->mbs        = a->mbs;
2242   c->nbs        = a->nbs;
2243 
2244   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2245   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2246   for ( i=0; i<mbs; i++ ) {
2247     c->imax[i] = a->imax[i];
2248     c->ilen[i] = a->ilen[i];
2249   }
2250 
2251   /* allocate the matrix space */
2252   c->singlemalloc = 1;
2253   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2254   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2255   c->j  = (int *) (c->a + nz*bs2);
2256   c->i  = c->j + nz;
2257   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2258   if (mbs > 0) {
2259     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2260     if (cpvalues == COPY_VALUES) {
2261       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2262     }
2263   }
2264 
2265   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2266   c->sorted      = a->sorted;
2267   c->roworiented = a->roworiented;
2268   c->nonew       = a->nonew;
2269 
2270   if (a->diag) {
2271     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2272     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2273     for ( i=0; i<mbs; i++ ) {
2274       c->diag[i] = a->diag[i];
2275     }
2276   }
2277   else c->diag          = 0;
2278   c->nz                 = a->nz;
2279   c->maxnz              = a->maxnz;
2280   c->solve_work         = 0;
2281   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2282   c->mult_work          = 0;
2283   *B = C;
2284   return 0;
2285 }
2286 
2287 #undef __FUNC__
2288 #define __FUNC__ "MatLoad_SeqBAIJ"
2289 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2290 {
2291   Mat_SeqBAIJ  *a;
2292   Mat          B;
2293   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2294   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2295   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2296   int          *masked, nmask,tmp,bs2,ishift;
2297   Scalar       *aa;
2298   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2299 
2300   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2301   bs2  = bs*bs;
2302 
2303   MPI_Comm_size(comm,&size);
2304   if (size > 1) SETERRQ(1,0,"view must have one processor");
2305   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2306   ierr = PetscBinaryRead(fd,header,4,PETSC_INT); CHKERRQ(ierr);
2307   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2308   M = header[1]; N = header[2]; nz = header[3];
2309 
2310   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2311 
2312   /*
2313      This code adds extra rows to make sure the number of rows is
2314     divisible by the blocksize
2315   */
2316   mbs        = M/bs;
2317   extra_rows = bs - M + bs*(mbs);
2318   if (extra_rows == bs) extra_rows = 0;
2319   else                  mbs++;
2320   if (extra_rows) {
2321     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2322   }
2323 
2324   /* read in row lengths */
2325   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2326   ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2327   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2328 
2329   /* read in column indices */
2330   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2331   ierr = PetscBinaryRead(fd,jj,nz,PETSC_INT); CHKERRQ(ierr);
2332   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2333 
2334   /* loop over row lengths determining block row lengths */
2335   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2336   PetscMemzero(browlengths,mbs*sizeof(int));
2337   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2338   PetscMemzero(mask,mbs*sizeof(int));
2339   masked = mask + mbs;
2340   rowcount = 0; nzcount = 0;
2341   for ( i=0; i<mbs; i++ ) {
2342     nmask = 0;
2343     for ( j=0; j<bs; j++ ) {
2344       kmax = rowlengths[rowcount];
2345       for ( k=0; k<kmax; k++ ) {
2346         tmp = jj[nzcount++]/bs;
2347         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2348       }
2349       rowcount++;
2350     }
2351     browlengths[i] += nmask;
2352     /* zero out the mask elements we set */
2353     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2354   }
2355 
2356   /* create our matrix */
2357   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);CHKERRQ(ierr);
2358   B = *A;
2359   a = (Mat_SeqBAIJ *) B->data;
2360 
2361   /* set matrix "i" values */
2362   a->i[0] = 0;
2363   for ( i=1; i<= mbs; i++ ) {
2364     a->i[i]      = a->i[i-1] + browlengths[i-1];
2365     a->ilen[i-1] = browlengths[i-1];
2366   }
2367   a->nz         = 0;
2368   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2369 
2370   /* read in nonzero values */
2371   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2372   ierr = PetscBinaryRead(fd,aa,nz,PETSC_SCALAR); CHKERRQ(ierr);
2373   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2374 
2375   /* set "a" and "j" values into matrix */
2376   nzcount = 0; jcount = 0;
2377   for ( i=0; i<mbs; i++ ) {
2378     nzcountb = nzcount;
2379     nmask    = 0;
2380     for ( j=0; j<bs; j++ ) {
2381       kmax = rowlengths[i*bs+j];
2382       for ( k=0; k<kmax; k++ ) {
2383         tmp = jj[nzcount++]/bs;
2384 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2385       }
2386       rowcount++;
2387     }
2388     /* sort the masked values */
2389     PetscSortInt(nmask,masked);
2390 
2391     /* set "j" values into matrix */
2392     maskcount = 1;
2393     for ( j=0; j<nmask; j++ ) {
2394       a->j[jcount++]  = masked[j];
2395       mask[masked[j]] = maskcount++;
2396     }
2397     /* set "a" values into matrix */
2398     ishift = bs2*a->i[i];
2399     for ( j=0; j<bs; j++ ) {
2400       kmax = rowlengths[i*bs+j];
2401       for ( k=0; k<kmax; k++ ) {
2402         tmp    = jj[nzcountb]/bs ;
2403         block  = mask[tmp] - 1;
2404         point  = jj[nzcountb] - bs*tmp;
2405         idx    = ishift + bs2*block + j + bs*point;
2406         a->a[idx] = aa[nzcountb++];
2407       }
2408     }
2409     /* zero out the mask elements we set */
2410     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2411   }
2412   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2413 
2414   PetscFree(rowlengths);
2415   PetscFree(browlengths);
2416   PetscFree(aa);
2417   PetscFree(jj);
2418   PetscFree(mask);
2419 
2420   B->assembled = PETSC_TRUE;
2421 
2422   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
2423   if (flg) {
2424     Viewer tviewer;
2425     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2426     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO,0);CHKERRQ(ierr);
2427     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2428     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2429   }
2430   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
2431   if (flg) {
2432     Viewer tviewer;
2433     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2434     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO_LONG,0);CHKERRQ(ierr);
2435     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2436     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2437   }
2438   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
2439   if (flg) {
2440     Viewer tviewer;
2441     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2442     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2443     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2444   }
2445   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
2446   if (flg) {
2447     Viewer tviewer;
2448     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2449     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_MATLAB,"M");CHKERRQ(ierr);
2450     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2451     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2452   }
2453   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
2454   if (flg) {
2455     Viewer tviewer;
2456     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
2457     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
2458     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
2459     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2460   }
2461   return 0;
2462 }
2463 
2464 
2465 
2466