xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 95b207ea01a3fbf8432ea0e0186aba3bf9cd0565)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij.c,v 1.107 1997/07/09 20:55:07 balay Exp bsmith $";
3 #endif
4 
5 /*
6     Defines the basic matrix operations for the BAIJ (compressed row)
7   matrix storage format.
8 */
9 
10 #include "pinclude/pviewer.h"
11 #include "sys.h"
12 #include "src/mat/impls/baij/seq/baij.h"
13 #include "src/vec/vecimpl.h"
14 #include "src/inline/spops.h"
15 #include "petsc.h"
16 
17 
18 /*
19      Adds diagonal pointers to sparse matrix structure.
20 */
21 
22 #undef __FUNC__
23 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
24 int MatMarkDiag_SeqBAIJ(Mat A)
25 {
26   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
27   int         i,j, *diag, m = a->mbs;
28 
29   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
30   PLogObjectMemory(A,(m+1)*sizeof(int));
31   for ( i=0; i<m; i++ ) {
32     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
33       if (a->j[j] == i) {
34         diag[i] = j;
35         break;
36       }
37     }
38   }
39   a->diag = diag;
40   return 0;
41 }
42 
43 
44 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
48 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
52   int         ierr, n = a->mbs,i;
53 
54   *nn = n;
55   if (!ia) return 0;
56   if (symmetric) {
57     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
58   } else if (oshift == 1) {
59     /* temporarily add 1 to i and j indices */
60     int nz = a->i[n] + 1;
61     for ( i=0; i<nz; i++ ) a->j[i]++;
62     for ( i=0; i<n+1; i++ ) a->i[i]++;
63     *ia = a->i; *ja = a->j;
64   } else {
65     *ia = a->i; *ja = a->j;
66   }
67 
68   return 0;
69 }
70 
71 #undef __FUNC__
72 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ" /* ADIC Ignore */
73 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
74                                 PetscTruth *done)
75 {
76   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
77   int         i,n = a->mbs;
78 
79   if (!ia) return 0;
80   if (symmetric) {
81     PetscFree(*ia);
82     PetscFree(*ja);
83   } else if (oshift == 1) {
84     int nz = a->i[n];
85     for ( i=0; i<nz; i++ ) a->j[i]--;
86     for ( i=0; i<n+1; i++ ) a->i[i]--;
87   }
88   return 0;
89 }
90 
91 
92 #undef __FUNC__
93 #define __FUNC__ "MatView_SeqBAIJ_Binary" /* ADIC Ignore */
94 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
95 {
96   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
97   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
98   Scalar      *aa;
99 
100   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
101   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
102   col_lens[0] = MAT_COOKIE;
103 
104   col_lens[1] = a->m;
105   col_lens[2] = a->n;
106   col_lens[3] = a->nz*bs2;
107 
108   /* store lengths of each row and write (including header) to file */
109   count = 0;
110   for ( i=0; i<a->mbs; i++ ) {
111     for ( j=0; j<bs; j++ ) {
112       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
113     }
114   }
115   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
116   PetscFree(col_lens);
117 
118   /* store column indices (zero start index) */
119   jj = (int *) PetscMalloc( (a->nz+1)*bs2*sizeof(int) ); CHKPTRQ(jj);
120   count = 0;
121   for ( i=0; i<a->mbs; i++ ) {
122     for ( j=0; j<bs; j++ ) {
123       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
124         for ( l=0; l<bs; l++ ) {
125           jj[count++] = bs*a->j[k] + l;
126         }
127       }
128     }
129   }
130   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
131   PetscFree(jj);
132 
133   /* store nonzero values */
134   aa = (Scalar *) PetscMalloc((a->nz+1)*bs2*sizeof(Scalar)); CHKPTRQ(aa);
135   count = 0;
136   for ( i=0; i<a->mbs; i++ ) {
137     for ( j=0; j<bs; j++ ) {
138       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
139         for ( l=0; l<bs; l++ ) {
140           aa[count++] = a->a[bs2*k + l*bs + j];
141         }
142       }
143     }
144   }
145   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
146   PetscFree(aa);
147   return 0;
148 }
149 
150 #undef __FUNC__
151 #define __FUNC__ "MatView_SeqBAIJ_ASCII" /* ADIC Ignore */
152 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
153 {
154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
155   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
156   FILE        *fd;
157   char        *outputname;
158 
159   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
160   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
161   ierr = ViewerGetFormat(viewer,&format);
162   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
163     fprintf(fd,"  block size is %d\n",bs);
164   }
165   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
166     SETERRQ(1,0,"Matlab format not supported");
167   }
168   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
169     for ( i=0; i<a->mbs; i++ ) {
170       for ( j=0; j<bs; j++ ) {
171         fprintf(fd,"row %d:",i*bs+j);
172         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
173           for ( l=0; l<bs; l++ ) {
174 #if defined(PETSC_COMPLEX)
175           if (imag(a->a[bs2*k + l*bs + j]) > 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
176             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
177               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
178           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
179             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
180               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
181           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
182             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
183 #else
184           if (a->a[bs2*k + l*bs + j] != 0.0)
185             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
186 #endif
187           }
188         }
189         fprintf(fd,"\n");
190       }
191     }
192   }
193   else {
194     for ( i=0; i<a->mbs; i++ ) {
195       for ( j=0; j<bs; j++ ) {
196         fprintf(fd,"row %d:",i*bs+j);
197         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
198           for ( l=0; l<bs; l++ ) {
199 #if defined(PETSC_COMPLEX)
200           if (imag(a->a[bs2*k + l*bs + j]) > 0.0) {
201             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
202               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
203           }
204           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0) {
205             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
206               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
207           }
208           else {
209             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
210           }
211 #else
212             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
213 #endif
214           }
215         }
216         fprintf(fd,"\n");
217       }
218     }
219   }
220   fflush(fd);
221   return 0;
222 }
223 
224 #undef __FUNC__
225 #define __FUNC__ "MatView_SeqBAIJ_Draw" /* ADIC Ignore */
226 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
227 {
228   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
229   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
230   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
231   Scalar       *aa;
232   Draw         draw;
233   DrawButton   button;
234   PetscTruth   isnull;
235 
236   ViewerDrawGetDraw(viewer,&draw);
237   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
238 
239   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
240   xr += w;    yr += h;  xl = -w;     yl = -h;
241   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
242   /* loop over matrix elements drawing boxes */
243   color = DRAW_BLUE;
244   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
245     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
246       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
247       x_l = a->j[j]*bs; x_r = x_l + 1.0;
248       aa = a->a + j*bs2;
249       for ( k=0; k<bs; k++ ) {
250         for ( l=0; l<bs; l++ ) {
251           if (PetscReal(*aa++) >=  0.) continue;
252           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
253         }
254       }
255     }
256   }
257   color = DRAW_CYAN;
258   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
259     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
260       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
261       x_l = a->j[j]*bs; x_r = x_l + 1.0;
262       aa = a->a + j*bs2;
263       for ( k=0; k<bs; k++ ) {
264         for ( l=0; l<bs; l++ ) {
265           if (PetscReal(*aa++) != 0.) continue;
266           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
267         }
268       }
269     }
270   }
271 
272   color = DRAW_RED;
273   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
274     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
275       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
276       x_l = a->j[j]*bs; x_r = x_l + 1.0;
277       aa = a->a + j*bs2;
278       for ( k=0; k<bs; k++ ) {
279         for ( l=0; l<bs; l++ ) {
280           if (PetscReal(*aa++) <= 0.) continue;
281           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
282         }
283       }
284     }
285   }
286 
287   DrawFlush(draw);
288   DrawGetPause(draw,&pause);
289   if (pause >= 0) { PetscSleep(pause); return 0;}
290 
291   /* allow the matrix to zoom or shrink */
292   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
293   while (button != BUTTON_RIGHT) {
294     DrawClear(draw);
295     if (button == BUTTON_LEFT) scale = .5;
296     else if (button == BUTTON_CENTER) scale = 2.;
297     xl = scale*(xl + w - xc) + xc - w*scale;
298     xr = scale*(xr - w - xc) + xc + w*scale;
299     yl = scale*(yl + h - yc) + yc - h*scale;
300     yr = scale*(yr - h - yc) + yc + h*scale;
301     w *= scale; h *= scale;
302     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
303     color = DRAW_BLUE;
304     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
305       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
306         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
307         x_l = a->j[j]*bs; x_r = x_l + 1.0;
308         aa = a->a + j*bs2;
309         for ( k=0; k<bs; k++ ) {
310           for ( l=0; l<bs; l++ ) {
311             if (PetscReal(*aa++) >=  0.) continue;
312             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
313           }
314         }
315       }
316     }
317     color = DRAW_CYAN;
318     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
319       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
320         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
321         x_l = a->j[j]*bs; x_r = x_l + 1.0;
322         aa = a->a + j*bs2;
323         for ( k=0; k<bs; k++ ) {
324           for ( l=0; l<bs; l++ ) {
325           if (PetscReal(*aa++) != 0.) continue;
326           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
327           }
328         }
329       }
330     }
331 
332     color = DRAW_RED;
333     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
334       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
335         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
336         x_l = a->j[j]*bs; x_r = x_l + 1.0;
337         aa = a->a + j*bs2;
338         for ( k=0; k<bs; k++ ) {
339           for ( l=0; l<bs; l++ ) {
340             if (PetscReal(*aa++) <= 0.) continue;
341             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
342           }
343         }
344       }
345     }
346     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
347   }
348   return 0;
349 }
350 
351 #undef __FUNC__
352 #define __FUNC__ "MatView_SeqBAIJ" /* ADIC Ignore */
353 int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
354 {
355   Mat         A = (Mat) obj;
356   ViewerType  vtype;
357   int         ierr;
358 
359   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
360   if (vtype == MATLAB_VIEWER) {
361     SETERRQ(1,0,"Matlab viewer not supported");
362   }
363   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
364     return MatView_SeqBAIJ_ASCII(A,viewer);
365   }
366   else if (vtype == BINARY_FILE_VIEWER) {
367     return MatView_SeqBAIJ_Binary(A,viewer);
368   }
369   else if (vtype == DRAW_VIEWER) {
370     return MatView_SeqBAIJ_Draw(A,viewer);
371   }
372   return 0;
373 }
374 
375 #define CHUNKSIZE  10
376 
377 #undef __FUNC__
378 #define __FUNC__ "MatSetValues_SeqBAIJ"
379 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
380 {
381   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
382   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
383   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
384   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
385   int          ridx,cidx,bs2=a->bs2;
386   Scalar      *ap,value,*aa=a->a,*bap;
387 
388   for ( k=0; k<m; k++ ) { /* loop over added rows */
389     row  = im[k]; brow = row/bs;
390 #if defined(PETSC_BOPT_g)
391     if (row < 0) SETERRQ(1,0,"Negative row");
392     if (row >= a->m) SETERRQ(1,0,"Row too large");
393 #endif
394     rp   = aj + ai[brow];
395     ap   = aa + bs2*ai[brow];
396     rmax = imax[brow];
397     nrow = ailen[brow];
398     low  = 0;
399     for ( l=0; l<n; l++ ) { /* loop over added columns */
400 #if defined(PETSC_BOPT_g)
401       if (in[l] < 0) SETERRQ(1,0,"Negative column");
402       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
403 #endif
404       col = in[l]; bcol = col/bs;
405       ridx = row % bs; cidx = col % bs;
406       if (roworiented) {
407         value = *v++;
408       }
409       else {
410         value = v[k + l*m];
411       }
412       if (!sorted) low = 0; high = nrow;
413       while (high-low > 7) {
414         t = (low+high)/2;
415         if (rp[t] > bcol) high = t;
416         else              low  = t;
417       }
418       for ( i=low; i<high; i++ ) {
419         if (rp[i] > bcol) break;
420         if (rp[i] == bcol) {
421           bap  = ap +  bs2*i + bs*cidx + ridx;
422           if (is == ADD_VALUES) *bap += value;
423           else                  *bap  = value;
424           goto noinsert1;
425         }
426       }
427       if (nonew == 1) goto noinsert1;
428       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
429       if (nrow >= rmax) {
430         /* there is no extra room in row, therefore enlarge */
431         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
432         Scalar *new_a;
433 
434         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
435 
436         /* Malloc new storage space */
437         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
438         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
439         new_j   = (int *) (new_a + bs2*new_nz);
440         new_i   = new_j + new_nz;
441 
442         /* copy over old data into new slots */
443         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
444         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
445         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
446         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
447         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
448                                                            len*sizeof(int));
449         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
450         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
451         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
452                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
453         /* free up old matrix storage */
454         PetscFree(a->a);
455         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
456         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
457         a->singlemalloc = 1;
458 
459         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
460         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
461         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
462         a->maxnz += bs2*CHUNKSIZE;
463         a->reallocs++;
464         a->nz++;
465       }
466       N = nrow++ - 1;
467       /* shift up all the later entries in this row */
468       for ( ii=N; ii>=i; ii-- ) {
469         rp[ii+1] = rp[ii];
470         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
471       }
472       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
473       rp[i]                      = bcol;
474       ap[bs2*i + bs*cidx + ridx] = value;
475       noinsert1:;
476       low = i;
477     }
478     ailen[brow] = nrow;
479   }
480   return 0;
481 }
482 
483 #undef __FUNC__
484 #define __FUNC__ "MatSetValuesBlocked_SeqBAIJ"
485 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
486 {
487   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
488   int         *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
489   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen;
490   int         roworiented=a->roworiented;
491   int         *aj=a->j,nonew=a->nonew;
492   int          bs2=a->bs2,bs=a->bs,stepval;
493   Scalar      *ap,*value=v,*aa=a->a,*bap;
494 
495   if (roworiented) {
496     stepval = (n-1)*bs;
497   } else {
498     stepval = (m-1)*bs;
499   }
500   for ( k=0; k<m; k++ ) { /* loop over added rows */
501     row  = im[k];
502 #if defined(PETSC_BOPT_g)
503     if (row < 0) SETERRQ(1,0,"Negative row");
504     if (row >= a->mbs) SETERRQ(1,0,"Row too large");
505 #endif
506     rp   = aj + ai[row];
507     ap   = aa + bs2*ai[row];
508     rmax = imax[row];
509     nrow = ailen[row];
510     low  = 0;
511     for ( l=0; l<n; l++ ) { /* loop over added columns */
512 #if defined(PETSC_BOPT_g)
513       if (in[l] < 0) SETERRQ(1,0,"Negative column");
514       if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large");
515 #endif
516       col = in[l];
517       if (roworiented) {
518         value = v + k*(stepval+bs)*bs + l*bs;
519       } else {
520         value = v + l*(stepval+bs)*bs + k*bs;
521       }
522       if (!sorted) low = 0; high = nrow;
523       while (high-low > 7) {
524         t = (low+high)/2;
525         if (rp[t] > col) high = t;
526         else             low  = t;
527       }
528       for ( i=low; i<high; i++ ) {
529         if (rp[i] > col) break;
530         if (rp[i] == col) {
531           bap  = ap +  bs2*i;
532           if (roworiented) {
533             if (is == ADD_VALUES) {
534               for ( ii=0; ii<bs; ii++,value+=stepval )
535                 for (jj=ii; jj<bs2; jj+=bs )
536                   bap[jj] += *value++;
537             } else {
538               for ( ii=0; ii<bs; ii++,value+=stepval )
539                 for (jj=ii; jj<bs2; jj+=bs )
540                   bap[jj] = *value++;
541             }
542           } else {
543             if (is == ADD_VALUES) {
544               for ( ii=0; ii<bs; ii++,value+=stepval )
545                 for (jj=0; jj<bs; jj++ )
546                   *bap++ += *value++;
547             } else {
548               for ( ii=0; ii<bs; ii++,value+=stepval )
549                 for (jj=0; jj<bs; jj++ )
550                   *bap++  = *value++;
551             }
552           }
553           goto noinsert2;
554         }
555       }
556       if (nonew == 1) goto noinsert2;
557       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
558       if (nrow >= rmax) {
559         /* there is no extra room in row, therefore enlarge */
560         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
561         Scalar *new_a;
562 
563         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
564 
565         /* malloc new storage space */
566         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
567         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
568         new_j   = (int *) (new_a + bs2*new_nz);
569         new_i   = new_j + new_nz;
570 
571         /* copy over old data into new slots */
572         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
573         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
574         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
575         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
576         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,
577                                                            len*sizeof(int));
578         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
579         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
580         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),
581                     aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
582         /* free up old matrix storage */
583         PetscFree(a->a);
584         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
585         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
586         a->singlemalloc = 1;
587 
588         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
589         rmax = imax[row] = imax[row] + CHUNKSIZE;
590         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
591         a->maxnz += bs2*CHUNKSIZE;
592         a->reallocs++;
593         a->nz++;
594       }
595       N = nrow++ - 1;
596       /* shift up all the later entries in this row */
597       for ( ii=N; ii>=i; ii-- ) {
598         rp[ii+1] = rp[ii];
599         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
600       }
601       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
602       rp[i] = col;
603       bap   = ap +  bs2*i;
604       if (roworiented) {
605         for ( ii=0; ii<bs; ii++,value+=stepval)
606           for (jj=ii; jj<bs2; jj+=bs )
607             bap[jj] = *value++;
608       } else {
609         for ( ii=0; ii<bs; ii++,value+=stepval )
610           for (jj=0; jj<bs; jj++ )
611             *bap++  = *value++;
612       }
613       noinsert2:;
614       low = i;
615     }
616     ailen[row] = nrow;
617   }
618   return 0;
619 }
620 
621 #undef __FUNC__
622 #define __FUNC__ "MatGetSize_SeqBAIJ" /* ADIC Ignore */
623 int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
624 {
625   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
626   *m = a->m; *n = a->n;
627   return 0;
628 }
629 
630 #undef __FUNC__
631 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ" /* ADIC Ignore */
632 int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
633 {
634   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
635   *m = 0; *n = a->m;
636   return 0;
637 }
638 
639 #undef __FUNC__
640 #define __FUNC__ "MatGetRow_SeqBAIJ"
641 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
642 {
643   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
644   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
645   Scalar      *aa,*v_i,*aa_i;
646 
647   bs  = a->bs;
648   ai  = a->i;
649   aj  = a->j;
650   aa  = a->a;
651   bs2 = a->bs2;
652 
653   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
654 
655   bn  = row/bs;   /* Block number */
656   bp  = row % bs; /* Block Position */
657   M   = ai[bn+1] - ai[bn];
658   *nz = bs*M;
659 
660   if (v) {
661     *v = 0;
662     if (*nz) {
663       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
664       for ( i=0; i<M; i++ ) { /* for each block in the block row */
665         v_i  = *v + i*bs;
666         aa_i = aa + bs2*(ai[bn] + i);
667         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
668       }
669     }
670   }
671 
672   if (idx) {
673     *idx = 0;
674     if (*nz) {
675       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
676       for ( i=0; i<M; i++ ) { /* for each block in the block row */
677         idx_i = *idx + i*bs;
678         itmp  = bs*aj[ai[bn] + i];
679         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
680       }
681     }
682   }
683   return 0;
684 }
685 
686 #undef __FUNC__
687 #define __FUNC__ "MatRestoreRow_SeqBAIJ" /* ADIC Ignore */
688 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
689 {
690   if (idx) {if (*idx) PetscFree(*idx);}
691   if (v)   {if (*v)   PetscFree(*v);}
692   return 0;
693 }
694 
695 #undef __FUNC__
696 #define __FUNC__ "MatTranspose_SeqBAIJ"
697 int MatTranspose_SeqBAIJ(Mat A,Mat *B)
698 {
699   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
700   Mat         C;
701   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
702   int         *rows,*cols,bs2=a->bs2;
703   Scalar      *array=a->a;
704 
705   if (B==PETSC_NULL && mbs!=nbs)
706     SETERRQ(1,0,"Square matrix only for in-place");
707   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
708   PetscMemzero(col,(1+nbs)*sizeof(int));
709 
710   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
711   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
712   PetscFree(col);
713   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
714   cols = rows + bs;
715   for ( i=0; i<mbs; i++ ) {
716     cols[0] = i*bs;
717     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
718     len = ai[i+1] - ai[i];
719     for ( j=0; j<len; j++ ) {
720       rows[0] = (*aj++)*bs;
721       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
722       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
723       array += bs2;
724     }
725   }
726   PetscFree(rows);
727 
728   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
729   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
730 
731   if (B != PETSC_NULL) {
732     *B = C;
733   } else {
734     /* This isn't really an in-place transpose */
735     PetscFree(a->a);
736     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
737     if (a->diag) PetscFree(a->diag);
738     if (a->ilen) PetscFree(a->ilen);
739     if (a->imax) PetscFree(a->imax);
740     if (a->solve_work) PetscFree(a->solve_work);
741     PetscFree(a);
742     PetscMemcpy(A,C,sizeof(struct _p_Mat));
743     PetscHeaderDestroy(C);
744   }
745   return 0;
746 }
747 
748 
749 #undef __FUNC__
750 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
751 int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
752 {
753   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
754   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
755   int        m = a->m,*ip, N, *ailen = a->ilen;
756   int        mbs = a->mbs, bs2 = a->bs2,rmax = 0;
757   Scalar     *aa = a->a, *ap;
758 
759   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
760 
761   if (m) rmax = ailen[0];
762   for ( i=1; i<mbs; i++ ) {
763     /* move each row back by the amount of empty slots (fshift) before it*/
764     fshift += imax[i-1] - ailen[i-1];
765     rmax   = PetscMax(rmax,ailen[i]);
766     if (fshift) {
767       ip = aj + ai[i]; ap = aa + bs2*ai[i];
768       N = ailen[i];
769       for ( j=0; j<N; j++ ) {
770         ip[j-fshift] = ip[j];
771         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
772       }
773     }
774     ai[i] = ai[i-1] + ailen[i-1];
775   }
776   if (mbs) {
777     fshift += imax[mbs-1] - ailen[mbs-1];
778     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
779   }
780   /* reset ilen and imax for each row */
781   for ( i=0; i<mbs; i++ ) {
782     ailen[i] = imax[i] = ai[i+1] - ai[i];
783   }
784   a->nz = ai[mbs];
785 
786   /* diagonals may have moved, so kill the diagonal pointers */
787   if (fshift && a->diag) {
788     PetscFree(a->diag);
789     PLogObjectMemory(A,-(m+1)*sizeof(int));
790     a->diag = 0;
791   }
792   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
793            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
794   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
795            a->reallocs);
796   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
797   a->reallocs          = 0;
798   A->info.nz_unneeded  = (double)fshift*bs2;
799 
800   return 0;
801 }
802 
803 #undef __FUNC__
804 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
805 int MatZeroEntries_SeqBAIJ(Mat A)
806 {
807   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
808   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
809   return 0;
810 }
811 
812 #undef __FUNC__
813 #define __FUNC__ "MatDestroy_SeqBAIJ" /* ADIC Ignore */
814 int MatDestroy_SeqBAIJ(PetscObject obj)
815 {
816   Mat         A  = (Mat) obj;
817   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
818 
819 #if defined(PETSC_LOG)
820   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
821 #endif
822   PetscFree(a->a);
823   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
824   if (a->diag) PetscFree(a->diag);
825   if (a->ilen) PetscFree(a->ilen);
826   if (a->imax) PetscFree(a->imax);
827   if (a->solve_work) PetscFree(a->solve_work);
828   if (a->mult_work) PetscFree(a->mult_work);
829   PetscFree(a);
830   PLogObjectDestroy(A);
831   PetscHeaderDestroy(A);
832   return 0;
833 }
834 
835 #undef __FUNC__
836 #define __FUNC__ "MatSetOption_SeqBAIJ" /* ADIC Ignore */
837 int MatSetOption_SeqBAIJ(Mat A,MatOption op)
838 {
839   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
840   if      (op == MAT_ROW_ORIENTED)                 a->roworiented = 1;
841   else if (op == MAT_COLUMN_ORIENTED)              a->roworiented = 0;
842   else if (op == MAT_COLUMNS_SORTED)               a->sorted      = 1;
843   else if (op == MAT_COLUMNS_UNSORTED)             a->sorted      = 0;
844   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)     a->nonew       = 1;
845   else if (op == MAT_NEW_NONZERO_LOCATION_ERROR)   a->nonew       = -1;
846   else if (op == MAT_NEW_NONZERO_ALLOCATION_ERROR) a->nonew       = -2;
847   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS)    a->nonew       = 0;
848   else if (op == MAT_ROWS_SORTED ||
849            op == MAT_ROWS_UNSORTED ||
850            op == MAT_SYMMETRIC ||
851            op == MAT_STRUCTURALLY_SYMMETRIC ||
852            op == MAT_YES_NEW_DIAGONALS ||
853            op == MAT_IGNORE_OFF_PROC_ENTRIES)
854     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
855   else if (op == MAT_NO_NEW_DIAGONALS)
856     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
857   else
858     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
859   return 0;
860 }
861 
862 
863 /* -------------------------------------------------------*/
864 /* Should check that shapes of vectors and matrices match */
865 /* -------------------------------------------------------*/
866 #include "pinclude/plapack.h"
867 
868 #undef __FUNC__
869 #define __FUNC__ "MatMult_SeqBAIJ_1"
870 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
871 {
872   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
873   register Scalar *x,*z,*v,sum;
874   int             mbs=a->mbs,i,*idx,*ii,n;
875 
876   VecGetArray_Fast(xx,x);
877   VecGetArray_Fast(zz,z);
878 
879   idx   = a->j;
880   v     = a->a;
881   ii    = a->i;
882 
883   for ( i=0; i<mbs; i++ ) {
884     n    = ii[1] - ii[0]; ii++;
885     sum  = 0.0;
886     while (n--) sum += *v++ * x[*idx++];
887     z[i] = sum;
888   }
889   VecRestoreArray_Fast(xx,x);
890   VecRestoreArray_Fast(zz,z);
891   PLogFlops(2*a->nz - a->m);
892   return 0;
893 }
894 
895 #undef __FUNC__
896 #define __FUNC__ "MatMult_SeqBAIJ_2"
897 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
898 {
899   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
900   register Scalar *x,*z,*v,*xb,sum1,sum2;
901   register Scalar x1,x2;
902   int             mbs=a->mbs,i,*idx,*ii;
903   int             j,n;
904 
905   VecGetArray_Fast(xx,x);
906   VecGetArray_Fast(zz,z);
907 
908   idx   = a->j;
909   v     = a->a;
910   ii    = a->i;
911 
912   for ( i=0; i<mbs; i++ ) {
913     n  = ii[1] - ii[0]; ii++;
914     sum1 = 0.0; sum2 = 0.0;
915     for ( j=0; j<n; j++ ) {
916       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
917       sum1 += v[0]*x1 + v[2]*x2;
918       sum2 += v[1]*x1 + v[3]*x2;
919       v += 4;
920     }
921     z[0] = sum1; z[1] = sum2;
922     z += 2;
923   }
924   VecRestoreArray_Fast(xx,x);
925   VecRestoreArray_Fast(zz,z);
926   PLogFlops(4*a->nz - a->m);
927   return 0;
928 }
929 
930 #undef __FUNC__
931 #define __FUNC__ "MatMult_SeqBAIJ_3"
932 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
933 {
934   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
935   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
936   int             mbs=a->mbs,i,*idx,*ii,j,n;
937 
938   VecGetArray_Fast(xx,x);
939   VecGetArray_Fast(zz,z);
940 
941   idx   = a->j;
942   v     = a->a;
943   ii    = a->i;
944 
945   for ( i=0; i<mbs; i++ ) {
946     n  = ii[1] - ii[0]; ii++;
947     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
948     for ( j=0; j<n; j++ ) {
949       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
950       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
951       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
952       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
953       v += 9;
954     }
955     z[0] = sum1; z[1] = sum2; z[2] = sum3;
956     z += 3;
957   }
958   VecRestoreArray_Fast(xx,x);
959   VecRestoreArray_Fast(zz,z);
960   PLogFlops(18*a->nz - a->m);
961   return 0;
962 }
963 
964 #undef __FUNC__
965 #define __FUNC__ "MatMult_SeqBAIJ_4"
966 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
967 {
968   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
969   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
970   register Scalar x1,x2,x3,x4;
971   int             mbs=a->mbs,i,*idx,*ii;
972   int             j,n;
973 
974   VecGetArray_Fast(xx,x);
975   VecGetArray_Fast(zz,z);
976 
977   idx   = a->j;
978   v     = a->a;
979   ii    = a->i;
980 
981   for ( i=0; i<mbs; i++ ) {
982     n  = ii[1] - ii[0]; ii++;
983     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
984     for ( j=0; j<n; j++ ) {
985       xb = x + 4*(*idx++);
986       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
987       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
988       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
989       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
990       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
991       v += 16;
992     }
993     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
994     z += 4;
995   }
996   VecRestoreArray_Fast(xx,x);
997   VecRestoreArray_Fast(zz,z);
998   PLogFlops(32*a->nz - a->m);
999   return 0;
1000 }
1001 
1002 #undef __FUNC__
1003 #define __FUNC__ "MatMult_SeqBAIJ_5"
1004 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
1005 {
1006   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1007   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1008   register Scalar x1,x2,x3,x4,x5;
1009   int             mbs=a->mbs,i,*idx,*ii,j,n;
1010 
1011   VecGetArray_Fast(xx,x);
1012   VecGetArray_Fast(zz,z);
1013 
1014   idx   = a->j;
1015   v     = a->a;
1016   ii    = a->i;
1017 
1018   for ( i=0; i<mbs; i++ ) {
1019     n  = ii[1] - ii[0]; ii++;
1020     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
1021     for ( j=0; j<n; j++ ) {
1022       xb = x + 5*(*idx++);
1023       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1024       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1025       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1026       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1027       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1028       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1029       v += 25;
1030     }
1031     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1032     z += 5;
1033   }
1034   VecRestoreArray_Fast(xx,x);
1035   VecRestoreArray_Fast(zz,z);
1036   PLogFlops(50*a->nz - a->m);
1037   return 0;
1038 }
1039 
1040 #undef __FUNC__
1041 #define __FUNC__ "MatMult_SeqBAIJ_7"
1042 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1043 {
1044   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1045   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1046   register Scalar x1,x2,x3,x4,x5,x6,x7;
1047   int             mbs=a->mbs,i,*idx,*ii,j,n;
1048 
1049   VecGetArray_Fast(xx,x);
1050   VecGetArray_Fast(zz,z);
1051 
1052   idx   = a->j;
1053   v     = a->a;
1054   ii    = a->i;
1055 
1056   for ( i=0; i<mbs; i++ ) {
1057     n  = ii[1] - ii[0]; ii++;
1058     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1059     for ( j=0; j<n; j++ ) {
1060       xb = x + 7*(*idx++);
1061       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1062       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1063       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1064       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1065       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1066       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1067       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1068       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1069       v += 49;
1070     }
1071     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1072     z += 7;
1073   }
1074 
1075   VecRestoreArray_Fast(xx,x);
1076   VecRestoreArray_Fast(zz,z);
1077   PLogFlops(98*a->nz - a->m);
1078   return 0;
1079 }
1080 
1081 #undef __FUNC__
1082 #define __FUNC__ "MatMult_SeqBAIJ_N"
1083 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1084 {
1085   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1086   register Scalar *x,*z,*v,*xb;
1087   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1088   int             _One = 1,ncols,k;
1089   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1090 
1091   VecGetArray_Fast(xx,x);
1092   VecGetArray_Fast(zz,z);
1093 
1094   idx   = a->j;
1095   v     = a->a;
1096   ii    = a->i;
1097 
1098 
1099   if (!a->mult_work) {
1100     k = PetscMax(a->m,a->n);
1101     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1102   }
1103   work = a->mult_work;
1104   for ( i=0; i<mbs; i++ ) {
1105     n     = ii[1] - ii[0]; ii++;
1106     ncols = n*bs;
1107     workt = work;
1108     for ( j=0; j<n; j++ ) {
1109       xb = x + bs*(*idx++);
1110       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1111       workt += bs;
1112     }
1113     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1114     v += n*bs2;
1115     z += bs;
1116   }
1117   VecRestoreArray_Fast(xx,x);
1118   VecRestoreArray_Fast(zz,z);
1119   PLogFlops(2*a->nz*bs2 - a->m);
1120   return 0;
1121 }
1122 
1123 #undef __FUNC__
1124 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1125 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1126 {
1127   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1128   register Scalar *x,*y,*z,*v,sum;
1129   int             mbs=a->mbs,i,*idx,*ii,n;
1130 
1131   VecGetArray_Fast(xx,x);
1132   VecGetArray_Fast(yy,y);
1133   VecGetArray_Fast(zz,z);
1134 
1135   idx   = a->j;
1136   v     = a->a;
1137   ii    = a->i;
1138 
1139   for ( i=0; i<mbs; i++ ) {
1140     n    = ii[1] - ii[0]; ii++;
1141     sum  = y[i];
1142     while (n--) sum += *v++ * x[*idx++];
1143     z[i] = sum;
1144   }
1145   VecRestoreArray_Fast(xx,x);
1146   VecRestoreArray_Fast(yy,y);
1147   VecRestoreArray_Fast(zz,z);
1148   PLogFlops(2*a->nz);
1149   return 0;
1150 }
1151 
1152 #undef __FUNC__
1153 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1154 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1155 {
1156   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1157   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1158   register Scalar x1,x2;
1159   int             mbs=a->mbs,i,*idx,*ii;
1160   int             j,n;
1161 
1162   VecGetArray_Fast(xx,x);
1163   VecGetArray_Fast(yy,y);
1164   VecGetArray_Fast(zz,z);
1165 
1166   idx   = a->j;
1167   v     = a->a;
1168   ii    = a->i;
1169 
1170   for ( i=0; i<mbs; i++ ) {
1171     n  = ii[1] - ii[0]; ii++;
1172     sum1 = y[0]; sum2 = y[1];
1173     for ( j=0; j<n; j++ ) {
1174       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1175       sum1 += v[0]*x1 + v[2]*x2;
1176       sum2 += v[1]*x1 + v[3]*x2;
1177       v += 4;
1178     }
1179     z[0] = sum1; z[1] = sum2;
1180     z += 2; y += 2;
1181   }
1182   VecRestoreArray_Fast(xx,x);
1183   VecRestoreArray_Fast(yy,y);
1184   VecRestoreArray_Fast(zz,z);
1185   PLogFlops(4*a->nz);
1186   return 0;
1187 }
1188 
1189 #undef __FUNC__
1190 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1191 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1192 {
1193   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1194   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1195   int             mbs=a->mbs,i,*idx,*ii,j,n;
1196 
1197   VecGetArray_Fast(xx,x);
1198   VecGetArray_Fast(yy,y);
1199   VecGetArray_Fast(zz,z);
1200 
1201   idx   = a->j;
1202   v     = a->a;
1203   ii    = a->i;
1204 
1205   for ( i=0; i<mbs; i++ ) {
1206     n  = ii[1] - ii[0]; ii++;
1207     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1208     for ( j=0; j<n; j++ ) {
1209       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1210       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1211       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1212       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1213       v += 9;
1214     }
1215     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1216     z += 3; y += 3;
1217   }
1218   VecRestoreArray_Fast(xx,x);
1219   VecRestoreArray_Fast(yy,y);
1220   VecRestoreArray_Fast(zz,z);
1221   PLogFlops(18*a->nz);
1222   return 0;
1223 }
1224 
1225 #undef __FUNC__
1226 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1227 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1228 {
1229   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1230   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1231   register Scalar x1,x2,x3,x4;
1232   int             mbs=a->mbs,i,*idx,*ii;
1233   int             j,n;
1234 
1235   VecGetArray_Fast(xx,x);
1236   VecGetArray_Fast(yy,y);
1237   VecGetArray_Fast(zz,z);
1238 
1239   idx   = a->j;
1240   v     = a->a;
1241   ii    = a->i;
1242 
1243   for ( i=0; i<mbs; i++ ) {
1244     n  = ii[1] - ii[0]; ii++;
1245     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1246     for ( j=0; j<n; j++ ) {
1247       xb = x + 4*(*idx++);
1248       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1249       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1250       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1251       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1252       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1253       v += 16;
1254     }
1255     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1256     z += 4; y += 4;
1257   }
1258   VecRestoreArray_Fast(xx,x);
1259   VecRestoreArray_Fast(yy,y);
1260   VecRestoreArray_Fast(zz,z);
1261   PLogFlops(32*a->nz);
1262   return 0;
1263 }
1264 
1265 #undef __FUNC__
1266 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1267 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1268 {
1269   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1270   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1271   register Scalar x1,x2,x3,x4,x5;
1272   int             mbs=a->mbs,i,*idx,*ii,j,n;
1273 
1274   VecGetArray_Fast(xx,x);
1275   VecGetArray_Fast(yy,y);
1276   VecGetArray_Fast(zz,z);
1277 
1278   idx   = a->j;
1279   v     = a->a;
1280   ii    = a->i;
1281 
1282   for ( i=0; i<mbs; i++ ) {
1283     n  = ii[1] - ii[0]; ii++;
1284     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1285     for ( j=0; j<n; j++ ) {
1286       xb = x + 5*(*idx++);
1287       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1288       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1289       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1290       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1291       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1292       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1293       v += 25;
1294     }
1295     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1296     z += 5; y += 5;
1297   }
1298   VecRestoreArray_Fast(xx,x);
1299   VecRestoreArray_Fast(yy,y);
1300   VecRestoreArray_Fast(zz,z);
1301   PLogFlops(50*a->nz);
1302   return 0;
1303 }
1304 
1305 #undef __FUNC__
1306 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1307 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1308 {
1309   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1310   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1311   register Scalar x1,x2,x3,x4,x5,x6,x7;
1312   int             mbs=a->mbs,i,*idx,*ii,j,n;
1313 
1314   VecGetArray_Fast(xx,x);
1315   VecGetArray_Fast(yy,y);
1316   VecGetArray_Fast(zz,z);
1317 
1318   idx   = a->j;
1319   v     = a->a;
1320   ii    = a->i;
1321 
1322   for ( i=0; i<mbs; i++ ) {
1323     n  = ii[1] - ii[0]; ii++;
1324     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1325     for ( j=0; j<n; j++ ) {
1326       xb = x + 7*(*idx++);
1327       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1328       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1329       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1330       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1331       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1332       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1333       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1334       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1335       v += 49;
1336     }
1337     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1338     z += 7; y += 7;
1339   }
1340   VecRestoreArray_Fast(xx,x);
1341   VecRestoreArray_Fast(yy,y);
1342   VecRestoreArray_Fast(zz,z);
1343   PLogFlops(98*a->nz);
1344   return 0;
1345 }
1346 
1347 
1348 #undef __FUNC__
1349 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1350 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1351 {
1352   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1353   register Scalar *x,*z,*v,*xb;
1354   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1355   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1356 
1357   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1358 
1359   VecGetArray_Fast(xx,x);
1360   VecGetArray_Fast(zz,z);
1361 
1362   idx   = a->j;
1363   v     = a->a;
1364   ii    = a->i;
1365 
1366 
1367   if (!a->mult_work) {
1368     k = PetscMax(a->m,a->n);
1369     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1370   }
1371   work = a->mult_work;
1372   for ( i=0; i<mbs; i++ ) {
1373     n     = ii[1] - ii[0]; ii++;
1374     ncols = n*bs;
1375     workt = work;
1376     for ( j=0; j<n; j++ ) {
1377       xb = x + bs*(*idx++);
1378       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1379       workt += bs;
1380     }
1381     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1382     v += n*bs2;
1383     z += bs;
1384   }
1385   VecRestoreArray_Fast(xx,x);
1386   VecRestoreArray_Fast(zz,z);
1387   PLogFlops(2*a->nz*bs2 );
1388   return 0;
1389 }
1390 
1391 #undef __FUNC__
1392 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1393 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1394 {
1395   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1396   Scalar          *xg,*zg,*zb;
1397   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1398   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1399   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1400 
1401 
1402   VecGetArray_Fast(xx,xg); x = xg;
1403   VecGetArray_Fast(zz,zg); z = zg;
1404   PetscMemzero(z,N*sizeof(Scalar));
1405 
1406   idx   = a->j;
1407   v     = a->a;
1408   ii    = a->i;
1409 
1410   switch (bs) {
1411   case 1:
1412     for ( i=0; i<mbs; i++ ) {
1413       n  = ii[1] - ii[0]; ii++;
1414       xb = x + i; x1 = xb[0];
1415       ib = idx + ai[i];
1416       for ( j=0; j<n; j++ ) {
1417         rval    = ib[j];
1418         z[rval] += *v++ * x1;
1419       }
1420     }
1421     break;
1422   case 2:
1423     for ( i=0; i<mbs; i++ ) {
1424       n  = ii[1] - ii[0]; ii++;
1425       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1426       ib = idx + ai[i];
1427       for ( j=0; j<n; j++ ) {
1428         rval      = ib[j]*2;
1429         z[rval++] += v[0]*x1 + v[1]*x2;
1430         z[rval++] += v[2]*x1 + v[3]*x2;
1431         v += 4;
1432       }
1433     }
1434     break;
1435   case 3:
1436     for ( i=0; i<mbs; i++ ) {
1437       n  = ii[1] - ii[0]; ii++;
1438       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1439       ib = idx + ai[i];
1440       for ( j=0; j<n; j++ ) {
1441         rval      = ib[j]*3;
1442         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1443         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1444         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1445         v += 9;
1446       }
1447     }
1448     break;
1449   case 4:
1450     for ( i=0; i<mbs; i++ ) {
1451       n  = ii[1] - ii[0]; ii++;
1452       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1453       ib = idx + ai[i];
1454       for ( j=0; j<n; j++ ) {
1455         rval      = ib[j]*4;
1456         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1457         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1458         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1459         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1460         v += 16;
1461       }
1462     }
1463     break;
1464   case 5:
1465     for ( i=0; i<mbs; i++ ) {
1466       n  = ii[1] - ii[0]; ii++;
1467       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1468       x4 = xb[3];   x5 = xb[4];
1469       ib = idx + ai[i];
1470       for ( j=0; j<n; j++ ) {
1471         rval      = ib[j]*5;
1472         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1473         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1474         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1475         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1476         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1477         v += 25;
1478       }
1479     }
1480     break;
1481       /* block sizes larger then 5 by 5 are handled by BLAS */
1482     default: {
1483       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1484       if (!a->mult_work) {
1485         k = PetscMax(a->m,a->n);
1486         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1487         CHKPTRQ(a->mult_work);
1488       }
1489       work = a->mult_work;
1490       for ( i=0; i<mbs; i++ ) {
1491         n     = ii[1] - ii[0]; ii++;
1492         ncols = n*bs;
1493         PetscMemzero(work,ncols*sizeof(Scalar));
1494         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1495         v += n*bs2;
1496         x += bs;
1497         workt = work;
1498         for ( j=0; j<n; j++ ) {
1499           zb = z + bs*(*idx++);
1500           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1501           workt += bs;
1502         }
1503       }
1504     }
1505   }
1506   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1507   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1508   PLogFlops(2*a->nz*a->bs2 - a->n);
1509   return 0;
1510 }
1511 
1512 #undef __FUNC__
1513 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1514 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1515 {
1516   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1517   Scalar          *xg,*zg,*zb;
1518   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1519   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1520   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1521 
1522 
1523 
1524   VecGetArray_Fast(xx,xg); x = xg;
1525   VecGetArray_Fast(zz,zg); z = zg;
1526 
1527   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1528   else PetscMemzero(z,N*sizeof(Scalar));
1529 
1530 
1531   idx   = a->j;
1532   v     = a->a;
1533   ii    = a->i;
1534 
1535   switch (bs) {
1536   case 1:
1537     for ( i=0; i<mbs; i++ ) {
1538       n  = ii[1] - ii[0]; ii++;
1539       xb = x + i; x1 = xb[0];
1540       ib = idx + ai[i];
1541       for ( j=0; j<n; j++ ) {
1542         rval    = ib[j];
1543         z[rval] += *v++ * x1;
1544       }
1545     }
1546     break;
1547   case 2:
1548     for ( i=0; i<mbs; i++ ) {
1549       n  = ii[1] - ii[0]; ii++;
1550       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1551       ib = idx + ai[i];
1552       for ( j=0; j<n; j++ ) {
1553         rval      = ib[j]*2;
1554         z[rval++] += v[0]*x1 + v[1]*x2;
1555         z[rval++] += v[2]*x1 + v[3]*x2;
1556         v += 4;
1557       }
1558     }
1559     break;
1560   case 3:
1561     for ( i=0; i<mbs; i++ ) {
1562       n  = ii[1] - ii[0]; ii++;
1563       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1564       ib = idx + ai[i];
1565       for ( j=0; j<n; j++ ) {
1566         rval      = ib[j]*3;
1567         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1568         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1569         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1570         v += 9;
1571       }
1572     }
1573     break;
1574   case 4:
1575     for ( i=0; i<mbs; i++ ) {
1576       n  = ii[1] - ii[0]; ii++;
1577       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1578       ib = idx + ai[i];
1579       for ( j=0; j<n; j++ ) {
1580         rval      = ib[j]*4;
1581         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1582         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1583         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1584         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1585         v += 16;
1586       }
1587     }
1588     break;
1589   case 5:
1590     for ( i=0; i<mbs; i++ ) {
1591       n  = ii[1] - ii[0]; ii++;
1592       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1593       x4 = xb[3];   x5 = xb[4];
1594       ib = idx + ai[i];
1595       for ( j=0; j<n; j++ ) {
1596         rval      = ib[j]*5;
1597         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1598         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1599         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1600         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1601         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1602         v += 25;
1603       }
1604     }
1605     break;
1606       /* block sizes larger then 5 by 5 are handled by BLAS */
1607     default: {
1608       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1609       if (!a->mult_work) {
1610         k = PetscMax(a->m,a->n);
1611         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1612         CHKPTRQ(a->mult_work);
1613       }
1614       work = a->mult_work;
1615       for ( i=0; i<mbs; i++ ) {
1616         n     = ii[1] - ii[0]; ii++;
1617         ncols = n*bs;
1618         PetscMemzero(work,ncols*sizeof(Scalar));
1619         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1620         v += n*bs2;
1621         x += bs;
1622         workt = work;
1623         for ( j=0; j<n; j++ ) {
1624           zb = z + bs*(*idx++);
1625           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1626           workt += bs;
1627         }
1628       }
1629     }
1630   }
1631   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1632   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1633   PLogFlops(2*a->nz*a->bs2);
1634   return 0;
1635 }
1636 
1637 #undef __FUNC__
1638 #define __FUNC__ "MatGetInfo_SeqBAIJ" /* ADIC Ignore */
1639 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1640 {
1641   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1642 
1643   info->rows_global    = (double)a->m;
1644   info->columns_global = (double)a->n;
1645   info->rows_local     = (double)a->m;
1646   info->columns_local  = (double)a->n;
1647   info->block_size     = a->bs2;
1648   info->nz_allocated   = a->maxnz;
1649   info->nz_used        = a->bs2*a->nz;
1650   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1651   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1652     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1653   info->assemblies   = A->num_ass;
1654   info->mallocs      = a->reallocs;
1655   info->memory       = A->mem;
1656   if (A->factor) {
1657     info->fill_ratio_given  = A->info.fill_ratio_given;
1658     info->fill_ratio_needed = A->info.fill_ratio_needed;
1659     info->factor_mallocs    = A->info.factor_mallocs;
1660   } else {
1661     info->fill_ratio_given  = 0;
1662     info->fill_ratio_needed = 0;
1663     info->factor_mallocs    = 0;
1664   }
1665   return 0;
1666 }
1667 
1668 #undef __FUNC__
1669 #define __FUNC__ "MatEqual_SeqBAIJ"
1670 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1671 {
1672   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1673 
1674   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1675 
1676   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1677   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1678       (a->nz != b->nz)) {
1679     *flg = PETSC_FALSE; return 0;
1680   }
1681 
1682   /* if the a->i are the same */
1683   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1684     *flg = PETSC_FALSE; return 0;
1685   }
1686 
1687   /* if a->j are the same */
1688   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1689     *flg = PETSC_FALSE; return 0;
1690   }
1691 
1692   /* if a->a are the same */
1693   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1694     *flg = PETSC_FALSE; return 0;
1695   }
1696   *flg = PETSC_TRUE;
1697   return 0;
1698 
1699 }
1700 
1701 #undef __FUNC__
1702 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1703 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1704 {
1705   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1706   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1707   Scalar      *x, zero = 0.0,*aa,*aa_j;
1708 
1709   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1710   bs   = a->bs;
1711   aa   = a->a;
1712   ai   = a->i;
1713   aj   = a->j;
1714   ambs = a->mbs;
1715   bs2  = a->bs2;
1716 
1717   VecSet(&zero,v);
1718   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1719   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1720   for ( i=0; i<ambs; i++ ) {
1721     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1722       if (aj[j] == i) {
1723         row  = i*bs;
1724         aa_j = aa+j*bs2;
1725         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1726         break;
1727       }
1728     }
1729   }
1730   return 0;
1731 }
1732 
1733 #undef __FUNC__
1734 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1735 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1736 {
1737   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1738   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1739   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1740 
1741   ai  = a->i;
1742   aj  = a->j;
1743   aa  = a->a;
1744   m   = a->m;
1745   n   = a->n;
1746   bs  = a->bs;
1747   mbs = a->mbs;
1748   bs2 = a->bs2;
1749   if (ll) {
1750     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1751     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1752     for ( i=0; i<mbs; i++ ) { /* for each block row */
1753       M  = ai[i+1] - ai[i];
1754       li = l + i*bs;
1755       v  = aa + bs2*ai[i];
1756       for ( j=0; j<M; j++ ) { /* for each block */
1757         for ( k=0; k<bs2; k++ ) {
1758           (*v++) *= li[k%bs];
1759         }
1760       }
1761     }
1762   }
1763 
1764   if (rr) {
1765     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1766     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1767     for ( i=0; i<mbs; i++ ) { /* for each block row */
1768       M  = ai[i+1] - ai[i];
1769       v  = aa + bs2*ai[i];
1770       for ( j=0; j<M; j++ ) { /* for each block */
1771         ri = r + bs*aj[ai[i]+j];
1772         for ( k=0; k<bs; k++ ) {
1773           x = ri[k];
1774           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1775         }
1776       }
1777     }
1778   }
1779   return 0;
1780 }
1781 
1782 
1783 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1784 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1785 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1786 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1787 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1788 
1789 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1790 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1791 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1792 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1793 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1794 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1795 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1796 
1797 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1798 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1799 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1800 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1801 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1802 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1803 
1804 #undef __FUNC__
1805 #define __FUNC__ "MatNorm_SeqBAIJ"
1806 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1807 {
1808   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1809   Scalar      *v = a->a;
1810   double      sum = 0.0;
1811   int         i,nz=a->nz,bs2=a->bs2;
1812 
1813   if (type == NORM_FROBENIUS) {
1814     for (i=0; i< bs2*nz; i++ ) {
1815 #if defined(PETSC_COMPLEX)
1816       sum += real(conj(*v)*(*v)); v++;
1817 #else
1818       sum += (*v)*(*v); v++;
1819 #endif
1820     }
1821     *norm = sqrt(sum);
1822   }
1823   else {
1824     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1825   }
1826   return 0;
1827 }
1828 
1829 extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec);
1830 /*
1831      note: This can only work for identity for row and col. It would
1832    be good to check this and otherwise generate an error.
1833 */
1834 #undef __FUNC__
1835 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1836 int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1837 {
1838   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1839   Mat         outA;
1840   int         ierr;
1841 
1842   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1843 
1844   outA          = inA;
1845   inA->factor   = FACTOR_LU;
1846   a->row        = row;
1847   a->col        = col;
1848 
1849   if (!a->solve_work) {
1850     a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1851     PLogObjectMemory(inA,(a->m+a->bs)*sizeof(Scalar));
1852   }
1853 
1854   if (!a->diag) {
1855     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1856   }
1857   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1858 
1859   /*
1860       Blocksize 4 has a special faster solver for ILU(0) factorization
1861     with natural ordering
1862   */
1863   if (a->bs == 4) {
1864     inA->ops.solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
1865   }
1866 
1867   return 0;
1868 }
1869 
1870 #undef __FUNC__
1871 #define __FUNC__ "MatScale_SeqBAIJ"
1872 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1873 {
1874   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1875   int         one = 1, totalnz = a->bs2*a->nz;
1876   BLscal_( &totalnz, alpha, a->a, &one );
1877   PLogFlops(totalnz);
1878   return 0;
1879 }
1880 
1881 #undef __FUNC__
1882 #define __FUNC__ "MatGetValues_SeqBAIJ"
1883 int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1884 {
1885   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1886   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1887   int        *ai = a->i, *ailen = a->ilen;
1888   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1889   Scalar     *ap, *aa = a->a, zero = 0.0;
1890 
1891   for ( k=0; k<m; k++ ) { /* loop over rows */
1892     row  = im[k]; brow = row/bs;
1893     if (row < 0) SETERRQ(1,0,"Negative row");
1894     if (row >= a->m) SETERRQ(1,0,"Row too large");
1895     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1896     nrow = ailen[brow];
1897     for ( l=0; l<n; l++ ) { /* loop over columns */
1898       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1899       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1900       col  = in[l] ;
1901       bcol = col/bs;
1902       cidx = col%bs;
1903       ridx = row%bs;
1904       high = nrow;
1905       low  = 0; /* assume unsorted */
1906       while (high-low > 5) {
1907         t = (low+high)/2;
1908         if (rp[t] > bcol) high = t;
1909         else             low  = t;
1910       }
1911       for ( i=low; i<high; i++ ) {
1912         if (rp[i] > bcol) break;
1913         if (rp[i] == bcol) {
1914           *v++ = ap[bs2*i+bs*cidx+ridx];
1915           goto finished;
1916         }
1917       }
1918       *v++ = zero;
1919       finished:;
1920     }
1921   }
1922   return 0;
1923 }
1924 
1925 #undef __FUNC__
1926 #define __FUNC__ "MatGetBlockSize_SeqBAIJ" /* ADIC Ignore */
1927 int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1928 {
1929   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1930   *bs = baij->bs;
1931   return 0;
1932 }
1933 
1934 /* idx should be of length atlease bs */
1935 #undef __FUNC__
1936 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1937 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1938 {
1939   int i,row;
1940   row = idx[0];
1941   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1942 
1943   for ( i=1; i<bs; i++ ) {
1944     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1945   }
1946   *flg = PETSC_TRUE;
1947   return 0;
1948 }
1949 
1950 #undef __FUNC__
1951 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1952 int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1953 {
1954   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1955   IS          is_local;
1956   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1957   PetscTruth  flg;
1958   Scalar      zero = 0.0,*aa;
1959 
1960   /* Make a copy of the IS and  sort it */
1961   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1962   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1963   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1964   ierr = ISSort(is_local); CHKERRQ(ierr);
1965   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1966 
1967   i = 0;
1968   while (i < is_n) {
1969     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1970     flg = PETSC_FALSE;
1971     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1972     if (flg) { /* There exists a block of rows to be Zerowed */
1973       baij->ilen[rows[i]/bs] = 0;
1974       i += bs;
1975     } else { /* Zero out only the requested row */
1976       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1977       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1978       for ( j=0; j<count; j++ ) {
1979         aa[0] = zero;
1980         aa+=bs;
1981       }
1982       i++;
1983     }
1984   }
1985   if (diag) {
1986     for ( j=0; j<is_n; j++ ) {
1987       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1988     }
1989   }
1990   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
1991   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
1992   ierr = ISDestroy(is_local); CHKERRQ(ierr);
1993   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1994 
1995   return 0;
1996 }
1997 
1998 #undef __FUNC__
1999 #define __FUNC__ "MatPrintHelp_SeqBAIJ" /* ADIC Ignore */
2000 int MatPrintHelp_SeqBAIJ(Mat A)
2001 {
2002   static int called = 0;
2003   MPI_Comm   comm = A->comm;
2004 
2005   if (called) return 0; else called = 1;
2006   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
2007   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
2008   return 0;
2009 }
2010 
2011 /* -------------------------------------------------------------------*/
2012 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
2013        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
2014        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
2015        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
2016        MatSolve_SeqBAIJ_N,0,
2017        0,0,
2018        MatLUFactor_SeqBAIJ,0,
2019        0,
2020        MatTranspose_SeqBAIJ,
2021        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
2022        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
2023        0,MatAssemblyEnd_SeqBAIJ,
2024        0,
2025        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
2026        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
2027        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
2028        MatILUFactorSymbolic_SeqBAIJ,0,
2029        0,0,
2030        MatConvertSameType_SeqBAIJ,0,0,
2031        MatILUFactor_SeqBAIJ,0,0,
2032        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
2033        MatGetValues_SeqBAIJ,0,
2034        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
2035        0,0,0,MatGetBlockSize_SeqBAIJ,
2036        MatGetRowIJ_SeqBAIJ,
2037        MatRestoreRowIJ_SeqBAIJ,
2038        0,0,0,0,0,0,
2039        MatSetValuesBlocked_SeqBAIJ};
2040 
2041 #undef __FUNC__
2042 #define __FUNC__ "MatCreateSeqBAIJ"
2043 /*@C
2044    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
2045    compressed row) format.  For good matrix assembly performance the
2046    user should preallocate the matrix storage by setting the parameter nz
2047    (or the array nzz).  By setting these parameters accurately, performance
2048    during matrix assembly can be increased by more than a factor of 50.
2049 
2050    Input Parameters:
2051 .  comm - MPI communicator, set to PETSC_COMM_SELF
2052 .  bs - size of block
2053 .  m - number of rows
2054 .  n - number of columns
2055 .  nz - number of block nonzeros per block row (same for all rows)
2056 .  nzz - array containing the number of block nonzeros in the various block rows
2057          (possibly different for each block row) or PETSC_NULL
2058 
2059    Output Parameter:
2060 .  A - the matrix
2061 
2062    Options Database Keys:
2063 $    -mat_no_unroll - uses code that does not unroll the loops in the
2064 $                     block calculations (much solver)
2065 $    -mat_block_size - size of the blocks to use
2066 
2067    Notes:
2068    The block AIJ format is fully compatible with standard Fortran 77
2069    storage.  That is, the stored row and column indices can begin at
2070    either one (as in Fortran) or zero.  See the users' manual for details.
2071 
2072    Specify the preallocated storage with either nz or nnz (not both).
2073    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2074    allocation.  For additional details, see the users manual chapter on
2075    matrices.
2076 
2077 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2078 @*/
2079 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2080 {
2081   Mat         B;
2082   Mat_SeqBAIJ *b;
2083   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2084 
2085   MPI_Comm_size(comm,&size);
2086   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2087 
2088   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2089 
2090   if (mbs*bs!=m || nbs*bs!=n)
2091     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2092 
2093   *A = 0;
2094   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
2095   PLogObjectCreate(B);
2096   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2097   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2098   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2099   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2100   if (!flg) {
2101     switch (bs) {
2102     case 1:
2103       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2104       B->ops.solve           = MatSolve_SeqBAIJ_1;
2105       B->ops.mult            = MatMult_SeqBAIJ_1;
2106       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2107       break;
2108     case 2:
2109       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2110       B->ops.solve           = MatSolve_SeqBAIJ_2;
2111       B->ops.mult            = MatMult_SeqBAIJ_2;
2112       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2113       break;
2114     case 3:
2115       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2116       B->ops.solve           = MatSolve_SeqBAIJ_3;
2117       B->ops.mult            = MatMult_SeqBAIJ_3;
2118       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2119       break;
2120     case 4:
2121       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2122       B->ops.solve           = MatSolve_SeqBAIJ_4;
2123       B->ops.mult            = MatMult_SeqBAIJ_4;
2124       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2125       break;
2126     case 5:
2127       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2128       B->ops.solve           = MatSolve_SeqBAIJ_5;
2129       B->ops.mult            = MatMult_SeqBAIJ_5;
2130       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2131       break;
2132     case 7:
2133       B->ops.mult            = MatMult_SeqBAIJ_7;
2134       B->ops.solve           = MatSolve_SeqBAIJ_7;
2135       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2136       break;
2137     }
2138   }
2139   B->destroy          = MatDestroy_SeqBAIJ;
2140   B->view             = MatView_SeqBAIJ;
2141   B->factor           = 0;
2142   B->lupivotthreshold = 1.0;
2143   B->mapping          = 0;
2144   b->row              = 0;
2145   b->col              = 0;
2146   b->reallocs         = 0;
2147 
2148   b->m       = m; B->m = m; B->M = m;
2149   b->n       = n; B->n = n; B->N = n;
2150   b->mbs     = mbs;
2151   b->nbs     = nbs;
2152   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2153   if (nnz == PETSC_NULL) {
2154     if (nz == PETSC_DEFAULT) nz = 5;
2155     else if (nz <= 0)        nz = 1;
2156     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2157     nz = nz*mbs;
2158   }
2159   else {
2160     nz = 0;
2161     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2162   }
2163 
2164   /* allocate the matrix space */
2165   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2166   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2167   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2168   b->j  = (int *) (b->a + nz*bs2);
2169   PetscMemzero(b->j,nz*sizeof(int));
2170   b->i  = b->j + nz;
2171   b->singlemalloc = 1;
2172 
2173   b->i[0] = 0;
2174   for (i=1; i<mbs+1; i++) {
2175     b->i[i] = b->i[i-1] + b->imax[i-1];
2176   }
2177 
2178   /* b->ilen will count nonzeros in each block row so far. */
2179   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2180   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2181   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2182 
2183   b->bs               = bs;
2184   b->bs2              = bs2;
2185   b->mbs              = mbs;
2186   b->nz               = 0;
2187   b->maxnz            = nz*bs2;
2188   b->sorted           = 0;
2189   b->roworiented      = 1;
2190   b->nonew            = 0;
2191   b->diag             = 0;
2192   b->solve_work       = 0;
2193   b->mult_work        = 0;
2194   b->spptr            = 0;
2195   B->info.nz_unneeded = (double)b->maxnz;
2196 
2197   *A = B;
2198   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2199   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2200   return 0;
2201 }
2202 
2203 #undef __FUNC__
2204 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2205 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2206 {
2207   Mat         C;
2208   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2209   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2210 
2211   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2212 
2213   *B = 0;
2214   PetscHeaderCreate(C,_p_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
2215   PLogObjectCreate(C);
2216   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2217   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2218   C->destroy    = MatDestroy_SeqBAIJ;
2219   C->view       = MatView_SeqBAIJ;
2220   C->factor     = A->factor;
2221   c->row        = 0;
2222   c->col        = 0;
2223   C->assembled  = PETSC_TRUE;
2224 
2225   c->m = C->m   = a->m;
2226   c->n = C->n   = a->n;
2227   C->M          = a->m;
2228   C->N          = a->n;
2229 
2230   c->bs         = a->bs;
2231   c->bs2        = a->bs2;
2232   c->mbs        = a->mbs;
2233   c->nbs        = a->nbs;
2234 
2235   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2236   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2237   for ( i=0; i<mbs; i++ ) {
2238     c->imax[i] = a->imax[i];
2239     c->ilen[i] = a->ilen[i];
2240   }
2241 
2242   /* allocate the matrix space */
2243   c->singlemalloc = 1;
2244   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2245   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2246   c->j  = (int *) (c->a + nz*bs2);
2247   c->i  = c->j + nz;
2248   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2249   if (mbs > 0) {
2250     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2251     if (cpvalues == COPY_VALUES) {
2252       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2253     }
2254   }
2255 
2256   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2257   c->sorted      = a->sorted;
2258   c->roworiented = a->roworiented;
2259   c->nonew       = a->nonew;
2260 
2261   if (a->diag) {
2262     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2263     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2264     for ( i=0; i<mbs; i++ ) {
2265       c->diag[i] = a->diag[i];
2266     }
2267   }
2268   else c->diag          = 0;
2269   c->nz                 = a->nz;
2270   c->maxnz              = a->maxnz;
2271   c->solve_work         = 0;
2272   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2273   c->mult_work          = 0;
2274   *B = C;
2275   return 0;
2276 }
2277 
2278 #undef __FUNC__
2279 #define __FUNC__ "MatLoad_SeqBAIJ"
2280 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2281 {
2282   Mat_SeqBAIJ  *a;
2283   Mat          B;
2284   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2285   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2286   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2287   int          *masked, nmask,tmp,bs2,ishift;
2288   Scalar       *aa;
2289   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2290 
2291   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2292   bs2  = bs*bs;
2293 
2294   MPI_Comm_size(comm,&size);
2295   if (size > 1) SETERRQ(1,0,"view must have one processor");
2296   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2297   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
2298   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2299   M = header[1]; N = header[2]; nz = header[3];
2300 
2301   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2302 
2303   /*
2304      This code adds extra rows to make sure the number of rows is
2305     divisible by the blocksize
2306   */
2307   mbs        = M/bs;
2308   extra_rows = bs - M + bs*(mbs);
2309   if (extra_rows == bs) extra_rows = 0;
2310   else                  mbs++;
2311   if (extra_rows) {
2312     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2313   }
2314 
2315   /* read in row lengths */
2316   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2317   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
2318   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2319 
2320   /* read in column indices */
2321   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2322   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
2323   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2324 
2325   /* loop over row lengths determining block row lengths */
2326   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2327   PetscMemzero(browlengths,mbs*sizeof(int));
2328   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2329   PetscMemzero(mask,mbs*sizeof(int));
2330   masked = mask + mbs;
2331   rowcount = 0; nzcount = 0;
2332   for ( i=0; i<mbs; i++ ) {
2333     nmask = 0;
2334     for ( j=0; j<bs; j++ ) {
2335       kmax = rowlengths[rowcount];
2336       for ( k=0; k<kmax; k++ ) {
2337         tmp = jj[nzcount++]/bs;
2338         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2339       }
2340       rowcount++;
2341     }
2342     browlengths[i] += nmask;
2343     /* zero out the mask elements we set */
2344     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2345   }
2346 
2347   /* create our matrix */
2348   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);CHKERRQ(ierr);
2349   B = *A;
2350   a = (Mat_SeqBAIJ *) B->data;
2351 
2352   /* set matrix "i" values */
2353   a->i[0] = 0;
2354   for ( i=1; i<= mbs; i++ ) {
2355     a->i[i]      = a->i[i-1] + browlengths[i-1];
2356     a->ilen[i-1] = browlengths[i-1];
2357   }
2358   a->nz         = 0;
2359   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2360 
2361   /* read in nonzero values */
2362   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2363   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
2364   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2365 
2366   /* set "a" and "j" values into matrix */
2367   nzcount = 0; jcount = 0;
2368   for ( i=0; i<mbs; i++ ) {
2369     nzcountb = nzcount;
2370     nmask    = 0;
2371     for ( j=0; j<bs; j++ ) {
2372       kmax = rowlengths[i*bs+j];
2373       for ( k=0; k<kmax; k++ ) {
2374         tmp = jj[nzcount++]/bs;
2375 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2376       }
2377       rowcount++;
2378     }
2379     /* sort the masked values */
2380     PetscSortInt(nmask,masked);
2381 
2382     /* set "j" values into matrix */
2383     maskcount = 1;
2384     for ( j=0; j<nmask; j++ ) {
2385       a->j[jcount++]  = masked[j];
2386       mask[masked[j]] = maskcount++;
2387     }
2388     /* set "a" values into matrix */
2389     ishift = bs2*a->i[i];
2390     for ( j=0; j<bs; j++ ) {
2391       kmax = rowlengths[i*bs+j];
2392       for ( k=0; k<kmax; k++ ) {
2393         tmp    = jj[nzcountb]/bs ;
2394         block  = mask[tmp] - 1;
2395         point  = jj[nzcountb] - bs*tmp;
2396         idx    = ishift + bs2*block + j + bs*point;
2397         a->a[idx] = aa[nzcountb++];
2398       }
2399     }
2400     /* zero out the mask elements we set */
2401     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2402   }
2403   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2404 
2405   PetscFree(rowlengths);
2406   PetscFree(browlengths);
2407   PetscFree(aa);
2408   PetscFree(jj);
2409   PetscFree(mask);
2410 
2411   B->assembled = PETSC_TRUE;
2412 
2413   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
2414   if (flg) {
2415     Viewer tviewer;
2416     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2417     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO,0);CHKERRQ(ierr);
2418     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2419     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2420   }
2421   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
2422   if (flg) {
2423     Viewer tviewer;
2424     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2425     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO_LONG,0);CHKERRQ(ierr);
2426     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2427     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2428   }
2429   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
2430   if (flg) {
2431     Viewer tviewer;
2432     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2433     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2434     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2435   }
2436   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
2437   if (flg) {
2438     Viewer tviewer;
2439     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2440     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_MATLAB,"M");CHKERRQ(ierr);
2441     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2442     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2443   }
2444   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
2445   if (flg) {
2446     Viewer tviewer;
2447     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
2448     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
2449     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
2450     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2451   }
2452   return 0;
2453 }
2454 
2455 
2456 
2457