xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 6a6a5d1dfdd5fcb6e31127fc1e7bbfb8f4c48c0a)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.21 1997/12/05 20:01:54 balay Exp bsmith $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "sys.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/spops.h"
10 #include "petsc.h"
11 #include "src/inline/bitarray.h"
12 
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
16 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
17 {
18   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
19   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
20   int         start, end, *ai, *aj,bs,*nidx2;
21   BT          table;
22 
23   PetscFunctionBegin;
24   m     = a->mbs;
25   ai    = a->i;
26   aj    = a->j;
27   bs    = a->bs;
28 
29   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
30 
31   ierr  = BTCreate(m,table); CHKERRQ(ierr);
32   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
33   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
34 
35   for ( i=0; i<is_max; i++ ) {
36     /* Initialise the two local arrays */
37     isz  = 0;
38     BTMemzero(m,table);
39 
40     /* Extract the indices, assume there can be duplicate entries */
41     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
42     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
43 
44     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
45     for ( j=0; j<n ; ++j){
46       ival = idx[j]/bs; /* convert the indices into block indices */
47       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
48       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
49     }
50     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
52 
53     k = 0;
54     for ( j=0; j<ov; j++){ /* for each overlap*/
55       n = isz;
56       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
57         row   = nidx[k];
58         start = ai[row];
59         end   = ai[row+1];
60         for ( l = start; l<end ; l++){
61           val = aj[l];
62           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
63         }
64       }
65     }
66     /* expand the Index Set */
67     for (j=0; j<isz; j++ ) {
68       for (k=0; k<bs; k++ )
69         nidx2[j*bs+k] = nidx[j]*bs+k;
70     }
71     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
72   }
73   BTDestroy(table);
74   PetscFree(nidx);
75   PetscFree(nidx2);
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNC__
80 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
81 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
82 {
83   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
84   int          nznew, *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
85   int          row,mat_i,*mat_j,tcol,*mat_ilen;
86   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
87   int          *aj = a->j, *ai = a->i;
88   Scalar       *mat_a;
89   Mat          C;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,(PetscTruth*)&i);
93   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
97   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
98   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
99 
100   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
101   ssmap = smap;
102   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
103   PetscMemzero(smap,oldcols*sizeof(int));
104   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110       for ( k=kstart; k<kend; k++ ) {
111         if (ssmap[aj[k]]) {
112           lens[i]++;
113         }
114       }
115     }
116   /* Create and fill new matrix */
117   if (scall == MAT_REUSE_MATRIX) {
118     c = (Mat_SeqBAIJ *)((*B)->data);
119 
120     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
121     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     PetscMemzero(c->ilen,c->mbs*sizeof(int));
125     C = *B;
126   } else {
127     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ *)(C->data);
130   for (i=0; i<nrows; i++) {
131     row    = irow[i];
132     nznew  = 0;
133     kstart = ai[row];
134     kend   = kstart + a->ilen[row];
135     mat_i  = c->i[i];
136     mat_j  = c->j + mat_i;
137     mat_a  = c->a + mat_i*bs2;
138     mat_ilen = c->ilen + i;
139     for ( k=kstart; k<kend; k++ ) {
140       if ((tcol=ssmap[a->j[k]])) {
141         *mat_j++ = tcol - 1;
142         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2;
143         (*mat_ilen)++;
144 
145       }
146     }
147   }
148 
149   /* Free work space */
150   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
151   PetscFree(smap); PetscFree(lens);
152   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
153   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
154 
155   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
156   *B = C;
157   PetscFunctionReturn(0);
158 }
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
162 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
163 {
164   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
165   IS          is1,is2;
166   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
167 
168   PetscFunctionBegin;
169   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
170   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
171   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
172   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
173 
174   /* Verify if the indices corespond to each element in a block
175    and form the IS with compressed IS */
176   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
177   iary = vary + a->mbs;
178   PetscMemzero(vary,(a->mbs)*sizeof(int));
179   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
180   count = 0;
181   for (i=0; i<a->mbs; i++) {
182     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
183     if (vary[i]==bs) iary[count++] = i;
184   }
185   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
186 
187   PetscMemzero(vary,(a->mbs)*sizeof(int));
188   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
189   count = 0;
190   for (i=0; i<a->mbs; i++) {
191     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
192     if (vary[i]==bs) iary[count++] = i;
193   }
194   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
195   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
196   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
197   PetscFree(vary);
198 
199   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
200   ISDestroy(is1);
201   ISDestroy(is2);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNC__
206 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
207 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,Mat **B)
208 {
209   int ierr,i;
210 
211   PetscFunctionBegin;
212   if (scall == MAT_INITIAL_MATRIX) {
213     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
214   }
215 
216   for ( i=0; i<n; i++ ) {
217     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
218   }
219   PetscFunctionReturn(0);
220 }
221 
222 
223 /* -------------------------------------------------------*/
224 /* Should check that shapes of vectors and matrices match */
225 /* -------------------------------------------------------*/
226 #include "pinclude/plapack.h"
227 
228 #undef __FUNC__
229 #define __FUNC__ "MatMult_SeqBAIJ_1"
230 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
231 {
232   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
233   register Scalar *x,*z,*v,sum;
234   int             mbs=a->mbs,i,*idx,*ii,n;
235 
236   PetscFunctionBegin;
237   VecGetArray_Fast(xx,x);
238   VecGetArray_Fast(zz,z);
239 
240   idx   = a->j;
241   v     = a->a;
242   ii    = a->i;
243 
244   for ( i=0; i<mbs; i++ ) {
245     n    = ii[1] - ii[0]; ii++;
246     sum  = 0.0;
247     while (n--) sum += *v++ * x[*idx++];
248     z[i] = sum;
249   }
250   VecRestoreArray_Fast(xx,x);
251   VecRestoreArray_Fast(zz,z);
252   PLogFlops(2*a->nz - a->m);
253   PetscFunctionReturn(0);
254 }
255 
256 #undef __FUNC__
257 #define __FUNC__ "MatMult_SeqBAIJ_2"
258 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
259 {
260   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
261   register Scalar *x,*z,*v,*xb,sum1,sum2;
262   register Scalar x1,x2;
263   int             mbs=a->mbs,i,*idx,*ii,j,n;
264 
265   PetscFunctionBegin;
266   VecGetArray_Fast(xx,x);
267   VecGetArray_Fast(zz,z);
268 
269   idx   = a->j;
270   v     = a->a;
271   ii    = a->i;
272 
273   for ( i=0; i<mbs; i++ ) {
274     n  = ii[1] - ii[0]; ii++;
275     sum1 = 0.0; sum2 = 0.0;
276     for ( j=0; j<n; j++ ) {
277       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
278       sum1 += v[0]*x1 + v[2]*x2;
279       sum2 += v[1]*x1 + v[3]*x2;
280       v += 4;
281     }
282     z[0] = sum1; z[1] = sum2;
283     z += 2;
284   }
285   VecRestoreArray_Fast(xx,x);
286   VecRestoreArray_Fast(zz,z);
287   PLogFlops(4*a->nz - a->m);
288   PetscFunctionReturn(0);
289 }
290 
291 #undef __FUNC__
292 #define __FUNC__ "MatMult_SeqBAIJ_3"
293 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
296   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
297   int             mbs=a->mbs,i,*idx,*ii,j,n;
298 
299   PetscFunctionBegin;
300   VecGetArray_Fast(xx,x);
301   VecGetArray_Fast(zz,z);
302 
303   idx   = a->j;
304   v     = a->a;
305   ii    = a->i;
306 
307   for ( i=0; i<mbs; i++ ) {
308     n  = ii[1] - ii[0]; ii++;
309     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
310     for ( j=0; j<n; j++ ) {
311       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
312       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
313       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
314       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
315       v += 9;
316     }
317     z[0] = sum1; z[1] = sum2; z[2] = sum3;
318     z += 3;
319   }
320   VecRestoreArray_Fast(xx,x);
321   VecRestoreArray_Fast(zz,z);
322   PLogFlops(18*a->nz - a->m);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNC__
327 #define __FUNC__ "MatMult_SeqBAIJ_4"
328 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
329 {
330   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
331   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
332   int             mbs=a->mbs,i,*idx,*ii,j,n;
333 
334   PetscFunctionBegin;
335   VecGetArray_Fast(xx,x);
336   VecGetArray_Fast(zz,z);
337 
338   idx   = a->j;
339   v     = a->a;
340   ii    = a->i;
341 
342   for ( i=0; i<mbs; i++ ) {
343     n  = ii[1] - ii[0]; ii++;
344     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
345     for ( j=0; j<n; j++ ) {
346       xb = x + 4*(*idx++);
347       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
348       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
349       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
350       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
351       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
352       v += 16;
353     }
354     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
355     z += 4;
356   }
357   VecRestoreArray_Fast(xx,x);
358   VecRestoreArray_Fast(zz,z);
359   PLogFlops(32*a->nz - a->m);
360   PetscFunctionReturn(0);
361 }
362 
363 #undef __FUNC__
364 #define __FUNC__ "MatMult_SeqBAIJ_5"
365 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
366 {
367   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
368   register Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
369   register Scalar * restrict v,* restrict xb,* restrict z, * restrict x;
370   int             mbs=a->mbs,i,*idx,*ii,j,n;
371 
372   VecGetArray_Fast(xx,x);
373   VecGetArray_Fast(zz,z);
374 
375   idx   = a->j;
376   v     = a->a;
377   ii    = a->i;
378 
379   for ( i=0; i<mbs; i++ ) {
380     n  = ii[1] - ii[0]; ii++;
381     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
382     for ( j=0; j<n; j++ ) {
383       xb = x + 5*(*idx++);
384       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
385       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
386       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
387       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
388       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
389       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
390       v += 25;
391     }
392     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
393     z += 5;
394   }
395   VecRestoreArray_Fast(xx,x);
396   VecRestoreArray_Fast(zz,z);
397   PLogFlops(50*a->nz - a->m);
398   PetscFunctionReturn(0);
399 }
400 
401 #undef __FUNC__
402 #define __FUNC__ "MatMult_SeqBAIJ_7"
403 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
404 {
405   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
406   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
407   register Scalar x1,x2,x3,x4,x5,x6,x7;
408   int             mbs=a->mbs,i,*idx,*ii,j,n;
409 
410   VecGetArray_Fast(xx,x);
411   VecGetArray_Fast(zz,z);
412 
413   idx   = a->j;
414   v     = a->a;
415   ii    = a->i;
416 
417   for ( i=0; i<mbs; i++ ) {
418     n  = ii[1] - ii[0]; ii++;
419     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
420     for ( j=0; j<n; j++ ) {
421       xb = x + 7*(*idx++);
422       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
423       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
424       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
425       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
426       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
427       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
428       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
429       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
430       v += 49;
431     }
432     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
433     z += 7;
434   }
435 
436   VecRestoreArray_Fast(xx,x);
437   VecRestoreArray_Fast(zz,z);
438   PLogFlops(98*a->nz - a->m);
439   PetscFunctionReturn(0);
440 }
441 
442 #undef __FUNC__
443 #define __FUNC__ "MatMult_SeqBAIJ_N"
444 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
445 {
446   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
447   register Scalar *x,*z,*v,*xb;
448   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
449   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
450   int             _One = 1,ncols,k;
451 
452   PetscFunctionBegin;
453   VecGetArray_Fast(xx,x);
454   VecGetArray_Fast(zz,z);
455 
456   idx   = a->j;
457   v     = a->a;
458   ii    = a->i;
459 
460 
461   if (!a->mult_work) {
462     k = PetscMax(a->m,a->n);
463     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
464   }
465   work = a->mult_work;
466   for ( i=0; i<mbs; i++ ) {
467     n     = ii[1] - ii[0]; ii++;
468     ncols = n*bs;
469     workt = work;
470     for ( j=0; j<n; j++ ) {
471       xb = x + bs*(*idx++);
472       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
473       workt += bs;
474     }
475     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
476     v += n*bs2;
477     z += bs;
478   }
479   VecRestoreArray_Fast(xx,x);
480   VecRestoreArray_Fast(zz,z);
481   PLogFlops(2*a->nz*bs2 - a->m);
482   PetscFunctionReturn(0);
483 }
484 
485 #undef __FUNC__
486 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
487 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
488 {
489   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
490   register Scalar *x,*y,*z,*v,sum;
491   int             mbs=a->mbs,i,*idx,*ii,n;
492 
493   PetscFunctionBegin;
494   VecGetArray_Fast(xx,x);
495   VecGetArray_Fast(yy,y);
496   VecGetArray_Fast(zz,z);
497 
498   idx   = a->j;
499   v     = a->a;
500   ii    = a->i;
501 
502   for ( i=0; i<mbs; i++ ) {
503     n    = ii[1] - ii[0]; ii++;
504     sum  = y[i];
505     while (n--) sum += *v++ * x[*idx++];
506     z[i] = sum;
507   }
508   VecRestoreArray_Fast(xx,x);
509   VecRestoreArray_Fast(yy,y);
510   VecRestoreArray_Fast(zz,z);
511   PLogFlops(2*a->nz);
512   PetscFunctionReturn(0);
513 }
514 
515 #undef __FUNC__
516 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
517 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
518 {
519   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
520   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
521   register Scalar x1,x2;
522   int             mbs=a->mbs,i,*idx,*ii,j,n;
523 
524   PetscFunctionBegin;
525   VecGetArray_Fast(xx,x);
526   VecGetArray_Fast(yy,y);
527   VecGetArray_Fast(zz,z);
528 
529   idx   = a->j;
530   v     = a->a;
531   ii    = a->i;
532 
533   for ( i=0; i<mbs; i++ ) {
534     n  = ii[1] - ii[0]; ii++;
535     sum1 = y[0]; sum2 = y[1];
536     for ( j=0; j<n; j++ ) {
537       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
538       sum1 += v[0]*x1 + v[2]*x2;
539       sum2 += v[1]*x1 + v[3]*x2;
540       v += 4;
541     }
542     z[0] = sum1; z[1] = sum2;
543     z += 2; y += 2;
544   }
545   VecRestoreArray_Fast(xx,x);
546   VecRestoreArray_Fast(yy,y);
547   VecRestoreArray_Fast(zz,z);
548   PLogFlops(4*a->nz);
549   PetscFunctionReturn(0);
550 }
551 
552 #undef __FUNC__
553 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
554 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
555 {
556   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
557   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
558   int             mbs=a->mbs,i,*idx,*ii,j,n;
559 
560   PetscFunctionBegin;
561   VecGetArray_Fast(xx,x);
562   VecGetArray_Fast(yy,y);
563   VecGetArray_Fast(zz,z);
564 
565   idx   = a->j;
566   v     = a->a;
567   ii    = a->i;
568 
569   for ( i=0; i<mbs; i++ ) {
570     n  = ii[1] - ii[0]; ii++;
571     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
572     for ( j=0; j<n; j++ ) {
573       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
574       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
575       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
576       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
577       v += 9;
578     }
579     z[0] = sum1; z[1] = sum2; z[2] = sum3;
580     z += 3; y += 3;
581   }
582   VecRestoreArray_Fast(xx,x);
583   VecRestoreArray_Fast(yy,y);
584   VecRestoreArray_Fast(zz,z);
585   PLogFlops(18*a->nz);
586   PetscFunctionReturn(0);
587 }
588 
589 #undef __FUNC__
590 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
591 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
592 {
593   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
594   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
595   int             mbs=a->mbs,i,*idx,*ii;
596   int             j,n;
597 
598   PetscFunctionBegin;
599   VecGetArray_Fast(xx,x);
600   VecGetArray_Fast(yy,y);
601   VecGetArray_Fast(zz,z);
602 
603   idx   = a->j;
604   v     = a->a;
605   ii    = a->i;
606 
607   for ( i=0; i<mbs; i++ ) {
608     n  = ii[1] - ii[0]; ii++;
609     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
610     for ( j=0; j<n; j++ ) {
611       xb = x + 4*(*idx++);
612       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
613       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
614       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
615       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
616       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
617       v += 16;
618     }
619     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
620     z += 4; y += 4;
621   }
622   VecRestoreArray_Fast(xx,x);
623   VecRestoreArray_Fast(yy,y);
624   VecRestoreArray_Fast(zz,z);
625   PLogFlops(32*a->nz);
626   PetscFunctionReturn(0);
627 }
628 
629 #undef __FUNC__
630 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
631 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
632 {
633   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
634   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
635   int             mbs=a->mbs,i,*idx,*ii,j,n;
636 
637   PetscFunctionBegin;
638   VecGetArray_Fast(xx,x);
639   VecGetArray_Fast(yy,y);
640   VecGetArray_Fast(zz,z);
641 
642   idx   = a->j;
643   v     = a->a;
644   ii    = a->i;
645 
646   for ( i=0; i<mbs; i++ ) {
647     n  = ii[1] - ii[0]; ii++;
648     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
649     for ( j=0; j<n; j++ ) {
650       xb = x + 5*(*idx++);
651       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
652       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
653       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
654       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
655       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
656       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
657       v += 25;
658     }
659     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
660     z += 5; y += 5;
661   }
662   VecRestoreArray_Fast(xx,x);
663   VecRestoreArray_Fast(yy,y);
664   VecRestoreArray_Fast(zz,z);
665   PLogFlops(50*a->nz);
666   PetscFunctionReturn(0);
667 }
668 
669 #undef __FUNC__
670 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
671 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
672 {
673   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
674   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
675   register Scalar x1,x2,x3,x4,x5,x6,x7;
676   int             mbs=a->mbs,i,*idx,*ii,j,n;
677 
678   PetscFunctionBegin;
679   VecGetArray_Fast(xx,x);
680   VecGetArray_Fast(yy,y);
681   VecGetArray_Fast(zz,z);
682 
683   idx   = a->j;
684   v     = a->a;
685   ii    = a->i;
686 
687   for ( i=0; i<mbs; i++ ) {
688     n  = ii[1] - ii[0]; ii++;
689     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
690     for ( j=0; j<n; j++ ) {
691       xb = x + 7*(*idx++);
692       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
693       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
694       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
695       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
696       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
697       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
698       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
699       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
700       v += 49;
701     }
702     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
703     z += 7; y += 7;
704   }
705   VecRestoreArray_Fast(xx,x);
706   VecRestoreArray_Fast(yy,y);
707   VecRestoreArray_Fast(zz,z);
708   PLogFlops(98*a->nz);
709   PetscFunctionReturn(0);
710 }
711 
712 
713 #undef __FUNC__
714 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
715 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
716 {
717   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
718   register Scalar *x,*z,*v,*xb;
719   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
720   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
721 
722   PetscFunctionBegin;
723   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
724 
725   VecGetArray_Fast(xx,x);
726   VecGetArray_Fast(zz,z);
727 
728   idx   = a->j;
729   v     = a->a;
730   ii    = a->i;
731 
732 
733   if (!a->mult_work) {
734     k = PetscMax(a->m,a->n);
735     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
736   }
737   work = a->mult_work;
738   for ( i=0; i<mbs; i++ ) {
739     n     = ii[1] - ii[0]; ii++;
740     ncols = n*bs;
741     workt = work;
742     for ( j=0; j<n; j++ ) {
743       xb = x + bs*(*idx++);
744       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
745       workt += bs;
746     }
747     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
748     v += n*bs2;
749     z += bs;
750   }
751   VecRestoreArray_Fast(xx,x);
752   VecRestoreArray_Fast(zz,z);
753   PLogFlops(2*a->nz*bs2 );
754   PetscFunctionReturn(0);
755 }
756 
757 #undef __FUNC__
758 #define __FUNC__ "MatMultTrans_SeqBAIJ"
759 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
760 {
761   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
762   Scalar          *xg,*zg,*zb;
763   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
764   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
765   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
766 
767 
768   PetscFunctionBegin;
769   VecGetArray_Fast(xx,xg); x = xg;
770   VecGetArray_Fast(zz,zg); z = zg;
771   PetscMemzero(z,N*sizeof(Scalar));
772 
773   idx   = a->j;
774   v     = a->a;
775   ii    = a->i;
776 
777   switch (bs) {
778   case 1:
779     for ( i=0; i<mbs; i++ ) {
780       n  = ii[1] - ii[0]; ii++;
781       xb = x + i; x1 = xb[0];
782       ib = idx + ai[i];
783       for ( j=0; j<n; j++ ) {
784         rval    = ib[j];
785         z[rval] += *v++ * x1;
786       }
787     }
788     break;
789   case 2:
790     for ( i=0; i<mbs; i++ ) {
791       n  = ii[1] - ii[0]; ii++;
792       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
793       ib = idx + ai[i];
794       for ( j=0; j<n; j++ ) {
795         rval      = ib[j]*2;
796         z[rval++] += v[0]*x1 + v[1]*x2;
797         z[rval++] += v[2]*x1 + v[3]*x2;
798         v += 4;
799       }
800     }
801     break;
802   case 3:
803     for ( i=0; i<mbs; i++ ) {
804       n  = ii[1] - ii[0]; ii++;
805       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
806       ib = idx + ai[i];
807       for ( j=0; j<n; j++ ) {
808         rval      = ib[j]*3;
809         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
810         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
811         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
812         v += 9;
813       }
814     }
815     break;
816   case 4:
817     for ( i=0; i<mbs; i++ ) {
818       n  = ii[1] - ii[0]; ii++;
819       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
820       ib = idx + ai[i];
821       for ( j=0; j<n; j++ ) {
822         rval      = ib[j]*4;
823         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
824         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
825         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
826         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
827         v += 16;
828       }
829     }
830     break;
831   case 5:
832     for ( i=0; i<mbs; i++ ) {
833       n  = ii[1] - ii[0]; ii++;
834       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
835       x4 = xb[3];   x5 = xb[4];
836       ib = idx + ai[i];
837       for ( j=0; j<n; j++ ) {
838         rval      = ib[j]*5;
839         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
840         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
841         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
842         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
843         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
844         v += 25;
845       }
846     }
847     break;
848       /* block sizes larger then 5 by 5 are handled by BLAS */
849     default: {
850       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
851       if (!a->mult_work) {
852         k = PetscMax(a->m,a->n);
853         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
854         CHKPTRQ(a->mult_work);
855       }
856       work = a->mult_work;
857       for ( i=0; i<mbs; i++ ) {
858         n     = ii[1] - ii[0]; ii++;
859         ncols = n*bs;
860         PetscMemzero(work,ncols*sizeof(Scalar));
861         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
862         v += n*bs2;
863         x += bs;
864         workt = work;
865         for ( j=0; j<n; j++ ) {
866           zb = z + bs*(*idx++);
867           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
868           workt += bs;
869         }
870       }
871     }
872   }
873   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
874   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
875   PLogFlops(2*a->nz*a->bs2 - a->n);
876   PetscFunctionReturn(0);
877 }
878 
879 #undef __FUNC__
880 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
881 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
882 
883 {
884   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
885   Scalar          *xg,*zg,*zb;
886   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
887   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
888   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
889 
890   PetscFunctionBegin;
891   VecGetArray_Fast(xx,xg); x = xg;
892   VecGetArray_Fast(zz,zg); z = zg;
893 
894   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
895   else PetscMemzero(z,N*sizeof(Scalar));
896 
897   idx   = a->j;
898   v     = a->a;
899   ii    = a->i;
900 
901   switch (bs) {
902   case 1:
903     for ( i=0; i<mbs; i++ ) {
904       n  = ii[1] - ii[0]; ii++;
905       xb = x + i; x1 = xb[0];
906       ib = idx + ai[i];
907       for ( j=0; j<n; j++ ) {
908         rval    = ib[j];
909         z[rval] += *v++ * x1;
910       }
911     }
912     break;
913   case 2:
914     for ( i=0; i<mbs; i++ ) {
915       n  = ii[1] - ii[0]; ii++;
916       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
917       ib = idx + ai[i];
918       for ( j=0; j<n; j++ ) {
919         rval      = ib[j]*2;
920         z[rval++] += v[0]*x1 + v[1]*x2;
921         z[rval++] += v[2]*x1 + v[3]*x2;
922         v += 4;
923       }
924     }
925     break;
926   case 3:
927     for ( i=0; i<mbs; i++ ) {
928       n  = ii[1] - ii[0]; ii++;
929       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
930       ib = idx + ai[i];
931       for ( j=0; j<n; j++ ) {
932         rval      = ib[j]*3;
933         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
934         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
935         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
936         v += 9;
937       }
938     }
939     break;
940   case 4:
941     for ( i=0; i<mbs; i++ ) {
942       n  = ii[1] - ii[0]; ii++;
943       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
944       ib = idx + ai[i];
945       for ( j=0; j<n; j++ ) {
946         rval      = ib[j]*4;
947         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
948         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
949         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
950         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
951         v += 16;
952       }
953     }
954     break;
955   case 5:
956     for ( i=0; i<mbs; i++ ) {
957       n  = ii[1] - ii[0]; ii++;
958       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
959       x4 = xb[3];   x5 = xb[4];
960       ib = idx + ai[i];
961       for ( j=0; j<n; j++ ) {
962         rval      = ib[j]*5;
963         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
964         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
965         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
966         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
967         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
968         v += 25;
969       }
970     }
971     break;
972       /* block sizes larger then 5 by 5 are handled by BLAS */
973     default: {
974       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
975       if (!a->mult_work) {
976         k = PetscMax(a->m,a->n);
977         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
978         CHKPTRQ(a->mult_work);
979       }
980       work = a->mult_work;
981       for ( i=0; i<mbs; i++ ) {
982         n     = ii[1] - ii[0]; ii++;
983         ncols = n*bs;
984         PetscMemzero(work,ncols*sizeof(Scalar));
985         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
986         v += n*bs2;
987         x += bs;
988         workt = work;
989         for ( j=0; j<n; j++ ) {
990           zb = z + bs*(*idx++);
991           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
992           workt += bs;
993         }
994       }
995     }
996   }
997   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
998   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
999   PLogFlops(2*a->nz*a->bs2);
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 #undef __FUNC__
1004 #define __FUNC__ "MatScale_SeqBAIJ"
1005 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1006 {
1007   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1008   int         one = 1, totalnz = a->bs2*a->nz;
1009 
1010   PetscFunctionBegin;
1011   BLscal_( &totalnz, alpha, a->a, &one );
1012   PLogFlops(totalnz);
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 #undef __FUNC__
1017 #define __FUNC__ "MatNorm_SeqBAIJ"
1018 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1019 {
1020   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1021   Scalar      *v = a->a;
1022   double      sum = 0.0;
1023   int         i,nz=a->nz,bs2=a->bs2;
1024 
1025   PetscFunctionBegin;
1026   if (type == NORM_FROBENIUS) {
1027     for (i=0; i< bs2*nz; i++ ) {
1028 #if defined(USE_PETSC_COMPLEX)
1029       sum += real(conj(*v)*(*v)); v++;
1030 #else
1031       sum += (*v)*(*v); v++;
1032 #endif
1033     }
1034     *norm = sqrt(sum);
1035   }
1036   else {
1037     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1038   }
1039   PetscFunctionReturn(0);
1040 }
1041 
1042 
1043 #undef __FUNC__
1044 #define __FUNC__ "MatEqual_SeqBAIJ"
1045 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1046 {
1047   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1048 
1049   PetscFunctionBegin;
1050   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1051 
1052   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1053   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1054     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1055   }
1056 
1057   /* if the a->i are the same */
1058   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1059     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1060   }
1061 
1062   /* if a->j are the same */
1063   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1064     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1065   }
1066 
1067   /* if a->a are the same */
1068   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1069     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1070   }
1071   *flg = PETSC_TRUE;
1072   PetscFunctionReturn(0);
1073 
1074 }
1075 
1076 #undef __FUNC__
1077 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1078 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1079 {
1080   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1081   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1082   Scalar      *x, zero = 0.0,*aa,*aa_j;
1083 
1084   PetscFunctionBegin;
1085   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1086   bs   = a->bs;
1087   aa   = a->a;
1088   ai   = a->i;
1089   aj   = a->j;
1090   ambs = a->mbs;
1091   bs2  = a->bs2;
1092 
1093   VecSet(&zero,v);
1094   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1095   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1096   for ( i=0; i<ambs; i++ ) {
1097     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1098       if (aj[j] == i) {
1099         row  = i*bs;
1100         aa_j = aa+j*bs2;
1101         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1102         break;
1103       }
1104     }
1105   }
1106   PetscFunctionReturn(0);
1107 }
1108 
1109 #undef __FUNC__
1110 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1111 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1112 {
1113   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1114   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1115   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1116 
1117   PetscFunctionBegin;
1118   ai  = a->i;
1119   aj  = a->j;
1120   aa  = a->a;
1121   m   = a->m;
1122   n   = a->n;
1123   bs  = a->bs;
1124   mbs = a->mbs;
1125   bs2 = a->bs2;
1126   if (ll) {
1127     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1128     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1129     for ( i=0; i<mbs; i++ ) { /* for each block row */
1130       M  = ai[i+1] - ai[i];
1131       li = l + i*bs;
1132       v  = aa + bs2*ai[i];
1133       for ( j=0; j<M; j++ ) { /* for each block */
1134         for ( k=0; k<bs2; k++ ) {
1135           (*v++) *= li[k%bs];
1136         }
1137       }
1138     }
1139   }
1140 
1141   if (rr) {
1142     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1143     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1144     for ( i=0; i<mbs; i++ ) { /* for each block row */
1145       M  = ai[i+1] - ai[i];
1146       v  = aa + bs2*ai[i];
1147       for ( j=0; j<M; j++ ) { /* for each block */
1148         ri = r + bs*aj[ai[i]+j];
1149         for ( k=0; k<bs; k++ ) {
1150           x = ri[k];
1151           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1152         }
1153       }
1154     }
1155   }
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 
1160 #undef __FUNC__
1161 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1162 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1163 {
1164   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1165 
1166   PetscFunctionBegin;
1167   info->rows_global    = (double)a->m;
1168   info->columns_global = (double)a->n;
1169   info->rows_local     = (double)a->m;
1170   info->columns_local  = (double)a->n;
1171   info->block_size     = a->bs2;
1172   info->nz_allocated   = a->maxnz;
1173   info->nz_used        = a->bs2*a->nz;
1174   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1175   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1176     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1177   info->assemblies   = A->num_ass;
1178   info->mallocs      = a->reallocs;
1179   info->memory       = A->mem;
1180   if (A->factor) {
1181     info->fill_ratio_given  = A->info.fill_ratio_given;
1182     info->fill_ratio_needed = A->info.fill_ratio_needed;
1183     info->factor_mallocs    = A->info.factor_mallocs;
1184   } else {
1185     info->fill_ratio_given  = 0;
1186     info->fill_ratio_needed = 0;
1187     info->factor_mallocs    = 0;
1188   }
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 
1193 #undef __FUNC__
1194 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1195 int MatZeroEntries_SeqBAIJ(Mat A)
1196 {
1197   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1198 
1199   PetscFunctionBegin;
1200   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
1201   PetscFunctionReturn(0);
1202 }
1203