xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision bc5ccf886fc7b029025ea4de8c4047e1db2135de)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.38 1999/01/12 23:15:41 bsmith Exp balay $";
3 #endif
4 
5 #include "sys.h"
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 #include "src/inline/ilu.h"
10 #include "bitarray.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
14 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
15 {
16   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
17   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
18   int         start, end, *ai, *aj,bs,*nidx2;
19   BT          table;
20 
21   PetscFunctionBegin;
22   m     = a->mbs;
23   ai    = a->i;
24   aj    = a->j;
25   bs    = a->bs;
26 
27   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
28 
29   ierr  = BTCreate(m,table); CHKERRQ(ierr);
30   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
31   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
32 
33   for ( i=0; i<is_max; i++ ) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     BTMemzero(m,table);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
40     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for ( j=0; j<n ; ++j){
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
46       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
47     }
48     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
50 
51     k = 0;
52     for ( j=0; j<ov; j++){ /* for each overlap*/
53       n = isz;
54       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for ( l = start; l<end ; l++){
59           val = aj[l];
60           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++ ) {
66       for (k=0; k<bs; k++ )
67         nidx2[j*bs+k] = nidx[j]*bs+k;
68     }
69     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
70   }
71   BTDestroy(table);
72   PetscFree(nidx);
73   PetscFree(nidx2);
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNC__
78 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
79 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
80 {
81   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
82   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
83   int          row,mat_i,*mat_j,tcol,*mat_ilen;
84   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
85   int          *aj = a->j, *ai = a->i;
86   MatScalar    *mat_a;
87   Mat          C;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,(PetscTruth*)&i);
91   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
95   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
96   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
97 
98   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
99   ssmap = smap;
100   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
101   PetscMemzero(smap,oldcols*sizeof(int));
102   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for ( k=kstart; k<kend; k++ ) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
119     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
120       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
121     }
122     PetscMemzero(c->ilen,c->mbs*sizeof(int));
123     C = *B;
124   } else {
125     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
126   }
127   c = (Mat_SeqBAIJ *)(C->data);
128   for (i=0; i<nrows; i++) {
129     row    = irow[i];
130     kstart = ai[row];
131     kend   = kstart + a->ilen[row];
132     mat_i  = c->i[i];
133     mat_j  = c->j + mat_i;
134     mat_a  = c->a + mat_i*bs2;
135     mat_ilen = c->ilen + i;
136     for ( k=kstart; k<kend; k++ ) {
137       if ((tcol=ssmap[a->j[k]])) {
138         *mat_j++ = tcol - 1;
139         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar)); mat_a+=bs2;
140         (*mat_ilen)++;
141       }
142     }
143   }
144 
145   /* Free work space */
146   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
147   PetscFree(smap); PetscFree(lens);
148   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
149   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
150 
151   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
152   *B = C;
153   PetscFunctionReturn(0);
154 }
155 
156 #undef __FUNC__
157 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
158 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
159 {
160   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
161   IS          is1,is2;
162   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
163 
164   PetscFunctionBegin;
165   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
166   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
167   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
168   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
169 
170   /* Verify if the indices corespond to each element in a block
171    and form the IS with compressed IS */
172   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
173   iary = vary + a->mbs;
174   PetscMemzero(vary,(a->mbs)*sizeof(int));
175   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
176   count = 0;
177   for (i=0; i<a->mbs; i++) {
178     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
179     if (vary[i]==bs) iary[count++] = i;
180   }
181   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
182 
183   PetscMemzero(vary,(a->mbs)*sizeof(int));
184   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
185   count = 0;
186   for (i=0; i<a->mbs; i++) {
187     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
188     if (vary[i]==bs) iary[count++] = i;
189   }
190   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
191   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
192   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
193   PetscFree(vary);
194 
195   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
196   ISDestroy(is1);
197   ISDestroy(is2);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNC__
202 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
203 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatReuse scall,Mat **B)
204 {
205   int ierr,i;
206 
207   PetscFunctionBegin;
208   if (scall == MAT_INITIAL_MATRIX) {
209     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
210   }
211 
212   for ( i=0; i<n; i++ ) {
213     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 
219 /* -------------------------------------------------------*/
220 /* Should check that shapes of vectors and matrices match */
221 /* -------------------------------------------------------*/
222 #include "pinclude/blaslapack.h"
223 
224 #undef __FUNC__
225 #define __FUNC__ "MatMult_SeqBAIJ_1"
226 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
227 {
228   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
229   Scalar          *x,*z,sum;
230   MatScalar       *v;
231   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
232 
233   PetscFunctionBegin;
234   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
235   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
236 
237   idx   = a->j;
238   v     = a->a;
239   ii    = a->i;
240 
241   for ( i=0; i<mbs; i++ ) {
242     n    = ii[1] - ii[0]; ii++;
243     sum  = 0.0;
244     while (n--) sum += *v++ * x[*idx++];
245     z[i] = sum;
246   }
247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
248   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
249   PLogFlops(2*a->nz - a->m);
250   PetscFunctionReturn(0);
251 }
252 
253 #undef __FUNC__
254 #define __FUNC__ "MatMult_SeqBAIJ_2"
255 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
256 {
257   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
258   Scalar          *x,*z,*xb,sum1,sum2;
259   Scalar          x1,x2;
260   MatScalar       *v;
261   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
262 
263   PetscFunctionBegin;
264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
266 
267   idx   = a->j;
268   v     = a->a;
269   ii    = a->i;
270 
271   for ( i=0; i<mbs; i++ ) {
272     n  = ii[1] - ii[0]; ii++;
273     sum1 = 0.0; sum2 = 0.0;
274     for ( j=0; j<n; j++ ) {
275       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
276       sum1 += v[0]*x1 + v[2]*x2;
277       sum2 += v[1]*x1 + v[3]*x2;
278       v += 4;
279     }
280     z[0] = sum1; z[1] = sum2;
281     z += 2;
282   }
283   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
285   PLogFlops(4*a->nz - a->m);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNC__
290 #define __FUNC__ "MatMult_SeqBAIJ_3"
291 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
292 {
293   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
294   Scalar       *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
295   MatScalar    *v;
296   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
297 
298 #if defined(HAVE_PRAGMA_DISJOINT)
299 #pragma disjoint(*v,*z,*xb)
300 #endif
301 
302   PetscFunctionBegin;
303   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
304   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
305 
306   idx   = a->j;
307   v     = a->a;
308   ii    = a->i;
309 
310   for ( i=0; i<mbs; i++ ) {
311     n  = ii[1] - ii[0]; ii++;
312     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
313     for ( j=0; j<n; j++ ) {
314       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
315       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
316       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
317       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
318       v += 9;
319     }
320     z[0] = sum1; z[1] = sum2; z[2] = sum3;
321     z += 3;
322   }
323   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
324   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
325   PLogFlops(18*a->nz - a->m);
326   PetscFunctionReturn(0);
327 }
328 
329 #undef __FUNC__
330 #define __FUNC__ "MatMult_SeqBAIJ_4"
331 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
332 {
333   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
334   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
335   MatScalar       *v;
336   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
337 
338   PetscFunctionBegin;
339   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
340   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
341 
342   idx   = a->j;
343   v     = a->a;
344   ii    = a->i;
345 
346   for ( i=0; i<mbs; i++ ) {
347     n  = ii[1] - ii[0]; ii++;
348     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
349     for ( j=0; j<n; j++ ) {
350       xb = x + 4*(*idx++);
351       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
352       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
353       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
354       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
355       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
356       v += 16;
357     }
358     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
359     z += 4;
360   }
361   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
362   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
363   PLogFlops(32*a->nz - a->m);
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNC__
368 #define __FUNC__ "MatMult_SeqBAIJ_5"
369 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
370 {
371   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
372   Scalar          sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
373   MatScalar       *v;
374   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
375 
376   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
377   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
378 
379   idx   = a->j;
380   v     = a->a;
381   ii    = a->i;
382 
383   for ( i=0; i<mbs; i++ ) {
384     n  = ii[1] - ii[0]; ii++;
385     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
386     for ( j=0; j<n; j++ ) {
387       xb = x + 5*(*idx++);
388       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
389       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
390       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
391       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
392       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
393       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
394       v += 25;
395     }
396     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
397     z += 5;
398   }
399   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
400   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
401   PLogFlops(50*a->nz - a->m);
402   PetscFunctionReturn(0);
403 }
404 
405 #undef __FUNC__
406 #define __FUNC__ "MatMult_SeqBAIJ_7"
407 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
408 {
409   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
410   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
411   Scalar          x1,x2,x3,x4,x5,x6,x7;
412   MatScalar       *v;
413   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
414 
415   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
416   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
417 
418   idx   = a->j;
419   v     = a->a;
420   ii    = a->i;
421 
422   for ( i=0; i<mbs; i++ ) {
423     n  = ii[1] - ii[0]; ii++;
424     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
425     for ( j=0; j<n; j++ ) {
426       xb = x + 7*(*idx++);
427       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
428       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
429       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
430       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
431       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
432       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
433       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
434       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
435       v += 49;
436     }
437     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
438     z += 7;
439   }
440 
441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
442   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
443   PLogFlops(98*a->nz - a->m);
444   PetscFunctionReturn(0);
445 }
446 
447 /*
448     This will not work with MatScalar == float because it calls the BLAS
449 */
450 #undef __FUNC__
451 #define __FUNC__ "MatMult_SeqBAIJ_N"
452 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
453 {
454   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
455   Scalar          *x,*z,*xb, *work,*workt;
456   MatScalar       *v;
457   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
458   int             ncols,k;
459 
460   PetscFunctionBegin;
461   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
463 
464   idx   = a->j;
465   v     = a->a;
466   ii    = a->i;
467 
468 
469   if (!a->mult_work) {
470     k = PetscMax(a->m,a->n);
471     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
472   }
473   work = a->mult_work;
474   for ( i=0; i<mbs; i++ ) {
475     n     = ii[1] - ii[0]; ii++;
476     ncols = n*bs;
477     workt = work;
478     for ( j=0; j<n; j++ ) {
479       xb = x + bs*(*idx++);
480       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
481       workt += bs;
482     }
483     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
484     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
485     v += n*bs2;
486     z += bs;
487   }
488   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
489   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
490   PLogFlops(2*a->nz*bs2 - a->m);
491   PetscFunctionReturn(0);
492 }
493 
494 #undef __FUNC__
495 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
496 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
497 {
498   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
499   Scalar          *x,*y,*z,sum;
500   MatScalar       *v;
501   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
502 
503   PetscFunctionBegin;
504   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
505   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
506   if (zz != yy) {
507     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
508   } else {
509     z = y;
510   }
511 
512   idx   = a->j;
513   v     = a->a;
514   ii    = a->i;
515 
516   for ( i=0; i<mbs; i++ ) {
517     n    = ii[1] - ii[0]; ii++;
518     sum  = y[i];
519     while (n--) sum += *v++ * x[*idx++];
520     z[i] = sum;
521   }
522   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
524   if (zz != yy) {
525     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
526   }
527   PLogFlops(2*a->nz);
528   PetscFunctionReturn(0);
529 }
530 
531 #undef __FUNC__
532 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
533 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
534 {
535   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
536   Scalar          *x,*y,*z,*xb,sum1,sum2;
537   Scalar          x1,x2;
538   MatScalar       *v;
539   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
540 
541   PetscFunctionBegin;
542   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
543   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
544   if (zz != yy) {
545     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
546   } else {
547     z = y;
548   }
549 
550   idx   = a->j;
551   v     = a->a;
552   ii    = a->i;
553 
554   for ( i=0; i<mbs; i++ ) {
555     n  = ii[1] - ii[0]; ii++;
556     sum1 = y[0]; sum2 = y[1];
557     for ( j=0; j<n; j++ ) {
558       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
559       sum1 += v[0]*x1 + v[2]*x2;
560       sum2 += v[1]*x1 + v[3]*x2;
561       v += 4;
562     }
563     z[0] = sum1; z[1] = sum2;
564     z += 2; y += 2;
565   }
566   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
567   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
568   if (zz != yy) {
569     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
570   }
571   PLogFlops(4*a->nz);
572   PetscFunctionReturn(0);
573 }
574 
575 #undef __FUNC__
576 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
577 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
578 {
579   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
580   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
581   MatScalar       *v;
582   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
583 
584   PetscFunctionBegin;
585   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
586   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
587   if (zz != yy) {
588     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
589   } else {
590     z = y;
591   }
592 
593   idx   = a->j;
594   v     = a->a;
595   ii    = a->i;
596 
597   for ( i=0; i<mbs; i++ ) {
598     n  = ii[1] - ii[0]; ii++;
599     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
600     for ( j=0; j<n; j++ ) {
601       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
602       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
603       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
604       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
605       v += 9;
606     }
607     z[0] = sum1; z[1] = sum2; z[2] = sum3;
608     z += 3; y += 3;
609   }
610   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
611   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
612   if (zz != yy) {
613     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
614   }
615   PLogFlops(18*a->nz);
616   PetscFunctionReturn(0);
617 }
618 
619 #undef __FUNC__
620 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
621 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
622 {
623   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
624   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
625   MatScalar       *v;
626   int             ierr,mbs=a->mbs,i,*idx,*ii;
627   int             j,n;
628 
629   PetscFunctionBegin;
630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
631   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
632   if (zz != yy) {
633     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
634   } else {
635     z = y;
636   }
637 
638   idx   = a->j;
639   v     = a->a;
640   ii    = a->i;
641 
642   for ( i=0; i<mbs; i++ ) {
643     n  = ii[1] - ii[0]; ii++;
644     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
645     for ( j=0; j<n; j++ ) {
646       xb = x + 4*(*idx++);
647       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
648       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
649       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
650       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
651       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
652       v += 16;
653     }
654     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
655     z += 4; y += 4;
656   }
657   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
658   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
659   if (zz != yy) {
660     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
661   }
662   PLogFlops(32*a->nz);
663   PetscFunctionReturn(0);
664 }
665 
666 #undef __FUNC__
667 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
668 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
669 {
670   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
671   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
672   MatScalar       *v;
673   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
674 
675   PetscFunctionBegin;
676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
677   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
678   if (zz != yy) {
679     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
680   } else {
681     z = y;
682   }
683 
684   idx   = a->j;
685   v     = a->a;
686   ii    = a->i;
687 
688   for ( i=0; i<mbs; i++ ) {
689     n  = ii[1] - ii[0]; ii++;
690     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
691     for ( j=0; j<n; j++ ) {
692       xb = x + 5*(*idx++);
693       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
694       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
695       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
696       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
697       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
698       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
699       v += 25;
700     }
701     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
702     z += 5; y += 5;
703   }
704   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
705   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
706   if (zz != yy) {
707     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
708   }
709   PLogFlops(50*a->nz);
710   PetscFunctionReturn(0);
711 }
712 
713 #undef __FUNC__
714 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
715 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
716 {
717   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
718   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
719   Scalar          x1,x2,x3,x4,x5,x6,x7;
720   MatScalar       *v;
721   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
722 
723   PetscFunctionBegin;
724   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
725   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
726   if (zz != yy) {
727     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
728   } else {
729     z = y;
730   }
731 
732   idx   = a->j;
733   v     = a->a;
734   ii    = a->i;
735 
736   for ( i=0; i<mbs; i++ ) {
737     n  = ii[1] - ii[0]; ii++;
738     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
739     for ( j=0; j<n; j++ ) {
740       xb = x + 7*(*idx++);
741       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
742       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
743       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
744       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
745       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
746       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
747       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
748       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
749       v += 49;
750     }
751     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
752     z += 7; y += 7;
753   }
754   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
755   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
756   if (zz != yy) {
757     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
758   }
759   PLogFlops(98*a->nz);
760   PetscFunctionReturn(0);
761 }
762 
763 #undef __FUNC__
764 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
765 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
766 {
767   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
768   Scalar         *x,*z,*xb,*work,*workt;
769   MatScalar      *v;
770   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
771   int            ncols,k;
772 
773   PetscFunctionBegin;
774   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
775 
776   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
777   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
778 
779   idx   = a->j;
780   v     = a->a;
781   ii    = a->i;
782 
783 
784   if (!a->mult_work) {
785     k = PetscMax(a->m,a->n);
786     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
787   }
788   work = a->mult_work;
789   for ( i=0; i<mbs; i++ ) {
790     n     = ii[1] - ii[0]; ii++;
791     ncols = n*bs;
792     workt = work;
793     for ( j=0; j<n; j++ ) {
794       xb = x + bs*(*idx++);
795       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
796       workt += bs;
797     }
798     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
799     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
800     v += n*bs2;
801     z += bs;
802   }
803   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
804   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
805   PLogFlops(2*a->nz*bs2 );
806   PetscFunctionReturn(0);
807 }
808 
809 #undef __FUNC__
810 #define __FUNC__ "MatMultTrans_SeqBAIJ"
811 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
812 {
813   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
814   Scalar          *xg,*zg,*zb;
815   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
816   MatScalar       *v;
817   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
818   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
819 
820 
821   PetscFunctionBegin;
822   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
823   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
824   PetscMemzero(z,N*sizeof(Scalar));
825 
826   idx   = a->j;
827   v     = a->a;
828   ii    = a->i;
829 
830   switch (bs) {
831   case 1:
832     for ( i=0; i<mbs; i++ ) {
833       n  = ii[1] - ii[0]; ii++;
834       xb = x + i; x1 = xb[0];
835       ib = idx + ai[i];
836       for ( j=0; j<n; j++ ) {
837         rval    = ib[j];
838         z[rval] += *v++ * x1;
839       }
840     }
841     break;
842   case 2:
843     for ( i=0; i<mbs; i++ ) {
844       n  = ii[1] - ii[0]; ii++;
845       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
846       ib = idx + ai[i];
847       for ( j=0; j<n; j++ ) {
848         rval      = ib[j]*2;
849         z[rval++] += v[0]*x1 + v[1]*x2;
850         z[rval++] += v[2]*x1 + v[3]*x2;
851         v += 4;
852       }
853     }
854     break;
855   case 3:
856     for ( i=0; i<mbs; i++ ) {
857       n  = ii[1] - ii[0]; ii++;
858       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
859       ib = idx + ai[i];
860       for ( j=0; j<n; j++ ) {
861         rval      = ib[j]*3;
862         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
863         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
864         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
865         v += 9;
866       }
867     }
868     break;
869   case 4:
870     for ( i=0; i<mbs; i++ ) {
871       n  = ii[1] - ii[0]; ii++;
872       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
873       ib = idx + ai[i];
874       for ( j=0; j<n; j++ ) {
875         rval      = ib[j]*4;
876         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
877         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
878         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
879         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
880         v += 16;
881       }
882     }
883     break;
884   case 5:
885     for ( i=0; i<mbs; i++ ) {
886       n  = ii[1] - ii[0]; ii++;
887       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
888       x4 = xb[3];   x5 = xb[4];
889       ib = idx + ai[i];
890       for ( j=0; j<n; j++ ) {
891         rval      = ib[j]*5;
892         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
893         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
894         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
895         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
896         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
897         v += 25;
898       }
899     }
900     break;
901       /* block sizes larger then 5 by 5 are handled by BLAS */
902     default: {
903       int       ncols,k;
904       Scalar    *work,*workt;
905 
906       if (!a->mult_work) {
907         k = PetscMax(a->m,a->n);
908         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
909       }
910       work = a->mult_work;
911       for ( i=0; i<mbs; i++ ) {
912         n     = ii[1] - ii[0]; ii++;
913         ncols = n*bs;
914         PetscMemzero(work,ncols*sizeof(Scalar));
915         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
916         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
917         v += n*bs2;
918         x += bs;
919         workt = work;
920         for ( j=0; j<n; j++ ) {
921           zb = z + bs*(*idx++);
922           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
923           workt += bs;
924         }
925       }
926     }
927   }
928   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
929   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
930   PLogFlops(2*a->nz*a->bs2 - a->n);
931   PetscFunctionReturn(0);
932 }
933 
934 #undef __FUNC__
935 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
936 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
937 
938 {
939   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
940   Scalar          *xg,*zg,*zb;
941   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
942   MatScalar       *v;
943   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
944   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
945 
946   PetscFunctionBegin;
947   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
948   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
949 
950   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
951   else PetscMemzero(z,N*sizeof(Scalar));
952 
953   idx   = a->j;
954   v     = a->a;
955   ii    = a->i;
956 
957   switch (bs) {
958   case 1:
959     for ( i=0; i<mbs; i++ ) {
960       n  = ii[1] - ii[0]; ii++;
961       xb = x + i; x1 = xb[0];
962       ib = idx + ai[i];
963       for ( j=0; j<n; j++ ) {
964         rval    = ib[j];
965         z[rval] += *v++ * x1;
966       }
967     }
968     break;
969   case 2:
970     for ( i=0; i<mbs; i++ ) {
971       n  = ii[1] - ii[0]; ii++;
972       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
973       ib = idx + ai[i];
974       for ( j=0; j<n; j++ ) {
975         rval      = ib[j]*2;
976         z[rval++] += v[0]*x1 + v[1]*x2;
977         z[rval++] += v[2]*x1 + v[3]*x2;
978         v += 4;
979       }
980     }
981     break;
982   case 3:
983     for ( i=0; i<mbs; i++ ) {
984       n  = ii[1] - ii[0]; ii++;
985       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
986       ib = idx + ai[i];
987       for ( j=0; j<n; j++ ) {
988         rval      = ib[j]*3;
989         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
990         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
991         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
992         v += 9;
993       }
994     }
995     break;
996   case 4:
997     for ( i=0; i<mbs; i++ ) {
998       n  = ii[1] - ii[0]; ii++;
999       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1000       ib = idx + ai[i];
1001       for ( j=0; j<n; j++ ) {
1002         rval      = ib[j]*4;
1003         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1004         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1005         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1006         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1007         v += 16;
1008       }
1009     }
1010     break;
1011   case 5:
1012     for ( i=0; i<mbs; i++ ) {
1013       n  = ii[1] - ii[0]; ii++;
1014       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1015       x4 = xb[3];   x5 = xb[4];
1016       ib = idx + ai[i];
1017       for ( j=0; j<n; j++ ) {
1018         rval      = ib[j]*5;
1019         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1020         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1021         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1022         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1023         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1024         v += 25;
1025       }
1026     }
1027     break;
1028       /* block sizes larger then 5 by 5 are handled by BLAS */
1029     default: {
1030       int       ncols,k;
1031       Scalar    *work,*workt;
1032 
1033       if (!a->mult_work) {
1034         k = PetscMax(a->m,a->n);
1035         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1036       }
1037       work = a->mult_work;
1038       for ( i=0; i<mbs; i++ ) {
1039         n     = ii[1] - ii[0]; ii++;
1040         ncols = n*bs;
1041         PetscMemzero(work,ncols*sizeof(Scalar));
1042         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1043         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1044         v += n*bs2;
1045         x += bs;
1046         workt = work;
1047         for ( j=0; j<n; j++ ) {
1048           zb = z + bs*(*idx++);
1049           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1050           workt += bs;
1051         }
1052       }
1053     }
1054   }
1055   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1056   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1057   PLogFlops(2*a->nz*a->bs2);
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 #undef __FUNC__
1062 #define __FUNC__ "MatScale_SeqBAIJ"
1063 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1064 {
1065   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1066   int         one = 1, totalnz = a->bs2*a->nz;
1067 
1068   PetscFunctionBegin;
1069   BLscal_( &totalnz, alpha, a->a, &one );
1070   PLogFlops(totalnz);
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 #undef __FUNC__
1075 #define __FUNC__ "MatNorm_SeqBAIJ"
1076 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1077 {
1078   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1079   MatScalar   *v = a->a;
1080   double      sum = 0.0;
1081   int         i,j,k,bs = a->bs, nz=a->nz,bs2=a->bs2;
1082 
1083   PetscFunctionBegin;
1084   if (type == NORM_FROBENIUS) {
1085     for (i=0; i< bs2*nz; i++ ) {
1086 #if defined(USE_PETSC_COMPLEX)
1087       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1088 #else
1089       sum += (*v)*(*v); v++;
1090 #endif
1091     }
1092     *norm = sqrt(sum);
1093   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1094     *norm = 0.0;
1095     for ( k=0; k<bs; k++ ) {
1096       for ( j=0; j<a->m; j++ ) {
1097         v = a->a + bs2*a->i[j] + k;
1098         sum = 0.0;
1099         for ( i=0; i<a->i[j+1]-a->i[j]; i++ ) {
1100           sum += PetscAbsScalar(*v);
1101           v   += bs;
1102         }
1103         if (sum > *norm) *norm = sum;
1104       }
1105     }
1106   } else {
1107     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1108   }
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 
1113 #undef __FUNC__
1114 #define __FUNC__ "MatEqual_SeqBAIJ"
1115 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1116 {
1117   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1118 
1119   PetscFunctionBegin;
1120   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1121 
1122   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1123   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1124     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1125   }
1126 
1127   /* if the a->i are the same */
1128   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1129     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1130   }
1131 
1132   /* if a->j are the same */
1133   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1134     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1135   }
1136 
1137   /* if a->a are the same */
1138   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1139     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1140   }
1141   *flg = PETSC_TRUE;
1142   PetscFunctionReturn(0);
1143 
1144 }
1145 
1146 #undef __FUNC__
1147 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1148 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1149 {
1150   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1151   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1152   Scalar      *x, zero = 0.0;
1153   MatScalar   *aa,*aa_j;
1154 
1155   PetscFunctionBegin;
1156   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1157   bs   = a->bs;
1158   aa   = a->a;
1159   ai   = a->i;
1160   aj   = a->j;
1161   ambs = a->mbs;
1162   bs2  = a->bs2;
1163 
1164   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1165   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1166   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1167   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1168   for ( i=0; i<ambs; i++ ) {
1169     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1170       if (aj[j] == i) {
1171         row  = i*bs;
1172         aa_j = aa+j*bs2;
1173         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1174         break;
1175       }
1176     }
1177   }
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNC__
1182 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1183 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1184 {
1185   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1186   Scalar      *l,*r,x,*li,*ri;
1187   MatScalar   *aa,*v;
1188   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1189 
1190   PetscFunctionBegin;
1191   ai  = a->i;
1192   aj  = a->j;
1193   aa  = a->a;
1194   m   = a->m;
1195   n   = a->n;
1196   bs  = a->bs;
1197   mbs = a->mbs;
1198   bs2 = a->bs2;
1199   if (ll) {
1200     ierr = VecGetArray(ll,&l); CHKERRQ(ierr);
1201     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1202     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1203     for ( i=0; i<mbs; i++ ) { /* for each block row */
1204       M  = ai[i+1] - ai[i];
1205       li = l + i*bs;
1206       v  = aa + bs2*ai[i];
1207       for ( j=0; j<M; j++ ) { /* for each block */
1208         for ( k=0; k<bs2; k++ ) {
1209           (*v++) *= li[k%bs];
1210         }
1211       }
1212     }
1213   }
1214 
1215   if (rr) {
1216     ierr = VecGetArray(rr,&r); CHKERRQ(ierr);
1217     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1218     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1219     for ( i=0; i<mbs; i++ ) { /* for each block row */
1220       M  = ai[i+1] - ai[i];
1221       v  = aa + bs2*ai[i];
1222       for ( j=0; j<M; j++ ) { /* for each block */
1223         ri = r + bs*aj[ai[i]+j];
1224         for ( k=0; k<bs; k++ ) {
1225           x = ri[k];
1226           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1227         }
1228       }
1229     }
1230   }
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 
1235 #undef __FUNC__
1236 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1237 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1238 {
1239   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1240 
1241   PetscFunctionBegin;
1242   info->rows_global    = (double)a->m;
1243   info->columns_global = (double)a->n;
1244   info->rows_local     = (double)a->m;
1245   info->columns_local  = (double)a->n;
1246   info->block_size     = a->bs2;
1247   info->nz_allocated   = a->maxnz;
1248   info->nz_used        = a->bs2*a->nz;
1249   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1250   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1251     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1252   info->assemblies   = A->num_ass;
1253   info->mallocs      = a->reallocs;
1254   info->memory       = A->mem;
1255   if (A->factor) {
1256     info->fill_ratio_given  = A->info.fill_ratio_given;
1257     info->fill_ratio_needed = A->info.fill_ratio_needed;
1258     info->factor_mallocs    = A->info.factor_mallocs;
1259   } else {
1260     info->fill_ratio_given  = 0;
1261     info->fill_ratio_needed = 0;
1262     info->factor_mallocs    = 0;
1263   }
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 
1268 #undef __FUNC__
1269 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1270 int MatZeroEntries_SeqBAIJ(Mat A)
1271 {
1272   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1273 
1274   PetscFunctionBegin;
1275   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1276   PetscFunctionReturn(0);
1277 }
1278