xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 3f6c72634bd4059617f4e6838b577bb0b316aba4)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.40 1999/03/17 23:23:13 bsmith Exp balay $";
3 #endif
4 
5 #include "sys.h"
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 #include "src/inline/ilu.h"
10 #include "bitarray.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
14 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
15 {
16   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
17   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
18   int         start, end, *ai, *aj,bs,*nidx2;
19   BT          table;
20 
21   PetscFunctionBegin;
22   m     = a->mbs;
23   ai    = a->i;
24   aj    = a->j;
25   bs    = a->bs;
26 
27   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
28 
29   ierr  = BTCreate(m,table); CHKERRQ(ierr);
30   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
31   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
32 
33   for ( i=0; i<is_max; i++ ) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     BTMemzero(m,table);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
40     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for ( j=0; j<n ; ++j){
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
46       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
47     }
48     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
50 
51     k = 0;
52     for ( j=0; j<ov; j++){ /* for each overlap*/
53       n = isz;
54       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for ( l = start; l<end ; l++){
59           val = aj[l];
60           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++ ) {
66       for (k=0; k<bs; k++ )
67         nidx2[j*bs+k] = nidx[j]*bs+k;
68     }
69     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
70   }
71   BTDestroy(table);
72   PetscFree(nidx);
73   PetscFree(nidx2);
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNC__
78 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
79 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
80 {
81   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
82   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
83   int          row,mat_i,*mat_j,tcol,*mat_ilen;
84   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
85   int          *aj = a->j, *ai = a->i;
86   MatScalar    *mat_a;
87   Mat          C;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,(PetscTruth*)&i);
91   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
95   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
96   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
97 
98   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
99   ssmap = smap;
100   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
101   PetscMemzero(smap,oldcols*sizeof(int));
102   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for ( k=kstart; k<kend; k++ ) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
119     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
120       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
121     }
122     PetscMemzero(c->ilen,c->mbs*sizeof(int));
123     C = *B;
124   } else {
125     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
126   }
127   c = (Mat_SeqBAIJ *)(C->data);
128   for (i=0; i<nrows; i++) {
129     row    = irow[i];
130     kstart = ai[row];
131     kend   = kstart + a->ilen[row];
132     mat_i  = c->i[i];
133     mat_j  = c->j + mat_i;
134     mat_a  = c->a + mat_i*bs2;
135     mat_ilen = c->ilen + i;
136     for ( k=kstart; k<kend; k++ ) {
137       if ((tcol=ssmap[a->j[k]])) {
138         *mat_j++ = tcol - 1;
139         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar)); mat_a+=bs2;
140         (*mat_ilen)++;
141       }
142     }
143   }
144 
145   /* Free work space */
146   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
147   PetscFree(smap); PetscFree(lens);
148   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
149   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
150 
151   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
152   *B = C;
153   PetscFunctionReturn(0);
154 }
155 
156 #undef __FUNC__
157 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
158 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
159 {
160   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
161   IS          is1,is2;
162   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
163 
164   PetscFunctionBegin;
165   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
166   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
167   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
168   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
169 
170   /* Verify if the indices corespond to each element in a block
171    and form the IS with compressed IS */
172   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
173   iary = vary + a->mbs;
174   PetscMemzero(vary,(a->mbs)*sizeof(int));
175   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
176   count = 0;
177   for (i=0; i<a->mbs; i++) {
178     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
179     if (vary[i]==bs) iary[count++] = i;
180   }
181   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
182 
183   PetscMemzero(vary,(a->mbs)*sizeof(int));
184   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
185   count = 0;
186   for (i=0; i<a->mbs; i++) {
187     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
188     if (vary[i]==bs) iary[count++] = i;
189   }
190   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
191   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
192   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
193   PetscFree(vary);
194 
195   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
196   ISDestroy(is1);
197   ISDestroy(is2);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNC__
202 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
203 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatReuse scall,Mat **B)
204 {
205   int ierr,i;
206 
207   PetscFunctionBegin;
208   if (scall == MAT_INITIAL_MATRIX) {
209     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
210   }
211 
212   for ( i=0; i<n; i++ ) {
213     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 
219 /* -------------------------------------------------------*/
220 /* Should check that shapes of vectors and matrices match */
221 /* -------------------------------------------------------*/
222 #include "pinclude/blaslapack.h"
223 
224 #undef __FUNC__
225 #define __FUNC__ "MatMult_SeqBAIJ_1"
226 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
227 {
228   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
229   Scalar          *x,*z,sum;
230   MatScalar       *v;
231   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
232 
233   PetscFunctionBegin;
234   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
235   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
236 
237   idx   = a->j;
238   v     = a->a;
239   ii    = a->i;
240 
241   for ( i=0; i<mbs; i++ ) {
242     n    = ii[1] - ii[0]; ii++;
243     sum  = 0.0;
244     while (n--) sum += *v++ * x[*idx++];
245     z[i] = sum;
246   }
247   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
248   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
249   PLogFlops(2*a->nz - a->m);
250   PetscFunctionReturn(0);
251 }
252 
253 #undef __FUNC__
254 #define __FUNC__ "MatMult_SeqBAIJ_2"
255 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
256 {
257   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
258   Scalar          *x,*z,*xb,sum1,sum2;
259   Scalar          x1,x2;
260   MatScalar       *v;
261   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
262 
263   PetscFunctionBegin;
264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
266 
267   idx   = a->j;
268   v     = a->a;
269   ii    = a->i;
270 
271   for ( i=0; i<mbs; i++ ) {
272     n  = ii[1] - ii[0]; ii++;
273     sum1 = 0.0; sum2 = 0.0;
274     for ( j=0; j<n; j++ ) {
275       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
276       sum1 += v[0]*x1 + v[2]*x2;
277       sum2 += v[1]*x1 + v[3]*x2;
278       v += 4;
279     }
280     z[0] = sum1; z[1] = sum2;
281     z += 2;
282   }
283   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
285   PLogFlops(8*a->nz - a->m);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNC__
290 #define __FUNC__ "MatMult_SeqBAIJ_3"
291 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
292 {
293   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
294   Scalar       *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
295   MatScalar    *v;
296   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
297 
298 #if defined(HAVE_PRAGMA_DISJOINT)
299 #pragma disjoint(*v,*z,*xb)
300 #endif
301 
302   PetscFunctionBegin;
303   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
304   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
305 
306   idx   = a->j;
307   v     = a->a;
308   ii    = a->i;
309 
310   for ( i=0; i<mbs; i++ ) {
311     n  = ii[1] - ii[0]; ii++;
312     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
313     for ( j=0; j<n; j++ ) {
314       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
315       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
316       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
317       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
318       v += 9;
319     }
320     z[0] = sum1; z[1] = sum2; z[2] = sum3;
321     z += 3;
322   }
323   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
324   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
325   PLogFlops(18*a->nz - a->m);
326   PetscFunctionReturn(0);
327 }
328 
329 #undef __FUNC__
330 #define __FUNC__ "MatMult_SeqBAIJ_4"
331 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
332 {
333   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
334   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
335   MatScalar       *v;
336   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
337 
338   PetscFunctionBegin;
339   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
340   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
341 
342   idx   = a->j;
343   v     = a->a;
344   ii    = a->i;
345 
346   for ( i=0; i<mbs; i++ ) {
347     n  = ii[1] - ii[0]; ii++;
348     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
349     for ( j=0; j<n; j++ ) {
350       xb = x + 4*(*idx++);
351       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
352       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
353       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
354       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
355       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
356       v += 16;
357     }
358     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
359     z += 4;
360   }
361   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
362   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
363   PLogFlops(32*a->nz - a->m);
364   PetscFunctionReturn(0);
365 }
366 
367 #undef __FUNC__
368 #define __FUNC__ "MatMult_SeqBAIJ_5"
369 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
370 {
371   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
372   Scalar          sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
373   MatScalar       *v;
374   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
375 
376   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
377   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
378 
379   idx   = a->j;
380   v     = a->a;
381   ii    = a->i;
382 
383   for ( i=0; i<mbs; i++ ) {
384     n  = ii[1] - ii[0]; ii++;
385     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
386     for ( j=0; j<n; j++ ) {
387       xb = x + 5*(*idx++);
388       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
389       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
390       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
391       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
392       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
393       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
394       v += 25;
395     }
396     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
397     z += 5;
398   }
399   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
400   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
401   PLogFlops(50*a->nz - a->m);
402   PetscFunctionReturn(0);
403 }
404 
405 
406 #undef __FUNC__
407 #define __FUNC__ "MatMult_SeqBAIJ_6"
408 int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
409 {
410   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
411   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
412   Scalar          x1,x2,x3,x4,x5,x6;
413   MatScalar       *v;
414   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
415 
416   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
417   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
418 
419   idx   = a->j;
420   v     = a->a;
421   ii    = a->i;
422 
423   for ( i=0; i<mbs; i++ ) {
424     n  = ii[1] - ii[0]; ii++;
425     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
426     for ( j=0; j<n; j++ ) {
427       xb = x + 6*(*idx++);
428       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
429       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
430       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
431       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
432       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
433       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
434       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
435       v += 36;
436     }
437     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
438     z += 6;
439   }
440 
441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
442   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
443   PLogFlops(72*a->nz - a->m);
444   PetscFunctionReturn(0);
445 }
446 #undef __FUNC__
447 #define __FUNC__ "MatMult_SeqBAIJ_7"
448 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
449 {
450   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
451   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
452   Scalar          x1,x2,x3,x4,x5,x6,x7;
453   MatScalar       *v;
454   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
455 
456   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
457   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
458 
459   idx   = a->j;
460   v     = a->a;
461   ii    = a->i;
462 
463   for ( i=0; i<mbs; i++ ) {
464     n  = ii[1] - ii[0]; ii++;
465     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
466     for ( j=0; j<n; j++ ) {
467       xb = x + 7*(*idx++);
468       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
469       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
470       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
471       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
472       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
473       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
474       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
475       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
476       v += 49;
477     }
478     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
479     z += 7;
480   }
481 
482   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
483   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
484   PLogFlops(98*a->nz - a->m);
485   PetscFunctionReturn(0);
486 }
487 
488 /*
489     This will not work with MatScalar == float because it calls the BLAS
490 */
491 #undef __FUNC__
492 #define __FUNC__ "MatMult_SeqBAIJ_N"
493 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
494 {
495   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
496   Scalar          *x,*z,*xb, *work,*workt;
497   MatScalar       *v;
498   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
499   int             ncols,k;
500 
501   PetscFunctionBegin;
502   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
503   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
504 
505   idx   = a->j;
506   v     = a->a;
507   ii    = a->i;
508 
509 
510   if (!a->mult_work) {
511     k = PetscMax(a->m,a->n);
512     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
513   }
514   work = a->mult_work;
515   for ( i=0; i<mbs; i++ ) {
516     n     = ii[1] - ii[0]; ii++;
517     ncols = n*bs;
518     workt = work;
519     for ( j=0; j<n; j++ ) {
520       xb = x + bs*(*idx++);
521       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
522       workt += bs;
523     }
524     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
525     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
526     v += n*bs2;
527     z += bs;
528   }
529   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
530   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
531   PLogFlops(2*a->nz*bs2 - a->m);
532   PetscFunctionReturn(0);
533 }
534 
535 #undef __FUNC__
536 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
537 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
538 {
539   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
540   Scalar          *x,*y,*z,sum;
541   MatScalar       *v;
542   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
543 
544   PetscFunctionBegin;
545   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
546   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
547   if (zz != yy) {
548     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
549   } else {
550     z = y;
551   }
552 
553   idx   = a->j;
554   v     = a->a;
555   ii    = a->i;
556 
557   for ( i=0; i<mbs; i++ ) {
558     n    = ii[1] - ii[0]; ii++;
559     sum  = y[i];
560     while (n--) sum += *v++ * x[*idx++];
561     z[i] = sum;
562   }
563   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
564   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
565   if (zz != yy) {
566     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
567   }
568   PLogFlops(2*a->nz);
569   PetscFunctionReturn(0);
570 }
571 
572 #undef __FUNC__
573 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
574 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
575 {
576   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
577   Scalar          *x,*y,*z,*xb,sum1,sum2;
578   Scalar          x1,x2;
579   MatScalar       *v;
580   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
581 
582   PetscFunctionBegin;
583   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
584   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
585   if (zz != yy) {
586     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
587   } else {
588     z = y;
589   }
590 
591   idx   = a->j;
592   v     = a->a;
593   ii    = a->i;
594 
595   for ( i=0; i<mbs; i++ ) {
596     n  = ii[1] - ii[0]; ii++;
597     sum1 = y[0]; sum2 = y[1];
598     for ( j=0; j<n; j++ ) {
599       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
600       sum1 += v[0]*x1 + v[2]*x2;
601       sum2 += v[1]*x1 + v[3]*x2;
602       v += 4;
603     }
604     z[0] = sum1; z[1] = sum2;
605     z += 2; y += 2;
606   }
607   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
609   if (zz != yy) {
610     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
611   }
612   PLogFlops(4*a->nz);
613   PetscFunctionReturn(0);
614 }
615 
616 #undef __FUNC__
617 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
618 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
619 {
620   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
621   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
622   MatScalar       *v;
623   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
624 
625   PetscFunctionBegin;
626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
628   if (zz != yy) {
629     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
630   } else {
631     z = y;
632   }
633 
634   idx   = a->j;
635   v     = a->a;
636   ii    = a->i;
637 
638   for ( i=0; i<mbs; i++ ) {
639     n  = ii[1] - ii[0]; ii++;
640     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
641     for ( j=0; j<n; j++ ) {
642       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
643       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
644       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
645       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
646       v += 9;
647     }
648     z[0] = sum1; z[1] = sum2; z[2] = sum3;
649     z += 3; y += 3;
650   }
651   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
652   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
653   if (zz != yy) {
654     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
655   }
656   PLogFlops(18*a->nz);
657   PetscFunctionReturn(0);
658 }
659 
660 #undef __FUNC__
661 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
662 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
663 {
664   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
665   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
666   MatScalar       *v;
667   int             ierr,mbs=a->mbs,i,*idx,*ii;
668   int             j,n;
669 
670   PetscFunctionBegin;
671   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
672   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
673   if (zz != yy) {
674     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
675   } else {
676     z = y;
677   }
678 
679   idx   = a->j;
680   v     = a->a;
681   ii    = a->i;
682 
683   for ( i=0; i<mbs; i++ ) {
684     n  = ii[1] - ii[0]; ii++;
685     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
686     for ( j=0; j<n; j++ ) {
687       xb = x + 4*(*idx++);
688       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
689       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
690       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
691       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
692       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
693       v += 16;
694     }
695     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
696     z += 4; y += 4;
697   }
698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
699   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
700   if (zz != yy) {
701     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
702   }
703   PLogFlops(32*a->nz);
704   PetscFunctionReturn(0);
705 }
706 
707 #undef __FUNC__
708 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
709 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
710 {
711   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
712   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
713   MatScalar       *v;
714   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
715 
716   PetscFunctionBegin;
717   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
718   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
719   if (zz != yy) {
720     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
721   } else {
722     z = y;
723   }
724 
725   idx   = a->j;
726   v     = a->a;
727   ii    = a->i;
728 
729   for ( i=0; i<mbs; i++ ) {
730     n  = ii[1] - ii[0]; ii++;
731     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
732     for ( j=0; j<n; j++ ) {
733       xb = x + 5*(*idx++);
734       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
735       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
736       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
737       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
738       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
739       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
740       v += 25;
741     }
742     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
743     z += 5; y += 5;
744   }
745   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
746   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
747   if (zz != yy) {
748     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
749   }
750   PLogFlops(50*a->nz);
751   PetscFunctionReturn(0);
752 }
753 #undef __FUNC__
754 #define __FUNC__ "MatMultAdd_SeqBAIJ_6"
755 int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
756 {
757   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
758   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
759   Scalar          x1,x2,x3,x4,x5,x6;
760   MatScalar       *v;
761   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
762 
763   PetscFunctionBegin;
764   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
765   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
766   if (zz != yy) {
767     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
768   } else {
769     z = y;
770   }
771 
772   idx   = a->j;
773   v     = a->a;
774   ii    = a->i;
775 
776   for ( i=0; i<mbs; i++ ) {
777     n  = ii[1] - ii[0]; ii++;
778     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
779     for ( j=0; j<n; j++ ) {
780       xb = x + 6*(*idx++);
781       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
782       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
783       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
784       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
785       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
786       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
787       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
788       v += 36;
789     }
790     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
791     z += 6; y += 6;
792   }
793   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
794   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
795   if (zz != yy) {
796     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
797   }
798   PLogFlops(72*a->nz);
799   PetscFunctionReturn(0);
800 }
801 
802 #undef __FUNC__
803 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
804 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
805 {
806   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
807   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
808   Scalar          x1,x2,x3,x4,x5,x6,x7;
809   MatScalar       *v;
810   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
811 
812   PetscFunctionBegin;
813   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
815   if (zz != yy) {
816     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
817   } else {
818     z = y;
819   }
820 
821   idx   = a->j;
822   v     = a->a;
823   ii    = a->i;
824 
825   for ( i=0; i<mbs; i++ ) {
826     n  = ii[1] - ii[0]; ii++;
827     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
828     for ( j=0; j<n; j++ ) {
829       xb = x + 7*(*idx++);
830       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
831       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
832       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
833       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
834       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
835       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
836       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
837       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
838       v += 49;
839     }
840     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
841     z += 7; y += 7;
842   }
843   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
844   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
845   if (zz != yy) {
846     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
847   }
848   PLogFlops(98*a->nz);
849   PetscFunctionReturn(0);
850 }
851 
852 #undef __FUNC__
853 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
854 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
855 {
856   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
857   Scalar         *x,*z,*xb,*work,*workt;
858   MatScalar      *v;
859   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
860   int            ncols,k;
861 
862   PetscFunctionBegin;
863   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
864 
865   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
866   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
867 
868   idx   = a->j;
869   v     = a->a;
870   ii    = a->i;
871 
872 
873   if (!a->mult_work) {
874     k = PetscMax(a->m,a->n);
875     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
876   }
877   work = a->mult_work;
878   for ( i=0; i<mbs; i++ ) {
879     n     = ii[1] - ii[0]; ii++;
880     ncols = n*bs;
881     workt = work;
882     for ( j=0; j<n; j++ ) {
883       xb = x + bs*(*idx++);
884       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
885       workt += bs;
886     }
887     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
888     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
889     v += n*bs2;
890     z += bs;
891   }
892   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
894   PLogFlops(2*a->nz*bs2 );
895   PetscFunctionReturn(0);
896 }
897 
898 #undef __FUNC__
899 #define __FUNC__ "MatMultTrans_SeqBAIJ"
900 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
901 {
902   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
903   Scalar          *xg,*zg,*zb;
904   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
905   MatScalar       *v;
906   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
907   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
908 
909 
910   PetscFunctionBegin;
911   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
912   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
913   PetscMemzero(z,N*sizeof(Scalar));
914 
915   idx   = a->j;
916   v     = a->a;
917   ii    = a->i;
918 
919   switch (bs) {
920   case 1:
921     for ( i=0; i<mbs; i++ ) {
922       n  = ii[1] - ii[0]; ii++;
923       xb = x + i; x1 = xb[0];
924       ib = idx + ai[i];
925       for ( j=0; j<n; j++ ) {
926         rval    = ib[j];
927         z[rval] += *v++ * x1;
928       }
929     }
930     break;
931   case 2:
932     for ( i=0; i<mbs; i++ ) {
933       n  = ii[1] - ii[0]; ii++;
934       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
935       ib = idx + ai[i];
936       for ( j=0; j<n; j++ ) {
937         rval      = ib[j]*2;
938         z[rval++] += v[0]*x1 + v[1]*x2;
939         z[rval++] += v[2]*x1 + v[3]*x2;
940         v += 4;
941       }
942     }
943     break;
944   case 3:
945     for ( i=0; i<mbs; i++ ) {
946       n  = ii[1] - ii[0]; ii++;
947       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
948       ib = idx + ai[i];
949       for ( j=0; j<n; j++ ) {
950         rval      = ib[j]*3;
951         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
952         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
953         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
954         v += 9;
955       }
956     }
957     break;
958   case 4:
959     for ( i=0; i<mbs; i++ ) {
960       n  = ii[1] - ii[0]; ii++;
961       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
962       ib = idx + ai[i];
963       for ( j=0; j<n; j++ ) {
964         rval      = ib[j]*4;
965         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
966         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
967         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
968         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
969         v += 16;
970       }
971     }
972     break;
973   case 5:
974     for ( i=0; i<mbs; i++ ) {
975       n  = ii[1] - ii[0]; ii++;
976       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
977       x4 = xb[3];   x5 = xb[4];
978       ib = idx + ai[i];
979       for ( j=0; j<n; j++ ) {
980         rval      = ib[j]*5;
981         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
982         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
983         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
984         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
985         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
986         v += 25;
987       }
988     }
989     break;
990       /* block sizes larger then 5 by 5 are handled by BLAS */
991     default: {
992       int       ncols,k;
993       Scalar    *work,*workt;
994 
995       if (!a->mult_work) {
996         k = PetscMax(a->m,a->n);
997         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
998       }
999       work = a->mult_work;
1000       for ( i=0; i<mbs; i++ ) {
1001         n     = ii[1] - ii[0]; ii++;
1002         ncols = n*bs;
1003         PetscMemzero(work,ncols*sizeof(Scalar));
1004         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1005         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1006         v += n*bs2;
1007         x += bs;
1008         workt = work;
1009         for ( j=0; j<n; j++ ) {
1010           zb = z + bs*(*idx++);
1011           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1012           workt += bs;
1013         }
1014       }
1015     }
1016   }
1017   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1018   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1019   PLogFlops(2*a->nz*a->bs2 - a->n);
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 #undef __FUNC__
1024 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1025 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1026 
1027 {
1028   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1029   Scalar          *xg,*zg,*zb;
1030   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
1031   MatScalar       *v;
1032   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1033   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1034 
1035   PetscFunctionBegin;
1036   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1037   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1038 
1039   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1040   else PetscMemzero(z,N*sizeof(Scalar));
1041 
1042   idx   = a->j;
1043   v     = a->a;
1044   ii    = a->i;
1045 
1046   switch (bs) {
1047   case 1:
1048     for ( i=0; i<mbs; i++ ) {
1049       n  = ii[1] - ii[0]; ii++;
1050       xb = x + i; x1 = xb[0];
1051       ib = idx + ai[i];
1052       for ( j=0; j<n; j++ ) {
1053         rval    = ib[j];
1054         z[rval] += *v++ * x1;
1055       }
1056     }
1057     break;
1058   case 2:
1059     for ( i=0; i<mbs; i++ ) {
1060       n  = ii[1] - ii[0]; ii++;
1061       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1062       ib = idx + ai[i];
1063       for ( j=0; j<n; j++ ) {
1064         rval      = ib[j]*2;
1065         z[rval++] += v[0]*x1 + v[1]*x2;
1066         z[rval++] += v[2]*x1 + v[3]*x2;
1067         v += 4;
1068       }
1069     }
1070     break;
1071   case 3:
1072     for ( i=0; i<mbs; i++ ) {
1073       n  = ii[1] - ii[0]; ii++;
1074       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1075       ib = idx + ai[i];
1076       for ( j=0; j<n; j++ ) {
1077         rval      = ib[j]*3;
1078         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1079         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1080         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1081         v += 9;
1082       }
1083     }
1084     break;
1085   case 4:
1086     for ( i=0; i<mbs; i++ ) {
1087       n  = ii[1] - ii[0]; ii++;
1088       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1089       ib = idx + ai[i];
1090       for ( j=0; j<n; j++ ) {
1091         rval      = ib[j]*4;
1092         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1093         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1094         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1095         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1096         v += 16;
1097       }
1098     }
1099     break;
1100   case 5:
1101     for ( i=0; i<mbs; i++ ) {
1102       n  = ii[1] - ii[0]; ii++;
1103       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1104       x4 = xb[3];   x5 = xb[4];
1105       ib = idx + ai[i];
1106       for ( j=0; j<n; j++ ) {
1107         rval      = ib[j]*5;
1108         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1109         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1110         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1111         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1112         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1113         v += 25;
1114       }
1115     }
1116     break;
1117       /* block sizes larger then 5 by 5 are handled by BLAS */
1118     default: {
1119       int       ncols,k;
1120       Scalar    *work,*workt;
1121 
1122       if (!a->mult_work) {
1123         k = PetscMax(a->m,a->n);
1124         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1125       }
1126       work = a->mult_work;
1127       for ( i=0; i<mbs; i++ ) {
1128         n     = ii[1] - ii[0]; ii++;
1129         ncols = n*bs;
1130         PetscMemzero(work,ncols*sizeof(Scalar));
1131         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1132         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1133         v += n*bs2;
1134         x += bs;
1135         workt = work;
1136         for ( j=0; j<n; j++ ) {
1137           zb = z + bs*(*idx++);
1138           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1139           workt += bs;
1140         }
1141       }
1142     }
1143   }
1144   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1145   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1146   PLogFlops(2*a->nz*a->bs2);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 #undef __FUNC__
1151 #define __FUNC__ "MatScale_SeqBAIJ"
1152 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1153 {
1154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1155   int         one = 1, totalnz = a->bs2*a->nz;
1156 
1157   PetscFunctionBegin;
1158   BLscal_( &totalnz, alpha, a->a, &one );
1159   PLogFlops(totalnz);
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 #undef __FUNC__
1164 #define __FUNC__ "MatNorm_SeqBAIJ"
1165 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1166 {
1167   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1168   MatScalar   *v = a->a;
1169   double      sum = 0.0;
1170   int         i,j,k,bs = a->bs, nz=a->nz,bs2=a->bs2;
1171 
1172   PetscFunctionBegin;
1173   if (type == NORM_FROBENIUS) {
1174     for (i=0; i< bs2*nz; i++ ) {
1175 #if defined(USE_PETSC_COMPLEX)
1176       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1177 #else
1178       sum += (*v)*(*v); v++;
1179 #endif
1180     }
1181     *norm = sqrt(sum);
1182   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1183     *norm = 0.0;
1184     for ( k=0; k<bs; k++ ) {
1185       for ( j=0; j<a->m; j++ ) {
1186         v = a->a + bs2*a->i[j] + k;
1187         sum = 0.0;
1188         for ( i=0; i<a->i[j+1]-a->i[j]; i++ ) {
1189           sum += PetscAbsScalar(*v);
1190           v   += bs;
1191         }
1192         if (sum > *norm) *norm = sum;
1193       }
1194     }
1195   } else {
1196     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1197   }
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 
1202 #undef __FUNC__
1203 #define __FUNC__ "MatEqual_SeqBAIJ"
1204 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1205 {
1206   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1207 
1208   PetscFunctionBegin;
1209   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1210 
1211   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1212   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1213     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1214   }
1215 
1216   /* if the a->i are the same */
1217   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1218     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1219   }
1220 
1221   /* if a->j are the same */
1222   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1223     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1224   }
1225 
1226   /* if a->a are the same */
1227   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1228     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1229   }
1230   *flg = PETSC_TRUE;
1231   PetscFunctionReturn(0);
1232 
1233 }
1234 
1235 #undef __FUNC__
1236 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1237 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1238 {
1239   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1240   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1241   Scalar      *x, zero = 0.0;
1242   MatScalar   *aa,*aa_j;
1243 
1244   PetscFunctionBegin;
1245   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1246   bs   = a->bs;
1247   aa   = a->a;
1248   ai   = a->i;
1249   aj   = a->j;
1250   ambs = a->mbs;
1251   bs2  = a->bs2;
1252 
1253   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1254   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1255   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1256   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1257   for ( i=0; i<ambs; i++ ) {
1258     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1259       if (aj[j] == i) {
1260         row  = i*bs;
1261         aa_j = aa+j*bs2;
1262         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1263         break;
1264       }
1265     }
1266   }
1267   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 #undef __FUNC__
1272 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1273 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1274 {
1275   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1276   Scalar      *l,*r,x,*li,*ri;
1277   MatScalar   *aa,*v;
1278   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1279 
1280   PetscFunctionBegin;
1281   ai  = a->i;
1282   aj  = a->j;
1283   aa  = a->a;
1284   m   = a->m;
1285   n   = a->n;
1286   bs  = a->bs;
1287   mbs = a->mbs;
1288   bs2 = a->bs2;
1289   if (ll) {
1290     ierr = VecGetArray(ll,&l); CHKERRQ(ierr);
1291     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1292     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1293     for ( i=0; i<mbs; i++ ) { /* for each block row */
1294       M  = ai[i+1] - ai[i];
1295       li = l + i*bs;
1296       v  = aa + bs2*ai[i];
1297       for ( j=0; j<M; j++ ) { /* for each block */
1298         for ( k=0; k<bs2; k++ ) {
1299           (*v++) *= li[k%bs];
1300         }
1301       }
1302     }
1303   }
1304 
1305   if (rr) {
1306     ierr = VecGetArray(rr,&r); CHKERRQ(ierr);
1307     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1308     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1309     for ( i=0; i<mbs; i++ ) { /* for each block row */
1310       M  = ai[i+1] - ai[i];
1311       v  = aa + bs2*ai[i];
1312       for ( j=0; j<M; j++ ) { /* for each block */
1313         ri = r + bs*aj[ai[i]+j];
1314         for ( k=0; k<bs; k++ ) {
1315           x = ri[k];
1316           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1317         }
1318       }
1319     }
1320   }
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 
1325 #undef __FUNC__
1326 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1327 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1328 {
1329   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1330 
1331   PetscFunctionBegin;
1332   info->rows_global    = (double)a->m;
1333   info->columns_global = (double)a->n;
1334   info->rows_local     = (double)a->m;
1335   info->columns_local  = (double)a->n;
1336   info->block_size     = a->bs2;
1337   info->nz_allocated   = a->maxnz;
1338   info->nz_used        = a->bs2*a->nz;
1339   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1340   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1341     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1342   info->assemblies   = A->num_ass;
1343   info->mallocs      = a->reallocs;
1344   info->memory       = A->mem;
1345   if (A->factor) {
1346     info->fill_ratio_given  = A->info.fill_ratio_given;
1347     info->fill_ratio_needed = A->info.fill_ratio_needed;
1348     info->factor_mallocs    = A->info.factor_mallocs;
1349   } else {
1350     info->fill_ratio_given  = 0;
1351     info->fill_ratio_needed = 0;
1352     info->factor_mallocs    = 0;
1353   }
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 
1358 #undef __FUNC__
1359 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1360 int MatZeroEntries_SeqBAIJ(Mat A)
1361 {
1362   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1363 
1364   PetscFunctionBegin;
1365   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1366   PetscFunctionReturn(0);
1367 }
1368