xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 6831982abb6453c2f3e25efecb5d0951d333371e)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.50 1999/10/13 20:37:28 bsmith Exp bsmith $";
3 #endif
4 
5 #include "sys.h"
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 #include "src/inline/ilu.h"
10 #include "bitarray.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
14 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
15 {
16   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
17   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
18   int         start, end, *ai, *aj,bs,*nidx2;
19   BTPetsc     table;
20 
21   PetscFunctionBegin;
22   m     = a->mbs;
23   ai    = a->i;
24   aj    = a->j;
25   bs    = a->bs;
26 
27   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
28 
29   ierr  = PetscBTCreate(m,table);CHKERRQ(ierr);
30   nidx  = (int *) PetscMalloc((m+1)*sizeof(int));CHKPTRQ(nidx);
31   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int));CHKPTRQ(nidx2);
32 
33   for ( i=0; i<is_max; i++ ) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for ( j=0; j<n ; ++j){
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
46       if(!PetscBTLoopupSet(table, ival)) { nidx[isz++] = ival;}
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for ( j=0; j<ov; j++){ /* for each overlap*/
53       n = isz;
54       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for ( l = start; l<end ; l++){
59           val = aj[l];
60           if (!PetscBTLoopupSet(table,val)) {nidx[isz++] = val;}
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++ ) {
66       for (k=0; k<bs; k++ )
67         nidx2[j*bs+k] = nidx[j]*bs+k;
68     }
69     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i));CHKERRQ(ierr);
70   }
71   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
72   ierr = PetscFree(nidx);CHKERRQ(ierr);
73   ierr = PetscFree(nidx2);CHKERRQ(ierr);
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNC__
78 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
79 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
80 {
81   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
82   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
83   int          row,mat_i,*mat_j,tcol,*mat_ilen;
84   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
85   int          *aj = a->j, *ai = a->i;
86   MatScalar    *mat_a;
87   Mat          C;
88   PetscTruth   flag;
89 
90   PetscFunctionBegin;
91   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
92   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
93 
94   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
96   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
97   ierr = ISGetSize(iscol,&ncols);CHKERRQ(ierr);
98 
99   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
100   ssmap = smap;
101   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
102   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
103   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
104   /* determine lens of each row */
105   for (i=0; i<nrows; i++) {
106     kstart  = ai[irow[i]];
107     kend    = kstart + a->ilen[irow[i]];
108     lens[i] = 0;
109       for ( k=kstart; k<kend; k++ ) {
110         if (ssmap[aj[k]]) {
111           lens[i]++;
112         }
113       }
114     }
115   /* Create and fill new matrix */
116   if (scall == MAT_REUSE_MATRIX) {
117     c = (Mat_SeqBAIJ *)((*B)->data);
118 
119     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
120     ierr = PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
121     if (flag == PETSC_FALSE) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
125     C = *B;
126   } else {
127     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ *)(C->data);
130   for (i=0; i<nrows; i++) {
131     row    = irow[i];
132     kstart = ai[row];
133     kend   = kstart + a->ilen[row];
134     mat_i  = c->i[i];
135     mat_j  = c->j + mat_i;
136     mat_a  = c->a + mat_i*bs2;
137     mat_ilen = c->ilen + i;
138     for ( k=kstart; k<kend; k++ ) {
139       if ((tcol=ssmap[a->j[k]])) {
140         *mat_j++ = tcol - 1;
141         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
142         mat_a   += bs2;
143         (*mat_ilen)++;
144       }
145     }
146   }
147 
148   /* Free work space */
149   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
150   ierr = PetscFree(smap);CHKERRQ(ierr);
151   ierr = PetscFree(lens);CHKERRQ(ierr);
152   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
153   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
154 
155   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
156   *B = C;
157   PetscFunctionReturn(0);
158 }
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
162 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
163 {
164   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
165   IS          is1,is2;
166   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
167 
168   PetscFunctionBegin;
169   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
170   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
171   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
172   ierr = ISGetSize(iscol,&ncols);CHKERRQ(ierr);
173 
174   /* Verify if the indices corespond to each element in a block
175    and form the IS with compressed IS */
176   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
177   iary = vary + a->mbs;
178   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
179   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
180   count = 0;
181   for (i=0; i<a->mbs; i++) {
182     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
183     if (vary[i]==bs) iary[count++] = i;
184   }
185   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1);CHKERRQ(ierr);
186 
187   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
188   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
189   count = 0;
190   for (i=0; i<a->mbs; i++) {
191     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
192     if (vary[i]==bs) iary[count++] = i;
193   }
194   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2);CHKERRQ(ierr);
195   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
196   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
197   ierr = PetscFree(vary);CHKERRQ(ierr);
198 
199   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);CHKERRQ(ierr);
200   ISDestroy(is1);
201   ISDestroy(is2);
202   PetscFunctionReturn(0);
203 }
204 
205 #undef __FUNC__
206 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
207 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatReuse scall,Mat **B)
208 {
209   int ierr,i;
210 
211   PetscFunctionBegin;
212   if (scall == MAT_INITIAL_MATRIX) {
213     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) );CHKPTRQ(*B);
214   }
215 
216   for ( i=0; i<n; i++ ) {
217     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
218   }
219   PetscFunctionReturn(0);
220 }
221 
222 
223 /* -------------------------------------------------------*/
224 /* Should check that shapes of vectors and matrices match */
225 /* -------------------------------------------------------*/
226 #include "pinclude/blaslapack.h"
227 
228 #undef __FUNC__
229 #define __FUNC__ "MatMult_SeqBAIJ_1"
230 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
231 {
232   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
233   Scalar          *x,*z,sum;
234   MatScalar       *v;
235   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
236 
237   PetscFunctionBegin;
238   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
239   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
240 
241   idx   = a->j;
242   v     = a->a;
243   ii    = a->i;
244 
245   for ( i=0; i<mbs; i++ ) {
246     n    = ii[1] - ii[0]; ii++;
247     sum  = 0.0;
248     while (n--) sum += *v++ * x[*idx++];
249     z[i] = sum;
250   }
251   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
252   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
253   PLogFlops(2*a->nz - a->m);
254   PetscFunctionReturn(0);
255 }
256 
257 #undef __FUNC__
258 #define __FUNC__ "MatMult_SeqBAIJ_2"
259 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
260 {
261   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
262   Scalar          *x,*z,*xb,sum1,sum2;
263   Scalar          x1,x2;
264   MatScalar       *v;
265   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
266 
267   PetscFunctionBegin;
268   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
269   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
270 
271   idx   = a->j;
272   v     = a->a;
273   ii    = a->i;
274 
275   for ( i=0; i<mbs; i++ ) {
276     n  = ii[1] - ii[0]; ii++;
277     sum1 = 0.0; sum2 = 0.0;
278     for ( j=0; j<n; j++ ) {
279       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
280       sum1 += v[0]*x1 + v[2]*x2;
281       sum2 += v[1]*x1 + v[3]*x2;
282       v += 4;
283     }
284     z[0] = sum1; z[1] = sum2;
285     z += 2;
286   }
287   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
288   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
289   PLogFlops(8*a->nz - a->m);
290   PetscFunctionReturn(0);
291 }
292 
293 #undef __FUNC__
294 #define __FUNC__ "MatMult_SeqBAIJ_3"
295 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
296 {
297   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
298   Scalar       *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
299   MatScalar    *v;
300   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
301 
302 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
303 #pragma disjoint(*v,*z,*xb)
304 #endif
305 
306   PetscFunctionBegin;
307   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
308   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
309 
310   idx   = a->j;
311   v     = a->a;
312   ii    = a->i;
313 
314   for ( i=0; i<mbs; i++ ) {
315     n  = ii[1] - ii[0]; ii++;
316     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
317     for ( j=0; j<n; j++ ) {
318       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
319       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
320       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
321       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
322       v += 9;
323     }
324     z[0] = sum1; z[1] = sum2; z[2] = sum3;
325     z += 3;
326   }
327   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
328   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
329   PLogFlops(18*a->nz - a->m);
330   PetscFunctionReturn(0);
331 }
332 
333 #undef __FUNC__
334 #define __FUNC__ "MatMult_SeqBAIJ_4"
335 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
336 {
337   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
338   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
339   MatScalar       *v;
340   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
341 
342   PetscFunctionBegin;
343   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
344   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
345 
346   idx   = a->j;
347   v     = a->a;
348   ii    = a->i;
349 
350   for ( i=0; i<mbs; i++ ) {
351     n  = ii[1] - ii[0]; ii++;
352     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
353     for ( j=0; j<n; j++ ) {
354       xb = x + 4*(*idx++);
355       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
356       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
357       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
358       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
359       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
360       v += 16;
361     }
362     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
363     z += 4;
364   }
365   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
366   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
367   PLogFlops(32*a->nz - a->m);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNC__
372 #define __FUNC__ "MatMult_SeqBAIJ_5"
373 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
374 {
375   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
376   Scalar          sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
377   MatScalar       *v;
378   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
379 
380   PetscFunctionBegin;
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
383 
384   idx   = a->j;
385   v     = a->a;
386   ii    = a->i;
387 
388   for ( i=0; i<mbs; i++ ) {
389     n  = ii[1] - ii[0]; ii++;
390     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
391     for ( j=0; j<n; j++ ) {
392       xb = x + 5*(*idx++);
393       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
394       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
395       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
396       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
397       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
398       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
399       v += 25;
400     }
401     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
402     z += 5;
403   }
404   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
405   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
406   PLogFlops(50*a->nz - a->m);
407   PetscFunctionReturn(0);
408 }
409 
410 
411 #undef __FUNC__
412 #define __FUNC__ "MatMult_SeqBAIJ_6"
413 int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
414 {
415   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
416   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
417   Scalar          x1,x2,x3,x4,x5,x6;
418   MatScalar       *v;
419   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
420 
421   PetscFunctionBegin;
422   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
423   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
424 
425   idx   = a->j;
426   v     = a->a;
427   ii    = a->i;
428 
429   for ( i=0; i<mbs; i++ ) {
430     n  = ii[1] - ii[0]; ii++;
431     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
432     for ( j=0; j<n; j++ ) {
433       xb = x + 6*(*idx++);
434       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
435       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
436       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
437       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
438       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
439       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
440       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
441       v += 36;
442     }
443     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
444     z += 6;
445   }
446 
447   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
448   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449   PLogFlops(72*a->nz - a->m);
450   PetscFunctionReturn(0);
451 }
452 #undef __FUNC__
453 #define __FUNC__ "MatMult_SeqBAIJ_7"
454 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
455 {
456   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
457   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
458   Scalar          x1,x2,x3,x4,x5,x6,x7;
459   MatScalar       *v;
460   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
461 
462   PetscFunctionBegin;
463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
464   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
465 
466   idx   = a->j;
467   v     = a->a;
468   ii    = a->i;
469 
470   for ( i=0; i<mbs; i++ ) {
471     n  = ii[1] - ii[0]; ii++;
472     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
473     for ( j=0; j<n; j++ ) {
474       xb = x + 7*(*idx++);
475       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
476       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
477       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
478       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
479       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
480       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
481       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
482       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
483       v += 49;
484     }
485     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
486     z += 7;
487   }
488 
489   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
490   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
491   PLogFlops(98*a->nz - a->m);
492   PetscFunctionReturn(0);
493 }
494 
495 /*
496     This will not work with MatScalar == float because it calls the BLAS
497 */
498 #undef __FUNC__
499 #define __FUNC__ "MatMult_SeqBAIJ_N"
500 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
501 {
502   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
503   Scalar          *x,*z,*xb, *work,*workt;
504   MatScalar       *v;
505   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
506   int             ncols,k;
507 
508   PetscFunctionBegin;
509   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
510   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
511 
512   idx   = a->j;
513   v     = a->a;
514   ii    = a->i;
515 
516 
517   if (!a->mult_work) {
518     k = PetscMax(a->m,a->n);
519     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
520   }
521   work = a->mult_work;
522   for ( i=0; i<mbs; i++ ) {
523     n     = ii[1] - ii[0]; ii++;
524     ncols = n*bs;
525     workt = work;
526     for ( j=0; j<n; j++ ) {
527       xb = x + bs*(*idx++);
528       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
529       workt += bs;
530     }
531     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
532     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
533     v += n*bs2;
534     z += bs;
535   }
536   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
537   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
538   PLogFlops(2*a->nz*bs2 - a->m);
539   PetscFunctionReturn(0);
540 }
541 
542 #undef __FUNC__
543 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
544 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
545 {
546   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
547   Scalar          *x,*y,*z,sum;
548   MatScalar       *v;
549   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
550 
551   PetscFunctionBegin;
552   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
553   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
554   if (zz != yy) {
555     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
556   } else {
557     z = y;
558   }
559 
560   idx   = a->j;
561   v     = a->a;
562   ii    = a->i;
563 
564   for ( i=0; i<mbs; i++ ) {
565     n    = ii[1] - ii[0]; ii++;
566     sum  = y[i];
567     while (n--) sum += *v++ * x[*idx++];
568     z[i] = sum;
569   }
570   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
571   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
572   if (zz != yy) {
573     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   }
575   PLogFlops(2*a->nz);
576   PetscFunctionReturn(0);
577 }
578 
579 #undef __FUNC__
580 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
581 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
582 {
583   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
584   Scalar          *x,*y,*z,*xb,sum1,sum2;
585   Scalar          x1,x2;
586   MatScalar       *v;
587   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
588 
589   PetscFunctionBegin;
590   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
591   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
592   if (zz != yy) {
593     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
594   } else {
595     z = y;
596   }
597 
598   idx   = a->j;
599   v     = a->a;
600   ii    = a->i;
601 
602   for ( i=0; i<mbs; i++ ) {
603     n  = ii[1] - ii[0]; ii++;
604     sum1 = y[0]; sum2 = y[1];
605     for ( j=0; j<n; j++ ) {
606       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
607       sum1 += v[0]*x1 + v[2]*x2;
608       sum2 += v[1]*x1 + v[3]*x2;
609       v += 4;
610     }
611     z[0] = sum1; z[1] = sum2;
612     z += 2; y += 2;
613   }
614   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
615   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
616   if (zz != yy) {
617     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
618   }
619   PLogFlops(4*a->nz);
620   PetscFunctionReturn(0);
621 }
622 
623 #undef __FUNC__
624 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
625 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
626 {
627   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
628   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
629   MatScalar       *v;
630   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
631 
632   PetscFunctionBegin;
633   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
634   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
635   if (zz != yy) {
636     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
637   } else {
638     z = y;
639   }
640 
641   idx   = a->j;
642   v     = a->a;
643   ii    = a->i;
644 
645   for ( i=0; i<mbs; i++ ) {
646     n  = ii[1] - ii[0]; ii++;
647     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
648     for ( j=0; j<n; j++ ) {
649       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
650       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
651       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
652       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
653       v += 9;
654     }
655     z[0] = sum1; z[1] = sum2; z[2] = sum3;
656     z += 3; y += 3;
657   }
658   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
659   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
660   if (zz != yy) {
661     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
662   }
663   PLogFlops(18*a->nz);
664   PetscFunctionReturn(0);
665 }
666 
667 #undef __FUNC__
668 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
669 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
670 {
671   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
672   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
673   MatScalar       *v;
674   int             ierr,mbs=a->mbs,i,*idx,*ii;
675   int             j,n;
676 
677   PetscFunctionBegin;
678   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
679   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
680   if (zz != yy) {
681     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
682   } else {
683     z = y;
684   }
685 
686   idx   = a->j;
687   v     = a->a;
688   ii    = a->i;
689 
690   for ( i=0; i<mbs; i++ ) {
691     n  = ii[1] - ii[0]; ii++;
692     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
693     for ( j=0; j<n; j++ ) {
694       xb = x + 4*(*idx++);
695       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
696       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
697       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
698       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
699       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
700       v += 16;
701     }
702     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
703     z += 4; y += 4;
704   }
705   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
706   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
707   if (zz != yy) {
708     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
709   }
710   PLogFlops(32*a->nz);
711   PetscFunctionReturn(0);
712 }
713 
714 #undef __FUNC__
715 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
716 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
717 {
718   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
719   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
720   MatScalar       *v;
721   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
722 
723   PetscFunctionBegin;
724   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
725   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
726   if (zz != yy) {
727     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
728   } else {
729     z = y;
730   }
731 
732   idx   = a->j;
733   v     = a->a;
734   ii    = a->i;
735 
736   for ( i=0; i<mbs; i++ ) {
737     n  = ii[1] - ii[0]; ii++;
738     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
739     for ( j=0; j<n; j++ ) {
740       xb = x + 5*(*idx++);
741       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
742       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
743       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
744       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
745       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
746       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
747       v += 25;
748     }
749     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
750     z += 5; y += 5;
751   }
752   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
753   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
754   if (zz != yy) {
755     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
756   }
757   PLogFlops(50*a->nz);
758   PetscFunctionReturn(0);
759 }
760 #undef __FUNC__
761 #define __FUNC__ "MatMultAdd_SeqBAIJ_6"
762 int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
763 {
764   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
765   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
766   Scalar          x1,x2,x3,x4,x5,x6;
767   MatScalar       *v;
768   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
769 
770   PetscFunctionBegin;
771   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
772   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
773   if (zz != yy) {
774     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
775   } else {
776     z = y;
777   }
778 
779   idx   = a->j;
780   v     = a->a;
781   ii    = a->i;
782 
783   for ( i=0; i<mbs; i++ ) {
784     n  = ii[1] - ii[0]; ii++;
785     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
786     for ( j=0; j<n; j++ ) {
787       xb = x + 6*(*idx++);
788       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
789       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
790       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
791       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
792       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
793       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
794       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
795       v += 36;
796     }
797     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
798     z += 6; y += 6;
799   }
800   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
801   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
802   if (zz != yy) {
803     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
804   }
805   PLogFlops(72*a->nz);
806   PetscFunctionReturn(0);
807 }
808 
809 #undef __FUNC__
810 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
811 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
812 {
813   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
814   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
815   Scalar          x1,x2,x3,x4,x5,x6,x7;
816   MatScalar       *v;
817   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
818 
819   PetscFunctionBegin;
820   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
821   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
822   if (zz != yy) {
823     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
824   } else {
825     z = y;
826   }
827 
828   idx   = a->j;
829   v     = a->a;
830   ii    = a->i;
831 
832   for ( i=0; i<mbs; i++ ) {
833     n  = ii[1] - ii[0]; ii++;
834     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
835     for ( j=0; j<n; j++ ) {
836       xb = x + 7*(*idx++);
837       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
838       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
839       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
840       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
841       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
842       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
843       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
844       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
845       v += 49;
846     }
847     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
848     z += 7; y += 7;
849   }
850   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
851   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
852   if (zz != yy) {
853     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
854   }
855   PLogFlops(98*a->nz);
856   PetscFunctionReturn(0);
857 }
858 
859 #undef __FUNC__
860 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
861 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
862 {
863   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
864   Scalar         *x,*z,*xb,*work,*workt;
865   MatScalar      *v;
866   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
867   int            ncols,k;
868 
869   PetscFunctionBegin;
870   if ( xx != yy) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
871 
872   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
873   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
874 
875   idx   = a->j;
876   v     = a->a;
877   ii    = a->i;
878 
879 
880   if (!a->mult_work) {
881     k = PetscMax(a->m,a->n);
882     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
883   }
884   work = a->mult_work;
885   for ( i=0; i<mbs; i++ ) {
886     n     = ii[1] - ii[0]; ii++;
887     ncols = n*bs;
888     workt = work;
889     for ( j=0; j<n; j++ ) {
890       xb = x + bs*(*idx++);
891       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
892       workt += bs;
893     }
894     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
895     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
896     v += n*bs2;
897     z += bs;
898   }
899   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
900   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
901   PLogFlops(2*a->nz*bs2 );
902   PetscFunctionReturn(0);
903 }
904 
905 #undef __FUNC__
906 #define __FUNC__ "MatMultTrans_SeqBAIJ"
907 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
908 {
909   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
910   Scalar          *xg,*zg,*zb;
911   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
912   MatScalar       *v;
913   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
914   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
915 
916 
917   PetscFunctionBegin;
918   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
919   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
920   ierr = PetscMemzero(z,N*sizeof(Scalar));CHKERRQ(ierr);
921 
922   idx   = a->j;
923   v     = a->a;
924   ii    = a->i;
925 
926   switch (bs) {
927   case 1:
928     for ( i=0; i<mbs; i++ ) {
929       n  = ii[1] - ii[0]; ii++;
930       xb = x + i; x1 = xb[0];
931       ib = idx + ai[i];
932       for ( j=0; j<n; j++ ) {
933         rval    = ib[j];
934         z[rval] += *v++ * x1;
935       }
936     }
937     break;
938   case 2:
939     for ( i=0; i<mbs; i++ ) {
940       n  = ii[1] - ii[0]; ii++;
941       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
942       ib = idx + ai[i];
943       for ( j=0; j<n; j++ ) {
944         rval      = ib[j]*2;
945         z[rval++] += v[0]*x1 + v[1]*x2;
946         z[rval++] += v[2]*x1 + v[3]*x2;
947         v += 4;
948       }
949     }
950     break;
951   case 3:
952     for ( i=0; i<mbs; i++ ) {
953       n  = ii[1] - ii[0]; ii++;
954       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
955       ib = idx + ai[i];
956       for ( j=0; j<n; j++ ) {
957         rval      = ib[j]*3;
958         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
959         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
960         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
961         v += 9;
962       }
963     }
964     break;
965   case 4:
966     for ( i=0; i<mbs; i++ ) {
967       n  = ii[1] - ii[0]; ii++;
968       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
969       ib = idx + ai[i];
970       for ( j=0; j<n; j++ ) {
971         rval      = ib[j]*4;
972         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
973         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
974         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
975         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
976         v += 16;
977       }
978     }
979     break;
980   case 5:
981     for ( i=0; i<mbs; i++ ) {
982       n  = ii[1] - ii[0]; ii++;
983       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
984       x4 = xb[3];   x5 = xb[4];
985       ib = idx + ai[i];
986       for ( j=0; j<n; j++ ) {
987         rval      = ib[j]*5;
988         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
989         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
990         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
991         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
992         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
993         v += 25;
994       }
995     }
996     break;
997       /* block sizes larger then 5 by 5 are handled by BLAS */
998     default: {
999       int       ncols,k;
1000       Scalar    *work,*workt;
1001 
1002       if (!a->mult_work) {
1003         k = PetscMax(a->m,a->n);
1004         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1005       }
1006       work = a->mult_work;
1007       for ( i=0; i<mbs; i++ ) {
1008         n     = ii[1] - ii[0]; ii++;
1009         ncols = n*bs;
1010         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1011         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1012         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1013         v += n*bs2;
1014         x += bs;
1015         workt = work;
1016         for ( j=0; j<n; j++ ) {
1017           zb = z + bs*(*idx++);
1018           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1019           workt += bs;
1020         }
1021       }
1022     }
1023   }
1024   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1025   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1026   PLogFlops(2*a->nz*a->bs2 - a->n);
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 #undef __FUNC__
1031 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1032 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1033 
1034 {
1035   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1036   Scalar          *xg,*zg,*zb;
1037   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
1038   MatScalar       *v;
1039   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1040   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1041 
1042   PetscFunctionBegin;
1043   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1044   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1045 
1046   if ( yy != zz ) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1047   else {
1048     ierr = PetscMemzero(z,N*sizeof(Scalar));CHKERRQ(ierr);
1049   }
1050 
1051   idx   = a->j;
1052   v     = a->a;
1053   ii    = a->i;
1054 
1055   switch (bs) {
1056   case 1:
1057     for ( i=0; i<mbs; i++ ) {
1058       n  = ii[1] - ii[0]; ii++;
1059       xb = x + i; x1 = xb[0];
1060       ib = idx + ai[i];
1061       for ( j=0; j<n; j++ ) {
1062         rval    = ib[j];
1063         z[rval] += *v++ * x1;
1064       }
1065     }
1066     break;
1067   case 2:
1068     for ( i=0; i<mbs; i++ ) {
1069       n  = ii[1] - ii[0]; ii++;
1070       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1071       ib = idx + ai[i];
1072       for ( j=0; j<n; j++ ) {
1073         rval      = ib[j]*2;
1074         z[rval++] += v[0]*x1 + v[1]*x2;
1075         z[rval++] += v[2]*x1 + v[3]*x2;
1076         v += 4;
1077       }
1078     }
1079     break;
1080   case 3:
1081     for ( i=0; i<mbs; i++ ) {
1082       n  = ii[1] - ii[0]; ii++;
1083       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1084       ib = idx + ai[i];
1085       for ( j=0; j<n; j++ ) {
1086         rval      = ib[j]*3;
1087         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1088         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1089         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1090         v += 9;
1091       }
1092     }
1093     break;
1094   case 4:
1095     for ( i=0; i<mbs; i++ ) {
1096       n  = ii[1] - ii[0]; ii++;
1097       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1098       ib = idx + ai[i];
1099       for ( j=0; j<n; j++ ) {
1100         rval      = ib[j]*4;
1101         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1102         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1103         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1104         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1105         v += 16;
1106       }
1107     }
1108     break;
1109   case 5:
1110     for ( i=0; i<mbs; i++ ) {
1111       n  = ii[1] - ii[0]; ii++;
1112       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1113       x4 = xb[3];   x5 = xb[4];
1114       ib = idx + ai[i];
1115       for ( j=0; j<n; j++ ) {
1116         rval      = ib[j]*5;
1117         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1118         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1119         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1120         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1121         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1122         v += 25;
1123       }
1124     }
1125     break;
1126       /* block sizes larger then 5 by 5 are handled by BLAS */
1127     default: {
1128       int       ncols,k;
1129       Scalar    *work,*workt;
1130 
1131       if (!a->mult_work) {
1132         k = PetscMax(a->m,a->n);
1133         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1134       }
1135       work = a->mult_work;
1136       for ( i=0; i<mbs; i++ ) {
1137         n     = ii[1] - ii[0]; ii++;
1138         ncols = n*bs;
1139         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1140         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1141         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1142         v += n*bs2;
1143         x += bs;
1144         workt = work;
1145         for ( j=0; j<n; j++ ) {
1146           zb = z + bs*(*idx++);
1147           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1148           workt += bs;
1149         }
1150       }
1151     }
1152   }
1153   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1154   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1155   PLogFlops(2*a->nz*a->bs2);
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 #undef __FUNC__
1160 #define __FUNC__ "MatScale_SeqBAIJ"
1161 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1162 {
1163   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1164   int         one = 1, totalnz = a->bs2*a->nz;
1165 
1166   PetscFunctionBegin;
1167   BLscal_( &totalnz, alpha, a->a, &one );
1168   PLogFlops(totalnz);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNC__
1173 #define __FUNC__ "MatNorm_SeqBAIJ"
1174 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1175 {
1176   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1177   MatScalar   *v = a->a;
1178   double      sum = 0.0;
1179   int         i,j,k,bs = a->bs, nz=a->nz,bs2=a->bs2;
1180 
1181   PetscFunctionBegin;
1182   if (type == NORM_FROBENIUS) {
1183     for (i=0; i< bs2*nz; i++ ) {
1184 #if defined(PETSC_USE_COMPLEX)
1185       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1186 #else
1187       sum += (*v)*(*v); v++;
1188 #endif
1189     }
1190     *norm = sqrt(sum);
1191   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1192     *norm = 0.0;
1193     for ( k=0; k<bs; k++ ) {
1194       for ( j=0; j<a->m; j++ ) {
1195         v = a->a + bs2*a->i[j] + k;
1196         sum = 0.0;
1197         for ( i=0; i<a->i[j+1]-a->i[j]; i++ ) {
1198           sum += PetscAbsScalar(*v);
1199           v   += bs;
1200         }
1201         if (sum > *norm) *norm = sum;
1202       }
1203     }
1204   } else {
1205     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1206   }
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 
1211 #undef __FUNC__
1212 #define __FUNC__ "MatEqual_SeqBAIJ"
1213 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1214 {
1215   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1216   int         ierr;
1217 
1218   PetscFunctionBegin;
1219   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1220 
1221   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1222   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1223     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1224   }
1225 
1226   /* if the a->i are the same */
1227   ierr = PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1228   if (*flg == PETSC_FALSE) {
1229     PetscFunctionReturn(0);
1230   }
1231 
1232   /* if a->j are the same */
1233   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1234   if (*flg == PETSC_FALSE) {
1235     PetscFunctionReturn(0);
1236   }
1237   /* if a->a are the same */
1238   ierr = PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1239   PetscFunctionReturn(0);
1240 
1241 }
1242 
1243 #undef __FUNC__
1244 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1245 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1246 {
1247   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1248   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1249   Scalar      *x, zero = 0.0;
1250   MatScalar   *aa,*aa_j;
1251 
1252   PetscFunctionBegin;
1253   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1254   bs   = a->bs;
1255   aa   = a->a;
1256   ai   = a->i;
1257   aj   = a->j;
1258   ambs = a->mbs;
1259   bs2  = a->bs2;
1260 
1261   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1262   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1263   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1264   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1265   for ( i=0; i<ambs; i++ ) {
1266     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1267       if (aj[j] == i) {
1268         row  = i*bs;
1269         aa_j = aa+j*bs2;
1270         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1271         break;
1272       }
1273     }
1274   }
1275   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNC__
1280 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1281 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1282 {
1283   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1284   Scalar      *l,*r,x,*li,*ri;
1285   MatScalar   *aa,*v;
1286   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1287 
1288   PetscFunctionBegin;
1289   ai  = a->i;
1290   aj  = a->j;
1291   aa  = a->a;
1292   m   = a->m;
1293   n   = a->n;
1294   bs  = a->bs;
1295   mbs = a->mbs;
1296   bs2 = a->bs2;
1297   if (ll) {
1298     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1299     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1300     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1301     for ( i=0; i<mbs; i++ ) { /* for each block row */
1302       M  = ai[i+1] - ai[i];
1303       li = l + i*bs;
1304       v  = aa + bs2*ai[i];
1305       for ( j=0; j<M; j++ ) { /* for each block */
1306         for ( k=0; k<bs2; k++ ) {
1307           (*v++) *= li[k%bs];
1308         }
1309       }
1310     }
1311     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1312     PLogFlops(a->nz);
1313   }
1314 
1315   if (rr) {
1316     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1317     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1318     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1319     for ( i=0; i<mbs; i++ ) { /* for each block row */
1320       M  = ai[i+1] - ai[i];
1321       v  = aa + bs2*ai[i];
1322       for ( j=0; j<M; j++ ) { /* for each block */
1323         ri = r + bs*aj[ai[i]+j];
1324         for ( k=0; k<bs; k++ ) {
1325           x = ri[k];
1326           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1327         }
1328       }
1329     }
1330     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1331     PLogFlops(a->nz);
1332   }
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 
1337 #undef __FUNC__
1338 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1339 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1340 {
1341   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1342 
1343   PetscFunctionBegin;
1344   info->rows_global    = (double)a->m;
1345   info->columns_global = (double)a->n;
1346   info->rows_local     = (double)a->m;
1347   info->columns_local  = (double)a->n;
1348   info->block_size     = a->bs2;
1349   info->nz_allocated   = a->maxnz;
1350   info->nz_used        = a->bs2*a->nz;
1351   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1352   info->assemblies   = A->num_ass;
1353   info->mallocs      = a->reallocs;
1354   info->memory       = A->mem;
1355   if (A->factor) {
1356     info->fill_ratio_given  = A->info.fill_ratio_given;
1357     info->fill_ratio_needed = A->info.fill_ratio_needed;
1358     info->factor_mallocs    = A->info.factor_mallocs;
1359   } else {
1360     info->fill_ratio_given  = 0;
1361     info->fill_ratio_needed = 0;
1362     info->factor_mallocs    = 0;
1363   }
1364   PetscFunctionReturn(0);
1365 }
1366 
1367 
1368 #undef __FUNC__
1369 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1370 int MatZeroEntries_SeqBAIJ(Mat A)
1371 {
1372   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1373   int         ierr;
1374 
1375   PetscFunctionBegin;
1376   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379