xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision e1311b9049e89cb3452dcd306fde571f4b440ff2)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.24 1998/03/16 18:55:40 bsmith Exp bsmith $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "sys.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/spops.h"
10 #include "petsc.h"
11 #include "src/inline/bitarray.h"
12 
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
16 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
17 {
18   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
19   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
20   int         start, end, *ai, *aj,bs,*nidx2;
21   BT          table;
22 
23   PetscFunctionBegin;
24   m     = a->mbs;
25   ai    = a->i;
26   aj    = a->j;
27   bs    = a->bs;
28 
29   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
30 
31   ierr  = BTCreate(m,table); CHKERRQ(ierr);
32   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
33   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
34 
35   for ( i=0; i<is_max; i++ ) {
36     /* Initialise the two local arrays */
37     isz  = 0;
38     BTMemzero(m,table);
39 
40     /* Extract the indices, assume there can be duplicate entries */
41     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
42     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
43 
44     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
45     for ( j=0; j<n ; ++j){
46       ival = idx[j]/bs; /* convert the indices into block indices */
47       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
48       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
49     }
50     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
52 
53     k = 0;
54     for ( j=0; j<ov; j++){ /* for each overlap*/
55       n = isz;
56       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
57         row   = nidx[k];
58         start = ai[row];
59         end   = ai[row+1];
60         for ( l = start; l<end ; l++){
61           val = aj[l];
62           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
63         }
64       }
65     }
66     /* expand the Index Set */
67     for (j=0; j<isz; j++ ) {
68       for (k=0; k<bs; k++ )
69         nidx2[j*bs+k] = nidx[j]*bs+k;
70     }
71     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
72   }
73   BTDestroy(table);
74   PetscFree(nidx);
75   PetscFree(nidx2);
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNC__
80 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
81 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
82 {
83   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
84   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
85   int          row,mat_i,*mat_j,tcol,*mat_ilen;
86   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
87   int          *aj = a->j, *ai = a->i;
88   Scalar       *mat_a;
89   Mat          C;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,(PetscTruth*)&i);
93   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
97   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
98   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
99 
100   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
101   ssmap = smap;
102   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
103   PetscMemzero(smap,oldcols*sizeof(int));
104   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110       for ( k=kstart; k<kend; k++ ) {
111         if (ssmap[aj[k]]) {
112           lens[i]++;
113         }
114       }
115     }
116   /* Create and fill new matrix */
117   if (scall == MAT_REUSE_MATRIX) {
118     c = (Mat_SeqBAIJ *)((*B)->data);
119 
120     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
121     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     PetscMemzero(c->ilen,c->mbs*sizeof(int));
125     C = *B;
126   } else {
127     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ *)(C->data);
130   for (i=0; i<nrows; i++) {
131     row    = irow[i];
132     kstart = ai[row];
133     kend   = kstart + a->ilen[row];
134     mat_i  = c->i[i];
135     mat_j  = c->j + mat_i;
136     mat_a  = c->a + mat_i*bs2;
137     mat_ilen = c->ilen + i;
138     for ( k=kstart; k<kend; k++ ) {
139       if ((tcol=ssmap[a->j[k]])) {
140         *mat_j++ = tcol - 1;
141         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2;
142         (*mat_ilen)++;
143       }
144     }
145   }
146 
147   /* Free work space */
148   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
149   PetscFree(smap); PetscFree(lens);
150   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
151   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
152 
153   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
154   *B = C;
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNC__
159 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
160 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
161 {
162   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
163   IS          is1,is2;
164   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
165 
166   PetscFunctionBegin;
167   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
168   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
169   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
170   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
171 
172   /* Verify if the indices corespond to each element in a block
173    and form the IS with compressed IS */
174   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
175   iary = vary + a->mbs;
176   PetscMemzero(vary,(a->mbs)*sizeof(int));
177   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
178   count = 0;
179   for (i=0; i<a->mbs; i++) {
180     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
181     if (vary[i]==bs) iary[count++] = i;
182   }
183   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
184 
185   PetscMemzero(vary,(a->mbs)*sizeof(int));
186   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
187   count = 0;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
190     if (vary[i]==bs) iary[count++] = i;
191   }
192   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
193   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
194   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
195   PetscFree(vary);
196 
197   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
198   ISDestroy(is1);
199   ISDestroy(is2);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNC__
204 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
205 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,Mat **B)
206 {
207   int ierr,i;
208 
209   PetscFunctionBegin;
210   if (scall == MAT_INITIAL_MATRIX) {
211     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
212   }
213 
214   for ( i=0; i<n; i++ ) {
215     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
216   }
217   PetscFunctionReturn(0);
218 }
219 
220 
221 /* -------------------------------------------------------*/
222 /* Should check that shapes of vectors and matrices match */
223 /* -------------------------------------------------------*/
224 #include "pinclude/plapack.h"
225 
226 #undef __FUNC__
227 #define __FUNC__ "MatMult_SeqBAIJ_1"
228 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229 {
230   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
231    Scalar *x,*z,*v,sum;
232   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
233 
234   PetscFunctionBegin;
235   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
236   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
237 
238   idx   = a->j;
239   v     = a->a;
240   ii    = a->i;
241 
242   for ( i=0; i<mbs; i++ ) {
243     n    = ii[1] - ii[0]; ii++;
244     sum  = 0.0;
245     while (n--) sum += *v++ * x[*idx++];
246     z[i] = sum;
247   }
248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
250   PLogFlops(2*a->nz - a->m);
251   PetscFunctionReturn(0);
252 }
253 
254 #undef __FUNC__
255 #define __FUNC__ "MatMult_SeqBAIJ_2"
256 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
257 {
258   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
259    Scalar *x,*z,*v,*xb,sum1,sum2;
260    Scalar x1,x2;
261   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
262 
263   PetscFunctionBegin;
264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
266 
267   idx   = a->j;
268   v     = a->a;
269   ii    = a->i;
270 
271   for ( i=0; i<mbs; i++ ) {
272     n  = ii[1] - ii[0]; ii++;
273     sum1 = 0.0; sum2 = 0.0;
274     for ( j=0; j<n; j++ ) {
275       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
276       sum1 += v[0]*x1 + v[2]*x2;
277       sum2 += v[1]*x1 + v[3]*x2;
278       v += 4;
279     }
280     z[0] = sum1; z[1] = sum2;
281     z += 2;
282   }
283   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
285   PLogFlops(4*a->nz - a->m);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNC__
290 #define __FUNC__ "MatMult_SeqBAIJ_3"
291 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
292 {
293   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
294    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
295   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
296 
297   PetscFunctionBegin;
298   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
299   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
300 
301   idx   = a->j;
302   v     = a->a;
303   ii    = a->i;
304 
305   for ( i=0; i<mbs; i++ ) {
306     n  = ii[1] - ii[0]; ii++;
307     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
308     for ( j=0; j<n; j++ ) {
309       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
310       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
311       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
312       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
313       v += 9;
314     }
315     z[0] = sum1; z[1] = sum2; z[2] = sum3;
316     z += 3;
317   }
318   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
319   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
320   PLogFlops(18*a->nz - a->m);
321   PetscFunctionReturn(0);
322 }
323 
324 #undef __FUNC__
325 #define __FUNC__ "MatMult_SeqBAIJ_4"
326 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
327 {
328   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
329    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
330   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
331 
332   PetscFunctionBegin;
333   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
335 
336   idx   = a->j;
337   v     = a->a;
338   ii    = a->i;
339 
340   for ( i=0; i<mbs; i++ ) {
341     n  = ii[1] - ii[0]; ii++;
342     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
343     for ( j=0; j<n; j++ ) {
344       xb = x + 4*(*idx++);
345       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
346       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
347       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
348       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
349       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
350       v += 16;
351     }
352     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
353     z += 4;
354   }
355   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
356   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
357   PLogFlops(32*a->nz - a->m);
358   PetscFunctionReturn(0);
359 }
360 
361 #undef __FUNC__
362 #define __FUNC__ "MatMult_SeqBAIJ_5"
363 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
364 {
365   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
366   Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
367   Scalar *  v,*  xb,*  z, *  x;
368   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
369 
370   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
371   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
372 
373   idx   = a->j;
374   v     = a->a;
375   ii    = a->i;
376 
377   for ( i=0; i<mbs; i++ ) {
378     n  = ii[1] - ii[0]; ii++;
379     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
380     for ( j=0; j<n; j++ ) {
381       xb = x + 5*(*idx++);
382       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
383       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
384       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
385       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
386       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
387       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
388       v += 25;
389     }
390     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
391     z += 5;
392   }
393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
395   PLogFlops(50*a->nz - a->m);
396   PetscFunctionReturn(0);
397 }
398 
399 #undef __FUNC__
400 #define __FUNC__ "MatMult_SeqBAIJ_7"
401 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
402 {
403   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
404    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
405    Scalar x1,x2,x3,x4,x5,x6,x7;
406   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
407 
408   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
409   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
410 
411   idx   = a->j;
412   v     = a->a;
413   ii    = a->i;
414 
415   for ( i=0; i<mbs; i++ ) {
416     n  = ii[1] - ii[0]; ii++;
417     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
418     for ( j=0; j<n; j++ ) {
419       xb = x + 7*(*idx++);
420       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
421       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
422       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
423       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
424       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
425       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
426       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
427       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
428       v += 49;
429     }
430     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
431     z += 7;
432   }
433 
434   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
435   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
436   PLogFlops(98*a->nz - a->m);
437   PetscFunctionReturn(0);
438 }
439 
440 #undef __FUNC__
441 #define __FUNC__ "MatMult_SeqBAIJ_N"
442 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
443 {
444   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
445    Scalar *x,*z,*v,*xb;
446   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
447   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
448   int             _One = 1,ncols,k;
449 
450   PetscFunctionBegin;
451   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
452   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
453 
454   idx   = a->j;
455   v     = a->a;
456   ii    = a->i;
457 
458 
459   if (!a->mult_work) {
460     k = PetscMax(a->m,a->n);
461     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
462   }
463   work = a->mult_work;
464   for ( i=0; i<mbs; i++ ) {
465     n     = ii[1] - ii[0]; ii++;
466     ncols = n*bs;
467     workt = work;
468     for ( j=0; j<n; j++ ) {
469       xb = x + bs*(*idx++);
470       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
471       workt += bs;
472     }
473     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
474     v += n*bs2;
475     z += bs;
476   }
477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
478   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
479   PLogFlops(2*a->nz*bs2 - a->m);
480   PetscFunctionReturn(0);
481 }
482 
483 #undef __FUNC__
484 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
485 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
486 {
487   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
488    Scalar *x,*y,*z,*v,sum;
489   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
490 
491   PetscFunctionBegin;
492   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
493   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
494   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
495 
496   idx   = a->j;
497   v     = a->a;
498   ii    = a->i;
499 
500   for ( i=0; i<mbs; i++ ) {
501     n    = ii[1] - ii[0]; ii++;
502     sum  = y[i];
503     while (n--) sum += *v++ * x[*idx++];
504     z[i] = sum;
505   }
506   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
507   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
508   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
509   PLogFlops(2*a->nz);
510   PetscFunctionReturn(0);
511 }
512 
513 #undef __FUNC__
514 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
515 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
516 {
517   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
518    Scalar *x,*y,*z,*v,*xb,sum1,sum2;
519    Scalar x1,x2;
520   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
521 
522   PetscFunctionBegin;
523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
525   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
526 
527   idx   = a->j;
528   v     = a->a;
529   ii    = a->i;
530 
531   for ( i=0; i<mbs; i++ ) {
532     n  = ii[1] - ii[0]; ii++;
533     sum1 = y[0]; sum2 = y[1];
534     for ( j=0; j<n; j++ ) {
535       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
536       sum1 += v[0]*x1 + v[2]*x2;
537       sum2 += v[1]*x1 + v[3]*x2;
538       v += 4;
539     }
540     z[0] = sum1; z[1] = sum2;
541     z += 2; y += 2;
542   }
543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
544   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
545   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
546   PLogFlops(4*a->nz);
547   PetscFunctionReturn(0);
548 }
549 
550 #undef __FUNC__
551 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
552 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
553 {
554   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
555    Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
556   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
557 
558   PetscFunctionBegin;
559   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
560   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
561   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
562 
563   idx   = a->j;
564   v     = a->a;
565   ii    = a->i;
566 
567   for ( i=0; i<mbs; i++ ) {
568     n  = ii[1] - ii[0]; ii++;
569     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
570     for ( j=0; j<n; j++ ) {
571       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
572       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
573       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
574       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
575       v += 9;
576     }
577     z[0] = sum1; z[1] = sum2; z[2] = sum3;
578     z += 3; y += 3;
579   }
580   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
581   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
582   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
583   PLogFlops(18*a->nz);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNC__
588 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
589 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
590 {
591   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
592    Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
593   int             ierr,mbs=a->mbs,i,*idx,*ii;
594   int             j,n;
595 
596   PetscFunctionBegin;
597   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
598   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
599   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
600 
601   idx   = a->j;
602   v     = a->a;
603   ii    = a->i;
604 
605   for ( i=0; i<mbs; i++ ) {
606     n  = ii[1] - ii[0]; ii++;
607     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
608     for ( j=0; j<n; j++ ) {
609       xb = x + 4*(*idx++);
610       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
611       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
612       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
613       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
614       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
615       v += 16;
616     }
617     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
618     z += 4; y += 4;
619   }
620   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
621   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
622   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
623   PLogFlops(32*a->nz);
624   PetscFunctionReturn(0);
625 }
626 
627 #undef __FUNC__
628 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
629 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
630 {
631   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
632    Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
633   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
634 
635   PetscFunctionBegin;
636   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
637   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
638   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
639 
640   idx   = a->j;
641   v     = a->a;
642   ii    = a->i;
643 
644   for ( i=0; i<mbs; i++ ) {
645     n  = ii[1] - ii[0]; ii++;
646     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
647     for ( j=0; j<n; j++ ) {
648       xb = x + 5*(*idx++);
649       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
650       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
651       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
652       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
653       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
654       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
655       v += 25;
656     }
657     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
658     z += 5; y += 5;
659   }
660   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
661   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
662   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
663   PLogFlops(50*a->nz);
664   PetscFunctionReturn(0);
665 }
666 
667 #undef __FUNC__
668 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
669 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
670 {
671   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
672   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
673   Scalar          x1,x2,x3,x4,x5,x6,x7;
674   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
675 
676   PetscFunctionBegin;
677   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
678   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
679   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
680 
681   idx   = a->j;
682   v     = a->a;
683   ii    = a->i;
684 
685   for ( i=0; i<mbs; i++ ) {
686     n  = ii[1] - ii[0]; ii++;
687     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
688     for ( j=0; j<n; j++ ) {
689       xb = x + 7*(*idx++);
690       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
691       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
692       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
693       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
694       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
695       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
696       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
697       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
698       v += 49;
699     }
700     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
701     z += 7; y += 7;
702   }
703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
704   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
705   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
706   PLogFlops(98*a->nz);
707   PetscFunctionReturn(0);
708 }
709 
710 
711 #undef __FUNC__
712 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
713 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
714 {
715   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
716   Scalar         *x,*z,*v,*xb;
717   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
718   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
719 
720   PetscFunctionBegin;
721   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
722 
723   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
724   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
725 
726   idx   = a->j;
727   v     = a->a;
728   ii    = a->i;
729 
730 
731   if (!a->mult_work) {
732     k = PetscMax(a->m,a->n);
733     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
734   }
735   work = a->mult_work;
736   for ( i=0; i<mbs; i++ ) {
737     n     = ii[1] - ii[0]; ii++;
738     ncols = n*bs;
739     workt = work;
740     for ( j=0; j<n; j++ ) {
741       xb = x + bs*(*idx++);
742       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
743       workt += bs;
744     }
745     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
746     v += n*bs2;
747     z += bs;
748   }
749   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
750   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
751   PLogFlops(2*a->nz*bs2 );
752   PetscFunctionReturn(0);
753 }
754 
755 #undef __FUNC__
756 #define __FUNC__ "MatMultTrans_SeqBAIJ"
757 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
758 {
759   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
760   Scalar          *xg,*zg,*zb;
761    Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
762   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
763   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
764 
765 
766   PetscFunctionBegin;
767   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
768   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
769   PetscMemzero(z,N*sizeof(Scalar));
770 
771   idx   = a->j;
772   v     = a->a;
773   ii    = a->i;
774 
775   switch (bs) {
776   case 1:
777     for ( i=0; i<mbs; i++ ) {
778       n  = ii[1] - ii[0]; ii++;
779       xb = x + i; x1 = xb[0];
780       ib = idx + ai[i];
781       for ( j=0; j<n; j++ ) {
782         rval    = ib[j];
783         z[rval] += *v++ * x1;
784       }
785     }
786     break;
787   case 2:
788     for ( i=0; i<mbs; i++ ) {
789       n  = ii[1] - ii[0]; ii++;
790       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
791       ib = idx + ai[i];
792       for ( j=0; j<n; j++ ) {
793         rval      = ib[j]*2;
794         z[rval++] += v[0]*x1 + v[1]*x2;
795         z[rval++] += v[2]*x1 + v[3]*x2;
796         v += 4;
797       }
798     }
799     break;
800   case 3:
801     for ( i=0; i<mbs; i++ ) {
802       n  = ii[1] - ii[0]; ii++;
803       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
804       ib = idx + ai[i];
805       for ( j=0; j<n; j++ ) {
806         rval      = ib[j]*3;
807         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
808         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
809         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
810         v += 9;
811       }
812     }
813     break;
814   case 4:
815     for ( i=0; i<mbs; i++ ) {
816       n  = ii[1] - ii[0]; ii++;
817       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
818       ib = idx + ai[i];
819       for ( j=0; j<n; j++ ) {
820         rval      = ib[j]*4;
821         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
822         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
823         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
824         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
825         v += 16;
826       }
827     }
828     break;
829   case 5:
830     for ( i=0; i<mbs; i++ ) {
831       n  = ii[1] - ii[0]; ii++;
832       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
833       x4 = xb[3];   x5 = xb[4];
834       ib = idx + ai[i];
835       for ( j=0; j<n; j++ ) {
836         rval      = ib[j]*5;
837         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
838         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
839         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
840         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
841         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
842         v += 25;
843       }
844     }
845     break;
846       /* block sizes larger then 5 by 5 are handled by BLAS */
847     default: {
848       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
849       if (!a->mult_work) {
850         k = PetscMax(a->m,a->n);
851         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
852         CHKPTRQ(a->mult_work);
853       }
854       work = a->mult_work;
855       for ( i=0; i<mbs; i++ ) {
856         n     = ii[1] - ii[0]; ii++;
857         ncols = n*bs;
858         PetscMemzero(work,ncols*sizeof(Scalar));
859         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
860         v += n*bs2;
861         x += bs;
862         workt = work;
863         for ( j=0; j<n; j++ ) {
864           zb = z + bs*(*idx++);
865           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
866           workt += bs;
867         }
868       }
869     }
870   }
871   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
872   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
873   PLogFlops(2*a->nz*a->bs2 - a->n);
874   PetscFunctionReturn(0);
875 }
876 
877 #undef __FUNC__
878 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
879 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
880 
881 {
882   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
883   Scalar          *xg,*zg,*zb;
884   Scalar          *x,*z,*v,*xb,x1,x2,x3,x4,x5;
885   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
886   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
887 
888   PetscFunctionBegin;
889   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
890   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
891 
892   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
893   else PetscMemzero(z,N*sizeof(Scalar));
894 
895   idx   = a->j;
896   v     = a->a;
897   ii    = a->i;
898 
899   switch (bs) {
900   case 1:
901     for ( i=0; i<mbs; i++ ) {
902       n  = ii[1] - ii[0]; ii++;
903       xb = x + i; x1 = xb[0];
904       ib = idx + ai[i];
905       for ( j=0; j<n; j++ ) {
906         rval    = ib[j];
907         z[rval] += *v++ * x1;
908       }
909     }
910     break;
911   case 2:
912     for ( i=0; i<mbs; i++ ) {
913       n  = ii[1] - ii[0]; ii++;
914       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
915       ib = idx + ai[i];
916       for ( j=0; j<n; j++ ) {
917         rval      = ib[j]*2;
918         z[rval++] += v[0]*x1 + v[1]*x2;
919         z[rval++] += v[2]*x1 + v[3]*x2;
920         v += 4;
921       }
922     }
923     break;
924   case 3:
925     for ( i=0; i<mbs; i++ ) {
926       n  = ii[1] - ii[0]; ii++;
927       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
928       ib = idx + ai[i];
929       for ( j=0; j<n; j++ ) {
930         rval      = ib[j]*3;
931         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
932         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
933         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
934         v += 9;
935       }
936     }
937     break;
938   case 4:
939     for ( i=0; i<mbs; i++ ) {
940       n  = ii[1] - ii[0]; ii++;
941       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
942       ib = idx + ai[i];
943       for ( j=0; j<n; j++ ) {
944         rval      = ib[j]*4;
945         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
946         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
947         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
948         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
949         v += 16;
950       }
951     }
952     break;
953   case 5:
954     for ( i=0; i<mbs; i++ ) {
955       n  = ii[1] - ii[0]; ii++;
956       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
957       x4 = xb[3];   x5 = xb[4];
958       ib = idx + ai[i];
959       for ( j=0; j<n; j++ ) {
960         rval      = ib[j]*5;
961         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
962         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
963         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
964         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
965         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
966         v += 25;
967       }
968     }
969     break;
970       /* block sizes larger then 5 by 5 are handled by BLAS */
971     default: {
972       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
973       if (!a->mult_work) {
974         k = PetscMax(a->m,a->n);
975         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
976         CHKPTRQ(a->mult_work);
977       }
978       work = a->mult_work;
979       for ( i=0; i<mbs; i++ ) {
980         n     = ii[1] - ii[0]; ii++;
981         ncols = n*bs;
982         PetscMemzero(work,ncols*sizeof(Scalar));
983         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
984         v += n*bs2;
985         x += bs;
986         workt = work;
987         for ( j=0; j<n; j++ ) {
988           zb = z + bs*(*idx++);
989           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
990           workt += bs;
991         }
992       }
993     }
994   }
995   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
996   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
997   PLogFlops(2*a->nz*a->bs2);
998   PetscFunctionReturn(0);
999 }
1000 
1001 #undef __FUNC__
1002 #define __FUNC__ "MatScale_SeqBAIJ"
1003 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1004 {
1005   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1006   int         one = 1, totalnz = a->bs2*a->nz;
1007 
1008   PetscFunctionBegin;
1009   BLscal_( &totalnz, alpha, a->a, &one );
1010   PLogFlops(totalnz);
1011   PetscFunctionReturn(0);
1012 }
1013 
1014 #undef __FUNC__
1015 #define __FUNC__ "MatNorm_SeqBAIJ"
1016 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1017 {
1018   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1019   Scalar      *v = a->a;
1020   double      sum = 0.0;
1021   int         i,nz=a->nz,bs2=a->bs2;
1022 
1023   PetscFunctionBegin;
1024   if (type == NORM_FROBENIUS) {
1025     for (i=0; i< bs2*nz; i++ ) {
1026 #if defined(USE_PETSC_COMPLEX)
1027       sum += real(conj(*v)*(*v)); v++;
1028 #else
1029       sum += (*v)*(*v); v++;
1030 #endif
1031     }
1032     *norm = sqrt(sum);
1033   }
1034   else {
1035     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1036   }
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 
1041 #undef __FUNC__
1042 #define __FUNC__ "MatEqual_SeqBAIJ"
1043 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1044 {
1045   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1046 
1047   PetscFunctionBegin;
1048   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1049 
1050   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1051   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1052     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1053   }
1054 
1055   /* if the a->i are the same */
1056   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1057     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1058   }
1059 
1060   /* if a->j are the same */
1061   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1062     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1063   }
1064 
1065   /* if a->a are the same */
1066   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1067     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1068   }
1069   *flg = PETSC_TRUE;
1070   PetscFunctionReturn(0);
1071 
1072 }
1073 
1074 #undef __FUNC__
1075 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1076 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1077 {
1078   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1079   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1080   Scalar      *x, zero = 0.0,*aa,*aa_j;
1081 
1082   PetscFunctionBegin;
1083   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1084   bs   = a->bs;
1085   aa   = a->a;
1086   ai   = a->i;
1087   aj   = a->j;
1088   ambs = a->mbs;
1089   bs2  = a->bs2;
1090 
1091   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1092   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1093   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1094   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1095   for ( i=0; i<ambs; i++ ) {
1096     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1097       if (aj[j] == i) {
1098         row  = i*bs;
1099         aa_j = aa+j*bs2;
1100         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1101         break;
1102       }
1103     }
1104   }
1105   PetscFunctionReturn(0);
1106 }
1107 
1108 #undef __FUNC__
1109 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1110 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1111 {
1112   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1113   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1114   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1115 
1116   PetscFunctionBegin;
1117   ai  = a->i;
1118   aj  = a->j;
1119   aa  = a->a;
1120   m   = a->m;
1121   n   = a->n;
1122   bs  = a->bs;
1123   mbs = a->mbs;
1124   bs2 = a->bs2;
1125   if (ll) {
1126     ierr = VecGetArray(ll,&l); CHKERRQ(ierr);
1127     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1128     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1129     for ( i=0; i<mbs; i++ ) { /* for each block row */
1130       M  = ai[i+1] - ai[i];
1131       li = l + i*bs;
1132       v  = aa + bs2*ai[i];
1133       for ( j=0; j<M; j++ ) { /* for each block */
1134         for ( k=0; k<bs2; k++ ) {
1135           (*v++) *= li[k%bs];
1136         }
1137       }
1138     }
1139   }
1140 
1141   if (rr) {
1142     ierr = VecGetArray(rr,&r); CHKERRQ(ierr);
1143     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1144     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1145     for ( i=0; i<mbs; i++ ) { /* for each block row */
1146       M  = ai[i+1] - ai[i];
1147       v  = aa + bs2*ai[i];
1148       for ( j=0; j<M; j++ ) { /* for each block */
1149         ri = r + bs*aj[ai[i]+j];
1150         for ( k=0; k<bs; k++ ) {
1151           x = ri[k];
1152           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1153         }
1154       }
1155     }
1156   }
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 
1161 #undef __FUNC__
1162 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1163 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1164 {
1165   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1166 
1167   PetscFunctionBegin;
1168   info->rows_global    = (double)a->m;
1169   info->columns_global = (double)a->n;
1170   info->rows_local     = (double)a->m;
1171   info->columns_local  = (double)a->n;
1172   info->block_size     = a->bs2;
1173   info->nz_allocated   = a->maxnz;
1174   info->nz_used        = a->bs2*a->nz;
1175   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1176   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1177     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1178   info->assemblies   = A->num_ass;
1179   info->mallocs      = a->reallocs;
1180   info->memory       = A->mem;
1181   if (A->factor) {
1182     info->fill_ratio_given  = A->info.fill_ratio_given;
1183     info->fill_ratio_needed = A->info.fill_ratio_needed;
1184     info->factor_mallocs    = A->info.factor_mallocs;
1185   } else {
1186     info->fill_ratio_given  = 0;
1187     info->fill_ratio_needed = 0;
1188     info->factor_mallocs    = 0;
1189   }
1190   PetscFunctionReturn(0);
1191 }
1192 
1193 
1194 #undef __FUNC__
1195 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1196 int MatZeroEntries_SeqBAIJ(Mat A)
1197 {
1198   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1199 
1200   PetscFunctionBegin;
1201   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
1202   PetscFunctionReturn(0);
1203 }
1204