xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 77ed534321f0a860738694ee6d0aa216f0623125)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.30 1998/10/09 19:22:47 bsmith Exp bsmith $";
3 #endif
4 
5 #include "sys.h"
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 #include "bitarray.h"
10 
11 #undef __FUNC__
12 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
13 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
14 {
15   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
16   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
17   int         start, end, *ai, *aj,bs,*nidx2;
18   BT          table;
19 
20   PetscFunctionBegin;
21   m     = a->mbs;
22   ai    = a->i;
23   aj    = a->j;
24   bs    = a->bs;
25 
26   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
27 
28   ierr  = BTCreate(m,table); CHKERRQ(ierr);
29   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
30   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
31 
32   for ( i=0; i<is_max; i++ ) {
33     /* Initialise the two local arrays */
34     isz  = 0;
35     BTMemzero(m,table);
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
39     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
40 
41     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42     for ( j=0; j<n ; ++j){
43       ival = idx[j]/bs; /* convert the indices into block indices */
44       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
45       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
46     }
47     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
49 
50     k = 0;
51     for ( j=0; j<ov; j++){ /* for each overlap*/
52       n = isz;
53       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
54         row   = nidx[k];
55         start = ai[row];
56         end   = ai[row+1];
57         for ( l = start; l<end ; l++){
58           val = aj[l];
59           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
60         }
61       }
62     }
63     /* expand the Index Set */
64     for (j=0; j<isz; j++ ) {
65       for (k=0; k<bs; k++ )
66         nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
69   }
70   BTDestroy(table);
71   PetscFree(nidx);
72   PetscFree(nidx2);
73   PetscFunctionReturn(0);
74 }
75 
76 #undef __FUNC__
77 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
78 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
79 {
80   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
81   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
82   int          row,mat_i,*mat_j,tcol,*mat_ilen;
83   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
84   int          *aj = a->j, *ai = a->i;
85   Scalar       *mat_a;
86   Mat          C;
87 
88   PetscFunctionBegin;
89   ierr = ISSorted(iscol,(PetscTruth*)&i);
90   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
91 
92   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
93   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
94   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
95   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
96 
97   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
98   ssmap = smap;
99   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
100   PetscMemzero(smap,oldcols*sizeof(int));
101   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
102   /* determine lens of each row */
103   for (i=0; i<nrows; i++) {
104     kstart  = ai[irow[i]];
105     kend    = kstart + a->ilen[irow[i]];
106     lens[i] = 0;
107       for ( k=kstart; k<kend; k++ ) {
108         if (ssmap[aj[k]]) {
109           lens[i]++;
110         }
111       }
112     }
113   /* Create and fill new matrix */
114   if (scall == MAT_REUSE_MATRIX) {
115     c = (Mat_SeqBAIJ *)((*B)->data);
116 
117     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
118     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
119       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
120     }
121     PetscMemzero(c->ilen,c->mbs*sizeof(int));
122     C = *B;
123   } else {
124     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
125   }
126   c = (Mat_SeqBAIJ *)(C->data);
127   for (i=0; i<nrows; i++) {
128     row    = irow[i];
129     kstart = ai[row];
130     kend   = kstart + a->ilen[row];
131     mat_i  = c->i[i];
132     mat_j  = c->j + mat_i;
133     mat_a  = c->a + mat_i*bs2;
134     mat_ilen = c->ilen + i;
135     for ( k=kstart; k<kend; k++ ) {
136       if ((tcol=ssmap[a->j[k]])) {
137         *mat_j++ = tcol - 1;
138         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2;
139         (*mat_ilen)++;
140       }
141     }
142   }
143 
144   /* Free work space */
145   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
146   PetscFree(smap); PetscFree(lens);
147   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
148   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
149 
150   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
151   *B = C;
152   PetscFunctionReturn(0);
153 }
154 
155 #undef __FUNC__
156 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
157 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
158 {
159   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
160   IS          is1,is2;
161   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
162 
163   PetscFunctionBegin;
164   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
165   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
166   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
167   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
168 
169   /* Verify if the indices corespond to each element in a block
170    and form the IS with compressed IS */
171   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
172   iary = vary + a->mbs;
173   PetscMemzero(vary,(a->mbs)*sizeof(int));
174   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
175   count = 0;
176   for (i=0; i<a->mbs; i++) {
177     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
178     if (vary[i]==bs) iary[count++] = i;
179   }
180   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
181 
182   PetscMemzero(vary,(a->mbs)*sizeof(int));
183   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
184   count = 0;
185   for (i=0; i<a->mbs; i++) {
186     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
187     if (vary[i]==bs) iary[count++] = i;
188   }
189   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
190   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
191   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
192   PetscFree(vary);
193 
194   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
195   ISDestroy(is1);
196   ISDestroy(is2);
197   PetscFunctionReturn(0);
198 }
199 
200 #undef __FUNC__
201 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
202 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,Mat **B)
203 {
204   int ierr,i;
205 
206   PetscFunctionBegin;
207   if (scall == MAT_INITIAL_MATRIX) {
208     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
209   }
210 
211   for ( i=0; i<n; i++ ) {
212     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
213   }
214   PetscFunctionReturn(0);
215 }
216 
217 
218 /* -------------------------------------------------------*/
219 /* Should check that shapes of vectors and matrices match */
220 /* -------------------------------------------------------*/
221 #include "pinclude/blaslapack.h"
222 
223 #undef __FUNC__
224 #define __FUNC__ "MatMult_SeqBAIJ_1"
225 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
226 {
227   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
228    Scalar *x,*z,*v,sum;
229   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
230 
231   PetscFunctionBegin;
232   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
233   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
234 
235   idx   = a->j;
236   v     = a->a;
237   ii    = a->i;
238 
239   for ( i=0; i<mbs; i++ ) {
240     n    = ii[1] - ii[0]; ii++;
241     sum  = 0.0;
242     while (n--) sum += *v++ * x[*idx++];
243     z[i] = sum;
244   }
245   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
246   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
247   PLogFlops(2*a->nz - a->m);
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNC__
252 #define __FUNC__ "MatMult_SeqBAIJ_2"
253 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
254 {
255   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
256    Scalar *x,*z,*v,*xb,sum1,sum2;
257    Scalar x1,x2;
258   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
259 
260   PetscFunctionBegin;
261   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
262   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
263 
264   idx   = a->j;
265   v     = a->a;
266   ii    = a->i;
267 
268   for ( i=0; i<mbs; i++ ) {
269     n  = ii[1] - ii[0]; ii++;
270     sum1 = 0.0; sum2 = 0.0;
271     for ( j=0; j<n; j++ ) {
272       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
273       sum1 += v[0]*x1 + v[2]*x2;
274       sum2 += v[1]*x1 + v[3]*x2;
275       v += 4;
276     }
277     z[0] = sum1; z[1] = sum2;
278     z += 2;
279   }
280   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
281   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
282   PLogFlops(4*a->nz - a->m);
283   PetscFunctionReturn(0);
284 }
285 
286 #undef __FUNC__
287 #define __FUNC__ "MatMult_SeqBAIJ_3"
288 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
289 {
290   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
291   Scalar       *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
292   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
293 
294 #if defined(HAVE_PRAGMA_DISJOINT)
295 #pragma disjoint(*v,*z,*xb)
296 #endif
297 
298   PetscFunctionBegin;
299   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
300   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
301 
302   idx   = a->j;
303   v     = a->a;
304   ii    = a->i;
305 
306   for ( i=0; i<mbs; i++ ) {
307     n  = ii[1] - ii[0]; ii++;
308     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
309     for ( j=0; j<n; j++ ) {
310       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
311       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
312       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
313       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
314       v += 9;
315     }
316     z[0] = sum1; z[1] = sum2; z[2] = sum3;
317     z += 3;
318   }
319   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321   PLogFlops(18*a->nz - a->m);
322   PetscFunctionReturn(0);
323 }
324 
325 #undef __FUNC__
326 #define __FUNC__ "MatMult_SeqBAIJ_4"
327 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
328 {
329   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
330    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
331   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
332 
333   PetscFunctionBegin;
334   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
335   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
336 
337   idx   = a->j;
338   v     = a->a;
339   ii    = a->i;
340 
341   for ( i=0; i<mbs; i++ ) {
342     n  = ii[1] - ii[0]; ii++;
343     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
344     for ( j=0; j<n; j++ ) {
345       xb = x + 4*(*idx++);
346       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
347       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
348       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
349       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
350       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
351       v += 16;
352     }
353     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
354     z += 4;
355   }
356   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
357   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
358   PLogFlops(32*a->nz - a->m);
359   PetscFunctionReturn(0);
360 }
361 
362 #undef __FUNC__
363 #define __FUNC__ "MatMult_SeqBAIJ_5"
364 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
365 {
366   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
367   Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
368   Scalar *  v,*  xb,*  z, *  x;
369   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
370 
371   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
372   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
373 
374   idx   = a->j;
375   v     = a->a;
376   ii    = a->i;
377 
378   for ( i=0; i<mbs; i++ ) {
379     n  = ii[1] - ii[0]; ii++;
380     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
381     for ( j=0; j<n; j++ ) {
382       xb = x + 5*(*idx++);
383       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
384       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
385       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
386       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
387       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
388       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
389       v += 25;
390     }
391     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
392     z += 5;
393   }
394   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
395   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
396   PLogFlops(50*a->nz - a->m);
397   PetscFunctionReturn(0);
398 }
399 
400 #undef __FUNC__
401 #define __FUNC__ "MatMult_SeqBAIJ_7"
402 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
403 {
404   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
405    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
406    Scalar x1,x2,x3,x4,x5,x6,x7;
407   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
408 
409   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   idx   = a->j;
413   v     = a->a;
414   ii    = a->i;
415 
416   for ( i=0; i<mbs; i++ ) {
417     n  = ii[1] - ii[0]; ii++;
418     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
419     for ( j=0; j<n; j++ ) {
420       xb = x + 7*(*idx++);
421       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
422       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
423       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
424       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
425       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
426       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
427       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
428       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
429       v += 49;
430     }
431     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
432     z += 7;
433   }
434 
435   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
436   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
437   PLogFlops(98*a->nz - a->m);
438   PetscFunctionReturn(0);
439 }
440 
441 #undef __FUNC__
442 #define __FUNC__ "MatMult_SeqBAIJ_N"
443 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
444 {
445   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
446    Scalar *x,*z,*v,*xb;
447   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
448   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
449   int             _One = 1,ncols,k;
450 
451   PetscFunctionBegin;
452   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
453   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
454 
455   idx   = a->j;
456   v     = a->a;
457   ii    = a->i;
458 
459 
460   if (!a->mult_work) {
461     k = PetscMax(a->m,a->n);
462     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
463   }
464   work = a->mult_work;
465   for ( i=0; i<mbs; i++ ) {
466     n     = ii[1] - ii[0]; ii++;
467     ncols = n*bs;
468     workt = work;
469     for ( j=0; j<n; j++ ) {
470       xb = x + bs*(*idx++);
471       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
472       workt += bs;
473     }
474     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
475     v += n*bs2;
476     z += bs;
477   }
478   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
479   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
480   PLogFlops(2*a->nz*bs2 - a->m);
481   PetscFunctionReturn(0);
482 }
483 
484 #undef __FUNC__
485 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
486 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
487 {
488   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
489   Scalar          *x,*y,*z,*v,sum;
490   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
491 
492   PetscFunctionBegin;
493   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
494   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
495   if (zz != yy) {
496     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
497   } else {
498     z = y;
499   }
500 
501   idx   = a->j;
502   v     = a->a;
503   ii    = a->i;
504 
505   for ( i=0; i<mbs; i++ ) {
506     n    = ii[1] - ii[0]; ii++;
507     sum  = y[i];
508     while (n--) sum += *v++ * x[*idx++];
509     z[i] = sum;
510   }
511   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
513   if (zz != yy) {
514     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
515   }
516   PLogFlops(2*a->nz);
517   PetscFunctionReturn(0);
518 }
519 
520 #undef __FUNC__
521 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
522 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
523 {
524   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
525   Scalar          *x,*y,*z,*v,*xb,sum1,sum2;
526   Scalar          x1,x2;
527   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
528 
529   PetscFunctionBegin;
530   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
532   if (zz != yy) {
533     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
534   } else {
535     z = y;
536   }
537 
538   idx   = a->j;
539   v     = a->a;
540   ii    = a->i;
541 
542   for ( i=0; i<mbs; i++ ) {
543     n  = ii[1] - ii[0]; ii++;
544     sum1 = y[0]; sum2 = y[1];
545     for ( j=0; j<n; j++ ) {
546       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
547       sum1 += v[0]*x1 + v[2]*x2;
548       sum2 += v[1]*x1 + v[3]*x2;
549       v += 4;
550     }
551     z[0] = sum1; z[1] = sum2;
552     z += 2; y += 2;
553   }
554   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
555   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
556   if (zz != yy) {
557     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
558   }
559   PLogFlops(4*a->nz);
560   PetscFunctionReturn(0);
561 }
562 
563 #undef __FUNC__
564 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
565 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
566 {
567   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
568   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
569   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
570 
571   PetscFunctionBegin;
572   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
573   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
574   if (zz != yy) {
575     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
576   } else {
577     z = y;
578   }
579 
580   idx   = a->j;
581   v     = a->a;
582   ii    = a->i;
583 
584   for ( i=0; i<mbs; i++ ) {
585     n  = ii[1] - ii[0]; ii++;
586     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
587     for ( j=0; j<n; j++ ) {
588       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
589       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
590       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
591       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
592       v += 9;
593     }
594     z[0] = sum1; z[1] = sum2; z[2] = sum3;
595     z += 3; y += 3;
596   }
597   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
598   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
599   if (zz != yy) {
600     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
601   }
602   PLogFlops(18*a->nz);
603   PetscFunctionReturn(0);
604 }
605 
606 #undef __FUNC__
607 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
608 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
609 {
610   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
611   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
612   int             ierr,mbs=a->mbs,i,*idx,*ii;
613   int             j,n;
614 
615   PetscFunctionBegin;
616   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
617   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
618   if (zz != yy) {
619     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
620   } else {
621     z = y;
622   }
623 
624   idx   = a->j;
625   v     = a->a;
626   ii    = a->i;
627 
628   for ( i=0; i<mbs; i++ ) {
629     n  = ii[1] - ii[0]; ii++;
630     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
631     for ( j=0; j<n; j++ ) {
632       xb = x + 4*(*idx++);
633       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
634       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
635       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
636       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
637       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
638       v += 16;
639     }
640     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
641     z += 4; y += 4;
642   }
643   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
644   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
645   if (zz != yy) {
646     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
647   }
648   PLogFlops(32*a->nz);
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNC__
653 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
654 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
655 {
656   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
657   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
658   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
659 
660   PetscFunctionBegin;
661   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
662   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
663   if (zz != yy) {
664     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
665   } else {
666     z = y;
667   }
668 
669   idx   = a->j;
670   v     = a->a;
671   ii    = a->i;
672 
673   for ( i=0; i<mbs; i++ ) {
674     n  = ii[1] - ii[0]; ii++;
675     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
676     for ( j=0; j<n; j++ ) {
677       xb = x + 5*(*idx++);
678       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
679       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
680       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
681       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
682       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
683       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
684       v += 25;
685     }
686     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
687     z += 5; y += 5;
688   }
689   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
690   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
691   if (zz != yy) {
692     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
693   }
694   PLogFlops(50*a->nz);
695   PetscFunctionReturn(0);
696 }
697 
698 #undef __FUNC__
699 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
700 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
701 {
702   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
703   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
704   Scalar          x1,x2,x3,x4,x5,x6,x7;
705   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
706 
707   PetscFunctionBegin;
708   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
709   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
710   if (zz != yy) {
711     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
712   } else {
713     z = y;
714   }
715 
716   idx   = a->j;
717   v     = a->a;
718   ii    = a->i;
719 
720   for ( i=0; i<mbs; i++ ) {
721     n  = ii[1] - ii[0]; ii++;
722     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
723     for ( j=0; j<n; j++ ) {
724       xb = x + 7*(*idx++);
725       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
726       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
727       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
728       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
729       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
730       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
731       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
732       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
733       v += 49;
734     }
735     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
736     z += 7; y += 7;
737   }
738   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
739   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
740   if (zz != yy) {
741     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
742   }
743   PLogFlops(98*a->nz);
744   PetscFunctionReturn(0);
745 }
746 
747 
748 #undef __FUNC__
749 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
750 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
751 {
752   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
753   Scalar         *x,*z,*v,*xb;
754   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
755   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
756 
757   PetscFunctionBegin;
758   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
759 
760   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
761   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
762 
763   idx   = a->j;
764   v     = a->a;
765   ii    = a->i;
766 
767 
768   if (!a->mult_work) {
769     k = PetscMax(a->m,a->n);
770     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
771   }
772   work = a->mult_work;
773   for ( i=0; i<mbs; i++ ) {
774     n     = ii[1] - ii[0]; ii++;
775     ncols = n*bs;
776     workt = work;
777     for ( j=0; j<n; j++ ) {
778       xb = x + bs*(*idx++);
779       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
780       workt += bs;
781     }
782     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
783     v += n*bs2;
784     z += bs;
785   }
786   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
788   PLogFlops(2*a->nz*bs2 );
789   PetscFunctionReturn(0);
790 }
791 
792 #undef __FUNC__
793 #define __FUNC__ "MatMultTrans_SeqBAIJ"
794 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
795 {
796   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
797   Scalar          *xg,*zg,*zb;
798    Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
799   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
800   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
801 
802 
803   PetscFunctionBegin;
804   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
805   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
806   PetscMemzero(z,N*sizeof(Scalar));
807 
808   idx   = a->j;
809   v     = a->a;
810   ii    = a->i;
811 
812   switch (bs) {
813   case 1:
814     for ( i=0; i<mbs; i++ ) {
815       n  = ii[1] - ii[0]; ii++;
816       xb = x + i; x1 = xb[0];
817       ib = idx + ai[i];
818       for ( j=0; j<n; j++ ) {
819         rval    = ib[j];
820         z[rval] += *v++ * x1;
821       }
822     }
823     break;
824   case 2:
825     for ( i=0; i<mbs; i++ ) {
826       n  = ii[1] - ii[0]; ii++;
827       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
828       ib = idx + ai[i];
829       for ( j=0; j<n; j++ ) {
830         rval      = ib[j]*2;
831         z[rval++] += v[0]*x1 + v[1]*x2;
832         z[rval++] += v[2]*x1 + v[3]*x2;
833         v += 4;
834       }
835     }
836     break;
837   case 3:
838     for ( i=0; i<mbs; i++ ) {
839       n  = ii[1] - ii[0]; ii++;
840       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
841       ib = idx + ai[i];
842       for ( j=0; j<n; j++ ) {
843         rval      = ib[j]*3;
844         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
845         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
846         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
847         v += 9;
848       }
849     }
850     break;
851   case 4:
852     for ( i=0; i<mbs; i++ ) {
853       n  = ii[1] - ii[0]; ii++;
854       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
855       ib = idx + ai[i];
856       for ( j=0; j<n; j++ ) {
857         rval      = ib[j]*4;
858         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
859         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
860         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
861         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
862         v += 16;
863       }
864     }
865     break;
866   case 5:
867     for ( i=0; i<mbs; i++ ) {
868       n  = ii[1] - ii[0]; ii++;
869       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
870       x4 = xb[3];   x5 = xb[4];
871       ib = idx + ai[i];
872       for ( j=0; j<n; j++ ) {
873         rval      = ib[j]*5;
874         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
875         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
876         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
877         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
878         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
879         v += 25;
880       }
881     }
882     break;
883       /* block sizes larger then 5 by 5 are handled by BLAS */
884     default: {
885       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
886       if (!a->mult_work) {
887         k = PetscMax(a->m,a->n);
888         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
889         CHKPTRQ(a->mult_work);
890       }
891       work = a->mult_work;
892       for ( i=0; i<mbs; i++ ) {
893         n     = ii[1] - ii[0]; ii++;
894         ncols = n*bs;
895         PetscMemzero(work,ncols*sizeof(Scalar));
896         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
897         v += n*bs2;
898         x += bs;
899         workt = work;
900         for ( j=0; j<n; j++ ) {
901           zb = z + bs*(*idx++);
902           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
903           workt += bs;
904         }
905       }
906     }
907   }
908   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
909   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
910   PLogFlops(2*a->nz*a->bs2 - a->n);
911   PetscFunctionReturn(0);
912 }
913 
914 #undef __FUNC__
915 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
916 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
917 
918 {
919   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
920   Scalar          *xg,*zg,*zb;
921   Scalar          *x,*z,*v,*xb,x1,x2,x3,x4,x5;
922   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
923   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
924 
925   PetscFunctionBegin;
926   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
927   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
928 
929   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
930   else PetscMemzero(z,N*sizeof(Scalar));
931 
932   idx   = a->j;
933   v     = a->a;
934   ii    = a->i;
935 
936   switch (bs) {
937   case 1:
938     for ( i=0; i<mbs; i++ ) {
939       n  = ii[1] - ii[0]; ii++;
940       xb = x + i; x1 = xb[0];
941       ib = idx + ai[i];
942       for ( j=0; j<n; j++ ) {
943         rval    = ib[j];
944         z[rval] += *v++ * x1;
945       }
946     }
947     break;
948   case 2:
949     for ( i=0; i<mbs; i++ ) {
950       n  = ii[1] - ii[0]; ii++;
951       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
952       ib = idx + ai[i];
953       for ( j=0; j<n; j++ ) {
954         rval      = ib[j]*2;
955         z[rval++] += v[0]*x1 + v[1]*x2;
956         z[rval++] += v[2]*x1 + v[3]*x2;
957         v += 4;
958       }
959     }
960     break;
961   case 3:
962     for ( i=0; i<mbs; i++ ) {
963       n  = ii[1] - ii[0]; ii++;
964       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
965       ib = idx + ai[i];
966       for ( j=0; j<n; j++ ) {
967         rval      = ib[j]*3;
968         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
969         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
970         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
971         v += 9;
972       }
973     }
974     break;
975   case 4:
976     for ( i=0; i<mbs; i++ ) {
977       n  = ii[1] - ii[0]; ii++;
978       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
979       ib = idx + ai[i];
980       for ( j=0; j<n; j++ ) {
981         rval      = ib[j]*4;
982         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
983         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
984         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
985         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
986         v += 16;
987       }
988     }
989     break;
990   case 5:
991     for ( i=0; i<mbs; i++ ) {
992       n  = ii[1] - ii[0]; ii++;
993       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
994       x4 = xb[3];   x5 = xb[4];
995       ib = idx + ai[i];
996       for ( j=0; j<n; j++ ) {
997         rval      = ib[j]*5;
998         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
999         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1000         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1001         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1002         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1003         v += 25;
1004       }
1005     }
1006     break;
1007       /* block sizes larger then 5 by 5 are handled by BLAS */
1008     default: {
1009       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1010       if (!a->mult_work) {
1011         k = PetscMax(a->m,a->n);
1012         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1013         CHKPTRQ(a->mult_work);
1014       }
1015       work = a->mult_work;
1016       for ( i=0; i<mbs; i++ ) {
1017         n     = ii[1] - ii[0]; ii++;
1018         ncols = n*bs;
1019         PetscMemzero(work,ncols*sizeof(Scalar));
1020         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1021         v += n*bs2;
1022         x += bs;
1023         workt = work;
1024         for ( j=0; j<n; j++ ) {
1025           zb = z + bs*(*idx++);
1026           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1027           workt += bs;
1028         }
1029       }
1030     }
1031   }
1032   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1033   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1034   PLogFlops(2*a->nz*a->bs2);
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 #undef __FUNC__
1039 #define __FUNC__ "MatScale_SeqBAIJ"
1040 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1041 {
1042   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1043   int         one = 1, totalnz = a->bs2*a->nz;
1044 
1045   PetscFunctionBegin;
1046   BLscal_( &totalnz, alpha, a->a, &one );
1047   PLogFlops(totalnz);
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 #undef __FUNC__
1052 #define __FUNC__ "MatNorm_SeqBAIJ"
1053 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1054 {
1055   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1056   Scalar      *v = a->a;
1057   double      sum = 0.0;
1058   int         i,nz=a->nz,bs2=a->bs2;
1059 
1060   PetscFunctionBegin;
1061   if (type == NORM_FROBENIUS) {
1062     for (i=0; i< bs2*nz; i++ ) {
1063 #if defined(USE_PETSC_COMPLEX)
1064       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1065 #else
1066       sum += (*v)*(*v); v++;
1067 #endif
1068     }
1069     *norm = sqrt(sum);
1070   }
1071   else {
1072     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1073   }
1074   PetscFunctionReturn(0);
1075 }
1076 
1077 
1078 #undef __FUNC__
1079 #define __FUNC__ "MatEqual_SeqBAIJ"
1080 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1081 {
1082   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1083 
1084   PetscFunctionBegin;
1085   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1086 
1087   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1088   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1089     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1090   }
1091 
1092   /* if the a->i are the same */
1093   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1094     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1095   }
1096 
1097   /* if a->j are the same */
1098   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1099     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1100   }
1101 
1102   /* if a->a are the same */
1103   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1104     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1105   }
1106   *flg = PETSC_TRUE;
1107   PetscFunctionReturn(0);
1108 
1109 }
1110 
1111 #undef __FUNC__
1112 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1113 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1114 {
1115   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1116   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1117   Scalar      *x, zero = 0.0,*aa,*aa_j;
1118 
1119   PetscFunctionBegin;
1120   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1121   bs   = a->bs;
1122   aa   = a->a;
1123   ai   = a->i;
1124   aj   = a->j;
1125   ambs = a->mbs;
1126   bs2  = a->bs2;
1127 
1128   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1129   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1130   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1131   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1132   for ( i=0; i<ambs; i++ ) {
1133     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1134       if (aj[j] == i) {
1135         row  = i*bs;
1136         aa_j = aa+j*bs2;
1137         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1138         break;
1139       }
1140     }
1141   }
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 #undef __FUNC__
1146 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1147 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1148 {
1149   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1150   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1151   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1152 
1153   PetscFunctionBegin;
1154   ai  = a->i;
1155   aj  = a->j;
1156   aa  = a->a;
1157   m   = a->m;
1158   n   = a->n;
1159   bs  = a->bs;
1160   mbs = a->mbs;
1161   bs2 = a->bs2;
1162   if (ll) {
1163     ierr = VecGetArray(ll,&l); CHKERRQ(ierr);
1164     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1165     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1166     for ( i=0; i<mbs; i++ ) { /* for each block row */
1167       M  = ai[i+1] - ai[i];
1168       li = l + i*bs;
1169       v  = aa + bs2*ai[i];
1170       for ( j=0; j<M; j++ ) { /* for each block */
1171         for ( k=0; k<bs2; k++ ) {
1172           (*v++) *= li[k%bs];
1173         }
1174       }
1175     }
1176   }
1177 
1178   if (rr) {
1179     ierr = VecGetArray(rr,&r); CHKERRQ(ierr);
1180     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1181     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1182     for ( i=0; i<mbs; i++ ) { /* for each block row */
1183       M  = ai[i+1] - ai[i];
1184       v  = aa + bs2*ai[i];
1185       for ( j=0; j<M; j++ ) { /* for each block */
1186         ri = r + bs*aj[ai[i]+j];
1187         for ( k=0; k<bs; k++ ) {
1188           x = ri[k];
1189           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1190         }
1191       }
1192     }
1193   }
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 
1198 #undef __FUNC__
1199 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1200 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1201 {
1202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1203 
1204   PetscFunctionBegin;
1205   info->rows_global    = (double)a->m;
1206   info->columns_global = (double)a->n;
1207   info->rows_local     = (double)a->m;
1208   info->columns_local  = (double)a->n;
1209   info->block_size     = a->bs2;
1210   info->nz_allocated   = a->maxnz;
1211   info->nz_used        = a->bs2*a->nz;
1212   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1213   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1214     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1215   info->assemblies   = A->num_ass;
1216   info->mallocs      = a->reallocs;
1217   info->memory       = A->mem;
1218   if (A->factor) {
1219     info->fill_ratio_given  = A->info.fill_ratio_given;
1220     info->fill_ratio_needed = A->info.fill_ratio_needed;
1221     info->factor_mallocs    = A->info.factor_mallocs;
1222   } else {
1223     info->fill_ratio_given  = 0;
1224     info->fill_ratio_needed = 0;
1225     info->factor_mallocs    = 0;
1226   }
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 
1231 #undef __FUNC__
1232 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1233 int MatZeroEntries_SeqBAIJ(Mat A)
1234 {
1235   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1236 
1237   PetscFunctionBegin;
1238   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
1239   PetscFunctionReturn(0);
1240 }
1241