xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 2e8a6d31cfff42dc61c2443ddfa186a411c27ca7)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.29 1998/08/03 14:59:21 balay Exp bsmith $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "sys.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/spops.h"
10 #include "petsc.h"
11 #include "bitarray.h"
12 
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
16 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
17 {
18   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
19   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
20   int         start, end, *ai, *aj,bs,*nidx2;
21   BT          table;
22 
23   PetscFunctionBegin;
24   m     = a->mbs;
25   ai    = a->i;
26   aj    = a->j;
27   bs    = a->bs;
28 
29   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
30 
31   ierr  = BTCreate(m,table); CHKERRQ(ierr);
32   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
33   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
34 
35   for ( i=0; i<is_max; i++ ) {
36     /* Initialise the two local arrays */
37     isz  = 0;
38     BTMemzero(m,table);
39 
40     /* Extract the indices, assume there can be duplicate entries */
41     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
42     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
43 
44     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
45     for ( j=0; j<n ; ++j){
46       ival = idx[j]/bs; /* convert the indices into block indices */
47       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
48       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
49     }
50     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
52 
53     k = 0;
54     for ( j=0; j<ov; j++){ /* for each overlap*/
55       n = isz;
56       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
57         row   = nidx[k];
58         start = ai[row];
59         end   = ai[row+1];
60         for ( l = start; l<end ; l++){
61           val = aj[l];
62           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
63         }
64       }
65     }
66     /* expand the Index Set */
67     for (j=0; j<isz; j++ ) {
68       for (k=0; k<bs; k++ )
69         nidx2[j*bs+k] = nidx[j]*bs+k;
70     }
71     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
72   }
73   BTDestroy(table);
74   PetscFree(nidx);
75   PetscFree(nidx2);
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNC__
80 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
81 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
82 {
83   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
84   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
85   int          row,mat_i,*mat_j,tcol,*mat_ilen;
86   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
87   int          *aj = a->j, *ai = a->i;
88   Scalar       *mat_a;
89   Mat          C;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,(PetscTruth*)&i);
93   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
97   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
98   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
99 
100   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
101   ssmap = smap;
102   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
103   PetscMemzero(smap,oldcols*sizeof(int));
104   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110       for ( k=kstart; k<kend; k++ ) {
111         if (ssmap[aj[k]]) {
112           lens[i]++;
113         }
114       }
115     }
116   /* Create and fill new matrix */
117   if (scall == MAT_REUSE_MATRIX) {
118     c = (Mat_SeqBAIJ *)((*B)->data);
119 
120     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
121     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     PetscMemzero(c->ilen,c->mbs*sizeof(int));
125     C = *B;
126   } else {
127     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ *)(C->data);
130   for (i=0; i<nrows; i++) {
131     row    = irow[i];
132     kstart = ai[row];
133     kend   = kstart + a->ilen[row];
134     mat_i  = c->i[i];
135     mat_j  = c->j + mat_i;
136     mat_a  = c->a + mat_i*bs2;
137     mat_ilen = c->ilen + i;
138     for ( k=kstart; k<kend; k++ ) {
139       if ((tcol=ssmap[a->j[k]])) {
140         *mat_j++ = tcol - 1;
141         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2;
142         (*mat_ilen)++;
143       }
144     }
145   }
146 
147   /* Free work space */
148   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
149   PetscFree(smap); PetscFree(lens);
150   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
151   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
152 
153   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
154   *B = C;
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNC__
159 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
160 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
161 {
162   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
163   IS          is1,is2;
164   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
165 
166   PetscFunctionBegin;
167   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
168   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
169   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
170   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
171 
172   /* Verify if the indices corespond to each element in a block
173    and form the IS with compressed IS */
174   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
175   iary = vary + a->mbs;
176   PetscMemzero(vary,(a->mbs)*sizeof(int));
177   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
178   count = 0;
179   for (i=0; i<a->mbs; i++) {
180     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
181     if (vary[i]==bs) iary[count++] = i;
182   }
183   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
184 
185   PetscMemzero(vary,(a->mbs)*sizeof(int));
186   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
187   count = 0;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
190     if (vary[i]==bs) iary[count++] = i;
191   }
192   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
193   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
194   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
195   PetscFree(vary);
196 
197   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
198   ISDestroy(is1);
199   ISDestroy(is2);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNC__
204 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
205 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,Mat **B)
206 {
207   int ierr,i;
208 
209   PetscFunctionBegin;
210   if (scall == MAT_INITIAL_MATRIX) {
211     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
212   }
213 
214   for ( i=0; i<n; i++ ) {
215     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
216   }
217   PetscFunctionReturn(0);
218 }
219 
220 
221 /* -------------------------------------------------------*/
222 /* Should check that shapes of vectors and matrices match */
223 /* -------------------------------------------------------*/
224 #include "pinclude/blaslapack.h"
225 
226 #undef __FUNC__
227 #define __FUNC__ "MatMult_SeqBAIJ_1"
228 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229 {
230   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
231    Scalar *x,*z,*v,sum;
232   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
233 
234   PetscFunctionBegin;
235   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
236   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
237 
238   idx   = a->j;
239   v     = a->a;
240   ii    = a->i;
241 
242   for ( i=0; i<mbs; i++ ) {
243     n    = ii[1] - ii[0]; ii++;
244     sum  = 0.0;
245     while (n--) sum += *v++ * x[*idx++];
246     z[i] = sum;
247   }
248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
250   PLogFlops(2*a->nz - a->m);
251   PetscFunctionReturn(0);
252 }
253 
254 #undef __FUNC__
255 #define __FUNC__ "MatMult_SeqBAIJ_2"
256 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
257 {
258   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
259    Scalar *x,*z,*v,*xb,sum1,sum2;
260    Scalar x1,x2;
261   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
262 
263   PetscFunctionBegin;
264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
266 
267   idx   = a->j;
268   v     = a->a;
269   ii    = a->i;
270 
271   for ( i=0; i<mbs; i++ ) {
272     n  = ii[1] - ii[0]; ii++;
273     sum1 = 0.0; sum2 = 0.0;
274     for ( j=0; j<n; j++ ) {
275       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
276       sum1 += v[0]*x1 + v[2]*x2;
277       sum2 += v[1]*x1 + v[3]*x2;
278       v += 4;
279     }
280     z[0] = sum1; z[1] = sum2;
281     z += 2;
282   }
283   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
285   PLogFlops(4*a->nz - a->m);
286   PetscFunctionReturn(0);
287 }
288 
289 #undef __FUNC__
290 #define __FUNC__ "MatMult_SeqBAIJ_3"
291 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
292 {
293   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
294   Scalar       *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
295   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
296 
297 #if defined(HAVE_PRAGMA_DISJOINT)
298 #pragma disjoint(*v,*z,*xb)
299 #endif
300 
301   PetscFunctionBegin;
302   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   idx   = a->j;
306   v     = a->a;
307   ii    = a->i;
308 
309   for ( i=0; i<mbs; i++ ) {
310     n  = ii[1] - ii[0]; ii++;
311     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
312     for ( j=0; j<n; j++ ) {
313       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
314       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
315       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
316       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
317       v += 9;
318     }
319     z[0] = sum1; z[1] = sum2; z[2] = sum3;
320     z += 3;
321   }
322   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
324   PLogFlops(18*a->nz - a->m);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNC__
329 #define __FUNC__ "MatMult_SeqBAIJ_4"
330 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
333    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
334   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
335 
336   PetscFunctionBegin;
337   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
339 
340   idx   = a->j;
341   v     = a->a;
342   ii    = a->i;
343 
344   for ( i=0; i<mbs; i++ ) {
345     n  = ii[1] - ii[0]; ii++;
346     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
347     for ( j=0; j<n; j++ ) {
348       xb = x + 4*(*idx++);
349       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
350       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
351       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
352       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
353       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
354       v += 16;
355     }
356     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
357     z += 4;
358   }
359   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
360   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
361   PLogFlops(32*a->nz - a->m);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNC__
366 #define __FUNC__ "MatMult_SeqBAIJ_5"
367 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
368 {
369   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
370   Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
371   Scalar *  v,*  xb,*  z, *  x;
372   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
373 
374   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
375   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
376 
377   idx   = a->j;
378   v     = a->a;
379   ii    = a->i;
380 
381   for ( i=0; i<mbs; i++ ) {
382     n  = ii[1] - ii[0]; ii++;
383     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
384     for ( j=0; j<n; j++ ) {
385       xb = x + 5*(*idx++);
386       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
387       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
388       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
389       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
390       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
391       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
392       v += 25;
393     }
394     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
395     z += 5;
396   }
397   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
398   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
399   PLogFlops(50*a->nz - a->m);
400   PetscFunctionReturn(0);
401 }
402 
403 #undef __FUNC__
404 #define __FUNC__ "MatMult_SeqBAIJ_7"
405 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
406 {
407   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
408    Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
409    Scalar x1,x2,x3,x4,x5,x6,x7;
410   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
411 
412   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
413   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
414 
415   idx   = a->j;
416   v     = a->a;
417   ii    = a->i;
418 
419   for ( i=0; i<mbs; i++ ) {
420     n  = ii[1] - ii[0]; ii++;
421     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
422     for ( j=0; j<n; j++ ) {
423       xb = x + 7*(*idx++);
424       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
425       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
426       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
427       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
428       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
429       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
430       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
431       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
432       v += 49;
433     }
434     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
435     z += 7;
436   }
437 
438   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
439   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
440   PLogFlops(98*a->nz - a->m);
441   PetscFunctionReturn(0);
442 }
443 
444 #undef __FUNC__
445 #define __FUNC__ "MatMult_SeqBAIJ_N"
446 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
447 {
448   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
449    Scalar *x,*z,*v,*xb;
450   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
451   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
452   int             _One = 1,ncols,k;
453 
454   PetscFunctionBegin;
455   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
456   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
457 
458   idx   = a->j;
459   v     = a->a;
460   ii    = a->i;
461 
462 
463   if (!a->mult_work) {
464     k = PetscMax(a->m,a->n);
465     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
466   }
467   work = a->mult_work;
468   for ( i=0; i<mbs; i++ ) {
469     n     = ii[1] - ii[0]; ii++;
470     ncols = n*bs;
471     workt = work;
472     for ( j=0; j<n; j++ ) {
473       xb = x + bs*(*idx++);
474       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
475       workt += bs;
476     }
477     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
478     v += n*bs2;
479     z += bs;
480   }
481   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
482   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
483   PLogFlops(2*a->nz*bs2 - a->m);
484   PetscFunctionReturn(0);
485 }
486 
487 #undef __FUNC__
488 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
489 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
490 {
491   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
492   Scalar          *x,*y,*z,*v,sum;
493   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
494 
495   PetscFunctionBegin;
496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
497   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
498   if (zz != yy) {
499     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
500   } else {
501     z = y;
502   }
503 
504   idx   = a->j;
505   v     = a->a;
506   ii    = a->i;
507 
508   for ( i=0; i<mbs; i++ ) {
509     n    = ii[1] - ii[0]; ii++;
510     sum  = y[i];
511     while (n--) sum += *v++ * x[*idx++];
512     z[i] = sum;
513   }
514   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
515   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
516   if (zz != yy) {
517     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
518   }
519   PLogFlops(2*a->nz);
520   PetscFunctionReturn(0);
521 }
522 
523 #undef __FUNC__
524 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
525 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
526 {
527   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
528   Scalar          *x,*y,*z,*v,*xb,sum1,sum2;
529   Scalar          x1,x2;
530   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
531 
532   PetscFunctionBegin;
533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
534   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
535   if (zz != yy) {
536     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
537   } else {
538     z = y;
539   }
540 
541   idx   = a->j;
542   v     = a->a;
543   ii    = a->i;
544 
545   for ( i=0; i<mbs; i++ ) {
546     n  = ii[1] - ii[0]; ii++;
547     sum1 = y[0]; sum2 = y[1];
548     for ( j=0; j<n; j++ ) {
549       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
550       sum1 += v[0]*x1 + v[2]*x2;
551       sum2 += v[1]*x1 + v[3]*x2;
552       v += 4;
553     }
554     z[0] = sum1; z[1] = sum2;
555     z += 2; y += 2;
556   }
557   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
558   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
559   if (zz != yy) {
560     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
561   }
562   PLogFlops(4*a->nz);
563   PetscFunctionReturn(0);
564 }
565 
566 #undef __FUNC__
567 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
568 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
569 {
570   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
571   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
572   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
573 
574   PetscFunctionBegin;
575   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
576   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
577   if (zz != yy) {
578     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
579   } else {
580     z = y;
581   }
582 
583   idx   = a->j;
584   v     = a->a;
585   ii    = a->i;
586 
587   for ( i=0; i<mbs; i++ ) {
588     n  = ii[1] - ii[0]; ii++;
589     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
590     for ( j=0; j<n; j++ ) {
591       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
592       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
593       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
594       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
595       v += 9;
596     }
597     z[0] = sum1; z[1] = sum2; z[2] = sum3;
598     z += 3; y += 3;
599   }
600   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
601   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
602   if (zz != yy) {
603     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
604   }
605   PLogFlops(18*a->nz);
606   PetscFunctionReturn(0);
607 }
608 
609 #undef __FUNC__
610 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
611 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
612 {
613   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
614   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
615   int             ierr,mbs=a->mbs,i,*idx,*ii;
616   int             j,n;
617 
618   PetscFunctionBegin;
619   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
620   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
621   if (zz != yy) {
622     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
623   } else {
624     z = y;
625   }
626 
627   idx   = a->j;
628   v     = a->a;
629   ii    = a->i;
630 
631   for ( i=0; i<mbs; i++ ) {
632     n  = ii[1] - ii[0]; ii++;
633     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
634     for ( j=0; j<n; j++ ) {
635       xb = x + 4*(*idx++);
636       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
637       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
638       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
639       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
640       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
641       v += 16;
642     }
643     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
644     z += 4; y += 4;
645   }
646   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
647   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
648   if (zz != yy) {
649     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
650   }
651   PLogFlops(32*a->nz);
652   PetscFunctionReturn(0);
653 }
654 
655 #undef __FUNC__
656 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
657 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
658 {
659   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
660   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
661   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
662 
663   PetscFunctionBegin;
664   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
665   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
666   if (zz != yy) {
667     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
668   } else {
669     z = y;
670   }
671 
672   idx   = a->j;
673   v     = a->a;
674   ii    = a->i;
675 
676   for ( i=0; i<mbs; i++ ) {
677     n  = ii[1] - ii[0]; ii++;
678     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
679     for ( j=0; j<n; j++ ) {
680       xb = x + 5*(*idx++);
681       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
682       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
683       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
684       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
685       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
686       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
687       v += 25;
688     }
689     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
690     z += 5; y += 5;
691   }
692   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
693   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
694   if (zz != yy) {
695     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
696   }
697   PLogFlops(50*a->nz);
698   PetscFunctionReturn(0);
699 }
700 
701 #undef __FUNC__
702 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
703 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
704 {
705   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
706   Scalar          *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
707   Scalar          x1,x2,x3,x4,x5,x6,x7;
708   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
709 
710   PetscFunctionBegin;
711   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
712   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
713   if (zz != yy) {
714     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
715   } else {
716     z = y;
717   }
718 
719   idx   = a->j;
720   v     = a->a;
721   ii    = a->i;
722 
723   for ( i=0; i<mbs; i++ ) {
724     n  = ii[1] - ii[0]; ii++;
725     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
726     for ( j=0; j<n; j++ ) {
727       xb = x + 7*(*idx++);
728       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
729       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
730       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
731       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
732       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
733       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
734       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
735       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
736       v += 49;
737     }
738     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
739     z += 7; y += 7;
740   }
741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
742   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
743   if (zz != yy) {
744     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
745   }
746   PLogFlops(98*a->nz);
747   PetscFunctionReturn(0);
748 }
749 
750 
751 #undef __FUNC__
752 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
753 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
754 {
755   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
756   Scalar         *x,*z,*v,*xb;
757   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
758   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
759 
760   PetscFunctionBegin;
761   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
762 
763   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
764   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
765 
766   idx   = a->j;
767   v     = a->a;
768   ii    = a->i;
769 
770 
771   if (!a->mult_work) {
772     k = PetscMax(a->m,a->n);
773     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
774   }
775   work = a->mult_work;
776   for ( i=0; i<mbs; i++ ) {
777     n     = ii[1] - ii[0]; ii++;
778     ncols = n*bs;
779     workt = work;
780     for ( j=0; j<n; j++ ) {
781       xb = x + bs*(*idx++);
782       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
783       workt += bs;
784     }
785     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
786     v += n*bs2;
787     z += bs;
788   }
789   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
790   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
791   PLogFlops(2*a->nz*bs2 );
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNC__
796 #define __FUNC__ "MatMultTrans_SeqBAIJ"
797 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
798 {
799   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
800   Scalar          *xg,*zg,*zb;
801    Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
802   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
803   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
804 
805 
806   PetscFunctionBegin;
807   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
808   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
809   PetscMemzero(z,N*sizeof(Scalar));
810 
811   idx   = a->j;
812   v     = a->a;
813   ii    = a->i;
814 
815   switch (bs) {
816   case 1:
817     for ( i=0; i<mbs; i++ ) {
818       n  = ii[1] - ii[0]; ii++;
819       xb = x + i; x1 = xb[0];
820       ib = idx + ai[i];
821       for ( j=0; j<n; j++ ) {
822         rval    = ib[j];
823         z[rval] += *v++ * x1;
824       }
825     }
826     break;
827   case 2:
828     for ( i=0; i<mbs; i++ ) {
829       n  = ii[1] - ii[0]; ii++;
830       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
831       ib = idx + ai[i];
832       for ( j=0; j<n; j++ ) {
833         rval      = ib[j]*2;
834         z[rval++] += v[0]*x1 + v[1]*x2;
835         z[rval++] += v[2]*x1 + v[3]*x2;
836         v += 4;
837       }
838     }
839     break;
840   case 3:
841     for ( i=0; i<mbs; i++ ) {
842       n  = ii[1] - ii[0]; ii++;
843       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
844       ib = idx + ai[i];
845       for ( j=0; j<n; j++ ) {
846         rval      = ib[j]*3;
847         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
848         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
849         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
850         v += 9;
851       }
852     }
853     break;
854   case 4:
855     for ( i=0; i<mbs; i++ ) {
856       n  = ii[1] - ii[0]; ii++;
857       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
858       ib = idx + ai[i];
859       for ( j=0; j<n; j++ ) {
860         rval      = ib[j]*4;
861         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
862         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
863         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
864         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
865         v += 16;
866       }
867     }
868     break;
869   case 5:
870     for ( i=0; i<mbs; i++ ) {
871       n  = ii[1] - ii[0]; ii++;
872       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
873       x4 = xb[3];   x5 = xb[4];
874       ib = idx + ai[i];
875       for ( j=0; j<n; j++ ) {
876         rval      = ib[j]*5;
877         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
878         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
879         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
880         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
881         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
882         v += 25;
883       }
884     }
885     break;
886       /* block sizes larger then 5 by 5 are handled by BLAS */
887     default: {
888       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
889       if (!a->mult_work) {
890         k = PetscMax(a->m,a->n);
891         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
892         CHKPTRQ(a->mult_work);
893       }
894       work = a->mult_work;
895       for ( i=0; i<mbs; i++ ) {
896         n     = ii[1] - ii[0]; ii++;
897         ncols = n*bs;
898         PetscMemzero(work,ncols*sizeof(Scalar));
899         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
900         v += n*bs2;
901         x += bs;
902         workt = work;
903         for ( j=0; j<n; j++ ) {
904           zb = z + bs*(*idx++);
905           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
906           workt += bs;
907         }
908       }
909     }
910   }
911   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
912   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
913   PLogFlops(2*a->nz*a->bs2 - a->n);
914   PetscFunctionReturn(0);
915 }
916 
917 #undef __FUNC__
918 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
919 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
920 
921 {
922   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
923   Scalar          *xg,*zg,*zb;
924   Scalar          *x,*z,*v,*xb,x1,x2,x3,x4,x5;
925   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
926   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
927 
928   PetscFunctionBegin;
929   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
930   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
931 
932   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
933   else PetscMemzero(z,N*sizeof(Scalar));
934 
935   idx   = a->j;
936   v     = a->a;
937   ii    = a->i;
938 
939   switch (bs) {
940   case 1:
941     for ( i=0; i<mbs; i++ ) {
942       n  = ii[1] - ii[0]; ii++;
943       xb = x + i; x1 = xb[0];
944       ib = idx + ai[i];
945       for ( j=0; j<n; j++ ) {
946         rval    = ib[j];
947         z[rval] += *v++ * x1;
948       }
949     }
950     break;
951   case 2:
952     for ( i=0; i<mbs; i++ ) {
953       n  = ii[1] - ii[0]; ii++;
954       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
955       ib = idx + ai[i];
956       for ( j=0; j<n; j++ ) {
957         rval      = ib[j]*2;
958         z[rval++] += v[0]*x1 + v[1]*x2;
959         z[rval++] += v[2]*x1 + v[3]*x2;
960         v += 4;
961       }
962     }
963     break;
964   case 3:
965     for ( i=0; i<mbs; i++ ) {
966       n  = ii[1] - ii[0]; ii++;
967       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
968       ib = idx + ai[i];
969       for ( j=0; j<n; j++ ) {
970         rval      = ib[j]*3;
971         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
972         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
973         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
974         v += 9;
975       }
976     }
977     break;
978   case 4:
979     for ( i=0; i<mbs; i++ ) {
980       n  = ii[1] - ii[0]; ii++;
981       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
982       ib = idx + ai[i];
983       for ( j=0; j<n; j++ ) {
984         rval      = ib[j]*4;
985         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
986         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
987         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
988         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
989         v += 16;
990       }
991     }
992     break;
993   case 5:
994     for ( i=0; i<mbs; i++ ) {
995       n  = ii[1] - ii[0]; ii++;
996       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
997       x4 = xb[3];   x5 = xb[4];
998       ib = idx + ai[i];
999       for ( j=0; j<n; j++ ) {
1000         rval      = ib[j]*5;
1001         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1002         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1003         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1004         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1005         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1006         v += 25;
1007       }
1008     }
1009     break;
1010       /* block sizes larger then 5 by 5 are handled by BLAS */
1011     default: {
1012       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1013       if (!a->mult_work) {
1014         k = PetscMax(a->m,a->n);
1015         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1016         CHKPTRQ(a->mult_work);
1017       }
1018       work = a->mult_work;
1019       for ( i=0; i<mbs; i++ ) {
1020         n     = ii[1] - ii[0]; ii++;
1021         ncols = n*bs;
1022         PetscMemzero(work,ncols*sizeof(Scalar));
1023         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1024         v += n*bs2;
1025         x += bs;
1026         workt = work;
1027         for ( j=0; j<n; j++ ) {
1028           zb = z + bs*(*idx++);
1029           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1030           workt += bs;
1031         }
1032       }
1033     }
1034   }
1035   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1036   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1037   PLogFlops(2*a->nz*a->bs2);
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 #undef __FUNC__
1042 #define __FUNC__ "MatScale_SeqBAIJ"
1043 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1044 {
1045   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1046   int         one = 1, totalnz = a->bs2*a->nz;
1047 
1048   PetscFunctionBegin;
1049   BLscal_( &totalnz, alpha, a->a, &one );
1050   PLogFlops(totalnz);
1051   PetscFunctionReturn(0);
1052 }
1053 
1054 #undef __FUNC__
1055 #define __FUNC__ "MatNorm_SeqBAIJ"
1056 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1057 {
1058   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1059   Scalar      *v = a->a;
1060   double      sum = 0.0;
1061   int         i,nz=a->nz,bs2=a->bs2;
1062 
1063   PetscFunctionBegin;
1064   if (type == NORM_FROBENIUS) {
1065     for (i=0; i< bs2*nz; i++ ) {
1066 #if defined(USE_PETSC_COMPLEX)
1067       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1068 #else
1069       sum += (*v)*(*v); v++;
1070 #endif
1071     }
1072     *norm = sqrt(sum);
1073   }
1074   else {
1075     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1076   }
1077   PetscFunctionReturn(0);
1078 }
1079 
1080 
1081 #undef __FUNC__
1082 #define __FUNC__ "MatEqual_SeqBAIJ"
1083 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1084 {
1085   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1086 
1087   PetscFunctionBegin;
1088   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1089 
1090   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1091   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1092     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1093   }
1094 
1095   /* if the a->i are the same */
1096   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1097     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1098   }
1099 
1100   /* if a->j are the same */
1101   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1102     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1103   }
1104 
1105   /* if a->a are the same */
1106   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1107     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1108   }
1109   *flg = PETSC_TRUE;
1110   PetscFunctionReturn(0);
1111 
1112 }
1113 
1114 #undef __FUNC__
1115 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1116 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1117 {
1118   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1119   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1120   Scalar      *x, zero = 0.0,*aa,*aa_j;
1121 
1122   PetscFunctionBegin;
1123   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1124   bs   = a->bs;
1125   aa   = a->a;
1126   ai   = a->i;
1127   aj   = a->j;
1128   ambs = a->mbs;
1129   bs2  = a->bs2;
1130 
1131   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1132   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1133   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1134   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1135   for ( i=0; i<ambs; i++ ) {
1136     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1137       if (aj[j] == i) {
1138         row  = i*bs;
1139         aa_j = aa+j*bs2;
1140         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1141         break;
1142       }
1143     }
1144   }
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 #undef __FUNC__
1149 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1150 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1151 {
1152   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1153   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1154   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1155 
1156   PetscFunctionBegin;
1157   ai  = a->i;
1158   aj  = a->j;
1159   aa  = a->a;
1160   m   = a->m;
1161   n   = a->n;
1162   bs  = a->bs;
1163   mbs = a->mbs;
1164   bs2 = a->bs2;
1165   if (ll) {
1166     ierr = VecGetArray(ll,&l); CHKERRQ(ierr);
1167     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1168     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1169     for ( i=0; i<mbs; i++ ) { /* for each block row */
1170       M  = ai[i+1] - ai[i];
1171       li = l + i*bs;
1172       v  = aa + bs2*ai[i];
1173       for ( j=0; j<M; j++ ) { /* for each block */
1174         for ( k=0; k<bs2; k++ ) {
1175           (*v++) *= li[k%bs];
1176         }
1177       }
1178     }
1179   }
1180 
1181   if (rr) {
1182     ierr = VecGetArray(rr,&r); CHKERRQ(ierr);
1183     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1184     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1185     for ( i=0; i<mbs; i++ ) { /* for each block row */
1186       M  = ai[i+1] - ai[i];
1187       v  = aa + bs2*ai[i];
1188       for ( j=0; j<M; j++ ) { /* for each block */
1189         ri = r + bs*aj[ai[i]+j];
1190         for ( k=0; k<bs; k++ ) {
1191           x = ri[k];
1192           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1193         }
1194       }
1195     }
1196   }
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 
1201 #undef __FUNC__
1202 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1203 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1204 {
1205   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1206 
1207   PetscFunctionBegin;
1208   info->rows_global    = (double)a->m;
1209   info->columns_global = (double)a->n;
1210   info->rows_local     = (double)a->m;
1211   info->columns_local  = (double)a->n;
1212   info->block_size     = a->bs2;
1213   info->nz_allocated   = a->maxnz;
1214   info->nz_used        = a->bs2*a->nz;
1215   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1216   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1217     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1218   info->assemblies   = A->num_ass;
1219   info->mallocs      = a->reallocs;
1220   info->memory       = A->mem;
1221   if (A->factor) {
1222     info->fill_ratio_given  = A->info.fill_ratio_given;
1223     info->fill_ratio_needed = A->info.fill_ratio_needed;
1224     info->factor_mallocs    = A->info.factor_mallocs;
1225   } else {
1226     info->fill_ratio_given  = 0;
1227     info->fill_ratio_needed = 0;
1228     info->factor_mallocs    = 0;
1229   }
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 
1234 #undef __FUNC__
1235 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1236 int MatZeroEntries_SeqBAIJ(Mat A)
1237 {
1238   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1239 
1240   PetscFunctionBegin;
1241   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
1242   PetscFunctionReturn(0);
1243 }
1244