xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision c31ba22a230fe2c23596c66917081dfec3ccf152)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.22 1998/01/28 21:02:12 bsmith Exp bsmith $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "sys.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/spops.h"
10 #include "petsc.h"
11 #include "src/inline/bitarray.h"
12 
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
16 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
17 {
18   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
19   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
20   int         start, end, *ai, *aj,bs,*nidx2;
21   BT          table;
22 
23   PetscFunctionBegin;
24   m     = a->mbs;
25   ai    = a->i;
26   aj    = a->j;
27   bs    = a->bs;
28 
29   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
30 
31   ierr  = BTCreate(m,table); CHKERRQ(ierr);
32   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
33   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
34 
35   for ( i=0; i<is_max; i++ ) {
36     /* Initialise the two local arrays */
37     isz  = 0;
38     BTMemzero(m,table);
39 
40     /* Extract the indices, assume there can be duplicate entries */
41     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
42     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
43 
44     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
45     for ( j=0; j<n ; ++j){
46       ival = idx[j]/bs; /* convert the indices into block indices */
47       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
48       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
49     }
50     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
52 
53     k = 0;
54     for ( j=0; j<ov; j++){ /* for each overlap*/
55       n = isz;
56       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
57         row   = nidx[k];
58         start = ai[row];
59         end   = ai[row+1];
60         for ( l = start; l<end ; l++){
61           val = aj[l];
62           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
63         }
64       }
65     }
66     /* expand the Index Set */
67     for (j=0; j<isz; j++ ) {
68       for (k=0; k<bs; k++ )
69         nidx2[j*bs+k] = nidx[j]*bs+k;
70     }
71     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
72   }
73   BTDestroy(table);
74   PetscFree(nidx);
75   PetscFree(nidx2);
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNC__
80 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
81 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
82 {
83   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
84   int          nznew, *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
85   int          row,mat_i,*mat_j,tcol,*mat_ilen;
86   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
87   int          *aj = a->j, *ai = a->i;
88   Scalar       *mat_a;
89   Mat          C;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,(PetscTruth*)&i);
93   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
97   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
98   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
99 
100   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
101   ssmap = smap;
102   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
103   PetscMemzero(smap,oldcols*sizeof(int));
104   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110       for ( k=kstart; k<kend; k++ ) {
111         if (ssmap[aj[k]]) {
112           lens[i]++;
113         }
114       }
115     }
116   /* Create and fill new matrix */
117   if (scall == MAT_REUSE_MATRIX) {
118     c = (Mat_SeqBAIJ *)((*B)->data);
119 
120     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
121     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     PetscMemzero(c->ilen,c->mbs*sizeof(int));
125     C = *B;
126   } else {
127     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ *)(C->data);
130   for (i=0; i<nrows; i++) {
131     row    = irow[i];
132     nznew  = 0;
133     kstart = ai[row];
134     kend   = kstart + a->ilen[row];
135     mat_i  = c->i[i];
136     mat_j  = c->j + mat_i;
137     mat_a  = c->a + mat_i*bs2;
138     mat_ilen = c->ilen + i;
139     for ( k=kstart; k<kend; k++ ) {
140       if ((tcol=ssmap[a->j[k]])) {
141         *mat_j++ = tcol - 1;
142         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2;
143         (*mat_ilen)++;
144       }
145     }
146   }
147 
148   /* Free work space */
149   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
150   PetscFree(smap); PetscFree(lens);
151   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
152   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
153 
154   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
155   *B = C;
156   PetscFunctionReturn(0);
157 }
158 
159 #undef __FUNC__
160 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
161 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatGetSubMatrixCall scall,Mat *B)
162 {
163   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
164   IS          is1,is2;
165   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
166 
167   PetscFunctionBegin;
168   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
169   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
170   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
171   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
172 
173   /* Verify if the indices corespond to each element in a block
174    and form the IS with compressed IS */
175   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
176   iary = vary + a->mbs;
177   PetscMemzero(vary,(a->mbs)*sizeof(int));
178   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
179   count = 0;
180   for (i=0; i<a->mbs; i++) {
181     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
182     if (vary[i]==bs) iary[count++] = i;
183   }
184   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
185 
186   PetscMemzero(vary,(a->mbs)*sizeof(int));
187   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
188   count = 0;
189   for (i=0; i<a->mbs; i++) {
190     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
191     if (vary[i]==bs) iary[count++] = i;
192   }
193   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
194   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
195   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
196   PetscFree(vary);
197 
198   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B); CHKERRQ(ierr);
199   ISDestroy(is1);
200   ISDestroy(is2);
201   PetscFunctionReturn(0);
202 }
203 
204 #undef __FUNC__
205 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
206 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,Mat **B)
207 {
208   int ierr,i;
209 
210   PetscFunctionBegin;
211   if (scall == MAT_INITIAL_MATRIX) {
212     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
213   }
214 
215   for ( i=0; i<n; i++ ) {
216     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
217   }
218   PetscFunctionReturn(0);
219 }
220 
221 
222 /* -------------------------------------------------------*/
223 /* Should check that shapes of vectors and matrices match */
224 /* -------------------------------------------------------*/
225 #include "pinclude/plapack.h"
226 
227 #undef __FUNC__
228 #define __FUNC__ "MatMult_SeqBAIJ_1"
229 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
230 {
231   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
232   register Scalar *x,*z,*v,sum;
233   int             mbs=a->mbs,i,*idx,*ii,n;
234 
235   PetscFunctionBegin;
236   VecGetArray_Fast(xx,x);
237   VecGetArray_Fast(zz,z);
238 
239   idx   = a->j;
240   v     = a->a;
241   ii    = a->i;
242 
243   for ( i=0; i<mbs; i++ ) {
244     n    = ii[1] - ii[0]; ii++;
245     sum  = 0.0;
246     while (n--) sum += *v++ * x[*idx++];
247     z[i] = sum;
248   }
249   VecRestoreArray_Fast(xx,x);
250   VecRestoreArray_Fast(zz,z);
251   PLogFlops(2*a->nz - a->m);
252   PetscFunctionReturn(0);
253 }
254 
255 #undef __FUNC__
256 #define __FUNC__ "MatMult_SeqBAIJ_2"
257 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
258 {
259   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
260   register Scalar *x,*z,*v,*xb,sum1,sum2;
261   register Scalar x1,x2;
262   int             mbs=a->mbs,i,*idx,*ii,j,n;
263 
264   PetscFunctionBegin;
265   VecGetArray_Fast(xx,x);
266   VecGetArray_Fast(zz,z);
267 
268   idx   = a->j;
269   v     = a->a;
270   ii    = a->i;
271 
272   for ( i=0; i<mbs; i++ ) {
273     n  = ii[1] - ii[0]; ii++;
274     sum1 = 0.0; sum2 = 0.0;
275     for ( j=0; j<n; j++ ) {
276       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
277       sum1 += v[0]*x1 + v[2]*x2;
278       sum2 += v[1]*x1 + v[3]*x2;
279       v += 4;
280     }
281     z[0] = sum1; z[1] = sum2;
282     z += 2;
283   }
284   VecRestoreArray_Fast(xx,x);
285   VecRestoreArray_Fast(zz,z);
286   PLogFlops(4*a->nz - a->m);
287   PetscFunctionReturn(0);
288 }
289 
290 #undef __FUNC__
291 #define __FUNC__ "MatMult_SeqBAIJ_3"
292 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
293 {
294   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
295   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
296   int             mbs=a->mbs,i,*idx,*ii,j,n;
297 
298   PetscFunctionBegin;
299   VecGetArray_Fast(xx,x);
300   VecGetArray_Fast(zz,z);
301 
302   idx   = a->j;
303   v     = a->a;
304   ii    = a->i;
305 
306   for ( i=0; i<mbs; i++ ) {
307     n  = ii[1] - ii[0]; ii++;
308     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
309     for ( j=0; j<n; j++ ) {
310       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
311       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
312       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
313       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
314       v += 9;
315     }
316     z[0] = sum1; z[1] = sum2; z[2] = sum3;
317     z += 3;
318   }
319   VecRestoreArray_Fast(xx,x);
320   VecRestoreArray_Fast(zz,z);
321   PLogFlops(18*a->nz - a->m);
322   PetscFunctionReturn(0);
323 }
324 
325 #undef __FUNC__
326 #define __FUNC__ "MatMult_SeqBAIJ_4"
327 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
328 {
329   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
330   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
331   int             mbs=a->mbs,i,*idx,*ii,j,n;
332 
333   PetscFunctionBegin;
334   VecGetArray_Fast(xx,x);
335   VecGetArray_Fast(zz,z);
336 
337   idx   = a->j;
338   v     = a->a;
339   ii    = a->i;
340 
341   for ( i=0; i<mbs; i++ ) {
342     n  = ii[1] - ii[0]; ii++;
343     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
344     for ( j=0; j<n; j++ ) {
345       xb = x + 4*(*idx++);
346       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
347       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
348       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
349       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
350       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
351       v += 16;
352     }
353     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
354     z += 4;
355   }
356   VecRestoreArray_Fast(xx,x);
357   VecRestoreArray_Fast(zz,z);
358   PLogFlops(32*a->nz - a->m);
359   PetscFunctionReturn(0);
360 }
361 
362 #undef __FUNC__
363 #define __FUNC__ "MatMult_SeqBAIJ_5"
364 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
365 {
366   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
367   register Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
368   register Scalar * restrict v,* restrict xb,* restrict z, * restrict x;
369   int             mbs=a->mbs,i,*idx,*ii,j,n;
370 
371   VecGetArray_Fast(xx,x);
372   VecGetArray_Fast(zz,z);
373 
374   idx   = a->j;
375   v     = a->a;
376   ii    = a->i;
377 
378   for ( i=0; i<mbs; i++ ) {
379     n  = ii[1] - ii[0]; ii++;
380     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
381     for ( j=0; j<n; j++ ) {
382       xb = x + 5*(*idx++);
383       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
384       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
385       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
386       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
387       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
388       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
389       v += 25;
390     }
391     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
392     z += 5;
393   }
394   VecRestoreArray_Fast(xx,x);
395   VecRestoreArray_Fast(zz,z);
396   PLogFlops(50*a->nz - a->m);
397   PetscFunctionReturn(0);
398 }
399 
400 #undef __FUNC__
401 #define __FUNC__ "MatMult_SeqBAIJ_7"
402 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
403 {
404   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
405   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
406   register Scalar x1,x2,x3,x4,x5,x6,x7;
407   int             mbs=a->mbs,i,*idx,*ii,j,n;
408 
409   VecGetArray_Fast(xx,x);
410   VecGetArray_Fast(zz,z);
411 
412   idx   = a->j;
413   v     = a->a;
414   ii    = a->i;
415 
416   for ( i=0; i<mbs; i++ ) {
417     n  = ii[1] - ii[0]; ii++;
418     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
419     for ( j=0; j<n; j++ ) {
420       xb = x + 7*(*idx++);
421       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
422       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
423       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
424       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
425       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
426       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
427       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
428       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
429       v += 49;
430     }
431     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
432     z += 7;
433   }
434 
435   VecRestoreArray_Fast(xx,x);
436   VecRestoreArray_Fast(zz,z);
437   PLogFlops(98*a->nz - a->m);
438   PetscFunctionReturn(0);
439 }
440 
441 #undef __FUNC__
442 #define __FUNC__ "MatMult_SeqBAIJ_N"
443 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
444 {
445   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
446   register Scalar *x,*z,*v,*xb;
447   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
448   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
449   int             _One = 1,ncols,k;
450 
451   PetscFunctionBegin;
452   VecGetArray_Fast(xx,x);
453   VecGetArray_Fast(zz,z);
454 
455   idx   = a->j;
456   v     = a->a;
457   ii    = a->i;
458 
459 
460   if (!a->mult_work) {
461     k = PetscMax(a->m,a->n);
462     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
463   }
464   work = a->mult_work;
465   for ( i=0; i<mbs; i++ ) {
466     n     = ii[1] - ii[0]; ii++;
467     ncols = n*bs;
468     workt = work;
469     for ( j=0; j<n; j++ ) {
470       xb = x + bs*(*idx++);
471       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
472       workt += bs;
473     }
474     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
475     v += n*bs2;
476     z += bs;
477   }
478   VecRestoreArray_Fast(xx,x);
479   VecRestoreArray_Fast(zz,z);
480   PLogFlops(2*a->nz*bs2 - a->m);
481   PetscFunctionReturn(0);
482 }
483 
484 #undef __FUNC__
485 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
486 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
487 {
488   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
489   register Scalar *x,*y,*z,*v,sum;
490   int             mbs=a->mbs,i,*idx,*ii,n;
491 
492   PetscFunctionBegin;
493   VecGetArray_Fast(xx,x);
494   VecGetArray_Fast(yy,y);
495   VecGetArray_Fast(zz,z);
496 
497   idx   = a->j;
498   v     = a->a;
499   ii    = a->i;
500 
501   for ( i=0; i<mbs; i++ ) {
502     n    = ii[1] - ii[0]; ii++;
503     sum  = y[i];
504     while (n--) sum += *v++ * x[*idx++];
505     z[i] = sum;
506   }
507   VecRestoreArray_Fast(xx,x);
508   VecRestoreArray_Fast(yy,y);
509   VecRestoreArray_Fast(zz,z);
510   PLogFlops(2*a->nz);
511   PetscFunctionReturn(0);
512 }
513 
514 #undef __FUNC__
515 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
516 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
517 {
518   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
519   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
520   register Scalar x1,x2;
521   int             mbs=a->mbs,i,*idx,*ii,j,n;
522 
523   PetscFunctionBegin;
524   VecGetArray_Fast(xx,x);
525   VecGetArray_Fast(yy,y);
526   VecGetArray_Fast(zz,z);
527 
528   idx   = a->j;
529   v     = a->a;
530   ii    = a->i;
531 
532   for ( i=0; i<mbs; i++ ) {
533     n  = ii[1] - ii[0]; ii++;
534     sum1 = y[0]; sum2 = y[1];
535     for ( j=0; j<n; j++ ) {
536       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
537       sum1 += v[0]*x1 + v[2]*x2;
538       sum2 += v[1]*x1 + v[3]*x2;
539       v += 4;
540     }
541     z[0] = sum1; z[1] = sum2;
542     z += 2; y += 2;
543   }
544   VecRestoreArray_Fast(xx,x);
545   VecRestoreArray_Fast(yy,y);
546   VecRestoreArray_Fast(zz,z);
547   PLogFlops(4*a->nz);
548   PetscFunctionReturn(0);
549 }
550 
551 #undef __FUNC__
552 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
553 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
554 {
555   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
556   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
557   int             mbs=a->mbs,i,*idx,*ii,j,n;
558 
559   PetscFunctionBegin;
560   VecGetArray_Fast(xx,x);
561   VecGetArray_Fast(yy,y);
562   VecGetArray_Fast(zz,z);
563 
564   idx   = a->j;
565   v     = a->a;
566   ii    = a->i;
567 
568   for ( i=0; i<mbs; i++ ) {
569     n  = ii[1] - ii[0]; ii++;
570     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
571     for ( j=0; j<n; j++ ) {
572       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
573       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
574       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
575       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
576       v += 9;
577     }
578     z[0] = sum1; z[1] = sum2; z[2] = sum3;
579     z += 3; y += 3;
580   }
581   VecRestoreArray_Fast(xx,x);
582   VecRestoreArray_Fast(yy,y);
583   VecRestoreArray_Fast(zz,z);
584   PLogFlops(18*a->nz);
585   PetscFunctionReturn(0);
586 }
587 
588 #undef __FUNC__
589 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
590 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
591 {
592   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
593   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
594   int             mbs=a->mbs,i,*idx,*ii;
595   int             j,n;
596 
597   PetscFunctionBegin;
598   VecGetArray_Fast(xx,x);
599   VecGetArray_Fast(yy,y);
600   VecGetArray_Fast(zz,z);
601 
602   idx   = a->j;
603   v     = a->a;
604   ii    = a->i;
605 
606   for ( i=0; i<mbs; i++ ) {
607     n  = ii[1] - ii[0]; ii++;
608     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
609     for ( j=0; j<n; j++ ) {
610       xb = x + 4*(*idx++);
611       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
612       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
613       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
614       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
615       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
616       v += 16;
617     }
618     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
619     z += 4; y += 4;
620   }
621   VecRestoreArray_Fast(xx,x);
622   VecRestoreArray_Fast(yy,y);
623   VecRestoreArray_Fast(zz,z);
624   PLogFlops(32*a->nz);
625   PetscFunctionReturn(0);
626 }
627 
628 #undef __FUNC__
629 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
630 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
631 {
632   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
633   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
634   int             mbs=a->mbs,i,*idx,*ii,j,n;
635 
636   PetscFunctionBegin;
637   VecGetArray_Fast(xx,x);
638   VecGetArray_Fast(yy,y);
639   VecGetArray_Fast(zz,z);
640 
641   idx   = a->j;
642   v     = a->a;
643   ii    = a->i;
644 
645   for ( i=0; i<mbs; i++ ) {
646     n  = ii[1] - ii[0]; ii++;
647     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
648     for ( j=0; j<n; j++ ) {
649       xb = x + 5*(*idx++);
650       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
651       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
652       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
653       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
654       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
655       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
656       v += 25;
657     }
658     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
659     z += 5; y += 5;
660   }
661   VecRestoreArray_Fast(xx,x);
662   VecRestoreArray_Fast(yy,y);
663   VecRestoreArray_Fast(zz,z);
664   PLogFlops(50*a->nz);
665   PetscFunctionReturn(0);
666 }
667 
668 #undef __FUNC__
669 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
670 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
671 {
672   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
673   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
674   register Scalar x1,x2,x3,x4,x5,x6,x7;
675   int             mbs=a->mbs,i,*idx,*ii,j,n;
676 
677   PetscFunctionBegin;
678   VecGetArray_Fast(xx,x);
679   VecGetArray_Fast(yy,y);
680   VecGetArray_Fast(zz,z);
681 
682   idx   = a->j;
683   v     = a->a;
684   ii    = a->i;
685 
686   for ( i=0; i<mbs; i++ ) {
687     n  = ii[1] - ii[0]; ii++;
688     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
689     for ( j=0; j<n; j++ ) {
690       xb = x + 7*(*idx++);
691       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
692       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
693       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
694       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
695       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
696       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
697       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
698       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
699       v += 49;
700     }
701     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
702     z += 7; y += 7;
703   }
704   VecRestoreArray_Fast(xx,x);
705   VecRestoreArray_Fast(yy,y);
706   VecRestoreArray_Fast(zz,z);
707   PLogFlops(98*a->nz);
708   PetscFunctionReturn(0);
709 }
710 
711 
712 #undef __FUNC__
713 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
714 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
715 {
716   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
717   register Scalar *x,*z,*v,*xb;
718   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
719   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
720 
721   PetscFunctionBegin;
722   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
723 
724   VecGetArray_Fast(xx,x);
725   VecGetArray_Fast(zz,z);
726 
727   idx   = a->j;
728   v     = a->a;
729   ii    = a->i;
730 
731 
732   if (!a->mult_work) {
733     k = PetscMax(a->m,a->n);
734     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
735   }
736   work = a->mult_work;
737   for ( i=0; i<mbs; i++ ) {
738     n     = ii[1] - ii[0]; ii++;
739     ncols = n*bs;
740     workt = work;
741     for ( j=0; j<n; j++ ) {
742       xb = x + bs*(*idx++);
743       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
744       workt += bs;
745     }
746     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
747     v += n*bs2;
748     z += bs;
749   }
750   VecRestoreArray_Fast(xx,x);
751   VecRestoreArray_Fast(zz,z);
752   PLogFlops(2*a->nz*bs2 );
753   PetscFunctionReturn(0);
754 }
755 
756 #undef __FUNC__
757 #define __FUNC__ "MatMultTrans_SeqBAIJ"
758 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
759 {
760   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
761   Scalar          *xg,*zg,*zb;
762   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
763   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
764   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
765 
766 
767   PetscFunctionBegin;
768   VecGetArray_Fast(xx,xg); x = xg;
769   VecGetArray_Fast(zz,zg); z = zg;
770   PetscMemzero(z,N*sizeof(Scalar));
771 
772   idx   = a->j;
773   v     = a->a;
774   ii    = a->i;
775 
776   switch (bs) {
777   case 1:
778     for ( i=0; i<mbs; i++ ) {
779       n  = ii[1] - ii[0]; ii++;
780       xb = x + i; x1 = xb[0];
781       ib = idx + ai[i];
782       for ( j=0; j<n; j++ ) {
783         rval    = ib[j];
784         z[rval] += *v++ * x1;
785       }
786     }
787     break;
788   case 2:
789     for ( i=0; i<mbs; i++ ) {
790       n  = ii[1] - ii[0]; ii++;
791       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
792       ib = idx + ai[i];
793       for ( j=0; j<n; j++ ) {
794         rval      = ib[j]*2;
795         z[rval++] += v[0]*x1 + v[1]*x2;
796         z[rval++] += v[2]*x1 + v[3]*x2;
797         v += 4;
798       }
799     }
800     break;
801   case 3:
802     for ( i=0; i<mbs; i++ ) {
803       n  = ii[1] - ii[0]; ii++;
804       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
805       ib = idx + ai[i];
806       for ( j=0; j<n; j++ ) {
807         rval      = ib[j]*3;
808         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
809         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
810         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
811         v += 9;
812       }
813     }
814     break;
815   case 4:
816     for ( i=0; i<mbs; i++ ) {
817       n  = ii[1] - ii[0]; ii++;
818       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
819       ib = idx + ai[i];
820       for ( j=0; j<n; j++ ) {
821         rval      = ib[j]*4;
822         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
823         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
824         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
825         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
826         v += 16;
827       }
828     }
829     break;
830   case 5:
831     for ( i=0; i<mbs; i++ ) {
832       n  = ii[1] - ii[0]; ii++;
833       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
834       x4 = xb[3];   x5 = xb[4];
835       ib = idx + ai[i];
836       for ( j=0; j<n; j++ ) {
837         rval      = ib[j]*5;
838         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843         v += 25;
844       }
845     }
846     break;
847       /* block sizes larger then 5 by 5 are handled by BLAS */
848     default: {
849       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
850       if (!a->mult_work) {
851         k = PetscMax(a->m,a->n);
852         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
853         CHKPTRQ(a->mult_work);
854       }
855       work = a->mult_work;
856       for ( i=0; i<mbs; i++ ) {
857         n     = ii[1] - ii[0]; ii++;
858         ncols = n*bs;
859         PetscMemzero(work,ncols*sizeof(Scalar));
860         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
861         v += n*bs2;
862         x += bs;
863         workt = work;
864         for ( j=0; j<n; j++ ) {
865           zb = z + bs*(*idx++);
866           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
867           workt += bs;
868         }
869       }
870     }
871   }
872   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
873   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
874   PLogFlops(2*a->nz*a->bs2 - a->n);
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNC__
879 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
880 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
881 
882 {
883   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
884   Scalar          *xg,*zg,*zb;
885   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
886   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
887   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
888 
889   PetscFunctionBegin;
890   VecGetArray_Fast(xx,xg); x = xg;
891   VecGetArray_Fast(zz,zg); z = zg;
892 
893   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
894   else PetscMemzero(z,N*sizeof(Scalar));
895 
896   idx   = a->j;
897   v     = a->a;
898   ii    = a->i;
899 
900   switch (bs) {
901   case 1:
902     for ( i=0; i<mbs; i++ ) {
903       n  = ii[1] - ii[0]; ii++;
904       xb = x + i; x1 = xb[0];
905       ib = idx + ai[i];
906       for ( j=0; j<n; j++ ) {
907         rval    = ib[j];
908         z[rval] += *v++ * x1;
909       }
910     }
911     break;
912   case 2:
913     for ( i=0; i<mbs; i++ ) {
914       n  = ii[1] - ii[0]; ii++;
915       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
916       ib = idx + ai[i];
917       for ( j=0; j<n; j++ ) {
918         rval      = ib[j]*2;
919         z[rval++] += v[0]*x1 + v[1]*x2;
920         z[rval++] += v[2]*x1 + v[3]*x2;
921         v += 4;
922       }
923     }
924     break;
925   case 3:
926     for ( i=0; i<mbs; i++ ) {
927       n  = ii[1] - ii[0]; ii++;
928       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
929       ib = idx + ai[i];
930       for ( j=0; j<n; j++ ) {
931         rval      = ib[j]*3;
932         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
933         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
934         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
935         v += 9;
936       }
937     }
938     break;
939   case 4:
940     for ( i=0; i<mbs; i++ ) {
941       n  = ii[1] - ii[0]; ii++;
942       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
943       ib = idx + ai[i];
944       for ( j=0; j<n; j++ ) {
945         rval      = ib[j]*4;
946         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
947         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
948         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
949         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
950         v += 16;
951       }
952     }
953     break;
954   case 5:
955     for ( i=0; i<mbs; i++ ) {
956       n  = ii[1] - ii[0]; ii++;
957       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
958       x4 = xb[3];   x5 = xb[4];
959       ib = idx + ai[i];
960       for ( j=0; j<n; j++ ) {
961         rval      = ib[j]*5;
962         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
963         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
964         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
965         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
966         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
967         v += 25;
968       }
969     }
970     break;
971       /* block sizes larger then 5 by 5 are handled by BLAS */
972     default: {
973       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
974       if (!a->mult_work) {
975         k = PetscMax(a->m,a->n);
976         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
977         CHKPTRQ(a->mult_work);
978       }
979       work = a->mult_work;
980       for ( i=0; i<mbs; i++ ) {
981         n     = ii[1] - ii[0]; ii++;
982         ncols = n*bs;
983         PetscMemzero(work,ncols*sizeof(Scalar));
984         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
985         v += n*bs2;
986         x += bs;
987         workt = work;
988         for ( j=0; j<n; j++ ) {
989           zb = z + bs*(*idx++);
990           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
991           workt += bs;
992         }
993       }
994     }
995   }
996   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
997   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
998   PLogFlops(2*a->nz*a->bs2);
999   PetscFunctionReturn(0);
1000 }
1001 
1002 #undef __FUNC__
1003 #define __FUNC__ "MatScale_SeqBAIJ"
1004 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1005 {
1006   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1007   int         one = 1, totalnz = a->bs2*a->nz;
1008 
1009   PetscFunctionBegin;
1010   BLscal_( &totalnz, alpha, a->a, &one );
1011   PLogFlops(totalnz);
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 #undef __FUNC__
1016 #define __FUNC__ "MatNorm_SeqBAIJ"
1017 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1018 {
1019   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1020   Scalar      *v = a->a;
1021   double      sum = 0.0;
1022   int         i,nz=a->nz,bs2=a->bs2;
1023 
1024   PetscFunctionBegin;
1025   if (type == NORM_FROBENIUS) {
1026     for (i=0; i< bs2*nz; i++ ) {
1027 #if defined(USE_PETSC_COMPLEX)
1028       sum += real(conj(*v)*(*v)); v++;
1029 #else
1030       sum += (*v)*(*v); v++;
1031 #endif
1032     }
1033     *norm = sqrt(sum);
1034   }
1035   else {
1036     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1037   }
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 
1042 #undef __FUNC__
1043 #define __FUNC__ "MatEqual_SeqBAIJ"
1044 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1045 {
1046   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1047 
1048   PetscFunctionBegin;
1049   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1050 
1051   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1052   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1053     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1054   }
1055 
1056   /* if the a->i are the same */
1057   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1058     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1059   }
1060 
1061   /* if a->j are the same */
1062   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1063     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1064   }
1065 
1066   /* if a->a are the same */
1067   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1068     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1069   }
1070   *flg = PETSC_TRUE;
1071   PetscFunctionReturn(0);
1072 
1073 }
1074 
1075 #undef __FUNC__
1076 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1077 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1078 {
1079   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1080   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1081   Scalar      *x, zero = 0.0,*aa,*aa_j;
1082 
1083   PetscFunctionBegin;
1084   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1085   bs   = a->bs;
1086   aa   = a->a;
1087   ai   = a->i;
1088   aj   = a->j;
1089   ambs = a->mbs;
1090   bs2  = a->bs2;
1091 
1092   VecSet(&zero,v);
1093   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1094   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1095   for ( i=0; i<ambs; i++ ) {
1096     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1097       if (aj[j] == i) {
1098         row  = i*bs;
1099         aa_j = aa+j*bs2;
1100         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1101         break;
1102       }
1103     }
1104   }
1105   PetscFunctionReturn(0);
1106 }
1107 
1108 #undef __FUNC__
1109 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1110 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1111 {
1112   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1113   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1114   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1115 
1116   PetscFunctionBegin;
1117   ai  = a->i;
1118   aj  = a->j;
1119   aa  = a->a;
1120   m   = a->m;
1121   n   = a->n;
1122   bs  = a->bs;
1123   mbs = a->mbs;
1124   bs2 = a->bs2;
1125   if (ll) {
1126     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1127     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1128     for ( i=0; i<mbs; i++ ) { /* for each block row */
1129       M  = ai[i+1] - ai[i];
1130       li = l + i*bs;
1131       v  = aa + bs2*ai[i];
1132       for ( j=0; j<M; j++ ) { /* for each block */
1133         for ( k=0; k<bs2; k++ ) {
1134           (*v++) *= li[k%bs];
1135         }
1136       }
1137     }
1138   }
1139 
1140   if (rr) {
1141     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1142     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1143     for ( i=0; i<mbs; i++ ) { /* for each block row */
1144       M  = ai[i+1] - ai[i];
1145       v  = aa + bs2*ai[i];
1146       for ( j=0; j<M; j++ ) { /* for each block */
1147         ri = r + bs*aj[ai[i]+j];
1148         for ( k=0; k<bs; k++ ) {
1149           x = ri[k];
1150           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1151         }
1152       }
1153     }
1154   }
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 
1159 #undef __FUNC__
1160 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1161 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1162 {
1163   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1164 
1165   PetscFunctionBegin;
1166   info->rows_global    = (double)a->m;
1167   info->columns_global = (double)a->n;
1168   info->rows_local     = (double)a->m;
1169   info->columns_local  = (double)a->n;
1170   info->block_size     = a->bs2;
1171   info->nz_allocated   = a->maxnz;
1172   info->nz_used        = a->bs2*a->nz;
1173   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1174   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1175     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1176   info->assemblies   = A->num_ass;
1177   info->mallocs      = a->reallocs;
1178   info->memory       = A->mem;
1179   if (A->factor) {
1180     info->fill_ratio_given  = A->info.fill_ratio_given;
1181     info->fill_ratio_needed = A->info.fill_ratio_needed;
1182     info->factor_mallocs    = A->info.factor_mallocs;
1183   } else {
1184     info->fill_ratio_given  = 0;
1185     info->fill_ratio_needed = 0;
1186     info->factor_mallocs    = 0;
1187   }
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 
1192 #undef __FUNC__
1193 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1194 int MatZeroEntries_SeqBAIJ(Mat A)
1195 {
1196   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1197 
1198   PetscFunctionBegin;
1199   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
1200   PetscFunctionReturn(0);
1201 }
1202