xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 4482741e5b2e2bbc854fb1f8dba65221386520f2)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
12 {
13   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
14   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,
15                start,end,*ai,*aj,bs,*nidx2;
16   PetscBT      table;
17   PetscBT      table0;
18 
19   PetscFunctionBegin;
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = a->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
30   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) { /* for each is */
33     isz  = 0;
34     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
41     for (j=0; j<n ; ++j){
42       brow = idx[j]/bs; /* convert the indices into block indices */
43       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,brow)) { nidx[isz++] = brow;}
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap */
51       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
53       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
54 
55       n = isz;  /* length of the updated is[i] */
56       for (brow=0; brow<mbs; brow++){
57         start = ai[brow]; end   = ai[brow+1];
58         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l<end ; l++){
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
62           }
63           k++;
64           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
65         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
66           for (l = start; l<end ; l++){
67             bcol = aj[l];
68             if (PetscBTLookup(table0,bcol)){
69               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
70               break; /* for l = start; l<end ; l++) */
71             }
72           }
73         }
74       }
75     } /* for each overlap */
76 
77     /* expand the Index Set */
78     for (j=0; j<isz; j++) {
79       for (k=0; k<bs; k++)
80         nidx2[j*bs+k] = nidx[j]*bs+k;
81     }
82     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
83   }
84   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
85   ierr = PetscFree(nidx);CHKERRQ(ierr);
86   ierr = PetscFree(nidx2);CHKERRQ(ierr);
87   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
88   PetscFunctionReturn(0);
89 }
90 
91 #undef __FUNCT__
92 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
93 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
94 {
95   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
96   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
97   int          row,mat_i,*mat_j,tcol,*mat_ilen;
98   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
99   int          *aj = a->j,*ai = a->i;
100   MatScalar    *mat_a;
101   Mat          C;
102   PetscTruth   flag;
103 
104   PetscFunctionBegin;
105 
106   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
107   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
108   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
109 
110   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
111   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
112 
113   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
114   ssmap = smap;
115   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
116   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
117   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
118   /* determine lens of each row */
119   for (i=0; i<nrows; i++) {
120     kstart  = ai[irow[i]];
121     kend    = kstart + a->ilen[irow[i]];
122     lens[i] = 0;
123       for (k=kstart; k<kend; k++) {
124         if (ssmap[aj[k]]) {
125           lens[i]++;
126         }
127       }
128     }
129   /* Create and fill new matrix */
130   if (scall == MAT_REUSE_MATRIX) {
131     c = (Mat_SeqSBAIJ *)((*B)->data);
132 
133     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
134     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
135     if (flag == PETSC_FALSE) {
136       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
137     }
138     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
139     C = *B;
140   } else {
141     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
142     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
143     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
144   }
145   c = (Mat_SeqSBAIJ *)(C->data);
146   for (i=0; i<nrows; i++) {
147     row    = irow[i];
148     kstart = ai[row];
149     kend   = kstart + a->ilen[row];
150     mat_i  = c->i[i];
151     mat_j  = c->j + mat_i;
152     mat_a  = c->a + mat_i*bs2;
153     mat_ilen = c->ilen + i;
154     for (k=kstart; k<kend; k++) {
155       if ((tcol=ssmap[a->j[k]])) {
156         *mat_j++ = tcol - 1;
157         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
158         mat_a   += bs2;
159         (*mat_ilen)++;
160       }
161     }
162   }
163 
164   /* Free work space */
165   ierr = PetscFree(smap);CHKERRQ(ierr);
166   ierr = PetscFree(lens);CHKERRQ(ierr);
167   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169 
170   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
171   *B = C;
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
177 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
178 {
179   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
180   IS          is1;
181   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
182 
183   PetscFunctionBegin;
184   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
185 
186   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
187   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
188 
189   /* Verify if the indices corespond to each element in a block
190    and form the IS with compressed IS */
191   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
192   iary = vary + a->mbs;
193   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
194   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
195 
196   count = 0;
197   for (i=0; i<a->mbs; i++) {
198     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
199     if (vary[i]==bs) iary[count++] = i;
200   }
201   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
202 
203   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204   ierr = PetscFree(vary);CHKERRQ(ierr);
205 
206   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
207   ISDestroy(is1);
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
213 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
214 {
215   int ierr,i;
216 
217   PetscFunctionBegin;
218   if (scall == MAT_INITIAL_MATRIX) {
219     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
220   }
221 
222   for (i=0; i<n; i++) {
223     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
224   }
225   PetscFunctionReturn(0);
226 }
227 
228 /* -------------------------------------------------------*/
229 /* Should check that shapes of vectors and matrices match */
230 /* -------------------------------------------------------*/
231 #include "petscblaslapack.h"
232 
233 #undef __FUNCT__
234 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
235 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
236 {
237   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
238   PetscScalar     *x,*z,*xb,x1,zero=0.0;
239   MatScalar       *v;
240   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
241 
242   PetscFunctionBegin;
243   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
244   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
245   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
246 
247   v  = a->a;
248   xb = x;
249 
250   for (i=0; i<mbs; i++) {
251     n  = ai[1] - ai[0];  /* length of i_th row of A */
252     x1 = xb[0];
253     ib = aj + *ai;
254     jmin = 0;
255     if (*ib == i) {      /* (diag of A)*x */
256       z[i] += *v++ * x[*ib++];
257       jmin++;
258     }
259     for (j=jmin; j<n; j++) {
260       cval    = *ib;
261       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
262       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
263     }
264     xb++; ai++;
265   }
266 
267   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
268   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
269   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
270   PetscFunctionReturn(0);
271 }
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
275 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
276 {
277   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
278   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
279   MatScalar        *v;
280   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
281 
282 
283   PetscFunctionBegin;
284   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
285   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
286   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
287 
288   v     = a->a;
289   xb = x;
290 
291   for (i=0; i<mbs; i++) {
292     n  = ai[1] - ai[0]; /* length of i_th block row of A */
293     x1 = xb[0]; x2 = xb[1];
294     ib = aj + *ai;
295     jmin = 0;
296     if (*ib == i){     /* (diag of A)*x */
297       z[2*i]   += v[0]*x1 + v[2]*x2;
298       z[2*i+1] += v[2]*x1 + v[3]*x2;
299       v += 4; jmin++;
300     }
301     for (j=jmin; j<n; j++) {
302       /* (strict lower triangular part of A)*x  */
303       cval       = ib[j]*2;
304       z[cval]     += v[0]*x1 + v[1]*x2;
305       z[cval+1]   += v[2]*x1 + v[3]*x2;
306       /* (strict upper triangular part of A)*x  */
307       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
308       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
309       v  += 4;
310     }
311     xb +=2; ai++;
312   }
313 
314   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
315   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
316   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
317   PetscFunctionReturn(0);
318 }
319 
320 #undef __FUNCT__
321 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
322 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
323 {
324   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
325   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
326   MatScalar     *v;
327   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
328 
329 
330   PetscFunctionBegin;
331   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
332   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
333   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
334 
335   v     = a->a;
336   xb = x;
337 
338   for (i=0; i<mbs; i++) {
339     n  = ai[1] - ai[0]; /* length of i_th block row of A */
340     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
341     ib = aj + *ai;
342     jmin = 0;
343     if (*ib == i){     /* (diag of A)*x */
344       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
345       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
346       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
347       v += 9; jmin++;
348     }
349     for (j=jmin; j<n; j++) {
350       /* (strict lower triangular part of A)*x  */
351       cval       = ib[j]*3;
352       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
353       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
354       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
355       /* (strict upper triangular part of A)*x  */
356       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
357       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
358       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
359       v  += 9;
360     }
361     xb +=3; ai++;
362   }
363 
364   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
365   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
366   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
372 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
373 {
374   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
375   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
376   MatScalar        *v;
377   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
378 
379   PetscFunctionBegin;
380   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
383 
384   v     = a->a;
385   xb = x;
386 
387   for (i=0; i<mbs; i++) {
388     n  = ai[1] - ai[0]; /* length of i_th block row of A */
389     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
390     ib = aj + *ai;
391     jmin = 0;
392     if (*ib == i){     /* (diag of A)*x */
393       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
394       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
395       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
396       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
397       v += 16; jmin++;
398     }
399     for (j=jmin; j<n; j++) {
400       /* (strict lower triangular part of A)*x  */
401       cval       = ib[j]*4;
402       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
403       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
404       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
405       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
406       /* (strict upper triangular part of A)*x  */
407       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
408       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
409       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
410       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
411       v  += 16;
412     }
413     xb +=4; ai++;
414   }
415 
416   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
417   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
418   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
419   PetscFunctionReturn(0);
420 }
421 
422 #undef __FUNCT__
423 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
424 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
425 {
426   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
427   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
428   MatScalar       *v;
429   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
430 
431   PetscFunctionBegin;
432   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
433   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
434   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
435 
436   v     = a->a;
437   xb = x;
438 
439   for (i=0; i<mbs; i++) {
440     n  = ai[1] - ai[0]; /* length of i_th block row of A */
441     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
442     ib = aj + *ai;
443     jmin = 0;
444     if (*ib == i){      /* (diag of A)*x */
445       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
446       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
447       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
448       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
449       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
450       v += 25; jmin++;
451     }
452     for (j=jmin; j<n; j++) {
453       /* (strict lower triangular part of A)*x  */
454       cval       = ib[j]*5;
455       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
456       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
457       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
458       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
459       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
460       /* (strict upper triangular part of A)*x  */
461       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
462       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
463       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
464       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
465       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
466       v  += 25;
467     }
468     xb +=5; ai++;
469   }
470 
471   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
472   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
473   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
474   PetscFunctionReturn(0);
475 }
476 
477 
478 #undef __FUNCT__
479 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
480 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
481 {
482   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
483   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
484   MatScalar       *v;
485   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
486 
487   PetscFunctionBegin;
488   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
489   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
490   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
491 
492   v     = a->a;
493   xb = x;
494 
495   for (i=0; i<mbs; i++) {
496     n  = ai[1] - ai[0]; /* length of i_th block row of A */
497     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
498     ib = aj + *ai;
499     jmin = 0;
500     if (*ib == i){      /* (diag of A)*x */
501       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
502       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
503       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
504       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
505       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
506       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
507       v += 36; jmin++;
508     }
509     for (j=jmin; j<n; j++) {
510       /* (strict lower triangular part of A)*x  */
511       cval       = ib[j]*6;
512       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
513       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
514       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
515       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
516       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
517       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
518       /* (strict upper triangular part of A)*x  */
519       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
520       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
521       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
522       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
523       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
524       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
525       v  += 36;
526     }
527     xb +=6; ai++;
528   }
529 
530   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
531   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
532   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
533   PetscFunctionReturn(0);
534 }
535 #undef __FUNCT__
536 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
537 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
538 {
539   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
540   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
541   MatScalar       *v;
542   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
543 
544   PetscFunctionBegin;
545   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
546   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
547   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
548 
549   v     = a->a;
550   xb = x;
551 
552   for (i=0; i<mbs; i++) {
553     n  = ai[1] - ai[0]; /* length of i_th block row of A */
554     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
555     ib = aj + *ai;
556     jmin = 0;
557     if (*ib == i){      /* (diag of A)*x */
558       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
559       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
560       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
561       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
562       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
563       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
564       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
565       v += 49; jmin++;
566     }
567     for (j=jmin; j<n; j++) {
568       /* (strict lower triangular part of A)*x  */
569       cval       = ib[j]*7;
570       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
571       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
572       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
573       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
574       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
575       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
576       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
577       /* (strict upper triangular part of A)*x  */
578       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
579       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
580       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
581       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
582       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
583       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
584       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
585       v  += 49;
586     }
587     xb +=7; ai++;
588   }
589   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
590   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
591   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
592   PetscFunctionReturn(0);
593 }
594 
595 /*
596     This will not work with MatScalar == float because it calls the BLAS
597 */
598 #undef __FUNCT__
599 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
600 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
601 {
602   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
603   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
604   MatScalar       *v;
605   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
606   int             ncols,k;
607 
608   PetscFunctionBegin;
609   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
610   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
611   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
612 
613   aj   = a->j;
614   v    = a->a;
615   ii   = a->i;
616 
617   if (!a->mult_work) {
618     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
619   }
620   work = a->mult_work;
621 
622   for (i=0; i<mbs; i++) {
623     n     = ii[1] - ii[0]; ncols = n*bs;
624     workt = work; idx=aj+ii[0];
625 
626     /* upper triangular part */
627     for (j=0; j<n; j++) {
628       xb = x_ptr + bs*(*idx++);
629       for (k=0; k<bs; k++) workt[k] = xb[k];
630       workt += bs;
631     }
632     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
633     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
634 
635     /* strict lower triangular part */
636     idx = aj+ii[0];
637     if (*idx == i){
638       ncols -= bs; v += bs2; idx++; n--;
639     }
640 
641     if (ncols > 0){
642       workt = work;
643       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
644       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
645       for (j=0; j<n; j++) {
646         zb = z_ptr + bs*(*idx++);
647         for (k=0; k<bs; k++) zb[k] += workt[k] ;
648         workt += bs;
649       }
650     }
651     x += bs; v += n*bs2; z += bs; ii++;
652   }
653 
654   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
655   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
656   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
657   PetscFunctionReturn(0);
658 }
659 
660 #undef __FUNCT__
661 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
662 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
663 {
664   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
665   PetscScalar     *x,*y,*z,*xb,x1;
666   MatScalar       *v;
667   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
668 
669   PetscFunctionBegin;
670   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
671   if (yy != xx) {
672     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
673   } else {
674     y = x;
675   }
676   if (zz != yy) {
677     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
678     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
679     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
680   } else {
681     z = y;
682   }
683 
684   v  = a->a;
685   xb = x;
686 
687   for (i=0; i<mbs; i++) {
688     n  = ai[1] - ai[0];  /* length of i_th row of A */
689     x1 = xb[0];
690     ib = aj + *ai;
691     jmin = 0;
692     if (*ib == i) {            /* (diag of A)*x */
693       z[i] += *v++ * x[*ib++]; jmin++;
694     }
695     for (j=jmin; j<n; j++) {
696       cval    = *ib;
697       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
698       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
699     }
700     xb++; ai++;
701   }
702 
703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
704   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
705   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
706 
707   PetscLogFlops(2*(a->nz*2 - A->m));
708   PetscFunctionReturn(0);
709 }
710 
711 #undef __FUNCT__
712 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
713 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
714 {
715   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
716   PetscScalar     *x,*y,*z,*xb,x1,x2;
717   MatScalar       *v;
718   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
719 
720   PetscFunctionBegin;
721   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
722   if (yy != xx) {
723     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
724   } else {
725     y = x;
726   }
727   if (zz != yy) {
728     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
729     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
730     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
731   } else {
732     z = y;
733   }
734 
735   v     = a->a;
736   xb = x;
737 
738   for (i=0; i<mbs; i++) {
739     n  = ai[1] - ai[0]; /* length of i_th block row of A */
740     x1 = xb[0]; x2 = xb[1];
741     ib = aj + *ai;
742     jmin = 0;
743     if (*ib == i){      /* (diag of A)*x */
744       z[2*i]   += v[0]*x1 + v[2]*x2;
745       z[2*i+1] += v[2]*x1 + v[3]*x2;
746       v += 4; jmin++;
747     }
748     for (j=jmin; j<n; j++) {
749       /* (strict lower triangular part of A)*x  */
750       cval       = ib[j]*2;
751       z[cval]     += v[0]*x1 + v[1]*x2;
752       z[cval+1]   += v[2]*x1 + v[3]*x2;
753       /* (strict upper triangular part of A)*x  */
754       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
755       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
756       v  += 4;
757     }
758     xb +=2; ai++;
759   }
760 
761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
762   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
763   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
764 
765   PetscLogFlops(4*(a->nz*2 - A->m));
766   PetscFunctionReturn(0);
767 }
768 
769 #undef __FUNCT__
770 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
771 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
772 {
773   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
774   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
775   MatScalar       *v;
776   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
777 
778   PetscFunctionBegin;
779   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
780   if (yy != xx) {
781     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
782   } else {
783     y = x;
784   }
785   if (zz != yy) {
786     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
787     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
788     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
789   } else {
790     z = y;
791   }
792 
793   v     = a->a;
794   xb = x;
795 
796   for (i=0; i<mbs; i++) {
797     n  = ai[1] - ai[0]; /* length of i_th block row of A */
798     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
799     ib = aj + *ai;
800     jmin = 0;
801     if (*ib == i){     /* (diag of A)*x */
802      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
803      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
804      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
805      v += 9; jmin++;
806     }
807     for (j=jmin; j<n; j++) {
808       /* (strict lower triangular part of A)*x  */
809       cval       = ib[j]*3;
810       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
811       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
812       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
813       /* (strict upper triangular part of A)*x  */
814       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
815       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
816       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
817       v  += 9;
818     }
819     xb +=3; ai++;
820   }
821 
822   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
823   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
824   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
825 
826   PetscLogFlops(18*(a->nz*2 - A->m));
827   PetscFunctionReturn(0);
828 }
829 
830 #undef __FUNCT__
831 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
832 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
833 {
834   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
835   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
836   MatScalar       *v;
837   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
838 
839   PetscFunctionBegin;
840   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
841   if (yy != xx) {
842     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
843   } else {
844     y = x;
845   }
846   if (zz != yy) {
847     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
848     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
849     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
850   } else {
851     z = y;
852   }
853 
854   v     = a->a;
855   xb = x;
856 
857   for (i=0; i<mbs; i++) {
858     n  = ai[1] - ai[0]; /* length of i_th block row of A */
859     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
860     ib = aj + *ai;
861     jmin = 0;
862     if (*ib == i){      /* (diag of A)*x */
863       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
864       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
865       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
866       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
867       v += 16; jmin++;
868     }
869     for (j=jmin; j<n; j++) {
870       /* (strict lower triangular part of A)*x  */
871       cval       = ib[j]*4;
872       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
873       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
874       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
875       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
876       /* (strict upper triangular part of A)*x  */
877       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
878       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
879       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
880       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
881       v  += 16;
882     }
883     xb +=4; ai++;
884   }
885 
886   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
887   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
888   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
889 
890   PetscLogFlops(32*(a->nz*2 - A->m));
891   PetscFunctionReturn(0);
892 }
893 
894 #undef __FUNCT__
895 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
896 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
897 {
898   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
899   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
900   MatScalar       *v;
901   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
902 
903   PetscFunctionBegin;
904   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
905   if (yy != xx) {
906     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
907   } else {
908     y = x;
909   }
910   if (zz != yy) {
911     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
912     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
913     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
914   } else {
915     z = y;
916   }
917 
918   v     = a->a;
919   xb = x;
920 
921   for (i=0; i<mbs; i++) {
922     n  = ai[1] - ai[0]; /* length of i_th block row of A */
923     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
924     ib = aj + *ai;
925     jmin = 0;
926     if (*ib == i){      /* (diag of A)*x */
927       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
928       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
929       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
930       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
931       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
932       v += 25; jmin++;
933     }
934     for (j=jmin; j<n; j++) {
935       /* (strict lower triangular part of A)*x  */
936       cval       = ib[j]*5;
937       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
938       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
939       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
940       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
941       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
942       /* (strict upper triangular part of A)*x  */
943       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
944       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
945       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
946       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
947       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
948       v  += 25;
949     }
950     xb +=5; ai++;
951   }
952 
953   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
954   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
955   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
956 
957   PetscLogFlops(50*(a->nz*2 - A->m));
958   PetscFunctionReturn(0);
959 }
960 #undef __FUNCT__
961 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
962 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
963 {
964   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
965   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
966   MatScalar       *v;
967   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
968 
969   PetscFunctionBegin;
970   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
971   if (yy != xx) {
972     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
973   } else {
974     y = x;
975   }
976   if (zz != yy) {
977     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
978     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
979     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
980   } else {
981     z = y;
982   }
983 
984   v     = a->a;
985   xb = x;
986 
987   for (i=0; i<mbs; i++) {
988     n  = ai[1] - ai[0]; /* length of i_th block row of A */
989     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
990     ib = aj + *ai;
991     jmin = 0;
992     if (*ib == i){     /* (diag of A)*x */
993       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
994       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
995       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
996       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
997       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
998       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
999       v += 36; jmin++;
1000     }
1001     for (j=jmin; j<n; j++) {
1002       /* (strict lower triangular part of A)*x  */
1003       cval       = ib[j]*6;
1004       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1005       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1006       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1007       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1008       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1009       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1010       /* (strict upper triangular part of A)*x  */
1011       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1012       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1013       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1014       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1015       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1016       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1017       v  += 36;
1018     }
1019     xb +=6; ai++;
1020   }
1021 
1022   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1023   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1024   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1025 
1026   PetscLogFlops(72*(a->nz*2 - A->m));
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 #undef __FUNCT__
1031 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1032 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1033 {
1034   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1035   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1036   MatScalar       *v;
1037   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1038 
1039   PetscFunctionBegin;
1040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1041   if (yy != xx) {
1042     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1043   } else {
1044     y = x;
1045   }
1046   if (zz != yy) {
1047     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1048     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1049     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1050   } else {
1051     z = y;
1052   }
1053 
1054   v     = a->a;
1055   xb = x;
1056 
1057   for (i=0; i<mbs; i++) {
1058     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1059     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1060     ib = aj + *ai;
1061     jmin = 0;
1062     if (*ib == i){     /* (diag of A)*x */
1063       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1064       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1065       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1066       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1067       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1068       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1069       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1070       v += 49; jmin++;
1071     }
1072     for (j=jmin; j<n; j++) {
1073       /* (strict lower triangular part of A)*x  */
1074       cval       = ib[j]*7;
1075       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1076       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1077       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1078       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1079       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1080       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1081       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1082       /* (strict upper triangular part of A)*x  */
1083       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1084       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1085       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1086       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1087       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1088       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1089       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1090       v  += 49;
1091     }
1092     xb +=7; ai++;
1093   }
1094 
1095   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1096   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1097   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1098 
1099   PetscLogFlops(98*(a->nz*2 - A->m));
1100   PetscFunctionReturn(0);
1101 }
1102 
1103 #undef __FUNCT__
1104 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1105 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1106 {
1107   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1108   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1109   MatScalar       *v;
1110   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1111   int             ncols,k;
1112 
1113   PetscFunctionBegin;
1114   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1115   if (yy != xx) {
1116     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1117   } else {
1118     y = x;
1119   }
1120   if (zz != yy) {
1121     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1122     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1123     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1124   } else {
1125     z = y;
1126   }
1127 
1128   aj   = a->j;
1129   v    = a->a;
1130   ii   = a->i;
1131 
1132   if (!a->mult_work) {
1133     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1134   }
1135   work = a->mult_work;
1136 
1137 
1138   for (i=0; i<mbs; i++) {
1139     n     = ii[1] - ii[0]; ncols = n*bs;
1140     workt = work; idx=aj+ii[0];
1141 
1142     /* upper triangular part */
1143     for (j=0; j<n; j++) {
1144       xb = x_ptr + bs*(*idx++);
1145       for (k=0; k<bs; k++) workt[k] = xb[k];
1146       workt += bs;
1147     }
1148     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1149     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1150 
1151     /* strict lower triangular part */
1152     idx = aj+ii[0];
1153     if (*idx == i){
1154       ncols -= bs; v += bs2; idx++; n--;
1155     }
1156     if (ncols > 0){
1157       workt = work;
1158       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1159       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1160       for (j=0; j<n; j++) {
1161         zb = z_ptr + bs*(*idx++);
1162         /* idx++; */
1163         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1164         workt += bs;
1165       }
1166     }
1167 
1168     x += bs; v += n*bs2; z += bs; ii++;
1169   }
1170 
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1173   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1174 
1175   PetscLogFlops(2*(a->nz*2 - A->m));
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 #undef __FUNCT__
1180 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1181 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1182 {
1183   int ierr;
1184 
1185   PetscFunctionBegin;
1186   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 #undef __FUNCT__
1191 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1192 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1193 
1194 {
1195   int ierr;
1196 
1197   PetscFunctionBegin;
1198   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 #undef __FUNCT__
1203 #define __FUNCT__ "MatScale_SeqSBAIJ"
1204 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1205 {
1206   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1207   int         one = 1,totalnz = a->bs2*a->nz;
1208 
1209   PetscFunctionBegin;
1210   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1211   PetscLogFlops(totalnz);
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 #undef __FUNCT__
1216 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1217 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1218 {
1219   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1220   MatScalar   *v = a->a;
1221   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1222   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1223   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1224 
1225   PetscFunctionBegin;
1226   if (type == NORM_FROBENIUS) {
1227     for (k=0; k<mbs; k++){
1228       jmin = a->i[k]; jmax = a->i[k+1];
1229       col  = aj + jmin;
1230       if (*col == k){         /* diagonal block */
1231         for (i=0; i<bs2; i++){
1232 #if defined(PETSC_USE_COMPLEX)
1233           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1234 #else
1235           sum_diag += (*v)*(*v); v++;
1236 #endif
1237         }
1238         jmin++;
1239       }
1240       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1241         for (i=0; i<bs2; i++){
1242 #if defined(PETSC_USE_COMPLEX)
1243           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1244 #else
1245           sum_off += (*v)*(*v); v++;
1246 #endif
1247         }
1248       }
1249     }
1250     *norm = sqrt(sum_diag + 2*sum_off);
1251 
1252   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1253     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1254     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1255     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1256     for (i=0; i<mbs; i++) {
1257       jl[i] = mbs; il[0] = 0;
1258     }
1259 
1260     *norm = 0.0;
1261     for (k=0; k<mbs; k++) { /* k_th block row */
1262       for (j=0; j<bs; j++) sum[j]=0.0;
1263 
1264       /*-- col sum --*/
1265       i = jl[k]; /* first |A(i,k)| to be added */
1266       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1267                   at step k */
1268       while (i<mbs){
1269         nexti = jl[i];  /* next block row to be added */
1270         ik    = il[i];  /* block index of A(i,k) in the array a */
1271         for (j=0; j<bs; j++){
1272           v = a->a + ik*bs2 + j*bs;
1273           for (k1=0; k1<bs; k1++) {
1274             sum[j] += PetscAbsScalar(*v); v++;
1275           }
1276         }
1277         /* update il, jl */
1278         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1279         jmax = a->i[i+1];
1280         if (jmin < jmax){
1281           il[i] = jmin;
1282           j   = a->j[jmin];
1283           jl[i] = jl[j]; jl[j]=i;
1284         }
1285         i = nexti;
1286       }
1287 
1288       /*-- row sum --*/
1289       jmin = a->i[k]; jmax = a->i[k+1];
1290       for (i=jmin; i<jmax; i++) {
1291         for (j=0; j<bs; j++){
1292           v = a->a + i*bs2 + j;
1293           for (k1=0; k1<bs; k1++){
1294             sum[j] += PetscAbsScalar(*v);
1295             v   += bs;
1296           }
1297         }
1298       }
1299       /* add k_th block row to il, jl */
1300       col = aj+jmin;
1301       if (*col == k) jmin++;
1302       if (jmin < jmax){
1303         il[k] = jmin;
1304         j   = a->j[jmin];
1305         jl[k] = jl[j]; jl[j] = k;
1306       }
1307       for (j=0; j<bs; j++){
1308         if (sum[j] > *norm) *norm = sum[j];
1309       }
1310     }
1311     ierr = PetscFree(il);CHKERRQ(ierr);
1312     ierr = PetscFree(jl);CHKERRQ(ierr);
1313     ierr = PetscFree(sum);CHKERRQ(ierr);
1314   } else {
1315     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1316   }
1317   PetscFunctionReturn(0);
1318 }
1319 
1320 #undef __FUNCT__
1321 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1322 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1323 {
1324   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1325   int          ierr;
1326 
1327   PetscFunctionBegin;
1328 
1329   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1330   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1331     *flg = PETSC_FALSE;
1332     PetscFunctionReturn(0);
1333   }
1334 
1335   /* if the a->i are the same */
1336   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1337   if (*flg == PETSC_FALSE) {
1338     PetscFunctionReturn(0);
1339   }
1340 
1341   /* if a->j are the same */
1342   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1343   if (*flg == PETSC_FALSE) {
1344     PetscFunctionReturn(0);
1345   }
1346   /* if a->a are the same */
1347   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1348 
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 #undef __FUNCT__
1353 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1354 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1355 {
1356   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1357   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1358   PetscScalar  *x,zero = 0.0;
1359   MatScalar    *aa,*aa_j;
1360 
1361   PetscFunctionBegin;
1362   bs   = a->bs;
1363   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1364 
1365   aa   = a->a;
1366   ai   = a->i;
1367   aj   = a->j;
1368   ambs = a->mbs;
1369   bs2  = a->bs2;
1370 
1371   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1372   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1373   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1374   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1375   for (i=0; i<ambs; i++) {
1376     j=ai[i];
1377     if (aj[j] == i) {             /* if this is a diagonal element */
1378       row  = i*bs;
1379       aa_j = aa + j*bs2;
1380       if (A->factor && bs==1){
1381         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1382       } else {
1383         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1384       }
1385     }
1386   }
1387 
1388   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 #undef __FUNCT__
1393 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1394 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1395 {
1396   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1397   PetscScalar  *l,*r,x,*li,*ri;
1398   MatScalar    *aa,*v;
1399   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1400 
1401   PetscFunctionBegin;
1402   ai  = a->i;
1403   aj  = a->j;
1404   aa  = a->a;
1405   m   = A->m;
1406   bs  = a->bs;
1407   mbs = a->mbs;
1408   bs2 = a->bs2;
1409 
1410   if (ll != rr) {
1411     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1412   }
1413   if (ll) {
1414     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1415     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1416     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1417     for (i=0; i<mbs; i++) { /* for each block row */
1418       M  = ai[i+1] - ai[i];
1419       li = l + i*bs;
1420       v  = aa + bs2*ai[i];
1421       for (j=0; j<M; j++) { /* for each block */
1422         for (k=0; k<bs2; k++) {
1423           (*v++) *= li[k%bs];
1424         }
1425 #ifdef CONT
1426         /* will be used to replace the above loop */
1427         ri = l + bs*aj[ai[i]+j];
1428         for (k=0; k<bs; k++) { /* column value */
1429           x = ri[k];
1430           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1431         }
1432 #endif
1433 
1434       }
1435     }
1436     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1437     PetscLogFlops(2*a->nz);
1438   }
1439   /* will be deleted */
1440   if (rr) {
1441     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1442     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1443     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1444     for (i=0; i<mbs; i++) { /* for each block row */
1445       M  = ai[i+1] - ai[i];
1446       v  = aa + bs2*ai[i];
1447       for (j=0; j<M; j++) { /* for each block */
1448         ri = r + bs*aj[ai[i]+j];
1449         for (k=0; k<bs; k++) {
1450           x = ri[k];
1451           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1452         }
1453       }
1454     }
1455     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1456     PetscLogFlops(a->nz);
1457   }
1458   PetscFunctionReturn(0);
1459 }
1460 
1461 #undef __FUNCT__
1462 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1463 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1464 {
1465   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1466 
1467   PetscFunctionBegin;
1468   info->rows_global    = (double)A->m;
1469   info->columns_global = (double)A->m;
1470   info->rows_local     = (double)A->m;
1471   info->columns_local  = (double)A->m;
1472   info->block_size     = a->bs2;
1473   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1474   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1475   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1476   info->assemblies   = A->num_ass;
1477   info->mallocs      = a->reallocs;
1478   info->memory       = A->mem;
1479   if (A->factor) {
1480     info->fill_ratio_given  = A->info.fill_ratio_given;
1481     info->fill_ratio_needed = A->info.fill_ratio_needed;
1482     info->factor_mallocs    = A->info.factor_mallocs;
1483   } else {
1484     info->fill_ratio_given  = 0;
1485     info->fill_ratio_needed = 0;
1486     info->factor_mallocs    = 0;
1487   }
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 
1492 #undef __FUNCT__
1493 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1494 int MatZeroEntries_SeqSBAIJ(Mat A)
1495 {
1496   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1497   int         ierr;
1498 
1499   PetscFunctionBegin;
1500   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 #undef __FUNCT__
1505 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1506 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1507 {
1508   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1509   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1510   PetscReal    atmp;
1511   MatScalar    *aa;
1512   PetscScalar  zero = 0.0,*x;
1513   int          ncols,brow,bcol,krow,kcol;
1514 
1515   PetscFunctionBegin;
1516   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1517   bs   = a->bs;
1518   aa   = a->a;
1519   ai   = a->i;
1520   aj   = a->j;
1521   mbs = a->mbs;
1522 
1523   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1524   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1525   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1526   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1527   for (i=0; i<mbs; i++) {
1528     ncols = ai[1] - ai[0]; ai++;
1529     brow  = bs*i;
1530     for (j=0; j<ncols; j++){
1531       bcol = bs*(*aj);
1532       for (kcol=0; kcol<bs; kcol++){
1533         col = bcol + kcol;      /* col index */
1534         for (krow=0; krow<bs; krow++){
1535           atmp = PetscAbsScalar(*aa); aa++;
1536           row = brow + krow;    /* row index */
1537           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1538           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1539           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1540         }
1541       }
1542       aj++;
1543     }
1544   }
1545   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1546   PetscFunctionReturn(0);
1547 }
1548