xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision b3bf805bc5fa6ada2d57fa222e3d33ae005e5770)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/inline/spops.h"
4 #include "src/inline/ilu.h"
5 #include "petscbt.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   PetscBT        table,table0;
16 
17   PetscFunctionBegin;
18   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19   mbs = a->mbs;
20   ai  = a->i;
21   aj  = a->j;
22   bs  = A->bs;
23   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
24   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
25   ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
26   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) { /* for each is */
29     isz  = 0;
30     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
31 
32     /* Extract the indices, assume there can be duplicate entries */
33     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35 
36     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
37     bcol_max = 0;
38     for (j=0; j<n ; ++j){
39       brow = idx[j]/bs; /* convert the indices into block indices */
40       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if(!PetscBTLookupSet(table,brow)) {
42         nidx[isz++] = brow;
43         if (bcol_max < brow) bcol_max = brow;
44       }
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap */
51       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
53       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
54 
55       n = isz;  /* length of the updated is[i] */
56       for (brow=0; brow<mbs; brow++){
57         start = ai[brow]; end   = ai[brow+1];
58         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l<end ; l++){
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
62           }
63           k++;
64           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
65         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
66           for (l = start; l<end ; l++){
67             bcol = aj[l];
68             if (bcol > bcol_max) break;
69             if (PetscBTLookup(table0,bcol)){
70               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
71               break; /* for l = start; l<end ; l++) */
72             }
73           }
74         }
75       }
76     } /* for each overlap */
77 
78     /* expand the Index Set */
79     for (j=0; j<isz; j++) {
80       for (k=0; k<bs; k++)
81         nidx2[j*bs+k] = nidx[j]*bs+k;
82     }
83     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
84   }
85   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
86   ierr = PetscFree(nidx);CHKERRQ(ierr);
87   ierr = PetscFree(nidx2);CHKERRQ(ierr);
88   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
89   PetscFunctionReturn(0);
90 }
91 
92 #undef __FUNCT__
93 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
94 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
95 {
96   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
97   PetscErrorCode ierr;
98   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
99   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
100   PetscInt       *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2;
101   PetscInt       *aj = a->j,*ai = a->i;
102   MatScalar      *mat_a;
103   Mat            C;
104   PetscTruth     flag;
105 
106   PetscFunctionBegin;
107   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
108   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
109   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
110 
111   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
112   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
113 
114   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
115   ssmap = smap;
116   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
117   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
118   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
119   /* determine lens of each row */
120   for (i=0; i<nrows; i++) {
121     kstart  = ai[irow[i]];
122     kend    = kstart + a->ilen[irow[i]];
123     lens[i] = 0;
124       for (k=kstart; k<kend; k++) {
125         if (ssmap[aj[k]]) {
126           lens[i]++;
127         }
128       }
129     }
130   /* Create and fill new matrix */
131   if (scall == MAT_REUSE_MATRIX) {
132     c = (Mat_SeqSBAIJ *)((*B)->data);
133 
134     if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
135     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
136     if (!flag) {
137       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
138     }
139     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
140     C = *B;
141   } else {
142     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
143     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
144     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
145   }
146   c = (Mat_SeqSBAIJ *)(C->data);
147   for (i=0; i<nrows; i++) {
148     row    = irow[i];
149     kstart = ai[row];
150     kend   = kstart + a->ilen[row];
151     mat_i  = c->i[i];
152     mat_j  = c->j + mat_i;
153     mat_a  = c->a + mat_i*bs2;
154     mat_ilen = c->ilen + i;
155     for (k=kstart; k<kend; k++) {
156       if ((tcol=ssmap[a->j[k]])) {
157         *mat_j++ = tcol - 1;
158         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
159         mat_a   += bs2;
160         (*mat_ilen)++;
161       }
162     }
163   }
164 
165   /* Free work space */
166   ierr = PetscFree(smap);CHKERRQ(ierr);
167   ierr = PetscFree(lens);CHKERRQ(ierr);
168   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170 
171   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
172   *B = C;
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
179 {
180   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
181   IS             is1;
182   PetscErrorCode ierr;
183   PetscInt       *vary,*iary,*irow,nrows,i,bs=A->bs,count;
184 
185   PetscFunctionBegin;
186   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
187 
188   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
189   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
190 
191   /* Verify if the indices corespond to each element in a block
192    and form the IS with compressed IS */
193   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
194   iary = vary + a->mbs;
195   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
196   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
197 
198   count = 0;
199   for (i=0; i<a->mbs; i++) {
200     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
201     if (vary[i]==bs) iary[count++] = i;
202   }
203   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
204 
205   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
206   ierr = PetscFree(vary);CHKERRQ(ierr);
207 
208   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
209   ISDestroy(is1);
210   PetscFunctionReturn(0);
211 }
212 
213 #undef __FUNCT__
214 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
215 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
216 {
217   PetscErrorCode ierr;
218   PetscInt       i;
219 
220   PetscFunctionBegin;
221   if (scall == MAT_INITIAL_MATRIX) {
222     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
223   }
224 
225   for (i=0; i<n; i++) {
226     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
227   }
228   PetscFunctionReturn(0);
229 }
230 
231 /* -------------------------------------------------------*/
232 /* Should check that shapes of vectors and matrices match */
233 /* -------------------------------------------------------*/
234 #include "petscblaslapack.h"
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
238 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
239 {
240   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
241   PetscScalar    *x,*z,*xb,x1,zero=0.0;
242   MatScalar      *v;
243   PetscErrorCode ierr;
244   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
245 
246   PetscFunctionBegin;
247   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
250 
251   v  = a->a;
252   xb = x;
253 
254   for (i=0; i<mbs; i++) {
255     n  = ai[1] - ai[0];  /* length of i_th row of A */
256     x1 = xb[0];
257     ib = aj + *ai;
258     jmin = 0;
259     if (*ib == i) {      /* (diag of A)*x */
260       z[i] += *v++ * x[*ib++];
261       jmin++;
262     }
263     for (j=jmin; j<n; j++) {
264       cval    = *ib;
265       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
266       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
267     }
268     xb++; ai++;
269   }
270 
271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
272   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
273   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
279 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
282   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
283   MatScalar      *v;
284   PetscErrorCode ierr;
285   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
286 
287   PetscFunctionBegin;
288   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
290   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
291 
292   v     = a->a;
293   xb = x;
294 
295   for (i=0; i<mbs; i++) {
296     n  = ai[1] - ai[0]; /* length of i_th block row of A */
297     x1 = xb[0]; x2 = xb[1];
298     ib = aj + *ai;
299     jmin = 0;
300     if (*ib == i){     /* (diag of A)*x */
301       z[2*i]   += v[0]*x1 + v[2]*x2;
302       z[2*i+1] += v[2]*x1 + v[3]*x2;
303       v += 4; jmin++;
304     }
305     for (j=jmin; j<n; j++) {
306       /* (strict lower triangular part of A)*x  */
307       cval       = ib[j]*2;
308       z[cval]     += v[0]*x1 + v[1]*x2;
309       z[cval+1]   += v[2]*x1 + v[3]*x2;
310       /* (strict upper triangular part of A)*x  */
311       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
312       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
313       v  += 4;
314     }
315     xb +=2; ai++;
316   }
317 
318   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
319   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
320   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
321   PetscFunctionReturn(0);
322 }
323 
324 #undef __FUNCT__
325 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
326 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
327 {
328   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
329   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
330   MatScalar      *v;
331   PetscErrorCode ierr;
332   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
333 
334   PetscFunctionBegin;
335   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
336   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
337   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
338 
339   v    = a->a;
340   xb   = x;
341 
342   for (i=0; i<mbs; i++) {
343     n  = ai[1] - ai[0]; /* length of i_th block row of A */
344     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
345     ib = aj + *ai;
346     jmin = 0;
347     if (*ib == i){     /* (diag of A)*x */
348       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
349       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
350       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
351       v += 9; jmin++;
352     }
353     for (j=jmin; j<n; j++) {
354       /* (strict lower triangular part of A)*x  */
355       cval       = ib[j]*3;
356       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
357       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
358       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
359       /* (strict upper triangular part of A)*x  */
360       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
361       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
362       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
363       v  += 9;
364     }
365     xb +=3; ai++;
366   }
367 
368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
369   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
370   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
371   PetscFunctionReturn(0);
372 }
373 
374 #undef __FUNCT__
375 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
376 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
377 {
378   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
379   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
380   MatScalar      *v;
381   PetscErrorCode ierr;
382   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
383 
384   PetscFunctionBegin;
385   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
386   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
387   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
388 
389   v     = a->a;
390   xb = x;
391 
392   for (i=0; i<mbs; i++) {
393     n  = ai[1] - ai[0]; /* length of i_th block row of A */
394     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
395     ib = aj + *ai;
396     jmin = 0;
397     if (*ib == i){     /* (diag of A)*x */
398       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
399       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
400       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
401       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
402       v += 16; jmin++;
403     }
404     for (j=jmin; j<n; j++) {
405       /* (strict lower triangular part of A)*x  */
406       cval       = ib[j]*4;
407       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
408       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
409       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
410       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
411       /* (strict upper triangular part of A)*x  */
412       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
413       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
414       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
415       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
416       v  += 16;
417     }
418     xb +=4; ai++;
419   }
420 
421   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
422   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
423   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
424   PetscFunctionReturn(0);
425 }
426 
427 #undef __FUNCT__
428 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
429 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
430 {
431   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
432   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
433   MatScalar      *v;
434   PetscErrorCode ierr;
435   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
436 
437   PetscFunctionBegin;
438   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
439   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
440   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
441 
442   v     = a->a;
443   xb = x;
444 
445   for (i=0; i<mbs; i++) {
446     n  = ai[1] - ai[0]; /* length of i_th block row of A */
447     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
448     ib = aj + *ai;
449     jmin = 0;
450     if (*ib == i){      /* (diag of A)*x */
451       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
452       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
453       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
454       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
455       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
456       v += 25; jmin++;
457     }
458     for (j=jmin; j<n; j++) {
459       /* (strict lower triangular part of A)*x  */
460       cval       = ib[j]*5;
461       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
462       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
463       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
464       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
465       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
466       /* (strict upper triangular part of A)*x  */
467       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
468       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
469       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
470       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
471       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
472       v  += 25;
473     }
474     xb +=5; ai++;
475   }
476 
477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
478   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
479   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
480   PetscFunctionReturn(0);
481 }
482 
483 
484 #undef __FUNCT__
485 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
486 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
487 {
488   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
489   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
490   MatScalar      *v;
491   PetscErrorCode ierr;
492   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
493 
494   PetscFunctionBegin;
495   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
497   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
498 
499   v     = a->a;
500   xb = x;
501 
502   for (i=0; i<mbs; i++) {
503     n  = ai[1] - ai[0]; /* length of i_th block row of A */
504     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
505     ib = aj + *ai;
506     jmin = 0;
507     if (*ib == i){      /* (diag of A)*x */
508       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
509       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
510       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
511       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
512       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
513       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
514       v += 36; jmin++;
515     }
516     for (j=jmin; j<n; j++) {
517       /* (strict lower triangular part of A)*x  */
518       cval       = ib[j]*6;
519       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
520       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
521       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
522       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
523       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
524       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
525       /* (strict upper triangular part of A)*x  */
526       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
527       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
528       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
529       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
530       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
531       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
532       v  += 36;
533     }
534     xb +=6; ai++;
535   }
536 
537   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
538   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
539   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
540   PetscFunctionReturn(0);
541 }
542 #undef __FUNCT__
543 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
544 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
545 {
546   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
547   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
548   MatScalar      *v;
549   PetscErrorCode ierr;
550   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
551 
552   PetscFunctionBegin;
553   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
554   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
555   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
556 
557   v     = a->a;
558   xb = x;
559 
560   for (i=0; i<mbs; i++) {
561     n  = ai[1] - ai[0]; /* length of i_th block row of A */
562     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
563     ib = aj + *ai;
564     jmin = 0;
565     if (*ib == i){      /* (diag of A)*x */
566       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
567       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
568       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
569       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
570       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
571       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
572       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
573       v += 49; jmin++;
574     }
575     for (j=jmin; j<n; j++) {
576       /* (strict lower triangular part of A)*x  */
577       cval       = ib[j]*7;
578       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
579       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
580       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
581       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
582       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
583       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
584       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
585       /* (strict upper triangular part of A)*x  */
586       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
587       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
588       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
589       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
590       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
591       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
592       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
593       v  += 49;
594     }
595     xb +=7; ai++;
596   }
597   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
598   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
599   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
600   PetscFunctionReturn(0);
601 }
602 
603 /*
604     This will not work with MatScalar == float because it calls the BLAS
605 */
606 #undef __FUNCT__
607 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
608 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
609 {
610   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
611   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
612   MatScalar      *v;
613   PetscErrorCode ierr;
614   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
615 
616   PetscFunctionBegin;
617   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
618   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
619   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
620 
621   aj   = a->j;
622   v    = a->a;
623   ii   = a->i;
624 
625   if (!a->mult_work) {
626     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
627   }
628   work = a->mult_work;
629 
630   for (i=0; i<mbs; i++) {
631     n     = ii[1] - ii[0]; ncols = n*bs;
632     workt = work; idx=aj+ii[0];
633 
634     /* upper triangular part */
635     for (j=0; j<n; j++) {
636       xb = x_ptr + bs*(*idx++);
637       for (k=0; k<bs; k++) workt[k] = xb[k];
638       workt += bs;
639     }
640     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
641     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
642 
643     /* strict lower triangular part */
644     idx = aj+ii[0];
645     if (*idx == i){
646       ncols -= bs; v += bs2; idx++; n--;
647     }
648 
649     if (ncols > 0){
650       workt = work;
651       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
652       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
653       for (j=0; j<n; j++) {
654         zb = z_ptr + bs*(*idx++);
655         for (k=0; k<bs; k++) zb[k] += workt[k] ;
656         workt += bs;
657       }
658     }
659     x += bs; v += n*bs2; z += bs; ii++;
660   }
661 
662   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
663   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
664   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
665   PetscFunctionReturn(0);
666 }
667 
668 #undef __FUNCT__
669 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
670 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
671 {
672   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
673   PetscScalar    *x,*y,*z,*xb,x1;
674   MatScalar      *v;
675   PetscErrorCode ierr;
676   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
677 
678   PetscFunctionBegin;
679   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
680   if (yy != xx) {
681     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
682   } else {
683     y = x;
684   }
685   if (zz != yy) {
686     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
687     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
688     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
689   } else {
690     z = y;
691   }
692 
693   v  = a->a;
694   xb = x;
695 
696   for (i=0; i<mbs; i++) {
697     n  = ai[1] - ai[0];  /* length of i_th row of A */
698     x1 = xb[0];
699     ib = aj + *ai;
700     jmin = 0;
701     if (*ib == i) {            /* (diag of A)*x */
702       z[i] += *v++ * x[*ib++]; jmin++;
703     }
704     for (j=jmin; j<n; j++) {
705       cval    = *ib;
706       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
707       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
708     }
709     xb++; ai++;
710   }
711 
712   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
713   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
714   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
715 
716   PetscLogFlops(2*(a->nz*2 - A->m));
717   PetscFunctionReturn(0);
718 }
719 
720 #undef __FUNCT__
721 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
722 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
723 {
724   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
725   PetscScalar    *x,*y,*z,*xb,x1,x2;
726   MatScalar      *v;
727   PetscErrorCode ierr;
728   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
729 
730   PetscFunctionBegin;
731   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
732   if (yy != xx) {
733     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
734   } else {
735     y = x;
736   }
737   if (zz != yy) {
738     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
739     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
740     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
741   } else {
742     z = y;
743   }
744 
745   v     = a->a;
746   xb = x;
747 
748   for (i=0; i<mbs; i++) {
749     n  = ai[1] - ai[0]; /* length of i_th block row of A */
750     x1 = xb[0]; x2 = xb[1];
751     ib = aj + *ai;
752     jmin = 0;
753     if (*ib == i){      /* (diag of A)*x */
754       z[2*i]   += v[0]*x1 + v[2]*x2;
755       z[2*i+1] += v[2]*x1 + v[3]*x2;
756       v += 4; jmin++;
757     }
758     for (j=jmin; j<n; j++) {
759       /* (strict lower triangular part of A)*x  */
760       cval       = ib[j]*2;
761       z[cval]     += v[0]*x1 + v[1]*x2;
762       z[cval+1]   += v[2]*x1 + v[3]*x2;
763       /* (strict upper triangular part of A)*x  */
764       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
765       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
766       v  += 4;
767     }
768     xb +=2; ai++;
769   }
770 
771   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
772   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
773   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
774 
775   PetscLogFlops(4*(a->nz*2 - A->m));
776   PetscFunctionReturn(0);
777 }
778 
779 #undef __FUNCT__
780 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
781 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
782 {
783   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
784   PetscScalar    *x,*y,*z,*xb,x1,x2,x3;
785   MatScalar      *v;
786   PetscErrorCode ierr;
787   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
788 
789   PetscFunctionBegin;
790   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
791   if (yy != xx) {
792     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
793   } else {
794     y = x;
795   }
796   if (zz != yy) {
797     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
798     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
799     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
800   } else {
801     z = y;
802   }
803 
804   v     = a->a;
805   xb = x;
806 
807   for (i=0; i<mbs; i++) {
808     n  = ai[1] - ai[0]; /* length of i_th block row of A */
809     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
810     ib = aj + *ai;
811     jmin = 0;
812     if (*ib == i){     /* (diag of A)*x */
813      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
814      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
815      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
816      v += 9; jmin++;
817     }
818     for (j=jmin; j<n; j++) {
819       /* (strict lower triangular part of A)*x  */
820       cval       = ib[j]*3;
821       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
822       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
823       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
824       /* (strict upper triangular part of A)*x  */
825       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
826       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
827       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
828       v  += 9;
829     }
830     xb +=3; ai++;
831   }
832 
833   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
834   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
835   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
836 
837   PetscLogFlops(18*(a->nz*2 - A->m));
838   PetscFunctionReturn(0);
839 }
840 
841 #undef __FUNCT__
842 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
843 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
844 {
845   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
846   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4;
847   MatScalar      *v;
848   PetscErrorCode ierr;
849   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
850 
851   PetscFunctionBegin;
852   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
853   if (yy != xx) {
854     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
855   } else {
856     y = x;
857   }
858   if (zz != yy) {
859     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
860     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
861     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
862   } else {
863     z = y;
864   }
865 
866   v     = a->a;
867   xb = x;
868 
869   for (i=0; i<mbs; i++) {
870     n  = ai[1] - ai[0]; /* length of i_th block row of A */
871     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
872     ib = aj + *ai;
873     jmin = 0;
874     if (*ib == i){      /* (diag of A)*x */
875       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
876       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
877       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
878       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
879       v += 16; jmin++;
880     }
881     for (j=jmin; j<n; j++) {
882       /* (strict lower triangular part of A)*x  */
883       cval       = ib[j]*4;
884       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
885       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
886       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
887       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
888       /* (strict upper triangular part of A)*x  */
889       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
890       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
891       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
892       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
893       v  += 16;
894     }
895     xb +=4; ai++;
896   }
897 
898   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
899   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
900   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
901 
902   PetscLogFlops(32*(a->nz*2 - A->m));
903   PetscFunctionReturn(0);
904 }
905 
906 #undef __FUNCT__
907 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
908 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
909 {
910   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
911   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5;
912   MatScalar      *v;
913   PetscErrorCode ierr;
914   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
915 
916   PetscFunctionBegin;
917   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
918   if (yy != xx) {
919     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
920   } else {
921     y = x;
922   }
923   if (zz != yy) {
924     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
925     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
926     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
927   } else {
928     z = y;
929   }
930 
931   v     = a->a;
932   xb = x;
933 
934   for (i=0; i<mbs; i++) {
935     n  = ai[1] - ai[0]; /* length of i_th block row of A */
936     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
937     ib = aj + *ai;
938     jmin = 0;
939     if (*ib == i){      /* (diag of A)*x */
940       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
941       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
942       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
943       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
944       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
945       v += 25; jmin++;
946     }
947     for (j=jmin; j<n; j++) {
948       /* (strict lower triangular part of A)*x  */
949       cval       = ib[j]*5;
950       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
951       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
952       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
953       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
954       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
955       /* (strict upper triangular part of A)*x  */
956       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
957       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
958       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
959       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
960       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
961       v  += 25;
962     }
963     xb +=5; ai++;
964   }
965 
966   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
967   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
968   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
969 
970   PetscLogFlops(50*(a->nz*2 - A->m));
971   PetscFunctionReturn(0);
972 }
973 #undef __FUNCT__
974 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
975 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
976 {
977   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
978   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
979   MatScalar      *v;
980   PetscErrorCode ierr;
981   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
982 
983   PetscFunctionBegin;
984   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
985   if (yy != xx) {
986     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
987   } else {
988     y = x;
989   }
990   if (zz != yy) {
991     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
992     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
993   } else {
994     z = y;
995   }
996 
997   v     = a->a;
998   xb = x;
999 
1000   for (i=0; i<mbs; i++) {
1001     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1002     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1003     ib = aj + *ai;
1004     jmin = 0;
1005     if (*ib == i){     /* (diag of A)*x */
1006       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1007       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1008       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1009       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1010       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1011       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1012       v += 36; jmin++;
1013     }
1014     for (j=jmin; j<n; j++) {
1015       /* (strict lower triangular part of A)*x  */
1016       cval       = ib[j]*6;
1017       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1018       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1019       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1020       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1021       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1022       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1023       /* (strict upper triangular part of A)*x  */
1024       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1025       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1026       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1027       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1028       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1029       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1030       v  += 36;
1031     }
1032     xb +=6; ai++;
1033   }
1034 
1035   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1036   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1037   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1038 
1039   PetscLogFlops(72*(a->nz*2 - A->m));
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 #undef __FUNCT__
1044 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1045 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1046 {
1047   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1048   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1049   MatScalar      *v;
1050   PetscErrorCode ierr;
1051   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1052 
1053   PetscFunctionBegin;
1054   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1055   if (yy != xx) {
1056     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1057   } else {
1058     y = x;
1059   }
1060   if (zz != yy) {
1061     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1062     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1063   } else {
1064     z = y;
1065   }
1066 
1067   v     = a->a;
1068   xb = x;
1069 
1070   for (i=0; i<mbs; i++) {
1071     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1072     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1073     ib = aj + *ai;
1074     jmin = 0;
1075     if (*ib == i){     /* (diag of A)*x */
1076       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1077       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1078       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1079       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1080       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1081       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1082       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1083       v += 49; jmin++;
1084     }
1085     for (j=jmin; j<n; j++) {
1086       /* (strict lower triangular part of A)*x  */
1087       cval       = ib[j]*7;
1088       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1089       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1090       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1091       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1092       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1093       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1094       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1095       /* (strict upper triangular part of A)*x  */
1096       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1097       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1098       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1099       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1100       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1101       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1102       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1103       v  += 49;
1104     }
1105     xb +=7; ai++;
1106   }
1107 
1108   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1109   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1110   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1111 
1112   PetscLogFlops(98*(a->nz*2 - A->m));
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1118 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1119 {
1120   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1121   PetscScalar    *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1122   MatScalar      *v;
1123   PetscErrorCode ierr;
1124   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
1125 
1126   PetscFunctionBegin;
1127   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1128   if (yy != xx) {
1129     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1130   } else {
1131     y = x;
1132   }
1133   if (zz != yy) {
1134     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1135     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1136   } else {
1137     z = y;
1138   }
1139 
1140   aj   = a->j;
1141   v    = a->a;
1142   ii   = a->i;
1143 
1144   if (!a->mult_work) {
1145     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1146   }
1147   work = a->mult_work;
1148 
1149 
1150   for (i=0; i<mbs; i++) {
1151     n     = ii[1] - ii[0]; ncols = n*bs;
1152     workt = work; idx=aj+ii[0];
1153 
1154     /* upper triangular part */
1155     for (j=0; j<n; j++) {
1156       xb = x_ptr + bs*(*idx++);
1157       for (k=0; k<bs; k++) workt[k] = xb[k];
1158       workt += bs;
1159     }
1160     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1161     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1162 
1163     /* strict lower triangular part */
1164     idx = aj+ii[0];
1165     if (*idx == i){
1166       ncols -= bs; v += bs2; idx++; n--;
1167     }
1168     if (ncols > 0){
1169       workt = work;
1170       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1171       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1172       for (j=0; j<n; j++) {
1173         zb = z_ptr + bs*(*idx++);
1174         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1175         workt += bs;
1176       }
1177     }
1178 
1179     x += bs; v += n*bs2; z += bs; ii++;
1180   }
1181 
1182   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1183   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1184   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1185 
1186   PetscLogFlops(2*(a->nz*2 - A->m));
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 #undef __FUNCT__
1191 #define __FUNCT__ "MatScale_SeqSBAIJ"
1192 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1193 {
1194   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1195   PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz;
1196 
1197   PetscFunctionBegin;
1198   BLASscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1199   PetscLogFlops(totalnz);
1200   PetscFunctionReturn(0);
1201 }
1202 
1203 #undef __FUNCT__
1204 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1205 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1206 {
1207   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1208   MatScalar      *v = a->a;
1209   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1210   PetscInt       i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1211   PetscErrorCode ierr;
1212   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1213 
1214   PetscFunctionBegin;
1215   if (type == NORM_FROBENIUS) {
1216     for (k=0; k<mbs; k++){
1217       jmin = a->i[k]; jmax = a->i[k+1];
1218       col  = aj + jmin;
1219       if (*col == k){         /* diagonal block */
1220         for (i=0; i<bs2; i++){
1221 #if defined(PETSC_USE_COMPLEX)
1222           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1223 #else
1224           sum_diag += (*v)*(*v); v++;
1225 #endif
1226         }
1227         jmin++;
1228       }
1229       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1230         for (i=0; i<bs2; i++){
1231 #if defined(PETSC_USE_COMPLEX)
1232           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1233 #else
1234           sum_off += (*v)*(*v); v++;
1235 #endif
1236         }
1237       }
1238     }
1239     *norm = sqrt(sum_diag + 2*sum_off);
1240 
1241   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1242     ierr = PetscMalloc(mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1243     ierr = PetscMalloc(mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr);
1244     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1245     for (i=0; i<mbs; i++) {
1246       jl[i] = mbs; il[0] = 0;
1247     }
1248 
1249     *norm = 0.0;
1250     for (k=0; k<mbs; k++) { /* k_th block row */
1251       for (j=0; j<bs; j++) sum[j]=0.0;
1252 
1253       /*-- col sum --*/
1254       i = jl[k]; /* first |A(i,k)| to be added */
1255       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1256                   at step k */
1257       while (i<mbs){
1258         nexti = jl[i];  /* next block row to be added */
1259         ik    = il[i];  /* block index of A(i,k) in the array a */
1260         for (j=0; j<bs; j++){
1261           v = a->a + ik*bs2 + j*bs;
1262           for (k1=0; k1<bs; k1++) {
1263             sum[j] += PetscAbsScalar(*v); v++;
1264           }
1265         }
1266         /* update il, jl */
1267         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1268         jmax = a->i[i+1];
1269         if (jmin < jmax){
1270           il[i] = jmin;
1271           j   = a->j[jmin];
1272           jl[i] = jl[j]; jl[j]=i;
1273         }
1274         i = nexti;
1275       }
1276 
1277       /*-- row sum --*/
1278       jmin = a->i[k]; jmax = a->i[k+1];
1279       for (i=jmin; i<jmax; i++) {
1280         for (j=0; j<bs; j++){
1281           v = a->a + i*bs2 + j;
1282           for (k1=0; k1<bs; k1++){
1283             sum[j] += PetscAbsScalar(*v);
1284             v   += bs;
1285           }
1286         }
1287       }
1288       /* add k_th block row to il, jl */
1289       col = aj+jmin;
1290       if (*col == k) jmin++;
1291       if (jmin < jmax){
1292         il[k] = jmin;
1293         j   = a->j[jmin];
1294         jl[k] = jl[j]; jl[j] = k;
1295       }
1296       for (j=0; j<bs; j++){
1297         if (sum[j] > *norm) *norm = sum[j];
1298       }
1299     }
1300     ierr = PetscFree(il);CHKERRQ(ierr);
1301     ierr = PetscFree(jl);CHKERRQ(ierr);
1302     ierr = PetscFree(sum);CHKERRQ(ierr);
1303   } else {
1304     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1305   }
1306   PetscFunctionReturn(0);
1307 }
1308 
1309 #undef __FUNCT__
1310 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1311 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1312 {
1313   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1314   PetscErrorCode ierr;
1315 
1316   PetscFunctionBegin;
1317 
1318   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1319   if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1320     *flg = PETSC_FALSE;
1321     PetscFunctionReturn(0);
1322   }
1323 
1324   /* if the a->i are the same */
1325   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1326   if (!*flg) {
1327     PetscFunctionReturn(0);
1328   }
1329 
1330   /* if a->j are the same */
1331   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1332   if (!*flg) {
1333     PetscFunctionReturn(0);
1334   }
1335   /* if a->a are the same */
1336   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #undef __FUNCT__
1341 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1342 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1343 {
1344   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1345   PetscErrorCode ierr;
1346   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1347   PetscScalar    *x,zero = 0.0;
1348   MatScalar      *aa,*aa_j;
1349 
1350   PetscFunctionBegin;
1351   bs   = A->bs;
1352   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1353 
1354   aa   = a->a;
1355   ai   = a->i;
1356   aj   = a->j;
1357   ambs = a->mbs;
1358   bs2  = a->bs2;
1359 
1360   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1361   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1362   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1363   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1364   for (i=0; i<ambs; i++) {
1365     j=ai[i];
1366     if (aj[j] == i) {             /* if this is a diagonal element */
1367       row  = i*bs;
1368       aa_j = aa + j*bs2;
1369       if (A->factor && bs==1){
1370         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1371       } else {
1372         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1373       }
1374     }
1375   }
1376 
1377   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 #undef __FUNCT__
1382 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1383 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1384 {
1385   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1386   PetscScalar    *l,x,*li,*ri;
1387   MatScalar      *aa,*v;
1388   PetscErrorCode ierr;
1389   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1390   PetscTruth     flg;
1391 
1392   PetscFunctionBegin;
1393   if (ll != rr){
1394     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1395     if (!flg)
1396       SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1397   }
1398   if (!ll) PetscFunctionReturn(0);
1399   ai  = a->i;
1400   aj  = a->j;
1401   aa  = a->a;
1402   m   = A->m;
1403   bs  = A->bs;
1404   mbs = a->mbs;
1405   bs2 = a->bs2;
1406 
1407   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1408   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1409   if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1410   for (i=0; i<mbs; i++) { /* for each block row */
1411     M  = ai[i+1] - ai[i];
1412     li = l + i*bs;
1413     v  = aa + bs2*ai[i];
1414     for (j=0; j<M; j++) { /* for each block */
1415       ri = l + bs*aj[ai[i]+j];
1416       for (k=0; k<bs; k++) {
1417         x = ri[k];
1418         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1419       }
1420     }
1421   }
1422   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1423   PetscLogFlops(2*a->nz);
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 #undef __FUNCT__
1428 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1429 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1430 {
1431   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1432 
1433   PetscFunctionBegin;
1434   info->rows_global    = (double)A->m;
1435   info->columns_global = (double)A->m;
1436   info->rows_local     = (double)A->m;
1437   info->columns_local  = (double)A->m;
1438   info->block_size     = a->bs2;
1439   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1440   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1441   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1442   info->assemblies   = A->num_ass;
1443   info->mallocs      = a->reallocs;
1444   info->memory       = A->mem;
1445   if (A->factor) {
1446     info->fill_ratio_given  = A->info.fill_ratio_given;
1447     info->fill_ratio_needed = A->info.fill_ratio_needed;
1448     info->factor_mallocs    = A->info.factor_mallocs;
1449   } else {
1450     info->fill_ratio_given  = 0;
1451     info->fill_ratio_needed = 0;
1452     info->factor_mallocs    = 0;
1453   }
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 
1458 #undef __FUNCT__
1459 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1460 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1461 {
1462   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1463   PetscErrorCode ierr;
1464 
1465   PetscFunctionBegin;
1466   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1467   PetscFunctionReturn(0);
1468 }
1469 
1470 #undef __FUNCT__
1471 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1472 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1473 {
1474   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1475   PetscErrorCode ierr;
1476   PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1477   PetscReal    atmp;
1478   MatScalar    *aa;
1479   PetscScalar  zero = 0.0,*x;
1480   PetscInt          ncols,brow,bcol,krow,kcol;
1481 
1482   PetscFunctionBegin;
1483   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1484   bs   = A->bs;
1485   aa   = a->a;
1486   ai   = a->i;
1487   aj   = a->j;
1488   mbs = a->mbs;
1489 
1490   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1491   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1492   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1493   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1494   for (i=0; i<mbs; i++) {
1495     ncols = ai[1] - ai[0]; ai++;
1496     brow  = bs*i;
1497     for (j=0; j<ncols; j++){
1498       bcol = bs*(*aj);
1499       for (kcol=0; kcol<bs; kcol++){
1500         col = bcol + kcol;      /* col index */
1501         for (krow=0; krow<bs; krow++){
1502           atmp = PetscAbsScalar(*aa); aa++;
1503           row = brow + krow;    /* row index */
1504           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1505           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1506         }
1507       }
1508       aj++;
1509     }
1510   }
1511   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1512   PetscFunctionReturn(0);
1513 }
1514