xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 9c30b7d2697335155d7490a7e085415ee7b4a02a)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/inline/spops.h"
4 #include "src/inline/ilu.h"
5 #include "petscbt.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
11 {
12   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
13   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,bcol_max,
14                start,end,*ai,*aj,bs,*nidx2;
15   PetscBT      table;
16   PetscBT      table0;
17 
18   PetscFunctionBegin;
19   mbs = a->mbs;
20   ai  = a->i;
21   aj  = a->j;
22   bs  = a->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
28   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
29   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) { /* for each is */
32     isz  = 0;
33     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
34 
35     /* Extract the indices, assume there can be duplicate entries */
36     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
37     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
38 
39     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
40     bcol_max = 0;
41     for (j=0; j<n ; ++j){
42       brow = idx[j]/bs; /* convert the indices into block indices */
43       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,brow)) {
45         nidx[isz++] = brow;
46         if (bcol_max < brow) bcol_max = brow;
47       }
48     }
49     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
50     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
51 
52     k = 0;
53     for (j=0; j<ov; j++){ /* for each overlap */
54       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
55       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
56       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
57 
58       n = isz;  /* length of the updated is[i] */
59       for (brow=0; brow<mbs; brow++){
60         start = ai[brow]; end   = ai[brow+1];
61         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
62           for (l = start; l<end ; l++){
63             bcol = aj[l];
64             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
65           }
66           k++;
67           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69           for (l = start; l<end ; l++){
70             bcol = aj[l];
71             if (bcol > bcol_max) break;
72             if (PetscBTLookup(table0,bcol)){
73               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
74               break; /* for l = start; l<end ; l++) */
75             }
76           }
77         }
78       }
79     } /* for each overlap */
80 
81     /* expand the Index Set */
82     for (j=0; j<isz; j++) {
83       for (k=0; k<bs; k++)
84         nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
98 {
99   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
100   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
101   int          row,mat_i,*mat_j,tcol,*mat_ilen;
102   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
103   int          *aj = a->j,*ai = a->i;
104   MatScalar    *mat_a;
105   Mat          C;
106   PetscTruth   flag;
107 
108   PetscFunctionBegin;
109 
110   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
111   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
112   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
113 
114   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
115   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
116 
117   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
120   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122   /* determine lens of each row */
123   for (i=0; i<nrows; i++) {
124     kstart  = ai[irow[i]];
125     kend    = kstart + a->ilen[irow[i]];
126     lens[i] = 0;
127       for (k=kstart; k<kend; k++) {
128         if (ssmap[aj[k]]) {
129           lens[i]++;
130         }
131       }
132     }
133   /* Create and fill new matrix */
134   if (scall == MAT_REUSE_MATRIX) {
135     c = (Mat_SeqSBAIJ *)((*B)->data);
136 
137     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
138     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
139     if (flag == PETSC_FALSE) {
140       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
141     }
142     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
143     C = *B;
144   } else {
145     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
146     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
184   IS          is1;
185   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
186 
187   PetscFunctionBegin;
188   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
189 
190   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
192 
193   /* Verify if the indices corespond to each element in a block
194    and form the IS with compressed IS */
195   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
196   iary = vary + a->mbs;
197   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
198   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199 
200   count = 0;
201   for (i=0; i<a->mbs; i++) {
202     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
203     if (vary[i]==bs) iary[count++] = i;
204   }
205   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
206 
207   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
208   ierr = PetscFree(vary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
211   ISDestroy(is1);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   int ierr,i;
220 
221   PetscFunctionBegin;
222   if (scall == MAT_INITIAL_MATRIX) {
223     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
224   }
225 
226   for (i=0; i<n; i++) {
227     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
228   }
229   PetscFunctionReturn(0);
230 }
231 
232 /* -------------------------------------------------------*/
233 /* Should check that shapes of vectors and matrices match */
234 /* -------------------------------------------------------*/
235 #include "petscblaslapack.h"
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
239 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
240 {
241   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
242   PetscScalar     *x,*z,*xb,x1,zero=0.0;
243   MatScalar       *v;
244   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
245 
246   PetscFunctionBegin;
247   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
248   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
250 
251   v  = a->a;
252   xb = x;
253 
254   for (i=0; i<mbs; i++) {
255     n  = ai[1] - ai[0];  /* length of i_th row of A */
256     x1 = xb[0];
257     ib = aj + *ai;
258     jmin = 0;
259     if (*ib == i) {      /* (diag of A)*x */
260       z[i] += *v++ * x[*ib++];
261       jmin++;
262     }
263     for (j=jmin; j<n; j++) {
264       cval    = *ib;
265       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
266       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
267     }
268     xb++; ai++;
269   }
270 
271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
272   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
273   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
279 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
282   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
283   MatScalar        *v;
284   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
285 
286 
287   PetscFunctionBegin;
288   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
289   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
290   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
291 
292   v     = a->a;
293   xb = x;
294 
295   for (i=0; i<mbs; i++) {
296     n  = ai[1] - ai[0]; /* length of i_th block row of A */
297     x1 = xb[0]; x2 = xb[1];
298     ib = aj + *ai;
299     jmin = 0;
300     if (*ib == i){     /* (diag of A)*x */
301       z[2*i]   += v[0]*x1 + v[2]*x2;
302       z[2*i+1] += v[2]*x1 + v[3]*x2;
303       v += 4; jmin++;
304     }
305     for (j=jmin; j<n; j++) {
306       /* (strict lower triangular part of A)*x  */
307       cval       = ib[j]*2;
308       z[cval]     += v[0]*x1 + v[1]*x2;
309       z[cval+1]   += v[2]*x1 + v[3]*x2;
310       /* (strict upper triangular part of A)*x  */
311       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
312       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
313       v  += 4;
314     }
315     xb +=2; ai++;
316   }
317 
318   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
319   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
320   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
321   PetscFunctionReturn(0);
322 }
323 
324 #undef __FUNCT__
325 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
326 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
327 {
328   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
329   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
330   MatScalar     *v;
331   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
332 
333 
334   PetscFunctionBegin;
335   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
336   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
337   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
338 
339   v     = a->a;
340   xb = x;
341 
342   for (i=0; i<mbs; i++) {
343     n  = ai[1] - ai[0]; /* length of i_th block row of A */
344     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
345     ib = aj + *ai;
346     jmin = 0;
347     if (*ib == i){     /* (diag of A)*x */
348       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
349       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
350       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
351       v += 9; jmin++;
352     }
353     for (j=jmin; j<n; j++) {
354       /* (strict lower triangular part of A)*x  */
355       cval       = ib[j]*3;
356       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
357       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
358       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
359       /* (strict upper triangular part of A)*x  */
360       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
361       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
362       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
363       v  += 9;
364     }
365     xb +=3; ai++;
366   }
367 
368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
369   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
370   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
371   PetscFunctionReturn(0);
372 }
373 
374 #undef __FUNCT__
375 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
376 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
377 {
378   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
379   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
380   MatScalar        *v;
381   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
382 
383   PetscFunctionBegin;
384   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
385   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
387 
388   v     = a->a;
389   xb = x;
390 
391   for (i=0; i<mbs; i++) {
392     n  = ai[1] - ai[0]; /* length of i_th block row of A */
393     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
394     ib = aj + *ai;
395     jmin = 0;
396     if (*ib == i){     /* (diag of A)*x */
397       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
398       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
399       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
400       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
401       v += 16; jmin++;
402     }
403     for (j=jmin; j<n; j++) {
404       /* (strict lower triangular part of A)*x  */
405       cval       = ib[j]*4;
406       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
407       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
408       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
409       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
410       /* (strict upper triangular part of A)*x  */
411       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
412       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
413       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
414       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
415       v  += 16;
416     }
417     xb +=4; ai++;
418   }
419 
420   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
421   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
422   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
423   PetscFunctionReturn(0);
424 }
425 
426 #undef __FUNCT__
427 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
428 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
429 {
430   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
431   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
432   MatScalar       *v;
433   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
434 
435   PetscFunctionBegin;
436   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
437   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
438   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
439 
440   v     = a->a;
441   xb = x;
442 
443   for (i=0; i<mbs; i++) {
444     n  = ai[1] - ai[0]; /* length of i_th block row of A */
445     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
446     ib = aj + *ai;
447     jmin = 0;
448     if (*ib == i){      /* (diag of A)*x */
449       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
450       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
451       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
452       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
453       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
454       v += 25; jmin++;
455     }
456     for (j=jmin; j<n; j++) {
457       /* (strict lower triangular part of A)*x  */
458       cval       = ib[j]*5;
459       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
460       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
461       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
462       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
463       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
464       /* (strict upper triangular part of A)*x  */
465       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
466       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
467       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
468       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
469       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
470       v  += 25;
471     }
472     xb +=5; ai++;
473   }
474 
475   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
476   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
477   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
478   PetscFunctionReturn(0);
479 }
480 
481 
482 #undef __FUNCT__
483 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
484 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
485 {
486   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
487   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
488   MatScalar       *v;
489   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
490 
491   PetscFunctionBegin;
492   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
493   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
494   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
495 
496   v     = a->a;
497   xb = x;
498 
499   for (i=0; i<mbs; i++) {
500     n  = ai[1] - ai[0]; /* length of i_th block row of A */
501     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
502     ib = aj + *ai;
503     jmin = 0;
504     if (*ib == i){      /* (diag of A)*x */
505       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
506       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
507       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
508       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
509       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
510       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
511       v += 36; jmin++;
512     }
513     for (j=jmin; j<n; j++) {
514       /* (strict lower triangular part of A)*x  */
515       cval       = ib[j]*6;
516       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
517       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
518       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
519       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
520       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
521       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
522       /* (strict upper triangular part of A)*x  */
523       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
524       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
525       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
526       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
527       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
528       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
529       v  += 36;
530     }
531     xb +=6; ai++;
532   }
533 
534   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
536   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
537   PetscFunctionReturn(0);
538 }
539 #undef __FUNCT__
540 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
541 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
542 {
543   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
544   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
545   MatScalar       *v;
546   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
547 
548   PetscFunctionBegin;
549   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
550   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
551   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
552 
553   v     = a->a;
554   xb = x;
555 
556   for (i=0; i<mbs; i++) {
557     n  = ai[1] - ai[0]; /* length of i_th block row of A */
558     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
559     ib = aj + *ai;
560     jmin = 0;
561     if (*ib == i){      /* (diag of A)*x */
562       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
563       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
564       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
565       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
566       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
567       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
568       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
569       v += 49; jmin++;
570     }
571     for (j=jmin; j<n; j++) {
572       /* (strict lower triangular part of A)*x  */
573       cval       = ib[j]*7;
574       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
575       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
576       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
577       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
578       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
579       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
580       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
581       /* (strict upper triangular part of A)*x  */
582       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
583       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
584       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
585       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
586       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
587       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
588       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
589       v  += 49;
590     }
591     xb +=7; ai++;
592   }
593   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
594   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
595   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
596   PetscFunctionReturn(0);
597 }
598 
599 /*
600     This will not work with MatScalar == float because it calls the BLAS
601 */
602 #undef __FUNCT__
603 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
604 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
605 {
606   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
607   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
608   MatScalar       *v;
609   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
610   int             ncols,k;
611 
612   PetscFunctionBegin;
613   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
614   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
615   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
616 
617   aj   = a->j;
618   v    = a->a;
619   ii   = a->i;
620 
621   if (!a->mult_work) {
622     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
623   }
624   work = a->mult_work;
625 
626   for (i=0; i<mbs; i++) {
627     n     = ii[1] - ii[0]; ncols = n*bs;
628     workt = work; idx=aj+ii[0];
629 
630     /* upper triangular part */
631     for (j=0; j<n; j++) {
632       xb = x_ptr + bs*(*idx++);
633       for (k=0; k<bs; k++) workt[k] = xb[k];
634       workt += bs;
635     }
636     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
637     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
638 
639     /* strict lower triangular part */
640     idx = aj+ii[0];
641     if (*idx == i){
642       ncols -= bs; v += bs2; idx++; n--;
643     }
644 
645     if (ncols > 0){
646       workt = work;
647       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
648       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
649       for (j=0; j<n; j++) {
650         zb = z_ptr + bs*(*idx++);
651         for (k=0; k<bs; k++) zb[k] += workt[k] ;
652         workt += bs;
653       }
654     }
655     x += bs; v += n*bs2; z += bs; ii++;
656   }
657 
658   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
659   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
660   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
661   PetscFunctionReturn(0);
662 }
663 
664 #undef __FUNCT__
665 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
666 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
667 {
668   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
669   PetscScalar     *x,*y,*z,*xb,x1;
670   MatScalar       *v;
671   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
672 
673   PetscFunctionBegin;
674   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
675   if (yy != xx) {
676     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
677   } else {
678     y = x;
679   }
680   if (zz != yy) {
681     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
682     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
683     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
684   } else {
685     z = y;
686   }
687 
688   v  = a->a;
689   xb = x;
690 
691   for (i=0; i<mbs; i++) {
692     n  = ai[1] - ai[0];  /* length of i_th row of A */
693     x1 = xb[0];
694     ib = aj + *ai;
695     jmin = 0;
696     if (*ib == i) {            /* (diag of A)*x */
697       z[i] += *v++ * x[*ib++]; jmin++;
698     }
699     for (j=jmin; j<n; j++) {
700       cval    = *ib;
701       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
702       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
703     }
704     xb++; ai++;
705   }
706 
707   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
708   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
709   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
710 
711   PetscLogFlops(2*(a->nz*2 - A->m));
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
717 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
718 {
719   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
720   PetscScalar     *x,*y,*z,*xb,x1,x2;
721   MatScalar       *v;
722   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
723 
724   PetscFunctionBegin;
725   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
726   if (yy != xx) {
727     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
728   } else {
729     y = x;
730   }
731   if (zz != yy) {
732     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
733     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
734     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
735   } else {
736     z = y;
737   }
738 
739   v     = a->a;
740   xb = x;
741 
742   for (i=0; i<mbs; i++) {
743     n  = ai[1] - ai[0]; /* length of i_th block row of A */
744     x1 = xb[0]; x2 = xb[1];
745     ib = aj + *ai;
746     jmin = 0;
747     if (*ib == i){      /* (diag of A)*x */
748       z[2*i]   += v[0]*x1 + v[2]*x2;
749       z[2*i+1] += v[2]*x1 + v[3]*x2;
750       v += 4; jmin++;
751     }
752     for (j=jmin; j<n; j++) {
753       /* (strict lower triangular part of A)*x  */
754       cval       = ib[j]*2;
755       z[cval]     += v[0]*x1 + v[1]*x2;
756       z[cval+1]   += v[2]*x1 + v[3]*x2;
757       /* (strict upper triangular part of A)*x  */
758       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
759       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
760       v  += 4;
761     }
762     xb +=2; ai++;
763   }
764 
765   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
766   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
767   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
768 
769   PetscLogFlops(4*(a->nz*2 - A->m));
770   PetscFunctionReturn(0);
771 }
772 
773 #undef __FUNCT__
774 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
775 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
776 {
777   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
778   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
779   MatScalar       *v;
780   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
781 
782   PetscFunctionBegin;
783   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
784   if (yy != xx) {
785     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
786   } else {
787     y = x;
788   }
789   if (zz != yy) {
790     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
791     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
792     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
793   } else {
794     z = y;
795   }
796 
797   v     = a->a;
798   xb = x;
799 
800   for (i=0; i<mbs; i++) {
801     n  = ai[1] - ai[0]; /* length of i_th block row of A */
802     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
803     ib = aj + *ai;
804     jmin = 0;
805     if (*ib == i){     /* (diag of A)*x */
806      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
807      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
808      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
809      v += 9; jmin++;
810     }
811     for (j=jmin; j<n; j++) {
812       /* (strict lower triangular part of A)*x  */
813       cval       = ib[j]*3;
814       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
815       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
816       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
817       /* (strict upper triangular part of A)*x  */
818       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
819       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
820       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
821       v  += 9;
822     }
823     xb +=3; ai++;
824   }
825 
826   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
827   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
828   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
829 
830   PetscLogFlops(18*(a->nz*2 - A->m));
831   PetscFunctionReturn(0);
832 }
833 
834 #undef __FUNCT__
835 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
836 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
837 {
838   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
839   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
840   MatScalar       *v;
841   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
842 
843   PetscFunctionBegin;
844   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
845   if (yy != xx) {
846     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
847   } else {
848     y = x;
849   }
850   if (zz != yy) {
851     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
852     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
853     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
854   } else {
855     z = y;
856   }
857 
858   v     = a->a;
859   xb = x;
860 
861   for (i=0; i<mbs; i++) {
862     n  = ai[1] - ai[0]; /* length of i_th block row of A */
863     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
864     ib = aj + *ai;
865     jmin = 0;
866     if (*ib == i){      /* (diag of A)*x */
867       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
868       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
869       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
870       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
871       v += 16; jmin++;
872     }
873     for (j=jmin; j<n; j++) {
874       /* (strict lower triangular part of A)*x  */
875       cval       = ib[j]*4;
876       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
877       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
878       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
879       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
880       /* (strict upper triangular part of A)*x  */
881       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
882       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
883       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
884       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
885       v  += 16;
886     }
887     xb +=4; ai++;
888   }
889 
890   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
892   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
893 
894   PetscLogFlops(32*(a->nz*2 - A->m));
895   PetscFunctionReturn(0);
896 }
897 
898 #undef __FUNCT__
899 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
900 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
901 {
902   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
903   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
904   MatScalar       *v;
905   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
906 
907   PetscFunctionBegin;
908   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
909   if (yy != xx) {
910     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
911   } else {
912     y = x;
913   }
914   if (zz != yy) {
915     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
916     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
917     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
918   } else {
919     z = y;
920   }
921 
922   v     = a->a;
923   xb = x;
924 
925   for (i=0; i<mbs; i++) {
926     n  = ai[1] - ai[0]; /* length of i_th block row of A */
927     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
928     ib = aj + *ai;
929     jmin = 0;
930     if (*ib == i){      /* (diag of A)*x */
931       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
932       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
933       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
934       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
935       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
936       v += 25; jmin++;
937     }
938     for (j=jmin; j<n; j++) {
939       /* (strict lower triangular part of A)*x  */
940       cval       = ib[j]*5;
941       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
942       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
943       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
944       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
945       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
946       /* (strict upper triangular part of A)*x  */
947       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
948       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
949       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
950       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
951       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
952       v  += 25;
953     }
954     xb +=5; ai++;
955   }
956 
957   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
958   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
959   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
960 
961   PetscLogFlops(50*(a->nz*2 - A->m));
962   PetscFunctionReturn(0);
963 }
964 #undef __FUNCT__
965 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
966 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
967 {
968   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
969   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
970   MatScalar       *v;
971   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
972 
973   PetscFunctionBegin;
974   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
975   if (yy != xx) {
976     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
977   } else {
978     y = x;
979   }
980   if (zz != yy) {
981     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
982     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
983     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
984   } else {
985     z = y;
986   }
987 
988   v     = a->a;
989   xb = x;
990 
991   for (i=0; i<mbs; i++) {
992     n  = ai[1] - ai[0]; /* length of i_th block row of A */
993     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
994     ib = aj + *ai;
995     jmin = 0;
996     if (*ib == i){     /* (diag of A)*x */
997       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
998       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
999       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1000       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1001       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1002       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1003       v += 36; jmin++;
1004     }
1005     for (j=jmin; j<n; j++) {
1006       /* (strict lower triangular part of A)*x  */
1007       cval       = ib[j]*6;
1008       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1009       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1010       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1011       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1012       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1013       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1014       /* (strict upper triangular part of A)*x  */
1015       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1016       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1017       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1018       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1019       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1020       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1021       v  += 36;
1022     }
1023     xb +=6; ai++;
1024   }
1025 
1026   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1027   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1028   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1029 
1030   PetscLogFlops(72*(a->nz*2 - A->m));
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 #undef __FUNCT__
1035 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1036 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1037 {
1038   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1039   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1040   MatScalar       *v;
1041   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1042 
1043   PetscFunctionBegin;
1044   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1045   if (yy != xx) {
1046     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1047   } else {
1048     y = x;
1049   }
1050   if (zz != yy) {
1051     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1052     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1053     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1054   } else {
1055     z = y;
1056   }
1057 
1058   v     = a->a;
1059   xb = x;
1060 
1061   for (i=0; i<mbs; i++) {
1062     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1063     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1064     ib = aj + *ai;
1065     jmin = 0;
1066     if (*ib == i){     /* (diag of A)*x */
1067       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1068       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1069       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1070       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1071       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1072       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1073       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1074       v += 49; jmin++;
1075     }
1076     for (j=jmin; j<n; j++) {
1077       /* (strict lower triangular part of A)*x  */
1078       cval       = ib[j]*7;
1079       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1080       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1081       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1082       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1083       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1084       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1085       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1086       /* (strict upper triangular part of A)*x  */
1087       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1088       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1089       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1090       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1091       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1092       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1093       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1094       v  += 49;
1095     }
1096     xb +=7; ai++;
1097   }
1098 
1099   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1100   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1101   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1102 
1103   PetscLogFlops(98*(a->nz*2 - A->m));
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 #undef __FUNCT__
1108 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1109 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1110 {
1111   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1112   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1113   MatScalar       *v;
1114   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1115   int             ncols,k;
1116 
1117   PetscFunctionBegin;
1118   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1119   if (yy != xx) {
1120     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1121   } else {
1122     y = x;
1123   }
1124   if (zz != yy) {
1125     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1126     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1127     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1128   } else {
1129     z = y;
1130   }
1131 
1132   aj   = a->j;
1133   v    = a->a;
1134   ii   = a->i;
1135 
1136   if (!a->mult_work) {
1137     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1138   }
1139   work = a->mult_work;
1140 
1141 
1142   for (i=0; i<mbs; i++) {
1143     n     = ii[1] - ii[0]; ncols = n*bs;
1144     workt = work; idx=aj+ii[0];
1145 
1146     /* upper triangular part */
1147     for (j=0; j<n; j++) {
1148       xb = x_ptr + bs*(*idx++);
1149       for (k=0; k<bs; k++) workt[k] = xb[k];
1150       workt += bs;
1151     }
1152     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1153     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1154 
1155     /* strict lower triangular part */
1156     idx = aj+ii[0];
1157     if (*idx == i){
1158       ncols -= bs; v += bs2; idx++; n--;
1159     }
1160     if (ncols > 0){
1161       workt = work;
1162       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1163       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1164       for (j=0; j<n; j++) {
1165         zb = z_ptr + bs*(*idx++);
1166         /* idx++; */
1167         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1168         workt += bs;
1169       }
1170     }
1171 
1172     x += bs; v += n*bs2; z += bs; ii++;
1173   }
1174 
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1177   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1178 
1179   PetscLogFlops(2*(a->nz*2 - A->m));
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 #undef __FUNCT__
1184 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1185 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1186 {
1187   int ierr;
1188 
1189   PetscFunctionBegin;
1190   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1191   PetscFunctionReturn(0);
1192 }
1193 
1194 #undef __FUNCT__
1195 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1196 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1197 
1198 {
1199   int ierr;
1200 
1201   PetscFunctionBegin;
1202   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 #undef __FUNCT__
1207 #define __FUNCT__ "MatScale_SeqSBAIJ"
1208 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1209 {
1210   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1211   int         one = 1,totalnz = a->bs2*a->nz;
1212 
1213   PetscFunctionBegin;
1214   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1215   PetscLogFlops(totalnz);
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 #undef __FUNCT__
1220 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1221 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1222 {
1223   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1224   MatScalar   *v = a->a;
1225   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1226   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1227   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1228 
1229   PetscFunctionBegin;
1230   if (type == NORM_FROBENIUS) {
1231     for (k=0; k<mbs; k++){
1232       jmin = a->i[k]; jmax = a->i[k+1];
1233       col  = aj + jmin;
1234       if (*col == k){         /* diagonal block */
1235         for (i=0; i<bs2; i++){
1236 #if defined(PETSC_USE_COMPLEX)
1237           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1238 #else
1239           sum_diag += (*v)*(*v); v++;
1240 #endif
1241         }
1242         jmin++;
1243       }
1244       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1245         for (i=0; i<bs2; i++){
1246 #if defined(PETSC_USE_COMPLEX)
1247           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1248 #else
1249           sum_off += (*v)*(*v); v++;
1250 #endif
1251         }
1252       }
1253     }
1254     *norm = sqrt(sum_diag + 2*sum_off);
1255 
1256   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1257     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1258     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1259     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1260     for (i=0; i<mbs; i++) {
1261       jl[i] = mbs; il[0] = 0;
1262     }
1263 
1264     *norm = 0.0;
1265     for (k=0; k<mbs; k++) { /* k_th block row */
1266       for (j=0; j<bs; j++) sum[j]=0.0;
1267 
1268       /*-- col sum --*/
1269       i = jl[k]; /* first |A(i,k)| to be added */
1270       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1271                   at step k */
1272       while (i<mbs){
1273         nexti = jl[i];  /* next block row to be added */
1274         ik    = il[i];  /* block index of A(i,k) in the array a */
1275         for (j=0; j<bs; j++){
1276           v = a->a + ik*bs2 + j*bs;
1277           for (k1=0; k1<bs; k1++) {
1278             sum[j] += PetscAbsScalar(*v); v++;
1279           }
1280         }
1281         /* update il, jl */
1282         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1283         jmax = a->i[i+1];
1284         if (jmin < jmax){
1285           il[i] = jmin;
1286           j   = a->j[jmin];
1287           jl[i] = jl[j]; jl[j]=i;
1288         }
1289         i = nexti;
1290       }
1291 
1292       /*-- row sum --*/
1293       jmin = a->i[k]; jmax = a->i[k+1];
1294       for (i=jmin; i<jmax; i++) {
1295         for (j=0; j<bs; j++){
1296           v = a->a + i*bs2 + j;
1297           for (k1=0; k1<bs; k1++){
1298             sum[j] += PetscAbsScalar(*v);
1299             v   += bs;
1300           }
1301         }
1302       }
1303       /* add k_th block row to il, jl */
1304       col = aj+jmin;
1305       if (*col == k) jmin++;
1306       if (jmin < jmax){
1307         il[k] = jmin;
1308         j   = a->j[jmin];
1309         jl[k] = jl[j]; jl[j] = k;
1310       }
1311       for (j=0; j<bs; j++){
1312         if (sum[j] > *norm) *norm = sum[j];
1313       }
1314     }
1315     ierr = PetscFree(il);CHKERRQ(ierr);
1316     ierr = PetscFree(jl);CHKERRQ(ierr);
1317     ierr = PetscFree(sum);CHKERRQ(ierr);
1318   } else {
1319     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1320   }
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 #undef __FUNCT__
1325 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1326 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1327 {
1328   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1329   int          ierr;
1330 
1331   PetscFunctionBegin;
1332 
1333   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1334   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1335     *flg = PETSC_FALSE;
1336     PetscFunctionReturn(0);
1337   }
1338 
1339   /* if the a->i are the same */
1340   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1341   if (*flg == PETSC_FALSE) {
1342     PetscFunctionReturn(0);
1343   }
1344 
1345   /* if a->j are the same */
1346   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1347   if (*flg == PETSC_FALSE) {
1348     PetscFunctionReturn(0);
1349   }
1350   /* if a->a are the same */
1351   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1352 
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 #undef __FUNCT__
1357 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1358 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1359 {
1360   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1361   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1362   PetscScalar  *x,zero = 0.0;
1363   MatScalar    *aa,*aa_j;
1364 
1365   PetscFunctionBegin;
1366   bs   = a->bs;
1367   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1368 
1369   aa   = a->a;
1370   ai   = a->i;
1371   aj   = a->j;
1372   ambs = a->mbs;
1373   bs2  = a->bs2;
1374 
1375   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1376   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1377   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1378   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1379   for (i=0; i<ambs; i++) {
1380     j=ai[i];
1381     if (aj[j] == i) {             /* if this is a diagonal element */
1382       row  = i*bs;
1383       aa_j = aa + j*bs2;
1384       if (A->factor && bs==1){
1385         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1386       } else {
1387         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1388       }
1389     }
1390   }
1391 
1392   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 #undef __FUNCT__
1397 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1398 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1399 {
1400   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1401   PetscScalar  *l,*r,x,*li,*ri;
1402   MatScalar    *aa,*v;
1403   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1404 
1405   PetscFunctionBegin;
1406   ai  = a->i;
1407   aj  = a->j;
1408   aa  = a->a;
1409   m   = A->m;
1410   bs  = a->bs;
1411   mbs = a->mbs;
1412   bs2 = a->bs2;
1413 
1414   if (ll != rr) {
1415     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1416   }
1417   if (ll) {
1418     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1419     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1420     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1421     for (i=0; i<mbs; i++) { /* for each block row */
1422       M  = ai[i+1] - ai[i];
1423       li = l + i*bs;
1424       v  = aa + bs2*ai[i];
1425       for (j=0; j<M; j++) { /* for each block */
1426         for (k=0; k<bs2; k++) {
1427           (*v++) *= li[k%bs];
1428         }
1429 #ifdef CONT
1430         /* will be used to replace the above loop */
1431         ri = l + bs*aj[ai[i]+j];
1432         for (k=0; k<bs; k++) { /* column value */
1433           x = ri[k];
1434           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1435         }
1436 #endif
1437 
1438       }
1439     }
1440     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1441     PetscLogFlops(2*a->nz);
1442   }
1443   /* will be deleted */
1444   if (rr) {
1445     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1446     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1447     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1448     for (i=0; i<mbs; i++) { /* for each block row */
1449       M  = ai[i+1] - ai[i];
1450       v  = aa + bs2*ai[i];
1451       for (j=0; j<M; j++) { /* for each block */
1452         ri = r + bs*aj[ai[i]+j];
1453         for (k=0; k<bs; k++) {
1454           x = ri[k];
1455           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1456         }
1457       }
1458     }
1459     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1460     PetscLogFlops(a->nz);
1461   }
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 #undef __FUNCT__
1466 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1467 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1468 {
1469   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1470 
1471   PetscFunctionBegin;
1472   info->rows_global    = (double)A->m;
1473   info->columns_global = (double)A->m;
1474   info->rows_local     = (double)A->m;
1475   info->columns_local  = (double)A->m;
1476   info->block_size     = a->bs2;
1477   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1478   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1479   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1480   info->assemblies   = A->num_ass;
1481   info->mallocs      = a->reallocs;
1482   info->memory       = A->mem;
1483   if (A->factor) {
1484     info->fill_ratio_given  = A->info.fill_ratio_given;
1485     info->fill_ratio_needed = A->info.fill_ratio_needed;
1486     info->factor_mallocs    = A->info.factor_mallocs;
1487   } else {
1488     info->fill_ratio_given  = 0;
1489     info->fill_ratio_needed = 0;
1490     info->factor_mallocs    = 0;
1491   }
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 
1496 #undef __FUNCT__
1497 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1498 int MatZeroEntries_SeqSBAIJ(Mat A)
1499 {
1500   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1501   int         ierr;
1502 
1503   PetscFunctionBegin;
1504   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1505   PetscFunctionReturn(0);
1506 }
1507 
1508 #undef __FUNCT__
1509 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1510 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1511 {
1512   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1513   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1514   PetscReal    atmp;
1515   MatScalar    *aa;
1516   PetscScalar  zero = 0.0,*x;
1517   int          ncols,brow,bcol,krow,kcol;
1518 
1519   PetscFunctionBegin;
1520   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1521   bs   = a->bs;
1522   aa   = a->a;
1523   ai   = a->i;
1524   aj   = a->j;
1525   mbs = a->mbs;
1526 
1527   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1528   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1529   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1530   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1531   for (i=0; i<mbs; i++) {
1532     ncols = ai[1] - ai[0]; ai++;
1533     brow  = bs*i;
1534     for (j=0; j<ncols; j++){
1535       bcol = bs*(*aj);
1536       for (kcol=0; kcol<bs; kcol++){
1537         col = bcol + kcol;      /* col index */
1538         for (krow=0; krow<bs; krow++){
1539           atmp = PetscAbsScalar(*aa); aa++;
1540           row = brow + krow;    /* row index */
1541           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1542           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1543           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1544         }
1545       }
1546       aj++;
1547     }
1548   }
1549   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552