xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 5eee224d2838b018c45d75ab7aa92e5d8b61a5ea)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 /* MatIncreaseOverlap_SeqSBAIJ() is vertually same as MatIncreaseOverlap_SeqBAIJ()
10    except in the line 1: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
11    and is slightly polished.
12    Should the two function be combined into a single piece of code? */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
15 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
16 {
17   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
18   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol;
19   int          start,end,*ai,*aj,bs,*nidx2;
20   PetscBT      table;
21 
22   PetscFunctionBegin;
23   mbs = a->mbs;
24   ai  = a->i;
25   aj  = a->j;
26   bs  = a->bs;
27 
28   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29 
30   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
31   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
32   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
33 
34   for (i=0; i<is_max; i++) {
35     /* Initialise the two local arrays */
36     isz  = 0;
37     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
38 
39     /* Extract the indices, assume there can be duplicate entries */
40     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
41     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
42 
43     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
44     for (j=0; j<n ; ++j){
45       bcol = idx[j]/bs; /* convert the indices into block indices */
46       if (bcol >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
47       if(!PetscBTLookupSet(table,bcol)) { nidx[isz++] = bcol;}
48     }
49     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
50     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
51 
52     k = 0;
53     for (j=0; j<ov; j++){ /* for each overlap*/
54       n = isz;
55       for (; k<n ; k++){ /* do only those brows in nidx[k], which are not done yet */
56         brow   = nidx[k];
57         start = ai[brow];
58         end   = ai[brow+1];
59         for (l = start; l<end ; l++){
60           bcol = aj[l];
61           if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
62         }
63       }
64     }
65     /* expand the Index Set */
66     for (j=0; j<isz; j++) {
67       for (k=0; k<bs; k++)
68         nidx2[j*bs+k] = nidx[j]*bs+k;
69     }
70     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
71   }
72   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
73   ierr = PetscFree(nidx);CHKERRQ(ierr);
74   ierr = PetscFree(nidx2);CHKERRQ(ierr);
75   PetscFunctionReturn(0);
76 }
77 
78 #undef __FUNCT__
79 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
80 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
81 {
82   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
83   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
84   int          row,mat_i,*mat_j,tcol,*mat_ilen;
85   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
86   int          *aj = a->j,*ai = a->i;
87   MatScalar    *mat_a;
88   Mat          C;
89   PetscTruth   flag;
90 
91   PetscFunctionBegin;
92 
93   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
94   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
95   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
96 
97   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
98   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
99 
100   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
101   ssmap = smap;
102   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
103   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
104   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110       for (k=kstart; k<kend; k++) {
111         if (ssmap[aj[k]]) {
112           lens[i]++;
113         }
114       }
115     }
116   /* Create and fill new matrix */
117   if (scall == MAT_REUSE_MATRIX) {
118     c = (Mat_SeqSBAIJ *)((*B)->data);
119 
120     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
121     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
122     if (flag == PETSC_FALSE) {
123       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
124     }
125     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
126     C = *B;
127   } else {
128     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
129     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
130     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
131   }
132   c = (Mat_SeqSBAIJ *)(C->data);
133   for (i=0; i<nrows; i++) {
134     row    = irow[i];
135     kstart = ai[row];
136     kend   = kstart + a->ilen[row];
137     mat_i  = c->i[i];
138     mat_j  = c->j + mat_i;
139     mat_a  = c->a + mat_i*bs2;
140     mat_ilen = c->ilen + i;
141     for (k=kstart; k<kend; k++) {
142       if ((tcol=ssmap[a->j[k]])) {
143         *mat_j++ = tcol - 1;
144         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
145         mat_a   += bs2;
146         (*mat_ilen)++;
147       }
148     }
149   }
150 
151   /* Free work space */
152   ierr = PetscFree(smap);CHKERRQ(ierr);
153   ierr = PetscFree(lens);CHKERRQ(ierr);
154   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
155   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156 
157   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
158   *B = C;
159   PetscFunctionReturn(0);
160 }
161 
162 #undef __FUNCT__
163 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
164 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
165 {
166   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
167   IS          is1;
168   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
169 
170   PetscFunctionBegin;
171   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
172 
173   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
174   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
175 
176   /* Verify if the indices corespond to each element in a block
177    and form the IS with compressed IS */
178   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
179   iary = vary + a->mbs;
180   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
181   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182 
183   count = 0;
184   for (i=0; i<a->mbs; i++) {
185     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
186     if (vary[i]==bs) iary[count++] = i;
187   }
188   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
189 
190   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
191   ierr = PetscFree(vary);CHKERRQ(ierr);
192 
193   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
194   ISDestroy(is1);
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
200 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
201 {
202   int ierr,i;
203 
204   PetscFunctionBegin;
205   if (scall == MAT_INITIAL_MATRIX) {
206     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
207   }
208 
209   for (i=0; i<n; i++) {
210     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
211   }
212   PetscFunctionReturn(0);
213 }
214 
215 /* -------------------------------------------------------*/
216 /* Should check that shapes of vectors and matrices match */
217 /* -------------------------------------------------------*/
218 #include "petscblaslapack.h"
219 
220 #undef __FUNCT__
221 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
222 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
223 {
224   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
225   PetscScalar     *x,*z,*xb,x1,zero=0.0;
226   MatScalar       *v;
227   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
228 
229   PetscFunctionBegin;
230   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
231   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
232   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
233 
234   v  = a->a;
235   xb = x;
236 
237   for (i=0; i<mbs; i++) {
238     n  = ai[1] - ai[0];  /* length of i_th row of A */
239     x1 = xb[0];
240     ib = aj + *ai;
241     jmin = 0;
242     if (*ib == i) {      /* (diag of A)*x */
243       z[i] += *v++ * x[*ib++];
244       jmin++;
245     }
246     for (j=jmin; j<n; j++) {
247       cval    = *ib;
248       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
249       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
250     }
251     xb++; ai++;
252   }
253 
254   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
255   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
256   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
257   PetscFunctionReturn(0);
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
262 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
263 {
264   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
265   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
266   MatScalar        *v;
267   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
268 
269 
270   PetscFunctionBegin;
271   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
272   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
273   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
274 
275   v     = a->a;
276   xb = x;
277 
278   for (i=0; i<mbs; i++) {
279     n  = ai[1] - ai[0]; /* length of i_th block row of A */
280     x1 = xb[0]; x2 = xb[1];
281     ib = aj + *ai;
282     jmin = 0;
283     if (*ib == i){     /* (diag of A)*x */
284       z[2*i]   += v[0]*x1 + v[2]*x2;
285       z[2*i+1] += v[2]*x1 + v[3]*x2;
286       v += 4; jmin++;
287     }
288     for (j=jmin; j<n; j++) {
289       /* (strict lower triangular part of A)*x  */
290       cval       = ib[j]*2;
291       z[cval]     += v[0]*x1 + v[1]*x2;
292       z[cval+1]   += v[2]*x1 + v[3]*x2;
293       /* (strict upper triangular part of A)*x  */
294       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
295       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
296       v  += 4;
297     }
298     xb +=2; ai++;
299   }
300 
301   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
302   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
303   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
304   PetscFunctionReturn(0);
305 }
306 
307 #undef __FUNCT__
308 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
309 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
310 {
311   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
312   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
313   MatScalar     *v;
314   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
315 
316 
317   PetscFunctionBegin;
318   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
319   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
320   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
321 
322   v     = a->a;
323   xb = x;
324 
325   for (i=0; i<mbs; i++) {
326     n  = ai[1] - ai[0]; /* length of i_th block row of A */
327     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
328     ib = aj + *ai;
329     jmin = 0;
330     if (*ib == i){     /* (diag of A)*x */
331       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
332       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
333       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
334       v += 9; jmin++;
335     }
336     for (j=jmin; j<n; j++) {
337       /* (strict lower triangular part of A)*x  */
338       cval       = ib[j]*3;
339       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
340       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
341       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
342       /* (strict upper triangular part of A)*x  */
343       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
344       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
345       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
346       v  += 9;
347     }
348     xb +=3; ai++;
349   }
350 
351   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
352   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
353   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
354   PetscFunctionReturn(0);
355 }
356 
357 #undef __FUNCT__
358 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
359 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
360 {
361   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
362   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
363   MatScalar        *v;
364   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
365 
366   PetscFunctionBegin;
367   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
368   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
369   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
370 
371   v     = a->a;
372   xb = x;
373 
374   for (i=0; i<mbs; i++) {
375     n  = ai[1] - ai[0]; /* length of i_th block row of A */
376     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
377     ib = aj + *ai;
378     jmin = 0;
379     if (*ib == i){     /* (diag of A)*x */
380       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
381       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
382       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
383       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
384       v += 16; jmin++;
385     }
386     for (j=jmin; j<n; j++) {
387       /* (strict lower triangular part of A)*x  */
388       cval       = ib[j]*4;
389       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
390       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
391       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
392       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
393       /* (strict upper triangular part of A)*x  */
394       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
395       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
396       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
397       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
398       v  += 16;
399     }
400     xb +=4; ai++;
401   }
402 
403   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
404   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
405   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
406   PetscFunctionReturn(0);
407 }
408 
409 #undef __FUNCT__
410 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
411 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
412 {
413   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
414   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
415   MatScalar       *v;
416   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
417 
418   PetscFunctionBegin;
419   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
420   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
421   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
422 
423   v     = a->a;
424   xb = x;
425 
426   for (i=0; i<mbs; i++) {
427     n  = ai[1] - ai[0]; /* length of i_th block row of A */
428     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
429     ib = aj + *ai;
430     jmin = 0;
431     if (*ib == i){      /* (diag of A)*x */
432       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
433       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
434       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
435       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
436       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
437       v += 25; jmin++;
438     }
439     for (j=jmin; j<n; j++) {
440       /* (strict lower triangular part of A)*x  */
441       cval       = ib[j]*5;
442       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
443       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
444       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
445       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
446       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
447       /* (strict upper triangular part of A)*x  */
448       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
449       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
450       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
451       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
452       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
453       v  += 25;
454     }
455     xb +=5; ai++;
456   }
457 
458   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
459   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
460   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
461   PetscFunctionReturn(0);
462 }
463 
464 
465 #undef __FUNCT__
466 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
467 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
468 {
469   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
470   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
471   MatScalar       *v;
472   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
473 
474   PetscFunctionBegin;
475   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
476   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
477   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
478 
479   v     = a->a;
480   xb = x;
481 
482   for (i=0; i<mbs; i++) {
483     n  = ai[1] - ai[0]; /* length of i_th block row of A */
484     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
485     ib = aj + *ai;
486     jmin = 0;
487     if (*ib == i){      /* (diag of A)*x */
488       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
489       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
490       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
491       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
492       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
493       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
494       v += 36; jmin++;
495     }
496     for (j=jmin; j<n; j++) {
497       /* (strict lower triangular part of A)*x  */
498       cval       = ib[j]*6;
499       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
500       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
501       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
502       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
503       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
504       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
505       /* (strict upper triangular part of A)*x  */
506       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
507       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
508       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
509       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
510       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
511       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
512       v  += 36;
513     }
514     xb +=6; ai++;
515   }
516 
517   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
518   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
519   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
520   PetscFunctionReturn(0);
521 }
522 #undef __FUNCT__
523 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
524 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
525 {
526   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
527   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
528   MatScalar       *v;
529   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
530 
531   PetscFunctionBegin;
532   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
533   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
534   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
535 
536   v     = a->a;
537   xb = x;
538 
539   for (i=0; i<mbs; i++) {
540     n  = ai[1] - ai[0]; /* length of i_th block row of A */
541     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
542     ib = aj + *ai;
543     jmin = 0;
544     if (*ib == i){      /* (diag of A)*x */
545       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
546       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
547       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
548       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
549       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
550       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
551       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
552       v += 49; jmin++;
553     }
554     for (j=jmin; j<n; j++) {
555       /* (strict lower triangular part of A)*x  */
556       cval       = ib[j]*7;
557       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
558       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
559       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
560       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
561       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
562       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
563       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
564       /* (strict upper triangular part of A)*x  */
565       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
566       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
567       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
568       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
569       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
570       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
571       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
572       v  += 49;
573     }
574     xb +=7; ai++;
575   }
576   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
577   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
578   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
579   PetscFunctionReturn(0);
580 }
581 
582 /*
583     This will not work with MatScalar == float because it calls the BLAS
584 */
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
587 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
588 {
589   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
590   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
591   MatScalar       *v;
592   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
593   int             ncols,k;
594 
595   PetscFunctionBegin;
596   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
597   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
598   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
599 
600   aj   = a->j;
601   v    = a->a;
602   ii   = a->i;
603 
604   if (!a->mult_work) {
605     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
606   }
607   work = a->mult_work;
608 
609   for (i=0; i<mbs; i++) {
610     n     = ii[1] - ii[0]; ncols = n*bs;
611     workt = work; idx=aj+ii[0];
612 
613     /* upper triangular part */
614     for (j=0; j<n; j++) {
615       xb = x_ptr + bs*(*idx++);
616       for (k=0; k<bs; k++) workt[k] = xb[k];
617       workt += bs;
618     }
619     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
620     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
621 
622     /* strict lower triangular part */
623     idx = aj+ii[0];
624     if (*idx == i){
625       ncols -= bs; v += bs2; idx++; n--;
626     }
627 
628     if (ncols > 0){
629       workt = work;
630       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
631       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
632       for (j=0; j<n; j++) {
633         zb = z_ptr + bs*(*idx++);
634         for (k=0; k<bs; k++) zb[k] += workt[k] ;
635         workt += bs;
636       }
637     }
638     x += bs; v += n*bs2; z += bs; ii++;
639   }
640 
641   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
642   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
643   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
644   PetscFunctionReturn(0);
645 }
646 
647 #undef __FUNCT__
648 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
649 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
650 {
651   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
652   PetscScalar     *x,*y,*z,*xb,x1;
653   MatScalar       *v;
654   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
655 
656   PetscFunctionBegin;
657   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
658   if (yy != xx) {
659     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
660   } else {
661     y = x;
662   }
663   if (zz != yy) {
664     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
665     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
666     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
667   } else {
668     z = y;
669   }
670 
671   v  = a->a;
672   xb = x;
673 
674   for (i=0; i<mbs; i++) {
675     n  = ai[1] - ai[0];  /* length of i_th row of A */
676     x1 = xb[0];
677     ib = aj + *ai;
678     jmin = 0;
679     if (*ib == i) {            /* (diag of A)*x */
680       z[i] += *v++ * x[*ib++]; jmin++;
681     }
682     for (j=jmin; j<n; j++) {
683       cval    = *ib;
684       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
685       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
686     }
687     xb++; ai++;
688   }
689 
690   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
691   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
692   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
693 
694   PetscLogFlops(2*(a->nz*2 - A->m));
695   PetscFunctionReturn(0);
696 }
697 
698 #undef __FUNCT__
699 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
700 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
701 {
702   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
703   PetscScalar     *x,*y,*z,*xb,x1,x2;
704   MatScalar       *v;
705   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
706 
707   PetscFunctionBegin;
708   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
709   if (yy != xx) {
710     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
711   } else {
712     y = x;
713   }
714   if (zz != yy) {
715     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
716     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
717     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
718   } else {
719     z = y;
720   }
721 
722   v     = a->a;
723   xb = x;
724 
725   for (i=0; i<mbs; i++) {
726     n  = ai[1] - ai[0]; /* length of i_th block row of A */
727     x1 = xb[0]; x2 = xb[1];
728     ib = aj + *ai;
729     jmin = 0;
730     if (*ib == i){      /* (diag of A)*x */
731       z[2*i]   += v[0]*x1 + v[2]*x2;
732       z[2*i+1] += v[2]*x1 + v[3]*x2;
733       v += 4; jmin++;
734     }
735     for (j=jmin; j<n; j++) {
736       /* (strict lower triangular part of A)*x  */
737       cval       = ib[j]*2;
738       z[cval]     += v[0]*x1 + v[1]*x2;
739       z[cval+1]   += v[2]*x1 + v[3]*x2;
740       /* (strict upper triangular part of A)*x  */
741       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
742       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
743       v  += 4;
744     }
745     xb +=2; ai++;
746   }
747 
748   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
749   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
750   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
751 
752   PetscLogFlops(4*(a->nz*2 - A->m));
753   PetscFunctionReturn(0);
754 }
755 
756 #undef __FUNCT__
757 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
758 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
759 {
760   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
761   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
762   MatScalar       *v;
763   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
764 
765   PetscFunctionBegin;
766   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
767   if (yy != xx) {
768     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
769   } else {
770     y = x;
771   }
772   if (zz != yy) {
773     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
774     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
775     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
776   } else {
777     z = y;
778   }
779 
780   v     = a->a;
781   xb = x;
782 
783   for (i=0; i<mbs; i++) {
784     n  = ai[1] - ai[0]; /* length of i_th block row of A */
785     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
786     ib = aj + *ai;
787     jmin = 0;
788     if (*ib == i){     /* (diag of A)*x */
789      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
790      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
791      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
792      v += 9; jmin++;
793     }
794     for (j=jmin; j<n; j++) {
795       /* (strict lower triangular part of A)*x  */
796       cval       = ib[j]*3;
797       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
798       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
799       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
800       /* (strict upper triangular part of A)*x  */
801       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
802       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
803       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
804       v  += 9;
805     }
806     xb +=3; ai++;
807   }
808 
809   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
810   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
811   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
812 
813   PetscLogFlops(18*(a->nz*2 - A->m));
814   PetscFunctionReturn(0);
815 }
816 
817 #undef __FUNCT__
818 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
819 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
820 {
821   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
822   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
823   MatScalar       *v;
824   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
825 
826   PetscFunctionBegin;
827   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
828   if (yy != xx) {
829     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
830   } else {
831     y = x;
832   }
833   if (zz != yy) {
834     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
835     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
836     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
837   } else {
838     z = y;
839   }
840 
841   v     = a->a;
842   xb = x;
843 
844   for (i=0; i<mbs; i++) {
845     n  = ai[1] - ai[0]; /* length of i_th block row of A */
846     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
847     ib = aj + *ai;
848     jmin = 0;
849     if (*ib == i){      /* (diag of A)*x */
850       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
851       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
852       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
853       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
854       v += 16; jmin++;
855     }
856     for (j=jmin; j<n; j++) {
857       /* (strict lower triangular part of A)*x  */
858       cval       = ib[j]*4;
859       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
860       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
861       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
862       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
863       /* (strict upper triangular part of A)*x  */
864       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
865       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
866       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
867       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
868       v  += 16;
869     }
870     xb +=4; ai++;
871   }
872 
873   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
874   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
875   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
876 
877   PetscLogFlops(32*(a->nz*2 - A->m));
878   PetscFunctionReturn(0);
879 }
880 
881 #undef __FUNCT__
882 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
883 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
884 {
885   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
886   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
887   MatScalar       *v;
888   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
889 
890   PetscFunctionBegin;
891   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
892   if (yy != xx) {
893     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
894   } else {
895     y = x;
896   }
897   if (zz != yy) {
898     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
899     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
900     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
901   } else {
902     z = y;
903   }
904 
905   v     = a->a;
906   xb = x;
907 
908   for (i=0; i<mbs; i++) {
909     n  = ai[1] - ai[0]; /* length of i_th block row of A */
910     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
911     ib = aj + *ai;
912     jmin = 0;
913     if (*ib == i){      /* (diag of A)*x */
914       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
915       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
916       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
917       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
918       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
919       v += 25; jmin++;
920     }
921     for (j=jmin; j<n; j++) {
922       /* (strict lower triangular part of A)*x  */
923       cval       = ib[j]*5;
924       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
925       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
926       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
927       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
928       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
929       /* (strict upper triangular part of A)*x  */
930       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
931       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
932       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
933       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
934       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
935       v  += 25;
936     }
937     xb +=5; ai++;
938   }
939 
940   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
941   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
942   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
943 
944   PetscLogFlops(50*(a->nz*2 - A->m));
945   PetscFunctionReturn(0);
946 }
947 #undef __FUNCT__
948 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
949 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
950 {
951   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
952   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
953   MatScalar       *v;
954   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
955 
956   PetscFunctionBegin;
957   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
958   if (yy != xx) {
959     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
960   } else {
961     y = x;
962   }
963   if (zz != yy) {
964     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
965     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
966     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
967   } else {
968     z = y;
969   }
970 
971   v     = a->a;
972   xb = x;
973 
974   for (i=0; i<mbs; i++) {
975     n  = ai[1] - ai[0]; /* length of i_th block row of A */
976     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
977     ib = aj + *ai;
978     jmin = 0;
979     if (*ib == i){     /* (diag of A)*x */
980       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
981       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
982       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
983       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
984       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
985       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
986       v += 36; jmin++;
987     }
988     for (j=jmin; j<n; j++) {
989       /* (strict lower triangular part of A)*x  */
990       cval       = ib[j]*6;
991       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
992       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
993       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
994       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
995       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
996       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
997       /* (strict upper triangular part of A)*x  */
998       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
999       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1000       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1001       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1002       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1003       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1004       v  += 36;
1005     }
1006     xb +=6; ai++;
1007   }
1008 
1009   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1010   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1011   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1012 
1013   PetscLogFlops(72*(a->nz*2 - A->m));
1014   PetscFunctionReturn(0);
1015 }
1016 
1017 #undef __FUNCT__
1018 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1019 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1020 {
1021   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1022   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1023   MatScalar       *v;
1024   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1025 
1026   PetscFunctionBegin;
1027   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1028   if (yy != xx) {
1029     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1030   } else {
1031     y = x;
1032   }
1033   if (zz != yy) {
1034     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1035     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
1036     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1037   } else {
1038     z = y;
1039   }
1040 
1041   v     = a->a;
1042   xb = x;
1043 
1044   for (i=0; i<mbs; i++) {
1045     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1046     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1047     ib = aj + *ai;
1048     jmin = 0;
1049     if (*ib == i){     /* (diag of A)*x */
1050       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1051       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1052       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1053       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1054       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1055       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1056       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1057       v += 49; jmin++;
1058     }
1059     for (j=jmin; j<n; j++) {
1060       /* (strict lower triangular part of A)*x  */
1061       cval       = ib[j]*7;
1062       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1063       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1064       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1065       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1066       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1067       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1068       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1069       /* (strict upper triangular part of A)*x  */
1070       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1071       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1072       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1073       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1074       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1075       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1076       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1077       v  += 49;
1078     }
1079     xb +=7; ai++;
1080   }
1081 
1082   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1083   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1084   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1085 
1086   PetscLogFlops(98*(a->nz*2 - A->m));
1087   PetscFunctionReturn(0);
1088 }
1089 
1090 #undef __FUNCT__
1091 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1092 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1093 {
1094   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1095   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1096   MatScalar       *v;
1097   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1098   int             ncols,k;
1099 
1100   PetscFunctionBegin;
1101   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
1102   if (yy != xx) {
1103     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1104   } else {
1105     y = x;
1106   }
1107   if (zz != yy) {
1108     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1109     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
1110     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1111   } else {
1112     z = y;
1113   }
1114 
1115   aj   = a->j;
1116   v    = a->a;
1117   ii   = a->i;
1118 
1119   if (!a->mult_work) {
1120     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1121   }
1122   work = a->mult_work;
1123 
1124 
1125   for (i=0; i<mbs; i++) {
1126     n     = ii[1] - ii[0]; ncols = n*bs;
1127     workt = work; idx=aj+ii[0];
1128 
1129     /* upper triangular part */
1130     for (j=0; j<n; j++) {
1131       xb = x_ptr + bs*(*idx++);
1132       for (k=0; k<bs; k++) workt[k] = xb[k];
1133       workt += bs;
1134     }
1135     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1136     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1137 
1138     /* strict lower triangular part */
1139     idx = aj+ii[0];
1140     if (*idx == i){
1141       ncols -= bs; v += bs2; idx++; n--;
1142     }
1143     if (ncols > 0){
1144       workt = work;
1145       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1146       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1147       for (j=0; j<n; j++) {
1148         zb = z_ptr + bs*(*idx++);
1149         /* idx++; */
1150         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1151         workt += bs;
1152       }
1153     }
1154 
1155     x += bs; v += n*bs2; z += bs; ii++;
1156   }
1157 
1158   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1159   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1160   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1161 
1162   PetscLogFlops(2*(a->nz*2 - A->m));
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1168 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1169 {
1170   int ierr;
1171 
1172   PetscFunctionBegin;
1173   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 #undef __FUNCT__
1178 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1179 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1180 
1181 {
1182   int ierr;
1183 
1184   PetscFunctionBegin;
1185   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatScale_SeqSBAIJ"
1191 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1192 {
1193   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1194   int         one = 1,totalnz = a->bs2*a->nz;
1195 
1196   PetscFunctionBegin;
1197   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1198   PetscLogFlops(totalnz);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 #undef __FUNCT__
1203 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1204 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1205 {
1206   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1207   MatScalar   *v = a->a;
1208   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1209   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1210   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1211 
1212   PetscFunctionBegin;
1213   if (type == NORM_FROBENIUS) {
1214     for (k=0; k<mbs; k++){
1215       jmin = a->i[k]; jmax = a->i[k+1];
1216       col  = aj + jmin;
1217       if (*col == k){         /* diagonal block */
1218         for (i=0; i<bs2; i++){
1219 #if defined(PETSC_USE_COMPLEX)
1220           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1221 #else
1222           sum_diag += (*v)*(*v); v++;
1223 #endif
1224         }
1225         jmin++;
1226       }
1227       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1228         for (i=0; i<bs2; i++){
1229 #if defined(PETSC_USE_COMPLEX)
1230           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1231 #else
1232           sum_off += (*v)*(*v); v++;
1233 #endif
1234         }
1235       }
1236     }
1237     *norm = sqrt(sum_diag + 2*sum_off);
1238 
1239   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1240     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1241     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1242     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1243     for (i=0; i<mbs; i++) {
1244       jl[i] = mbs; il[0] = 0;
1245     }
1246 
1247     *norm = 0.0;
1248     for (k=0; k<mbs; k++) { /* k_th block row */
1249       for (j=0; j<bs; j++) sum[j]=0.0;
1250 
1251       /*-- col sum --*/
1252       i = jl[k]; /* first |A(i,k)| to be added */
1253       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1254                   at step k */
1255       while (i<mbs){
1256         nexti = jl[i];  /* next block row to be added */
1257         ik    = il[i];  /* block index of A(i,k) in the array a */
1258         for (j=0; j<bs; j++){
1259           v = a->a + ik*bs2 + j*bs;
1260           for (k1=0; k1<bs; k1++) {
1261             sum[j] += PetscAbsScalar(*v); v++;
1262           }
1263         }
1264         /* update il, jl */
1265         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1266         jmax = a->i[i+1];
1267         if (jmin < jmax){
1268           il[i] = jmin;
1269           j   = a->j[jmin];
1270           jl[i] = jl[j]; jl[j]=i;
1271         }
1272         i = nexti;
1273       }
1274 
1275       /*-- row sum --*/
1276       jmin = a->i[k]; jmax = a->i[k+1];
1277       for (i=jmin; i<jmax; i++) {
1278         for (j=0; j<bs; j++){
1279           v = a->a + i*bs2 + j;
1280           for (k1=0; k1<bs; k1++){
1281             sum[j] += PetscAbsScalar(*v);
1282             v   += bs;
1283           }
1284         }
1285       }
1286       /* add k_th block row to il, jl */
1287       col = aj+jmin;
1288       if (*col == k) jmin++;
1289       if (jmin < jmax){
1290         il[k] = jmin;
1291         j   = a->j[jmin];
1292         jl[k] = jl[j]; jl[j] = k;
1293       }
1294       for (j=0; j<bs; j++){
1295         if (sum[j] > *norm) *norm = sum[j];
1296       }
1297     }
1298     ierr = PetscFree(il);CHKERRQ(ierr);
1299     ierr = PetscFree(jl);CHKERRQ(ierr);
1300     ierr = PetscFree(sum);CHKERRQ(ierr);
1301   } else {
1302     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1303   }
1304   PetscFunctionReturn(0);
1305 }
1306 
1307 #undef __FUNCT__
1308 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1309 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1310 {
1311   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1312   int          ierr;
1313 
1314   PetscFunctionBegin;
1315 
1316   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1317   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1318     *flg = PETSC_FALSE;
1319     PetscFunctionReturn(0);
1320   }
1321 
1322   /* if the a->i are the same */
1323   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1324   if (*flg == PETSC_FALSE) {
1325     PetscFunctionReturn(0);
1326   }
1327 
1328   /* if a->j are the same */
1329   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1330   if (*flg == PETSC_FALSE) {
1331     PetscFunctionReturn(0);
1332   }
1333   /* if a->a are the same */
1334   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1335 
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 #undef __FUNCT__
1340 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1341 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1342 {
1343   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1344   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1345   PetscScalar  *x,zero = 0.0;
1346   MatScalar    *aa,*aa_j;
1347 
1348   PetscFunctionBegin;
1349   bs   = a->bs;
1350   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1351 
1352   aa   = a->a;
1353   ai   = a->i;
1354   aj   = a->j;
1355   ambs = a->mbs;
1356   bs2  = a->bs2;
1357 
1358   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1359   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1360   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1361   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1362   for (i=0; i<ambs; i++) {
1363     j=ai[i];
1364     if (aj[j] == i) {             /* if this is a diagonal element */
1365       row  = i*bs;
1366       aa_j = aa + j*bs2;
1367       if (A->factor && bs==1){
1368         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1369       } else {
1370         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1371       }
1372     }
1373   }
1374 
1375   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 #undef __FUNCT__
1380 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1381 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1382 {
1383   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1384   PetscScalar  *l,*r,x,*li,*ri;
1385   MatScalar    *aa,*v;
1386   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1387 
1388   PetscFunctionBegin;
1389   ai  = a->i;
1390   aj  = a->j;
1391   aa  = a->a;
1392   m   = A->m;
1393   bs  = a->bs;
1394   mbs = a->mbs;
1395   bs2 = a->bs2;
1396 
1397   if (ll != rr) {
1398     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1399   }
1400   if (ll) {
1401     ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr);
1402     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1403     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1404     for (i=0; i<mbs; i++) { /* for each block row */
1405       M  = ai[i+1] - ai[i];
1406       li = l + i*bs;
1407       v  = aa + bs2*ai[i];
1408       for (j=0; j<M; j++) { /* for each block */
1409         for (k=0; k<bs2; k++) {
1410           (*v++) *= li[k%bs];
1411         }
1412 #ifdef CONT
1413         /* will be used to replace the above loop */
1414         ri = l + bs*aj[ai[i]+j];
1415         for (k=0; k<bs; k++) { /* column value */
1416           x = ri[k];
1417           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1418         }
1419 #endif
1420 
1421       }
1422     }
1423     ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr);
1424     PetscLogFlops(2*a->nz);
1425   }
1426   /* will be deleted */
1427   if (rr) {
1428     ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr);
1429     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1430     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1431     for (i=0; i<mbs; i++) { /* for each block row */
1432       M  = ai[i+1] - ai[i];
1433       v  = aa + bs2*ai[i];
1434       for (j=0; j<M; j++) { /* for each block */
1435         ri = r + bs*aj[ai[i]+j];
1436         for (k=0; k<bs; k++) {
1437           x = ri[k];
1438           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1439         }
1440       }
1441     }
1442     ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr);
1443     PetscLogFlops(a->nz);
1444   }
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 #undef __FUNCT__
1449 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1450 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1451 {
1452   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1453 
1454   PetscFunctionBegin;
1455   info->rows_global    = (double)A->m;
1456   info->columns_global = (double)A->m;
1457   info->rows_local     = (double)A->m;
1458   info->columns_local  = (double)A->m;
1459   info->block_size     = a->bs2;
1460   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1461   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1462   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1463   info->assemblies   = A->num_ass;
1464   info->mallocs      = a->reallocs;
1465   info->memory       = A->mem;
1466   if (A->factor) {
1467     info->fill_ratio_given  = A->info.fill_ratio_given;
1468     info->fill_ratio_needed = A->info.fill_ratio_needed;
1469     info->factor_mallocs    = A->info.factor_mallocs;
1470   } else {
1471     info->fill_ratio_given  = 0;
1472     info->fill_ratio_needed = 0;
1473     info->factor_mallocs    = 0;
1474   }
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 
1479 #undef __FUNCT__
1480 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1481 int MatZeroEntries_SeqSBAIJ(Mat A)
1482 {
1483   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1484   int         ierr;
1485 
1486   PetscFunctionBegin;
1487   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 #undef __FUNCT__
1492 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1493 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1494 {
1495   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1496   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1497   PetscReal    atmp;
1498   MatScalar    *aa;
1499   PetscScalar  zero = 0.0,*x;
1500   int          ncols,brow,bcol,krow,kcol;
1501 
1502   PetscFunctionBegin;
1503   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1504   bs   = a->bs;
1505   aa   = a->a;
1506   ai   = a->i;
1507   aj   = a->j;
1508   mbs = a->mbs;
1509 
1510   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1511   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1512   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1513   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1514   for (i=0; i<mbs; i++) {
1515     ncols = ai[1] - ai[0]; ai++;
1516     brow  = bs*i;
1517     for (j=0; j<ncols; j++){
1518       bcol = bs*(*aj);
1519       for (kcol=0; kcol<bs; kcol++){
1520         col = bcol + kcol;      /* col index */
1521         for (krow=0; krow<bs; krow++){
1522           atmp = PetscAbsScalar(*aa); aa++;
1523           row = brow + krow;    /* row index */
1524           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1525           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1526           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1527         }
1528       }
1529       aj++;
1530     }
1531   }
1532   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1533   PetscFunctionReturn(0);
1534 }
1535