xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 4cb7fc73c5648a2ebbfd9f6fbb092565e0a4b3bb)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
12 {
13   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
14   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,
15                start,end,*ai,*aj,bs,*nidx2,*colsearch;
16   PetscBT      table;
17 
18   PetscFunctionBegin;
19   mbs = a->mbs;
20   ai  = a->i;
21   aj  = a->j;
22   bs  = a->bs;
23 
24   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25 
26   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
27   ierr = PetscMalloc((2*mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
28   colsearch = nidx + mbs + 1;
29   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
30 
31   for (i=0; i<is_max; i++) {
32 
33     /* Initialise the two local arrays */
34     isz  = 0;
35     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
39     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
40 
41     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
42     for (j=0; j<n ; ++j){
43       bcol = idx[j]/bs; /* convert the indices into block indices */
44       if (bcol >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45       if(!PetscBTLookupSet(table,bcol)) { nidx[isz++] = bcol;}
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++){ /* for each overlap */
52       /* initialize colsearch -
53          colsearch[i] points to the possible non-zero (i,active_col)-th entry in the array aj */
54       for (brow=0; brow<mbs; brow++) colsearch[brow] = ai[brow];
55 
56       /* sort nidx[k:isz-1] - needed by col search */
57       ierr = PetscSortInt(isz-k,nidx+k);
58 
59       n = isz;
60       for (; k<n ; k++){ /* do only those brows in nidx[k], which are not done yet */
61         brow   = nidx[k];
62         /* search block column */
63         for (l=0; l<brow; l++){
64           if (colsearch[l]<ai[l+1]){
65             bcol = aj[colsearch[l]];
66             while (bcol < brow ) {
67               colsearch[l]++;
68               if (colsearch[l]<ai[l+1]){
69                 bcol = aj[colsearch[l]];
70               } else {
71                 bcol = mbs; break;
72               }
73             }
74             if (bcol==brow){
75               if (!PetscBTLookupSet(table,l)) {nidx[isz++] = l;}
76               colsearch[l]++;
77             }
78           }
79         }
80         /* search block row */
81         start = ai[brow];
82         end   = ai[brow+1];
83         for (l = start; l<end ; l++){
84           bcol = aj[l];
85           if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
86         }
87       }
88     } /* for each overlap */
89 
90     /* expand the Index Set */
91     for (j=0; j<isz; j++) {
92       for (k=0; k<bs; k++)
93         nidx2[j*bs+k] = nidx[j]*bs+k;
94     }
95     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
96   }
97   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
98   ierr = PetscFree(nidx);CHKERRQ(ierr);
99   ierr = PetscFree(nidx2);CHKERRQ(ierr);
100   PetscFunctionReturn(0);
101 }
102 
103 #undef __FUNCT__
104 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
105 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
106 {
107   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
108   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
109   int          row,mat_i,*mat_j,tcol,*mat_ilen;
110   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
111   int          *aj = a->j,*ai = a->i;
112   MatScalar    *mat_a;
113   Mat          C;
114   PetscTruth   flag;
115 
116   PetscFunctionBegin;
117 
118   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
119   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
120   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
121 
122   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
123   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
124 
125   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
126   ssmap = smap;
127   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
128   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
129   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
130   /* determine lens of each row */
131   for (i=0; i<nrows; i++) {
132     kstart  = ai[irow[i]];
133     kend    = kstart + a->ilen[irow[i]];
134     lens[i] = 0;
135       for (k=kstart; k<kend; k++) {
136         if (ssmap[aj[k]]) {
137           lens[i]++;
138         }
139       }
140     }
141   /* Create and fill new matrix */
142   if (scall == MAT_REUSE_MATRIX) {
143     c = (Mat_SeqSBAIJ *)((*B)->data);
144 
145     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
146     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
147     if (flag == PETSC_FALSE) {
148       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
149     }
150     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
151     C = *B;
152   } else {
153     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
154     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
155     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
156   }
157   c = (Mat_SeqSBAIJ *)(C->data);
158   for (i=0; i<nrows; i++) {
159     row    = irow[i];
160     kstart = ai[row];
161     kend   = kstart + a->ilen[row];
162     mat_i  = c->i[i];
163     mat_j  = c->j + mat_i;
164     mat_a  = c->a + mat_i*bs2;
165     mat_ilen = c->ilen + i;
166     for (k=kstart; k<kend; k++) {
167       if ((tcol=ssmap[a->j[k]])) {
168         *mat_j++ = tcol - 1;
169         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
170         mat_a   += bs2;
171         (*mat_ilen)++;
172       }
173     }
174   }
175 
176   /* Free work space */
177   ierr = PetscFree(smap);CHKERRQ(ierr);
178   ierr = PetscFree(lens);CHKERRQ(ierr);
179   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
180   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
181 
182   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
183   *B = C;
184   PetscFunctionReturn(0);
185 }
186 
187 #undef __FUNCT__
188 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
189 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
190 {
191   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
192   IS          is1;
193   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
194 
195   PetscFunctionBegin;
196   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
197 
198   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
199   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
200 
201   /* Verify if the indices corespond to each element in a block
202    and form the IS with compressed IS */
203   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
204   iary = vary + a->mbs;
205   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
206   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
207 
208   count = 0;
209   for (i=0; i<a->mbs; i++) {
210     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
211     if (vary[i]==bs) iary[count++] = i;
212   }
213   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
214 
215   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
216   ierr = PetscFree(vary);CHKERRQ(ierr);
217 
218   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
219   ISDestroy(is1);
220   PetscFunctionReturn(0);
221 }
222 
223 #undef __FUNCT__
224 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
225 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
226 {
227   int ierr,i;
228 
229   PetscFunctionBegin;
230   if (scall == MAT_INITIAL_MATRIX) {
231     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
232   }
233 
234   for (i=0; i<n; i++) {
235     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
236   }
237   PetscFunctionReturn(0);
238 }
239 
240 /* -------------------------------------------------------*/
241 /* Should check that shapes of vectors and matrices match */
242 /* -------------------------------------------------------*/
243 #include "petscblaslapack.h"
244 
245 #undef __FUNCT__
246 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
247 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
248 {
249   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
250   PetscScalar     *x,*z,*xb,x1,zero=0.0;
251   MatScalar       *v;
252   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
253 
254   PetscFunctionBegin;
255   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
256   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
257   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
258 
259   v  = a->a;
260   xb = x;
261 
262   for (i=0; i<mbs; i++) {
263     n  = ai[1] - ai[0];  /* length of i_th row of A */
264     x1 = xb[0];
265     ib = aj + *ai;
266     jmin = 0;
267     if (*ib == i) {      /* (diag of A)*x */
268       z[i] += *v++ * x[*ib++];
269       jmin++;
270     }
271     for (j=jmin; j<n; j++) {
272       cval    = *ib;
273       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
274       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
275     }
276     xb++; ai++;
277   }
278 
279   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
280   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
281   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
282   PetscFunctionReturn(0);
283 }
284 
285 #undef __FUNCT__
286 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
287 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
288 {
289   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
290   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
291   MatScalar        *v;
292   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
293 
294 
295   PetscFunctionBegin;
296   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
297   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
298   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
299 
300   v     = a->a;
301   xb = x;
302 
303   for (i=0; i<mbs; i++) {
304     n  = ai[1] - ai[0]; /* length of i_th block row of A */
305     x1 = xb[0]; x2 = xb[1];
306     ib = aj + *ai;
307     jmin = 0;
308     if (*ib == i){     /* (diag of A)*x */
309       z[2*i]   += v[0]*x1 + v[2]*x2;
310       z[2*i+1] += v[2]*x1 + v[3]*x2;
311       v += 4; jmin++;
312     }
313     for (j=jmin; j<n; j++) {
314       /* (strict lower triangular part of A)*x  */
315       cval       = ib[j]*2;
316       z[cval]     += v[0]*x1 + v[1]*x2;
317       z[cval+1]   += v[2]*x1 + v[3]*x2;
318       /* (strict upper triangular part of A)*x  */
319       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
320       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
321       v  += 4;
322     }
323     xb +=2; ai++;
324   }
325 
326   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
327   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
328   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
329   PetscFunctionReturn(0);
330 }
331 
332 #undef __FUNCT__
333 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
334 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
335 {
336   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
337   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
338   MatScalar     *v;
339   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
340 
341 
342   PetscFunctionBegin;
343   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
344   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
345   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
346 
347   v     = a->a;
348   xb = x;
349 
350   for (i=0; i<mbs; i++) {
351     n  = ai[1] - ai[0]; /* length of i_th block row of A */
352     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
353     ib = aj + *ai;
354     jmin = 0;
355     if (*ib == i){     /* (diag of A)*x */
356       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
357       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
358       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
359       v += 9; jmin++;
360     }
361     for (j=jmin; j<n; j++) {
362       /* (strict lower triangular part of A)*x  */
363       cval       = ib[j]*3;
364       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
365       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
366       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
367       /* (strict upper triangular part of A)*x  */
368       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
369       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
370       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
371       v  += 9;
372     }
373     xb +=3; ai++;
374   }
375 
376   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
377   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
378   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
379   PetscFunctionReturn(0);
380 }
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
384 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
385 {
386   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
387   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
388   MatScalar        *v;
389   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
390 
391   PetscFunctionBegin;
392   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
393   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
394   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
395 
396   v     = a->a;
397   xb = x;
398 
399   for (i=0; i<mbs; i++) {
400     n  = ai[1] - ai[0]; /* length of i_th block row of A */
401     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
402     ib = aj + *ai;
403     jmin = 0;
404     if (*ib == i){     /* (diag of A)*x */
405       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
406       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
407       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
408       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
409       v += 16; jmin++;
410     }
411     for (j=jmin; j<n; j++) {
412       /* (strict lower triangular part of A)*x  */
413       cval       = ib[j]*4;
414       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
415       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
416       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
417       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
418       /* (strict upper triangular part of A)*x  */
419       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
420       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
421       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
422       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
423       v  += 16;
424     }
425     xb +=4; ai++;
426   }
427 
428   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
429   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
430   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
431   PetscFunctionReturn(0);
432 }
433 
434 #undef __FUNCT__
435 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
436 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
437 {
438   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
439   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
440   MatScalar       *v;
441   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
442 
443   PetscFunctionBegin;
444   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
445   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
446   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
447 
448   v     = a->a;
449   xb = x;
450 
451   for (i=0; i<mbs; i++) {
452     n  = ai[1] - ai[0]; /* length of i_th block row of A */
453     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
454     ib = aj + *ai;
455     jmin = 0;
456     if (*ib == i){      /* (diag of A)*x */
457       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
458       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
459       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
460       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
461       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
462       v += 25; jmin++;
463     }
464     for (j=jmin; j<n; j++) {
465       /* (strict lower triangular part of A)*x  */
466       cval       = ib[j]*5;
467       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
468       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
469       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
470       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
471       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
472       /* (strict upper triangular part of A)*x  */
473       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
474       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
475       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
476       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
477       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
478       v  += 25;
479     }
480     xb +=5; ai++;
481   }
482 
483   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
484   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
485   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
486   PetscFunctionReturn(0);
487 }
488 
489 
490 #undef __FUNCT__
491 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
492 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
493 {
494   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
495   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
496   MatScalar       *v;
497   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
498 
499   PetscFunctionBegin;
500   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
501   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
502   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
503 
504   v     = a->a;
505   xb = x;
506 
507   for (i=0; i<mbs; i++) {
508     n  = ai[1] - ai[0]; /* length of i_th block row of A */
509     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
510     ib = aj + *ai;
511     jmin = 0;
512     if (*ib == i){      /* (diag of A)*x */
513       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
514       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
515       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
516       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
517       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
518       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
519       v += 36; jmin++;
520     }
521     for (j=jmin; j<n; j++) {
522       /* (strict lower triangular part of A)*x  */
523       cval       = ib[j]*6;
524       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
525       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
526       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
527       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
528       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
529       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
530       /* (strict upper triangular part of A)*x  */
531       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
532       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
533       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
534       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
535       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
536       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
537       v  += 36;
538     }
539     xb +=6; ai++;
540   }
541 
542   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
543   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
544   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
545   PetscFunctionReturn(0);
546 }
547 #undef __FUNCT__
548 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
549 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
550 {
551   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
552   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
553   MatScalar       *v;
554   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
555 
556   PetscFunctionBegin;
557   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
558   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
559   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
560 
561   v     = a->a;
562   xb = x;
563 
564   for (i=0; i<mbs; i++) {
565     n  = ai[1] - ai[0]; /* length of i_th block row of A */
566     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
567     ib = aj + *ai;
568     jmin = 0;
569     if (*ib == i){      /* (diag of A)*x */
570       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
571       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
572       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
573       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
574       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
575       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
576       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
577       v += 49; jmin++;
578     }
579     for (j=jmin; j<n; j++) {
580       /* (strict lower triangular part of A)*x  */
581       cval       = ib[j]*7;
582       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
583       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
584       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
585       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
586       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
587       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
588       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
589       /* (strict upper triangular part of A)*x  */
590       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
591       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
592       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
593       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
594       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
595       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
596       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
597       v  += 49;
598     }
599     xb +=7; ai++;
600   }
601   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
602   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
603   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
604   PetscFunctionReturn(0);
605 }
606 
607 /*
608     This will not work with MatScalar == float because it calls the BLAS
609 */
610 #undef __FUNCT__
611 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
612 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
613 {
614   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
615   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
616   MatScalar       *v;
617   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
618   int             ncols,k;
619 
620   PetscFunctionBegin;
621   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
622   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
623   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
624 
625   aj   = a->j;
626   v    = a->a;
627   ii   = a->i;
628 
629   if (!a->mult_work) {
630     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
631   }
632   work = a->mult_work;
633 
634   for (i=0; i<mbs; i++) {
635     n     = ii[1] - ii[0]; ncols = n*bs;
636     workt = work; idx=aj+ii[0];
637 
638     /* upper triangular part */
639     for (j=0; j<n; j++) {
640       xb = x_ptr + bs*(*idx++);
641       for (k=0; k<bs; k++) workt[k] = xb[k];
642       workt += bs;
643     }
644     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
645     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
646 
647     /* strict lower triangular part */
648     idx = aj+ii[0];
649     if (*idx == i){
650       ncols -= bs; v += bs2; idx++; n--;
651     }
652 
653     if (ncols > 0){
654       workt = work;
655       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
656       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
657       for (j=0; j<n; j++) {
658         zb = z_ptr + bs*(*idx++);
659         for (k=0; k<bs; k++) zb[k] += workt[k] ;
660         workt += bs;
661       }
662     }
663     x += bs; v += n*bs2; z += bs; ii++;
664   }
665 
666   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
667   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
668   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
669   PetscFunctionReturn(0);
670 }
671 
672 #undef __FUNCT__
673 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
674 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
675 {
676   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
677   PetscScalar     *x,*y,*z,*xb,x1;
678   MatScalar       *v;
679   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
680 
681   PetscFunctionBegin;
682   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
683   if (yy != xx) {
684     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
685   } else {
686     y = x;
687   }
688   if (zz != yy) {
689     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
690     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
691     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
692   } else {
693     z = y;
694   }
695 
696   v  = a->a;
697   xb = x;
698 
699   for (i=0; i<mbs; i++) {
700     n  = ai[1] - ai[0];  /* length of i_th row of A */
701     x1 = xb[0];
702     ib = aj + *ai;
703     jmin = 0;
704     if (*ib == i) {            /* (diag of A)*x */
705       z[i] += *v++ * x[*ib++]; jmin++;
706     }
707     for (j=jmin; j<n; j++) {
708       cval    = *ib;
709       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
710       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
711     }
712     xb++; ai++;
713   }
714 
715   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
716   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
717   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
718 
719   PetscLogFlops(2*(a->nz*2 - A->m));
720   PetscFunctionReturn(0);
721 }
722 
723 #undef __FUNCT__
724 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
725 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
726 {
727   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
728   PetscScalar     *x,*y,*z,*xb,x1,x2;
729   MatScalar       *v;
730   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
731 
732   PetscFunctionBegin;
733   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
734   if (yy != xx) {
735     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
736   } else {
737     y = x;
738   }
739   if (zz != yy) {
740     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
741     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
742     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
743   } else {
744     z = y;
745   }
746 
747   v     = a->a;
748   xb = x;
749 
750   for (i=0; i<mbs; i++) {
751     n  = ai[1] - ai[0]; /* length of i_th block row of A */
752     x1 = xb[0]; x2 = xb[1];
753     ib = aj + *ai;
754     jmin = 0;
755     if (*ib == i){      /* (diag of A)*x */
756       z[2*i]   += v[0]*x1 + v[2]*x2;
757       z[2*i+1] += v[2]*x1 + v[3]*x2;
758       v += 4; jmin++;
759     }
760     for (j=jmin; j<n; j++) {
761       /* (strict lower triangular part of A)*x  */
762       cval       = ib[j]*2;
763       z[cval]     += v[0]*x1 + v[1]*x2;
764       z[cval+1]   += v[2]*x1 + v[3]*x2;
765       /* (strict upper triangular part of A)*x  */
766       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
767       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
768       v  += 4;
769     }
770     xb +=2; ai++;
771   }
772 
773   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
774   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
775   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
776 
777   PetscLogFlops(4*(a->nz*2 - A->m));
778   PetscFunctionReturn(0);
779 }
780 
781 #undef __FUNCT__
782 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
783 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
784 {
785   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
786   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
787   MatScalar       *v;
788   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
789 
790   PetscFunctionBegin;
791   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
792   if (yy != xx) {
793     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
794   } else {
795     y = x;
796   }
797   if (zz != yy) {
798     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
799     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
800     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
801   } else {
802     z = y;
803   }
804 
805   v     = a->a;
806   xb = x;
807 
808   for (i=0; i<mbs; i++) {
809     n  = ai[1] - ai[0]; /* length of i_th block row of A */
810     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
811     ib = aj + *ai;
812     jmin = 0;
813     if (*ib == i){     /* (diag of A)*x */
814      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
815      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
816      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
817      v += 9; jmin++;
818     }
819     for (j=jmin; j<n; j++) {
820       /* (strict lower triangular part of A)*x  */
821       cval       = ib[j]*3;
822       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
823       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
824       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
825       /* (strict upper triangular part of A)*x  */
826       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
827       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
828       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
829       v  += 9;
830     }
831     xb +=3; ai++;
832   }
833 
834   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
835   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
836   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
837 
838   PetscLogFlops(18*(a->nz*2 - A->m));
839   PetscFunctionReturn(0);
840 }
841 
842 #undef __FUNCT__
843 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
844 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
845 {
846   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
847   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
848   MatScalar       *v;
849   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
850 
851   PetscFunctionBegin;
852   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
853   if (yy != xx) {
854     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
855   } else {
856     y = x;
857   }
858   if (zz != yy) {
859     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
860     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
861     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
862   } else {
863     z = y;
864   }
865 
866   v     = a->a;
867   xb = x;
868 
869   for (i=0; i<mbs; i++) {
870     n  = ai[1] - ai[0]; /* length of i_th block row of A */
871     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
872     ib = aj + *ai;
873     jmin = 0;
874     if (*ib == i){      /* (diag of A)*x */
875       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
876       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
877       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
878       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
879       v += 16; jmin++;
880     }
881     for (j=jmin; j<n; j++) {
882       /* (strict lower triangular part of A)*x  */
883       cval       = ib[j]*4;
884       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
885       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
886       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
887       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
888       /* (strict upper triangular part of A)*x  */
889       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
890       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
891       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
892       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
893       v  += 16;
894     }
895     xb +=4; ai++;
896   }
897 
898   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
899   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
900   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
901 
902   PetscLogFlops(32*(a->nz*2 - A->m));
903   PetscFunctionReturn(0);
904 }
905 
906 #undef __FUNCT__
907 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
908 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
909 {
910   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
911   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
912   MatScalar       *v;
913   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
914 
915   PetscFunctionBegin;
916   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
917   if (yy != xx) {
918     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
919   } else {
920     y = x;
921   }
922   if (zz != yy) {
923     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
924     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
925     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
926   } else {
927     z = y;
928   }
929 
930   v     = a->a;
931   xb = x;
932 
933   for (i=0; i<mbs; i++) {
934     n  = ai[1] - ai[0]; /* length of i_th block row of A */
935     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
936     ib = aj + *ai;
937     jmin = 0;
938     if (*ib == i){      /* (diag of A)*x */
939       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
940       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
941       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
942       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
943       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
944       v += 25; jmin++;
945     }
946     for (j=jmin; j<n; j++) {
947       /* (strict lower triangular part of A)*x  */
948       cval       = ib[j]*5;
949       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
950       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
951       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
952       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
953       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
954       /* (strict upper triangular part of A)*x  */
955       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
956       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
957       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
958       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
959       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
960       v  += 25;
961     }
962     xb +=5; ai++;
963   }
964 
965   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
966   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
967   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
968 
969   PetscLogFlops(50*(a->nz*2 - A->m));
970   PetscFunctionReturn(0);
971 }
972 #undef __FUNCT__
973 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
974 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
975 {
976   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
977   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
978   MatScalar       *v;
979   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
980 
981   PetscFunctionBegin;
982   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
983   if (yy != xx) {
984     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
985   } else {
986     y = x;
987   }
988   if (zz != yy) {
989     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
990     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
991     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
992   } else {
993     z = y;
994   }
995 
996   v     = a->a;
997   xb = x;
998 
999   for (i=0; i<mbs; i++) {
1000     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1001     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1002     ib = aj + *ai;
1003     jmin = 0;
1004     if (*ib == i){     /* (diag of A)*x */
1005       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1006       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1007       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1008       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1009       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1010       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1011       v += 36; jmin++;
1012     }
1013     for (j=jmin; j<n; j++) {
1014       /* (strict lower triangular part of A)*x  */
1015       cval       = ib[j]*6;
1016       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1017       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1018       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1019       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1020       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1021       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1022       /* (strict upper triangular part of A)*x  */
1023       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1024       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1025       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1026       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1027       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1028       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1029       v  += 36;
1030     }
1031     xb +=6; ai++;
1032   }
1033 
1034   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1035   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1036   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1037 
1038   PetscLogFlops(72*(a->nz*2 - A->m));
1039   PetscFunctionReturn(0);
1040 }
1041 
1042 #undef __FUNCT__
1043 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1044 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1045 {
1046   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1047   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1048   MatScalar       *v;
1049   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1050 
1051   PetscFunctionBegin;
1052   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1053   if (yy != xx) {
1054     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1055   } else {
1056     y = x;
1057   }
1058   if (zz != yy) {
1059     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1060     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
1061     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1062   } else {
1063     z = y;
1064   }
1065 
1066   v     = a->a;
1067   xb = x;
1068 
1069   for (i=0; i<mbs; i++) {
1070     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1071     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1072     ib = aj + *ai;
1073     jmin = 0;
1074     if (*ib == i){     /* (diag of A)*x */
1075       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1076       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1077       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1078       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1079       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1080       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1081       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1082       v += 49; jmin++;
1083     }
1084     for (j=jmin; j<n; j++) {
1085       /* (strict lower triangular part of A)*x  */
1086       cval       = ib[j]*7;
1087       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1088       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1089       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1090       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1091       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1092       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1093       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1094       /* (strict upper triangular part of A)*x  */
1095       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1096       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1097       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1098       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1099       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1100       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1101       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1102       v  += 49;
1103     }
1104     xb +=7; ai++;
1105   }
1106 
1107   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1108   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1109   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1110 
1111   PetscLogFlops(98*(a->nz*2 - A->m));
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 #undef __FUNCT__
1116 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1117 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1118 {
1119   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1120   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1121   MatScalar       *v;
1122   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1123   int             ncols,k;
1124 
1125   PetscFunctionBegin;
1126   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
1127   if (yy != xx) {
1128     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1129   } else {
1130     y = x;
1131   }
1132   if (zz != yy) {
1133     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1134     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
1135     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1136   } else {
1137     z = y;
1138   }
1139 
1140   aj   = a->j;
1141   v    = a->a;
1142   ii   = a->i;
1143 
1144   if (!a->mult_work) {
1145     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1146   }
1147   work = a->mult_work;
1148 
1149 
1150   for (i=0; i<mbs; i++) {
1151     n     = ii[1] - ii[0]; ncols = n*bs;
1152     workt = work; idx=aj+ii[0];
1153 
1154     /* upper triangular part */
1155     for (j=0; j<n; j++) {
1156       xb = x_ptr + bs*(*idx++);
1157       for (k=0; k<bs; k++) workt[k] = xb[k];
1158       workt += bs;
1159     }
1160     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1161     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1162 
1163     /* strict lower triangular part */
1164     idx = aj+ii[0];
1165     if (*idx == i){
1166       ncols -= bs; v += bs2; idx++; n--;
1167     }
1168     if (ncols > 0){
1169       workt = work;
1170       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1171       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1172       for (j=0; j<n; j++) {
1173         zb = z_ptr + bs*(*idx++);
1174         /* idx++; */
1175         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1176         workt += bs;
1177       }
1178     }
1179 
1180     x += bs; v += n*bs2; z += bs; ii++;
1181   }
1182 
1183   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1184   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1185   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1186 
1187   PetscLogFlops(2*(a->nz*2 - A->m));
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 #undef __FUNCT__
1192 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1193 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1194 {
1195   int ierr;
1196 
1197   PetscFunctionBegin;
1198   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 #undef __FUNCT__
1203 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1204 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1205 
1206 {
1207   int ierr;
1208 
1209   PetscFunctionBegin;
1210   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1211   PetscFunctionReturn(0);
1212 }
1213 
1214 #undef __FUNCT__
1215 #define __FUNCT__ "MatScale_SeqSBAIJ"
1216 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1217 {
1218   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1219   int         one = 1,totalnz = a->bs2*a->nz;
1220 
1221   PetscFunctionBegin;
1222   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1223   PetscLogFlops(totalnz);
1224   PetscFunctionReturn(0);
1225 }
1226 
1227 #undef __FUNCT__
1228 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1229 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1230 {
1231   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1232   MatScalar   *v = a->a;
1233   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1234   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1235   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1236 
1237   PetscFunctionBegin;
1238   if (type == NORM_FROBENIUS) {
1239     for (k=0; k<mbs; k++){
1240       jmin = a->i[k]; jmax = a->i[k+1];
1241       col  = aj + jmin;
1242       if (*col == k){         /* diagonal block */
1243         for (i=0; i<bs2; i++){
1244 #if defined(PETSC_USE_COMPLEX)
1245           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1246 #else
1247           sum_diag += (*v)*(*v); v++;
1248 #endif
1249         }
1250         jmin++;
1251       }
1252       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1253         for (i=0; i<bs2; i++){
1254 #if defined(PETSC_USE_COMPLEX)
1255           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1256 #else
1257           sum_off += (*v)*(*v); v++;
1258 #endif
1259         }
1260       }
1261     }
1262     *norm = sqrt(sum_diag + 2*sum_off);
1263 
1264   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1265     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1266     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1267     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1268     for (i=0; i<mbs; i++) {
1269       jl[i] = mbs; il[0] = 0;
1270     }
1271 
1272     *norm = 0.0;
1273     for (k=0; k<mbs; k++) { /* k_th block row */
1274       for (j=0; j<bs; j++) sum[j]=0.0;
1275 
1276       /*-- col sum --*/
1277       i = jl[k]; /* first |A(i,k)| to be added */
1278       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1279                   at step k */
1280       while (i<mbs){
1281         nexti = jl[i];  /* next block row to be added */
1282         ik    = il[i];  /* block index of A(i,k) in the array a */
1283         for (j=0; j<bs; j++){
1284           v = a->a + ik*bs2 + j*bs;
1285           for (k1=0; k1<bs; k1++) {
1286             sum[j] += PetscAbsScalar(*v); v++;
1287           }
1288         }
1289         /* update il, jl */
1290         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1291         jmax = a->i[i+1];
1292         if (jmin < jmax){
1293           il[i] = jmin;
1294           j   = a->j[jmin];
1295           jl[i] = jl[j]; jl[j]=i;
1296         }
1297         i = nexti;
1298       }
1299 
1300       /*-- row sum --*/
1301       jmin = a->i[k]; jmax = a->i[k+1];
1302       for (i=jmin; i<jmax; i++) {
1303         for (j=0; j<bs; j++){
1304           v = a->a + i*bs2 + j;
1305           for (k1=0; k1<bs; k1++){
1306             sum[j] += PetscAbsScalar(*v);
1307             v   += bs;
1308           }
1309         }
1310       }
1311       /* add k_th block row to il, jl */
1312       col = aj+jmin;
1313       if (*col == k) jmin++;
1314       if (jmin < jmax){
1315         il[k] = jmin;
1316         j   = a->j[jmin];
1317         jl[k] = jl[j]; jl[j] = k;
1318       }
1319       for (j=0; j<bs; j++){
1320         if (sum[j] > *norm) *norm = sum[j];
1321       }
1322     }
1323     ierr = PetscFree(il);CHKERRQ(ierr);
1324     ierr = PetscFree(jl);CHKERRQ(ierr);
1325     ierr = PetscFree(sum);CHKERRQ(ierr);
1326   } else {
1327     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1328   }
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 #undef __FUNCT__
1333 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1334 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1335 {
1336   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1337   int          ierr;
1338 
1339   PetscFunctionBegin;
1340 
1341   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1342   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1343     *flg = PETSC_FALSE;
1344     PetscFunctionReturn(0);
1345   }
1346 
1347   /* if the a->i are the same */
1348   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1349   if (*flg == PETSC_FALSE) {
1350     PetscFunctionReturn(0);
1351   }
1352 
1353   /* if a->j are the same */
1354   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1355   if (*flg == PETSC_FALSE) {
1356     PetscFunctionReturn(0);
1357   }
1358   /* if a->a are the same */
1359   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1360 
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1366 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1367 {
1368   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1369   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1370   PetscScalar  *x,zero = 0.0;
1371   MatScalar    *aa,*aa_j;
1372 
1373   PetscFunctionBegin;
1374   bs   = a->bs;
1375   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1376 
1377   aa   = a->a;
1378   ai   = a->i;
1379   aj   = a->j;
1380   ambs = a->mbs;
1381   bs2  = a->bs2;
1382 
1383   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1384   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1385   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1386   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1387   for (i=0; i<ambs; i++) {
1388     j=ai[i];
1389     if (aj[j] == i) {             /* if this is a diagonal element */
1390       row  = i*bs;
1391       aa_j = aa + j*bs2;
1392       if (A->factor && bs==1){
1393         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1394       } else {
1395         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1396       }
1397     }
1398   }
1399 
1400   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 #undef __FUNCT__
1405 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1406 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1407 {
1408   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1409   PetscScalar  *l,*r,x,*li,*ri;
1410   MatScalar    *aa,*v;
1411   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1412 
1413   PetscFunctionBegin;
1414   ai  = a->i;
1415   aj  = a->j;
1416   aa  = a->a;
1417   m   = A->m;
1418   bs  = a->bs;
1419   mbs = a->mbs;
1420   bs2 = a->bs2;
1421 
1422   if (ll != rr) {
1423     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1424   }
1425   if (ll) {
1426     ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr);
1427     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1428     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1429     for (i=0; i<mbs; i++) { /* for each block row */
1430       M  = ai[i+1] - ai[i];
1431       li = l + i*bs;
1432       v  = aa + bs2*ai[i];
1433       for (j=0; j<M; j++) { /* for each block */
1434         for (k=0; k<bs2; k++) {
1435           (*v++) *= li[k%bs];
1436         }
1437 #ifdef CONT
1438         /* will be used to replace the above loop */
1439         ri = l + bs*aj[ai[i]+j];
1440         for (k=0; k<bs; k++) { /* column value */
1441           x = ri[k];
1442           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1443         }
1444 #endif
1445 
1446       }
1447     }
1448     ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr);
1449     PetscLogFlops(2*a->nz);
1450   }
1451   /* will be deleted */
1452   if (rr) {
1453     ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr);
1454     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1455     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1456     for (i=0; i<mbs; i++) { /* for each block row */
1457       M  = ai[i+1] - ai[i];
1458       v  = aa + bs2*ai[i];
1459       for (j=0; j<M; j++) { /* for each block */
1460         ri = r + bs*aj[ai[i]+j];
1461         for (k=0; k<bs; k++) {
1462           x = ri[k];
1463           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1464         }
1465       }
1466     }
1467     ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr);
1468     PetscLogFlops(a->nz);
1469   }
1470   PetscFunctionReturn(0);
1471 }
1472 
1473 #undef __FUNCT__
1474 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1475 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1476 {
1477   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1478 
1479   PetscFunctionBegin;
1480   info->rows_global    = (double)A->m;
1481   info->columns_global = (double)A->m;
1482   info->rows_local     = (double)A->m;
1483   info->columns_local  = (double)A->m;
1484   info->block_size     = a->bs2;
1485   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1486   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1487   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1488   info->assemblies   = A->num_ass;
1489   info->mallocs      = a->reallocs;
1490   info->memory       = A->mem;
1491   if (A->factor) {
1492     info->fill_ratio_given  = A->info.fill_ratio_given;
1493     info->fill_ratio_needed = A->info.fill_ratio_needed;
1494     info->factor_mallocs    = A->info.factor_mallocs;
1495   } else {
1496     info->fill_ratio_given  = 0;
1497     info->fill_ratio_needed = 0;
1498     info->factor_mallocs    = 0;
1499   }
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 
1504 #undef __FUNCT__
1505 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1506 int MatZeroEntries_SeqSBAIJ(Mat A)
1507 {
1508   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1509   int         ierr;
1510 
1511   PetscFunctionBegin;
1512   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1513   PetscFunctionReturn(0);
1514 }
1515 
1516 #undef __FUNCT__
1517 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1518 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1519 {
1520   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1521   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1522   PetscReal    atmp;
1523   MatScalar    *aa;
1524   PetscScalar  zero = 0.0,*x;
1525   int          ncols,brow,bcol,krow,kcol;
1526 
1527   PetscFunctionBegin;
1528   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1529   bs   = a->bs;
1530   aa   = a->a;
1531   ai   = a->i;
1532   aj   = a->j;
1533   mbs = a->mbs;
1534 
1535   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1536   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1537   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1538   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1539   for (i=0; i<mbs; i++) {
1540     ncols = ai[1] - ai[0]; ai++;
1541     brow  = bs*i;
1542     for (j=0; j<ncols; j++){
1543       bcol = bs*(*aj);
1544       for (kcol=0; kcol<bs; kcol++){
1545         col = bcol + kcol;      /* col index */
1546         for (krow=0; krow<bs; krow++){
1547           atmp = PetscAbsScalar(*aa); aa++;
1548           row = brow + krow;    /* row index */
1549           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1550           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1551           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1552         }
1553       }
1554       aj++;
1555     }
1556   }
1557   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1558   PetscFunctionReturn(0);
1559 }
1560