xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d44834fb5aa6ab55eaa2e18f09dfd00e40aa4639)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/inline/spops.h"
4 #include "src/inline/ilu.h"
5 #include "petscbt.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   PetscBT        table,table0;
16 
17   PetscFunctionBegin;
18   mbs = a->mbs;
19   ai  = a->i;
20   aj  = a->j;
21   bs  = A->bs;
22 
23   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24 
25   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
26   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
27   ierr = PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
28   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
29 
30   for (i=0; i<is_max; i++) { /* for each is */
31     isz  = 0;
32     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
33 
34     /* Extract the indices, assume there can be duplicate entries */
35     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
36     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
37 
38     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39     bcol_max = 0;
40     for (j=0; j<n ; ++j){
41       brow = idx[j]/bs; /* convert the indices into block indices */
42       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43       if(!PetscBTLookupSet(table,brow)) {
44         nidx[isz++] = brow;
45         if (bcol_max < brow) bcol_max = brow;
46       }
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++){ /* for each overlap */
53       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
55       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
56 
57       n = isz;  /* length of the updated is[i] */
58       for (brow=0; brow<mbs; brow++){
59         start = ai[brow]; end   = ai[brow+1];
60         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61           for (l = start; l<end ; l++){
62             bcol = aj[l];
63             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end ; l++){
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table0,bcol)){
72               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++)
83         nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 #undef __FUNCT__
95 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
96 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
97 {
98   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
99   PetscErrorCode ierr;
100   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
102   PetscInt       *irow,nrows,*ssmap,bs=A->bs,bs2=a->bs2;
103   PetscInt       *aj = a->j,*ai = a->i;
104   MatScalar      *mat_a;
105   Mat            C;
106   PetscTruth     flag;
107 
108   PetscFunctionBegin;
109 
110   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
111   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
112   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
113 
114   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
115   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
116 
117   ierr  = PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
120   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122   /* determine lens of each row */
123   for (i=0; i<nrows; i++) {
124     kstart  = ai[irow[i]];
125     kend    = kstart + a->ilen[irow[i]];
126     lens[i] = 0;
127       for (k=kstart; k<kend; k++) {
128         if (ssmap[aj[k]]) {
129           lens[i]++;
130         }
131       }
132     }
133   /* Create and fill new matrix */
134   if (scall == MAT_REUSE_MATRIX) {
135     c = (Mat_SeqSBAIJ *)((*B)->data);
136 
137     if (c->mbs!=nrows || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
138     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
139     if (flag == PETSC_FALSE) {
140       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
141     }
142     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
143     C = *B;
144   } else {
145     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
146     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
184   IS             is1;
185   PetscErrorCode ierr;
186   PetscInt       *vary,*iary,*irow,nrows,i,bs=A->bs,count;
187 
188   PetscFunctionBegin;
189   if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
190 
191   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
192   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
193 
194   /* Verify if the indices corespond to each element in a block
195    and form the IS with compressed IS */
196   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);CHKERRQ(ierr);
197   iary = vary + a->mbs;
198   ierr = PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
207 
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = PetscFree(vary);CHKERRQ(ierr);
210 
211   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
212   ISDestroy(is1);
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219 {
220   PetscErrorCode ierr;
221   PetscInt       i;
222 
223   PetscFunctionBegin;
224   if (scall == MAT_INITIAL_MATRIX) {
225     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<n; i++) {
229     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
230   }
231   PetscFunctionReturn(0);
232 }
233 
234 /* -------------------------------------------------------*/
235 /* Should check that shapes of vectors and matrices match */
236 /* -------------------------------------------------------*/
237 #include "petscblaslapack.h"
238 
239 #undef __FUNCT__
240 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
241 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
242 {
243   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
244   PetscScalar    *x,*z,*xb,x1,zero=0.0;
245   MatScalar      *v;
246   PetscErrorCode ierr;
247   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
248 
249   PetscFunctionBegin;
250   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
251   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
252   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
253 
254   v  = a->a;
255   xb = x;
256 
257   for (i=0; i<mbs; i++) {
258     n  = ai[1] - ai[0];  /* length of i_th row of A */
259     x1 = xb[0];
260     ib = aj + *ai;
261     jmin = 0;
262     if (*ib == i) {      /* (diag of A)*x */
263       z[i] += *v++ * x[*ib++];
264       jmin++;
265     }
266     for (j=jmin; j<n; j++) {
267       cval    = *ib;
268       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
269       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
270     }
271     xb++; ai++;
272   }
273 
274   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
275   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
276   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
277   PetscFunctionReturn(0);
278 }
279 
280 #undef __FUNCT__
281 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
282 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
283 {
284   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
285   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
286   MatScalar      *v;
287   PetscErrorCode ierr;
288   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
289 
290 
291   PetscFunctionBegin;
292   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
293   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
294   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
295 
296   v     = a->a;
297   xb = x;
298 
299   for (i=0; i<mbs; i++) {
300     n  = ai[1] - ai[0]; /* length of i_th block row of A */
301     x1 = xb[0]; x2 = xb[1];
302     ib = aj + *ai;
303     jmin = 0;
304     if (*ib == i){     /* (diag of A)*x */
305       z[2*i]   += v[0]*x1 + v[2]*x2;
306       z[2*i+1] += v[2]*x1 + v[3]*x2;
307       v += 4; jmin++;
308     }
309     for (j=jmin; j<n; j++) {
310       /* (strict lower triangular part of A)*x  */
311       cval       = ib[j]*2;
312       z[cval]     += v[0]*x1 + v[1]*x2;
313       z[cval+1]   += v[2]*x1 + v[3]*x2;
314       /* (strict upper triangular part of A)*x  */
315       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
316       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
317       v  += 4;
318     }
319     xb +=2; ai++;
320   }
321 
322   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
323   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
324   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
330 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
333   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
334   MatScalar      *v;
335   PetscErrorCode ierr;
336   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
337 
338 
339   PetscFunctionBegin;
340   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
342   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
343 
344   v     = a->a;
345   xb = x;
346 
347   for (i=0; i<mbs; i++) {
348     n  = ai[1] - ai[0]; /* length of i_th block row of A */
349     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
350     ib = aj + *ai;
351     jmin = 0;
352     if (*ib == i){     /* (diag of A)*x */
353       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
354       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
355       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
356       v += 9; jmin++;
357     }
358     for (j=jmin; j<n; j++) {
359       /* (strict lower triangular part of A)*x  */
360       cval       = ib[j]*3;
361       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
362       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
363       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
364       /* (strict upper triangular part of A)*x  */
365       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
366       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
367       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
368       v  += 9;
369     }
370     xb +=3; ai++;
371   }
372 
373   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
375   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
376   PetscFunctionReturn(0);
377 }
378 
379 #undef __FUNCT__
380 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
381 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
382 {
383   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
384   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
385   MatScalar      *v;
386   PetscErrorCode ierr;
387   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
388 
389   PetscFunctionBegin;
390   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
391   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
392   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
393 
394   v     = a->a;
395   xb = x;
396 
397   for (i=0; i<mbs; i++) {
398     n  = ai[1] - ai[0]; /* length of i_th block row of A */
399     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
400     ib = aj + *ai;
401     jmin = 0;
402     if (*ib == i){     /* (diag of A)*x */
403       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
404       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
405       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
406       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
407       v += 16; jmin++;
408     }
409     for (j=jmin; j<n; j++) {
410       /* (strict lower triangular part of A)*x  */
411       cval       = ib[j]*4;
412       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
413       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
414       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
415       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
416       /* (strict upper triangular part of A)*x  */
417       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
418       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
419       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
420       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
421       v  += 16;
422     }
423     xb +=4; ai++;
424   }
425 
426   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
427   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
428   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
429   PetscFunctionReturn(0);
430 }
431 
432 #undef __FUNCT__
433 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
434 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
435 {
436   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
437   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
438   MatScalar      *v;
439   PetscErrorCode ierr;
440   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
441 
442   PetscFunctionBegin;
443   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
444   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
445   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
446 
447   v     = a->a;
448   xb = x;
449 
450   for (i=0; i<mbs; i++) {
451     n  = ai[1] - ai[0]; /* length of i_th block row of A */
452     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
453     ib = aj + *ai;
454     jmin = 0;
455     if (*ib == i){      /* (diag of A)*x */
456       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
457       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
458       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
459       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
460       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
461       v += 25; jmin++;
462     }
463     for (j=jmin; j<n; j++) {
464       /* (strict lower triangular part of A)*x  */
465       cval       = ib[j]*5;
466       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
467       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
468       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
469       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
470       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
471       /* (strict upper triangular part of A)*x  */
472       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
473       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
474       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
475       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
476       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
477       v  += 25;
478     }
479     xb +=5; ai++;
480   }
481 
482   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
483   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
484   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
485   PetscFunctionReturn(0);
486 }
487 
488 
489 #undef __FUNCT__
490 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
491 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
492 {
493   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
494   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
495   MatScalar      *v;
496   PetscErrorCode ierr;
497   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
498 
499   PetscFunctionBegin;
500   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
501   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
502   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
503 
504   v     = a->a;
505   xb = x;
506 
507   for (i=0; i<mbs; i++) {
508     n  = ai[1] - ai[0]; /* length of i_th block row of A */
509     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
510     ib = aj + *ai;
511     jmin = 0;
512     if (*ib == i){      /* (diag of A)*x */
513       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
514       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
515       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
516       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
517       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
518       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
519       v += 36; jmin++;
520     }
521     for (j=jmin; j<n; j++) {
522       /* (strict lower triangular part of A)*x  */
523       cval       = ib[j]*6;
524       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
525       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
526       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
527       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
528       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
529       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
530       /* (strict upper triangular part of A)*x  */
531       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
532       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
533       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
534       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
535       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
536       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
537       v  += 36;
538     }
539     xb +=6; ai++;
540   }
541 
542   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
543   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
544   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
545   PetscFunctionReturn(0);
546 }
547 #undef __FUNCT__
548 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
549 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
550 {
551   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
552   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
553   MatScalar      *v;
554   PetscErrorCode ierr;
555   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
556 
557   PetscFunctionBegin;
558   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
559   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
560   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
561 
562   v     = a->a;
563   xb = x;
564 
565   for (i=0; i<mbs; i++) {
566     n  = ai[1] - ai[0]; /* length of i_th block row of A */
567     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
568     ib = aj + *ai;
569     jmin = 0;
570     if (*ib == i){      /* (diag of A)*x */
571       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
572       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
573       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
574       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
575       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
576       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
577       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
578       v += 49; jmin++;
579     }
580     for (j=jmin; j<n; j++) {
581       /* (strict lower triangular part of A)*x  */
582       cval       = ib[j]*7;
583       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
584       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
585       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
586       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
587       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
588       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
589       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
590       /* (strict upper triangular part of A)*x  */
591       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
592       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
593       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
594       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
595       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
596       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
597       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
598       v  += 49;
599     }
600     xb +=7; ai++;
601   }
602   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
603   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
604   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
605   PetscFunctionReturn(0);
606 }
607 
608 /*
609     This will not work with MatScalar == float because it calls the BLAS
610 */
611 #undef __FUNCT__
612 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
613 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
614 {
615   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
616   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
617   MatScalar      *v;
618   PetscErrorCode ierr;
619   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
620 
621   PetscFunctionBegin;
622   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
623   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
624   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
625 
626   aj   = a->j;
627   v    = a->a;
628   ii   = a->i;
629 
630   if (!a->mult_work) {
631     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
632   }
633   work = a->mult_work;
634 
635   for (i=0; i<mbs; i++) {
636     n     = ii[1] - ii[0]; ncols = n*bs;
637     workt = work; idx=aj+ii[0];
638 
639     /* upper triangular part */
640     for (j=0; j<n; j++) {
641       xb = x_ptr + bs*(*idx++);
642       for (k=0; k<bs; k++) workt[k] = xb[k];
643       workt += bs;
644     }
645     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
646     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
647 
648     /* strict lower triangular part */
649     idx = aj+ii[0];
650     if (*idx == i){
651       ncols -= bs; v += bs2; idx++; n--;
652     }
653 
654     if (ncols > 0){
655       workt = work;
656       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
657       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
658       for (j=0; j<n; j++) {
659         zb = z_ptr + bs*(*idx++);
660         for (k=0; k<bs; k++) zb[k] += workt[k] ;
661         workt += bs;
662       }
663     }
664     x += bs; v += n*bs2; z += bs; ii++;
665   }
666 
667   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
668   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
669   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
670   PetscFunctionReturn(0);
671 }
672 
673 #undef __FUNCT__
674 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
675 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
676 {
677   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
678   PetscScalar    *x,*y,*z,*xb,x1;
679   MatScalar      *v;
680   PetscErrorCode ierr;
681   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
682 
683   PetscFunctionBegin;
684   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
685   if (yy != xx) {
686     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
687   } else {
688     y = x;
689   }
690   if (zz != yy) {
691     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
692     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
693     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
694   } else {
695     z = y;
696   }
697 
698   v  = a->a;
699   xb = x;
700 
701   for (i=0; i<mbs; i++) {
702     n  = ai[1] - ai[0];  /* length of i_th row of A */
703     x1 = xb[0];
704     ib = aj + *ai;
705     jmin = 0;
706     if (*ib == i) {            /* (diag of A)*x */
707       z[i] += *v++ * x[*ib++]; jmin++;
708     }
709     for (j=jmin; j<n; j++) {
710       cval    = *ib;
711       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
712       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
713     }
714     xb++; ai++;
715   }
716 
717   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
718   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
719   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
720 
721   PetscLogFlops(2*(a->nz*2 - A->m));
722   PetscFunctionReturn(0);
723 }
724 
725 #undef __FUNCT__
726 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
727 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
728 {
729   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
730   PetscScalar    *x,*y,*z,*xb,x1,x2;
731   MatScalar      *v;
732   PetscErrorCode ierr;
733   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
734 
735   PetscFunctionBegin;
736   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
737   if (yy != xx) {
738     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
739   } else {
740     y = x;
741   }
742   if (zz != yy) {
743     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
744     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
745     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
746   } else {
747     z = y;
748   }
749 
750   v     = a->a;
751   xb = x;
752 
753   for (i=0; i<mbs; i++) {
754     n  = ai[1] - ai[0]; /* length of i_th block row of A */
755     x1 = xb[0]; x2 = xb[1];
756     ib = aj + *ai;
757     jmin = 0;
758     if (*ib == i){      /* (diag of A)*x */
759       z[2*i]   += v[0]*x1 + v[2]*x2;
760       z[2*i+1] += v[2]*x1 + v[3]*x2;
761       v += 4; jmin++;
762     }
763     for (j=jmin; j<n; j++) {
764       /* (strict lower triangular part of A)*x  */
765       cval       = ib[j]*2;
766       z[cval]     += v[0]*x1 + v[1]*x2;
767       z[cval+1]   += v[2]*x1 + v[3]*x2;
768       /* (strict upper triangular part of A)*x  */
769       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
770       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
771       v  += 4;
772     }
773     xb +=2; ai++;
774   }
775 
776   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
777   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
778   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
779 
780   PetscLogFlops(4*(a->nz*2 - A->m));
781   PetscFunctionReturn(0);
782 }
783 
784 #undef __FUNCT__
785 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
786 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
787 {
788   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
789   PetscScalar    *x,*y,*z,*xb,x1,x2,x3;
790   MatScalar      *v;
791   PetscErrorCode ierr;
792   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
793 
794   PetscFunctionBegin;
795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
796   if (yy != xx) {
797     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
798   } else {
799     y = x;
800   }
801   if (zz != yy) {
802     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
803     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
804     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
805   } else {
806     z = y;
807   }
808 
809   v     = a->a;
810   xb = x;
811 
812   for (i=0; i<mbs; i++) {
813     n  = ai[1] - ai[0]; /* length of i_th block row of A */
814     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
815     ib = aj + *ai;
816     jmin = 0;
817     if (*ib == i){     /* (diag of A)*x */
818      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
819      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
820      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
821      v += 9; jmin++;
822     }
823     for (j=jmin; j<n; j++) {
824       /* (strict lower triangular part of A)*x  */
825       cval       = ib[j]*3;
826       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
827       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
828       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
829       /* (strict upper triangular part of A)*x  */
830       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
831       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
832       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
833       v  += 9;
834     }
835     xb +=3; ai++;
836   }
837 
838   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
839   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
840   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
841 
842   PetscLogFlops(18*(a->nz*2 - A->m));
843   PetscFunctionReturn(0);
844 }
845 
846 #undef __FUNCT__
847 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
848 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
849 {
850   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
851   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4;
852   MatScalar      *v;
853   PetscErrorCode ierr;
854   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
855 
856   PetscFunctionBegin;
857   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
858   if (yy != xx) {
859     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
860   } else {
861     y = x;
862   }
863   if (zz != yy) {
864     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
865     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
866     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
867   } else {
868     z = y;
869   }
870 
871   v     = a->a;
872   xb = x;
873 
874   for (i=0; i<mbs; i++) {
875     n  = ai[1] - ai[0]; /* length of i_th block row of A */
876     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
877     ib = aj + *ai;
878     jmin = 0;
879     if (*ib == i){      /* (diag of A)*x */
880       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
881       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
882       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
883       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
884       v += 16; jmin++;
885     }
886     for (j=jmin; j<n; j++) {
887       /* (strict lower triangular part of A)*x  */
888       cval       = ib[j]*4;
889       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
890       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
891       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
892       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
893       /* (strict upper triangular part of A)*x  */
894       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
895       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
896       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
897       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
898       v  += 16;
899     }
900     xb +=4; ai++;
901   }
902 
903   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
904   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
905   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
906 
907   PetscLogFlops(32*(a->nz*2 - A->m));
908   PetscFunctionReturn(0);
909 }
910 
911 #undef __FUNCT__
912 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
913 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
914 {
915   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
916   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5;
917   MatScalar      *v;
918   PetscErrorCode ierr;
919   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
920 
921   PetscFunctionBegin;
922   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
923   if (yy != xx) {
924     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
925   } else {
926     y = x;
927   }
928   if (zz != yy) {
929     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
930     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
931     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
932   } else {
933     z = y;
934   }
935 
936   v     = a->a;
937   xb = x;
938 
939   for (i=0; i<mbs; i++) {
940     n  = ai[1] - ai[0]; /* length of i_th block row of A */
941     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
942     ib = aj + *ai;
943     jmin = 0;
944     if (*ib == i){      /* (diag of A)*x */
945       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
946       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
947       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
948       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
949       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
950       v += 25; jmin++;
951     }
952     for (j=jmin; j<n; j++) {
953       /* (strict lower triangular part of A)*x  */
954       cval       = ib[j]*5;
955       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
956       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
957       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
958       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
959       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
960       /* (strict upper triangular part of A)*x  */
961       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
962       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
963       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
964       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
965       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
966       v  += 25;
967     }
968     xb +=5; ai++;
969   }
970 
971   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
972   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
973   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
974 
975   PetscLogFlops(50*(a->nz*2 - A->m));
976   PetscFunctionReturn(0);
977 }
978 #undef __FUNCT__
979 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
980 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
981 {
982   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
983   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
984   MatScalar      *v;
985   PetscErrorCode ierr;
986   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
987 
988   PetscFunctionBegin;
989   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
990   if (yy != xx) {
991     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
992   } else {
993     y = x;
994   }
995   if (zz != yy) {
996     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
997     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
998     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
999   } else {
1000     z = y;
1001   }
1002 
1003   v     = a->a;
1004   xb = x;
1005 
1006   for (i=0; i<mbs; i++) {
1007     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1008     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1009     ib = aj + *ai;
1010     jmin = 0;
1011     if (*ib == i){     /* (diag of A)*x */
1012       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1013       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1014       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1015       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1016       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1017       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1018       v += 36; jmin++;
1019     }
1020     for (j=jmin; j<n; j++) {
1021       /* (strict lower triangular part of A)*x  */
1022       cval       = ib[j]*6;
1023       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1024       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1025       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1026       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1027       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1028       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1029       /* (strict upper triangular part of A)*x  */
1030       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1031       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1032       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1033       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1034       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1035       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1036       v  += 36;
1037     }
1038     xb +=6; ai++;
1039   }
1040 
1041   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1042   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1043   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1044 
1045   PetscLogFlops(72*(a->nz*2 - A->m));
1046   PetscFunctionReturn(0);
1047 }
1048 
1049 #undef __FUNCT__
1050 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1051 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1052 {
1053   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1054   PetscScalar    *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1055   MatScalar      *v;
1056   PetscErrorCode ierr;
1057   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
1058 
1059   PetscFunctionBegin;
1060   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1061   if (yy != xx) {
1062     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1063   } else {
1064     y = x;
1065   }
1066   if (zz != yy) {
1067     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1068     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1069     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1070   } else {
1071     z = y;
1072   }
1073 
1074   v     = a->a;
1075   xb = x;
1076 
1077   for (i=0; i<mbs; i++) {
1078     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1079     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1080     ib = aj + *ai;
1081     jmin = 0;
1082     if (*ib == i){     /* (diag of A)*x */
1083       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1084       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1085       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1086       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1087       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1088       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1089       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1090       v += 49; jmin++;
1091     }
1092     for (j=jmin; j<n; j++) {
1093       /* (strict lower triangular part of A)*x  */
1094       cval       = ib[j]*7;
1095       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1096       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1097       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1098       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1099       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1100       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1101       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1102       /* (strict upper triangular part of A)*x  */
1103       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1104       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1105       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1106       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1107       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1108       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1109       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1110       v  += 49;
1111     }
1112     xb +=7; ai++;
1113   }
1114 
1115   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1116   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1117   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1118 
1119   PetscLogFlops(98*(a->nz*2 - A->m));
1120   PetscFunctionReturn(0);
1121 }
1122 
1123 #undef __FUNCT__
1124 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1125 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1126 {
1127   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1128   PetscScalar    *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1129   MatScalar      *v;
1130   PetscErrorCode ierr;
1131   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->bs,j,n,bs2=a->bs2,ncols,k;
1132 
1133   PetscFunctionBegin;
1134   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1135   if (yy != xx) {
1136     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1137   } else {
1138     y = x;
1139   }
1140   if (zz != yy) {
1141     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1142     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1143     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1144   } else {
1145     z = y;
1146   }
1147 
1148   aj   = a->j;
1149   v    = a->a;
1150   ii   = a->i;
1151 
1152   if (!a->mult_work) {
1153     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1154   }
1155   work = a->mult_work;
1156 
1157 
1158   for (i=0; i<mbs; i++) {
1159     n     = ii[1] - ii[0]; ncols = n*bs;
1160     workt = work; idx=aj+ii[0];
1161 
1162     /* upper triangular part */
1163     for (j=0; j<n; j++) {
1164       xb = x_ptr + bs*(*idx++);
1165       for (k=0; k<bs; k++) workt[k] = xb[k];
1166       workt += bs;
1167     }
1168     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1169     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1170 
1171     /* strict lower triangular part */
1172     idx = aj+ii[0];
1173     if (*idx == i){
1174       ncols -= bs; v += bs2; idx++; n--;
1175     }
1176     if (ncols > 0){
1177       workt = work;
1178       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1179       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1180       for (j=0; j<n; j++) {
1181         zb = z_ptr + bs*(*idx++);
1182         /* idx++; */
1183         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1184         workt += bs;
1185       }
1186     }
1187 
1188     x += bs; v += n*bs2; z += bs; ii++;
1189   }
1190 
1191   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1192   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1193   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1194 
1195   PetscLogFlops(2*(a->nz*2 - A->m));
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 #undef __FUNCT__
1200 #define __FUNCT__ "MatScale_SeqSBAIJ"
1201 PetscErrorCode MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1202 {
1203   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1204   PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz;
1205 
1206   PetscFunctionBegin;
1207   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1208   PetscLogFlops(totalnz);
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 #undef __FUNCT__
1213 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1214 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1215 {
1216   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1217   MatScalar      *v = a->a;
1218   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1219   PetscInt       i,j,k,bs = A->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1220   PetscErrorCode ierr;
1221   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1222 
1223   PetscFunctionBegin;
1224   if (type == NORM_FROBENIUS) {
1225     for (k=0; k<mbs; k++){
1226       jmin = a->i[k]; jmax = a->i[k+1];
1227       col  = aj + jmin;
1228       if (*col == k){         /* diagonal block */
1229         for (i=0; i<bs2; i++){
1230 #if defined(PETSC_USE_COMPLEX)
1231           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1232 #else
1233           sum_diag += (*v)*(*v); v++;
1234 #endif
1235         }
1236         jmin++;
1237       }
1238       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1239         for (i=0; i<bs2; i++){
1240 #if defined(PETSC_USE_COMPLEX)
1241           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242 #else
1243           sum_off += (*v)*(*v); v++;
1244 #endif
1245         }
1246       }
1247     }
1248     *norm = sqrt(sum_diag + 2*sum_off);
1249 
1250   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1251     ierr = PetscMalloc(mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1252     ierr = PetscMalloc(mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr);
1253     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1254     for (i=0; i<mbs; i++) {
1255       jl[i] = mbs; il[0] = 0;
1256     }
1257 
1258     *norm = 0.0;
1259     for (k=0; k<mbs; k++) { /* k_th block row */
1260       for (j=0; j<bs; j++) sum[j]=0.0;
1261 
1262       /*-- col sum --*/
1263       i = jl[k]; /* first |A(i,k)| to be added */
1264       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1265                   at step k */
1266       while (i<mbs){
1267         nexti = jl[i];  /* next block row to be added */
1268         ik    = il[i];  /* block index of A(i,k) in the array a */
1269         for (j=0; j<bs; j++){
1270           v = a->a + ik*bs2 + j*bs;
1271           for (k1=0; k1<bs; k1++) {
1272             sum[j] += PetscAbsScalar(*v); v++;
1273           }
1274         }
1275         /* update il, jl */
1276         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1277         jmax = a->i[i+1];
1278         if (jmin < jmax){
1279           il[i] = jmin;
1280           j   = a->j[jmin];
1281           jl[i] = jl[j]; jl[j]=i;
1282         }
1283         i = nexti;
1284       }
1285 
1286       /*-- row sum --*/
1287       jmin = a->i[k]; jmax = a->i[k+1];
1288       for (i=jmin; i<jmax; i++) {
1289         for (j=0; j<bs; j++){
1290           v = a->a + i*bs2 + j;
1291           for (k1=0; k1<bs; k1++){
1292             sum[j] += PetscAbsScalar(*v);
1293             v   += bs;
1294           }
1295         }
1296       }
1297       /* add k_th block row to il, jl */
1298       col = aj+jmin;
1299       if (*col == k) jmin++;
1300       if (jmin < jmax){
1301         il[k] = jmin;
1302         j   = a->j[jmin];
1303         jl[k] = jl[j]; jl[j] = k;
1304       }
1305       for (j=0; j<bs; j++){
1306         if (sum[j] > *norm) *norm = sum[j];
1307       }
1308     }
1309     ierr = PetscFree(il);CHKERRQ(ierr);
1310     ierr = PetscFree(jl);CHKERRQ(ierr);
1311     ierr = PetscFree(sum);CHKERRQ(ierr);
1312   } else {
1313     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1314   }
1315   PetscFunctionReturn(0);
1316 }
1317 
1318 #undef __FUNCT__
1319 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1320 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1321 {
1322   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1323   PetscErrorCode ierr;
1324 
1325   PetscFunctionBegin;
1326 
1327   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1328   if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1329     *flg = PETSC_FALSE;
1330     PetscFunctionReturn(0);
1331   }
1332 
1333   /* if the a->i are the same */
1334   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1335   if (*flg == PETSC_FALSE) {
1336     PetscFunctionReturn(0);
1337   }
1338 
1339   /* if a->j are the same */
1340   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1341   if (*flg == PETSC_FALSE) {
1342     PetscFunctionReturn(0);
1343   }
1344   /* if a->a are the same */
1345   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(A->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1346 
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 #undef __FUNCT__
1351 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1352 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1353 {
1354   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1355   PetscErrorCode ierr;
1356   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1357   PetscScalar    *x,zero = 0.0;
1358   MatScalar      *aa,*aa_j;
1359 
1360   PetscFunctionBegin;
1361   bs   = A->bs;
1362   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1363 
1364   aa   = a->a;
1365   ai   = a->i;
1366   aj   = a->j;
1367   ambs = a->mbs;
1368   bs2  = a->bs2;
1369 
1370   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1371   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1372   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1373   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1374   for (i=0; i<ambs; i++) {
1375     j=ai[i];
1376     if (aj[j] == i) {             /* if this is a diagonal element */
1377       row  = i*bs;
1378       aa_j = aa + j*bs2;
1379       if (A->factor && bs==1){
1380         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1381       } else {
1382         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1383       }
1384     }
1385   }
1386 
1387   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1388   PetscFunctionReturn(0);
1389 }
1390 
1391 #undef __FUNCT__
1392 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1393 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1394 {
1395   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1396   PetscScalar    *l,*r,x,*li,*ri;
1397   MatScalar      *aa,*v;
1398   PetscErrorCode ierr;
1399   PetscInt       i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1400 
1401   PetscFunctionBegin;
1402   ai  = a->i;
1403   aj  = a->j;
1404   aa  = a->a;
1405   m   = A->m;
1406   bs  = A->bs;
1407   mbs = a->mbs;
1408   bs2 = a->bs2;
1409 
1410   if (ll != rr) {
1411     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1412   }
1413   if (ll) {
1414     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1415     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1416     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1417     for (i=0; i<mbs; i++) { /* for each block row */
1418       M  = ai[i+1] - ai[i];
1419       li = l + i*bs;
1420       v  = aa + bs2*ai[i];
1421       for (j=0; j<M; j++) { /* for each block */
1422         for (k=0; k<bs2; k++) {
1423           (*v++) *= li[k%bs];
1424         }
1425 #ifdef CONT
1426         /* will be used to replace the above loop */
1427         ri = l + bs*aj[ai[i]+j];
1428         for (k=0; k<bs; k++) { /* column value */
1429           x = ri[k];
1430           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1431         }
1432 #endif
1433 
1434       }
1435     }
1436     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1437     PetscLogFlops(2*a->nz);
1438   }
1439   /* will be deleted */
1440   if (rr) {
1441     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1442     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1443     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1444     for (i=0; i<mbs; i++) { /* for each block row */
1445       M  = ai[i+1] - ai[i];
1446       v  = aa + bs2*ai[i];
1447       for (j=0; j<M; j++) { /* for each block */
1448         ri = r + bs*aj[ai[i]+j];
1449         for (k=0; k<bs; k++) {
1450           x = ri[k];
1451           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1452         }
1453       }
1454     }
1455     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1456     PetscLogFlops(a->nz);
1457   }
1458   PetscFunctionReturn(0);
1459 }
1460 
1461 #undef __FUNCT__
1462 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1463 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1464 {
1465   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1466 
1467   PetscFunctionBegin;
1468   info->rows_global    = (double)A->m;
1469   info->columns_global = (double)A->m;
1470   info->rows_local     = (double)A->m;
1471   info->columns_local  = (double)A->m;
1472   info->block_size     = a->bs2;
1473   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1474   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1475   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1476   info->assemblies   = A->num_ass;
1477   info->mallocs      = a->reallocs;
1478   info->memory       = A->mem;
1479   if (A->factor) {
1480     info->fill_ratio_given  = A->info.fill_ratio_given;
1481     info->fill_ratio_needed = A->info.fill_ratio_needed;
1482     info->factor_mallocs    = A->info.factor_mallocs;
1483   } else {
1484     info->fill_ratio_given  = 0;
1485     info->fill_ratio_needed = 0;
1486     info->factor_mallocs    = 0;
1487   }
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 
1492 #undef __FUNCT__
1493 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1494 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1495 {
1496   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1497   PetscErrorCode ierr;
1498 
1499   PetscFunctionBegin;
1500   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 #undef __FUNCT__
1505 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1506 PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1507 {
1508   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1509   PetscErrorCode ierr;
1510   PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1511   PetscReal    atmp;
1512   MatScalar    *aa;
1513   PetscScalar  zero = 0.0,*x;
1514   PetscInt          ncols,brow,bcol,krow,kcol;
1515 
1516   PetscFunctionBegin;
1517   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1518   bs   = A->bs;
1519   aa   = a->a;
1520   ai   = a->i;
1521   aj   = a->j;
1522   mbs = a->mbs;
1523 
1524   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1525   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1526   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1527   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1528   for (i=0; i<mbs; i++) {
1529     ncols = ai[1] - ai[0]; ai++;
1530     brow  = bs*i;
1531     for (j=0; j<ncols; j++){
1532       bcol = bs*(*aj);
1533       for (kcol=0; kcol<bs; kcol++){
1534         col = bcol + kcol;      /* col index */
1535         for (krow=0; krow<bs; krow++){
1536           atmp = PetscAbsScalar(*aa); aa++;
1537           row = brow + krow;    /* row index */
1538           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1539           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1540         }
1541       }
1542       aj++;
1543     }
1544   }
1545   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1546   PetscFunctionReturn(0);
1547 }
1548