xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 4be2ffe04cc5be453f1066995c69d4da4e55196e)
1 /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/inline/spops.h"
5 #include "src/inline/ilu.h"
6 #include "petscbt.h"
7 #include "src/mat/impls/sbaij/seq/sbaij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
11 int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
12 {
13   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
14   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,
15                start,end,*ai,*aj,bs,*nidx2;
16   PetscBT      table;
17   PetscBT      table0;
18 
19   PetscFunctionBegin;
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = a->bs;
24 
25   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26 
27   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
28   ierr = PetscMalloc((mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
29   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
30   ierr = PetscBTCreate(mbs,table0);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) { /* for each is */
33     isz  = 0;
34     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
35 
36     /* Extract the indices, assume there can be duplicate entries */
37     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
38     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
39 
40     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
41     for (j=0; j<n ; ++j){
42       brow = idx[j]/bs; /* convert the indices into block indices */
43       if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44       if(!PetscBTLookupSet(table,brow)) { nidx[isz++] = brow;}
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++){ /* for each overlap */
51       /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52       ierr = PetscBTMemzero(mbs,table0);CHKERRQ(ierr);
53       for (l=k; l<isz; l++) PetscBTSet(table0,nidx[l]);
54 
55       n = isz;  /* length of the updated is[i] */
56       for (brow=0; brow<mbs; brow++){
57         start = ai[brow]; end   = ai[brow+1];
58         if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l<end ; l++){
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
62           }
63           k++;
64           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
65         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
66           for (l = start; l<end ; l++){
67             bcol = aj[l];
68             if (PetscBTLookup(table0,bcol)){
69               if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
70               break; /* for l = start; l<end ; l++) */
71             }
72           }
73         }
74       }
75     } /* for each overlap */
76 
77     /* expand the Index Set */
78     for (j=0; j<isz; j++) {
79       for (k=0; k<bs; k++)
80         nidx2[j*bs+k] = nidx[j]*bs+k;
81     }
82     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
83   }
84   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
85   ierr = PetscFree(nidx);CHKERRQ(ierr);
86   ierr = PetscFree(nidx2);CHKERRQ(ierr);
87   ierr = PetscBTDestroy(table0);CHKERRQ(ierr);
88 #ifdef VERSION_1
89   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
90   int          brow,i,j,k,l,mbs,n,*idx,ierr,*nidx,isz,bcol,
91                start,end,*ai,*aj,bs,*nidx2,*colsearch;
92   PetscBT      table;
93 
94   PetscFunctionBegin;
95   mbs = a->mbs;
96   ai  = a->i;
97   aj  = a->j;
98   bs  = a->bs;
99 
100   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
101 
102   ierr = PetscBTCreate(mbs,table);CHKERRQ(ierr);
103   ierr = PetscMalloc((2*mbs+1)*sizeof(int),&nidx);CHKERRQ(ierr);
104   colsearch = nidx + mbs + 1;
105   ierr = PetscMalloc((A->m+1)*sizeof(int),&nidx2);CHKERRQ(ierr);
106 
107   for (i=0; i<is_max; i++) {
108 
109     /* Initialise the two local arrays */
110     isz  = 0;
111     ierr = PetscBTMemzero(mbs,table);CHKERRQ(ierr);
112 
113     /* Extract the indices, assume there can be duplicate entries */
114     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
115     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
116 
117     /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
118     for (j=0; j<n ; ++j){
119       bcol = idx[j]/bs; /* convert the indices into block indices */
120       if (bcol >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
121       if(!PetscBTLookupSet(table,bcol)) { nidx[isz++] = bcol;}
122     }
123     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
124     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
125 
126     k = 0;
127     for (j=0; j<ov; j++){ /* for each overlap */
128       /* initialize colsearch -
129          colsearch[i] points to the possible non-zero (i,active_col)-th entry in the array aj */
130       for (brow=0; brow<mbs; brow++) colsearch[brow] = ai[brow];
131 
132       /* sort nidx[k:isz-1] - needed by col search */
133       ierr = PetscSortInt(isz-k,nidx+k);CHKERRQ(ierr);
134 
135       n = isz;
136       for (; k<n ; k++){ /* do only those brows in nidx[k], which are not done yet */
137         brow   = nidx[k];
138         /* search block column */
139         for (l=0; l<brow; l++){
140           if (colsearch[l]<ai[l+1]){
141             bcol = aj[colsearch[l]];
142             while (bcol < brow ) {
143               colsearch[l]++;
144               if (colsearch[l]<ai[l+1]){
145                 bcol = aj[colsearch[l]];
146               } else {
147                 bcol = mbs; break;
148               }
149             }
150             if (bcol==brow){
151               if (!PetscBTLookupSet(table,l)) {nidx[isz++] = l;}
152               colsearch[l]++;
153             }
154           }
155         }
156         /* search block row */
157         start = ai[brow];
158         end   = ai[brow+1];
159         for (l = start; l<end ; l++){
160           bcol = aj[l];
161           if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
162         }
163       }
164     } /* for each overlap */
165 
166     /* expand the Index Set */
167     for (j=0; j<isz; j++) {
168       for (k=0; k<bs; k++)
169         nidx2[j*bs+k] = nidx[j]*bs+k;
170     }
171     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
172   }
173   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
174   ierr = PetscFree(nidx);CHKERRQ(ierr);
175   ierr = PetscFree(nidx2);CHKERRQ(ierr);
176 #endif /* VERSION_1 */
177   PetscFunctionReturn(0);
178 }
179 
180 #undef __FUNCT__
181 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
182 int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
183 {
184   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
185   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
186   int          row,mat_i,*mat_j,tcol,*mat_ilen;
187   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
188   int          *aj = a->j,*ai = a->i;
189   MatScalar    *mat_a;
190   Mat          C;
191   PetscTruth   flag;
192 
193   PetscFunctionBegin;
194 
195   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
196   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
197   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
198 
199   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
200   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
201 
202   ierr  = PetscMalloc((1+oldcols)*sizeof(int),&smap);CHKERRQ(ierr);
203   ssmap = smap;
204   ierr  = PetscMalloc((1+nrows)*sizeof(int),&lens);CHKERRQ(ierr);
205   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
206   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
207   /* determine lens of each row */
208   for (i=0; i<nrows; i++) {
209     kstart  = ai[irow[i]];
210     kend    = kstart + a->ilen[irow[i]];
211     lens[i] = 0;
212       for (k=kstart; k<kend; k++) {
213         if (ssmap[aj[k]]) {
214           lens[i]++;
215         }
216       }
217     }
218   /* Create and fill new matrix */
219   if (scall == MAT_REUSE_MATRIX) {
220     c = (Mat_SeqSBAIJ *)((*B)->data);
221 
222     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
223     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
224     if (flag == PETSC_FALSE) {
225       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
226     }
227     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
228     C = *B;
229   } else {
230     ierr = MatCreate(A->comm,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);CHKERRQ(ierr);
231     ierr = MatSetType(C,A->type_name);CHKERRQ(ierr);
232     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
233   }
234   c = (Mat_SeqSBAIJ *)(C->data);
235   for (i=0; i<nrows; i++) {
236     row    = irow[i];
237     kstart = ai[row];
238     kend   = kstart + a->ilen[row];
239     mat_i  = c->i[i];
240     mat_j  = c->j + mat_i;
241     mat_a  = c->a + mat_i*bs2;
242     mat_ilen = c->ilen + i;
243     for (k=kstart; k<kend; k++) {
244       if ((tcol=ssmap[a->j[k]])) {
245         *mat_j++ = tcol - 1;
246         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
247         mat_a   += bs2;
248         (*mat_ilen)++;
249       }
250     }
251   }
252 
253   /* Free work space */
254   ierr = PetscFree(smap);CHKERRQ(ierr);
255   ierr = PetscFree(lens);CHKERRQ(ierr);
256   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
257   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
258 
259   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
260   *B = C;
261   PetscFunctionReturn(0);
262 }
263 
264 #undef __FUNCT__
265 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
266 int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
267 {
268   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
269   IS          is1;
270   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
271 
272   PetscFunctionBegin;
273   if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
274 
275   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
276   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
277 
278   /* Verify if the indices corespond to each element in a block
279    and form the IS with compressed IS */
280   ierr = PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);CHKERRQ(ierr);
281   iary = vary + a->mbs;
282   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
283   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
284 
285   count = 0;
286   for (i=0; i<a->mbs; i++) {
287     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
288     if (vary[i]==bs) iary[count++] = i;
289   }
290   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
291 
292   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
293   ierr = PetscFree(vary);CHKERRQ(ierr);
294 
295   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
296   ISDestroy(is1);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
302 int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
303 {
304   int ierr,i;
305 
306   PetscFunctionBegin;
307   if (scall == MAT_INITIAL_MATRIX) {
308     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<n; i++) {
312     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
313   }
314   PetscFunctionReturn(0);
315 }
316 
317 /* -------------------------------------------------------*/
318 /* Should check that shapes of vectors and matrices match */
319 /* -------------------------------------------------------*/
320 #include "petscblaslapack.h"
321 
322 #undef __FUNCT__
323 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
324 int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
325 {
326   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
327   PetscScalar     *x,*z,*xb,x1,zero=0.0;
328   MatScalar       *v;
329   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
330 
331   PetscFunctionBegin;
332   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
333   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
335 
336   v  = a->a;
337   xb = x;
338 
339   for (i=0; i<mbs; i++) {
340     n  = ai[1] - ai[0];  /* length of i_th row of A */
341     x1 = xb[0];
342     ib = aj + *ai;
343     jmin = 0;
344     if (*ib == i) {      /* (diag of A)*x */
345       z[i] += *v++ * x[*ib++];
346       jmin++;
347     }
348     for (j=jmin; j<n; j++) {
349       cval    = *ib;
350       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
351       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
352     }
353     xb++; ai++;
354   }
355 
356   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
357   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
358   PetscLogFlops(2*(a->nz*2 - A->m) - A->m);  /* nz = (nz+m)/2 */
359   PetscFunctionReturn(0);
360 }
361 
362 #undef __FUNCT__
363 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
364 int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
365 {
366   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
367   PetscScalar      *x,*z,*xb,x1,x2,zero=0.0;
368   MatScalar        *v;
369   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
370 
371 
372   PetscFunctionBegin;
373   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
374   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
375   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
376 
377   v     = a->a;
378   xb = x;
379 
380   for (i=0; i<mbs; i++) {
381     n  = ai[1] - ai[0]; /* length of i_th block row of A */
382     x1 = xb[0]; x2 = xb[1];
383     ib = aj + *ai;
384     jmin = 0;
385     if (*ib == i){     /* (diag of A)*x */
386       z[2*i]   += v[0]*x1 + v[2]*x2;
387       z[2*i+1] += v[2]*x1 + v[3]*x2;
388       v += 4; jmin++;
389     }
390     for (j=jmin; j<n; j++) {
391       /* (strict lower triangular part of A)*x  */
392       cval       = ib[j]*2;
393       z[cval]     += v[0]*x1 + v[1]*x2;
394       z[cval+1]   += v[2]*x1 + v[3]*x2;
395       /* (strict upper triangular part of A)*x  */
396       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
397       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
398       v  += 4;
399     }
400     xb +=2; ai++;
401   }
402 
403   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
404   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
405   PetscLogFlops(8*(a->nz*2 - A->m) - A->m);
406   PetscFunctionReturn(0);
407 }
408 
409 #undef __FUNCT__
410 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
411 int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
412 {
413   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
414   PetscScalar   *x,*z,*xb,x1,x2,x3,zero=0.0;
415   MatScalar     *v;
416   int           mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
417 
418 
419   PetscFunctionBegin;
420   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
421   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
422   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
423 
424   v     = a->a;
425   xb = x;
426 
427   for (i=0; i<mbs; i++) {
428     n  = ai[1] - ai[0]; /* length of i_th block row of A */
429     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
430     ib = aj + *ai;
431     jmin = 0;
432     if (*ib == i){     /* (diag of A)*x */
433       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
434       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
435       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
436       v += 9; jmin++;
437     }
438     for (j=jmin; j<n; j++) {
439       /* (strict lower triangular part of A)*x  */
440       cval       = ib[j]*3;
441       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
442       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
443       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
444       /* (strict upper triangular part of A)*x  */
445       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
446       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
447       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
448       v  += 9;
449     }
450     xb +=3; ai++;
451   }
452 
453   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
454   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
455   PetscLogFlops(18*(a->nz*2 - A->m) - A->m);
456   PetscFunctionReturn(0);
457 }
458 
459 #undef __FUNCT__
460 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
461 int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
462 {
463   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
464   PetscScalar      *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
465   MatScalar        *v;
466   int              mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
467 
468   PetscFunctionBegin;
469   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
470   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
471   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
472 
473   v     = a->a;
474   xb = x;
475 
476   for (i=0; i<mbs; i++) {
477     n  = ai[1] - ai[0]; /* length of i_th block row of A */
478     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
479     ib = aj + *ai;
480     jmin = 0;
481     if (*ib == i){     /* (diag of A)*x */
482       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
483       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
484       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
485       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
486       v += 16; jmin++;
487     }
488     for (j=jmin; j<n; j++) {
489       /* (strict lower triangular part of A)*x  */
490       cval       = ib[j]*4;
491       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
492       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
493       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
494       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
495       /* (strict upper triangular part of A)*x  */
496       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
497       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
498       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
499       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
500       v  += 16;
501     }
502     xb +=4; ai++;
503   }
504 
505   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
506   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
507   PetscLogFlops(32*(a->nz*2 - A->m) - A->m);
508   PetscFunctionReturn(0);
509 }
510 
511 #undef __FUNCT__
512 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
513 int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
514 {
515   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
516   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
517   MatScalar       *v;
518   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
519 
520   PetscFunctionBegin;
521   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
522   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
523   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
524 
525   v     = a->a;
526   xb = x;
527 
528   for (i=0; i<mbs; i++) {
529     n  = ai[1] - ai[0]; /* length of i_th block row of A */
530     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
531     ib = aj + *ai;
532     jmin = 0;
533     if (*ib == i){      /* (diag of A)*x */
534       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
535       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
536       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
537       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
538       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
539       v += 25; jmin++;
540     }
541     for (j=jmin; j<n; j++) {
542       /* (strict lower triangular part of A)*x  */
543       cval       = ib[j]*5;
544       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
545       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
546       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
547       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
548       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
549       /* (strict upper triangular part of A)*x  */
550       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
551       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
552       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
553       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
554       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
555       v  += 25;
556     }
557     xb +=5; ai++;
558   }
559 
560   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
561   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
562   PetscLogFlops(50*(a->nz*2 - A->m) - A->m);
563   PetscFunctionReturn(0);
564 }
565 
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
569 int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
570 {
571   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
572   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
573   MatScalar       *v;
574   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
575 
576   PetscFunctionBegin;
577   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
578   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
579   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
580 
581   v     = a->a;
582   xb = x;
583 
584   for (i=0; i<mbs; i++) {
585     n  = ai[1] - ai[0]; /* length of i_th block row of A */
586     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
587     ib = aj + *ai;
588     jmin = 0;
589     if (*ib == i){      /* (diag of A)*x */
590       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
591       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
592       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
593       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
594       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
595       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
596       v += 36; jmin++;
597     }
598     for (j=jmin; j<n; j++) {
599       /* (strict lower triangular part of A)*x  */
600       cval       = ib[j]*6;
601       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
602       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
603       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
604       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
605       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
606       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
607       /* (strict upper triangular part of A)*x  */
608       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
609       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
610       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
611       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
612       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
613       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
614       v  += 36;
615     }
616     xb +=6; ai++;
617   }
618 
619   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
620   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
621   PetscLogFlops(72*(a->nz*2 - A->m) - A->m);
622   PetscFunctionReturn(0);
623 }
624 #undef __FUNCT__
625 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
626 int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
627 {
628   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
629   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
630   MatScalar       *v;
631   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
632 
633   PetscFunctionBegin;
634   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
635   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
636   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
637 
638   v     = a->a;
639   xb = x;
640 
641   for (i=0; i<mbs; i++) {
642     n  = ai[1] - ai[0]; /* length of i_th block row of A */
643     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
644     ib = aj + *ai;
645     jmin = 0;
646     if (*ib == i){      /* (diag of A)*x */
647       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
648       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
649       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
650       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
651       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
652       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
653       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
654       v += 49; jmin++;
655     }
656     for (j=jmin; j<n; j++) {
657       /* (strict lower triangular part of A)*x  */
658       cval       = ib[j]*7;
659       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
660       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
661       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
662       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
663       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
664       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
665       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
666       /* (strict upper triangular part of A)*x  */
667       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
668       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
669       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
670       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
671       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
672       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
673       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
674       v  += 49;
675     }
676     xb +=7; ai++;
677   }
678   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
679   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
680   PetscLogFlops(98*(a->nz*2 - A->m) - A->m);
681   PetscFunctionReturn(0);
682 }
683 
684 /*
685     This will not work with MatScalar == float because it calls the BLAS
686 */
687 #undef __FUNCT__
688 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
689 int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
690 {
691   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
692   PetscScalar     *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
693   MatScalar       *v;
694   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
695   int             ncols,k;
696 
697   PetscFunctionBegin;
698   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
699   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
700   ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
701 
702   aj   = a->j;
703   v    = a->a;
704   ii   = a->i;
705 
706   if (!a->mult_work) {
707     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
708   }
709   work = a->mult_work;
710 
711   for (i=0; i<mbs; i++) {
712     n     = ii[1] - ii[0]; ncols = n*bs;
713     workt = work; idx=aj+ii[0];
714 
715     /* upper triangular part */
716     for (j=0; j<n; j++) {
717       xb = x_ptr + bs*(*idx++);
718       for (k=0; k<bs; k++) workt[k] = xb[k];
719       workt += bs;
720     }
721     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
722     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
723 
724     /* strict lower triangular part */
725     idx = aj+ii[0];
726     if (*idx == i){
727       ncols -= bs; v += bs2; idx++; n--;
728     }
729 
730     if (ncols > 0){
731       workt = work;
732       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
733       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
734       for (j=0; j<n; j++) {
735         zb = z_ptr + bs*(*idx++);
736         for (k=0; k<bs; k++) zb[k] += workt[k] ;
737         workt += bs;
738       }
739     }
740     x += bs; v += n*bs2; z += bs; ii++;
741   }
742 
743   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
744   ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
745   PetscLogFlops(2*(a->nz*2 - A->m)*bs2 - A->m);
746   PetscFunctionReturn(0);
747 }
748 
749 #undef __FUNCT__
750 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
751 int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
752 {
753   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
754   PetscScalar     *x,*y,*z,*xb,x1;
755   MatScalar       *v;
756   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
757 
758   PetscFunctionBegin;
759   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
760   if (yy != xx) {
761     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
762   } else {
763     y = x;
764   }
765   if (zz != yy) {
766     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
767     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
768     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
769   } else {
770     z = y;
771   }
772 
773   v  = a->a;
774   xb = x;
775 
776   for (i=0; i<mbs; i++) {
777     n  = ai[1] - ai[0];  /* length of i_th row of A */
778     x1 = xb[0];
779     ib = aj + *ai;
780     jmin = 0;
781     if (*ib == i) {            /* (diag of A)*x */
782       z[i] += *v++ * x[*ib++]; jmin++;
783     }
784     for (j=jmin; j<n; j++) {
785       cval    = *ib;
786       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
787       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
788     }
789     xb++; ai++;
790   }
791 
792   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
793   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
794   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
795 
796   PetscLogFlops(2*(a->nz*2 - A->m));
797   PetscFunctionReturn(0);
798 }
799 
800 #undef __FUNCT__
801 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
802 int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
803 {
804   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
805   PetscScalar     *x,*y,*z,*xb,x1,x2;
806   MatScalar       *v;
807   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
808 
809   PetscFunctionBegin;
810   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
811   if (yy != xx) {
812     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
813   } else {
814     y = x;
815   }
816   if (zz != yy) {
817     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
818     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
819     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
820   } else {
821     z = y;
822   }
823 
824   v     = a->a;
825   xb = x;
826 
827   for (i=0; i<mbs; i++) {
828     n  = ai[1] - ai[0]; /* length of i_th block row of A */
829     x1 = xb[0]; x2 = xb[1];
830     ib = aj + *ai;
831     jmin = 0;
832     if (*ib == i){      /* (diag of A)*x */
833       z[2*i]   += v[0]*x1 + v[2]*x2;
834       z[2*i+1] += v[2]*x1 + v[3]*x2;
835       v += 4; jmin++;
836     }
837     for (j=jmin; j<n; j++) {
838       /* (strict lower triangular part of A)*x  */
839       cval       = ib[j]*2;
840       z[cval]     += v[0]*x1 + v[1]*x2;
841       z[cval+1]   += v[2]*x1 + v[3]*x2;
842       /* (strict upper triangular part of A)*x  */
843       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
844       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
845       v  += 4;
846     }
847     xb +=2; ai++;
848   }
849 
850   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
851   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
852   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
853 
854   PetscLogFlops(4*(a->nz*2 - A->m));
855   PetscFunctionReturn(0);
856 }
857 
858 #undef __FUNCT__
859 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
860 int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
861 {
862   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
863   PetscScalar     *x,*y,*z,*xb,x1,x2,x3;
864   MatScalar       *v;
865   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
866 
867   PetscFunctionBegin;
868   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
869   if (yy != xx) {
870     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
871   } else {
872     y = x;
873   }
874   if (zz != yy) {
875     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
876     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
877     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
878   } else {
879     z = y;
880   }
881 
882   v     = a->a;
883   xb = x;
884 
885   for (i=0; i<mbs; i++) {
886     n  = ai[1] - ai[0]; /* length of i_th block row of A */
887     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
888     ib = aj + *ai;
889     jmin = 0;
890     if (*ib == i){     /* (diag of A)*x */
891      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
892      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
893      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
894      v += 9; jmin++;
895     }
896     for (j=jmin; j<n; j++) {
897       /* (strict lower triangular part of A)*x  */
898       cval       = ib[j]*3;
899       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
900       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
901       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
902       /* (strict upper triangular part of A)*x  */
903       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
904       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
905       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
906       v  += 9;
907     }
908     xb +=3; ai++;
909   }
910 
911   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
912   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
913   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
914 
915   PetscLogFlops(18*(a->nz*2 - A->m));
916   PetscFunctionReturn(0);
917 }
918 
919 #undef __FUNCT__
920 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
921 int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
922 {
923   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
924   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4;
925   MatScalar       *v;
926   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
927 
928   PetscFunctionBegin;
929   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
930   if (yy != xx) {
931     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
932   } else {
933     y = x;
934   }
935   if (zz != yy) {
936     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
937     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
938     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
939   } else {
940     z = y;
941   }
942 
943   v     = a->a;
944   xb = x;
945 
946   for (i=0; i<mbs; i++) {
947     n  = ai[1] - ai[0]; /* length of i_th block row of A */
948     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
949     ib = aj + *ai;
950     jmin = 0;
951     if (*ib == i){      /* (diag of A)*x */
952       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
953       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
954       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
955       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
956       v += 16; jmin++;
957     }
958     for (j=jmin; j<n; j++) {
959       /* (strict lower triangular part of A)*x  */
960       cval       = ib[j]*4;
961       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
962       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
963       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
964       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
965       /* (strict upper triangular part of A)*x  */
966       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
967       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
968       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
969       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
970       v  += 16;
971     }
972     xb +=4; ai++;
973   }
974 
975   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
976   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
977   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
978 
979   PetscLogFlops(32*(a->nz*2 - A->m));
980   PetscFunctionReturn(0);
981 }
982 
983 #undef __FUNCT__
984 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
985 int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
986 {
987   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
988   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5;
989   MatScalar       *v;
990   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
991 
992   PetscFunctionBegin;
993   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
994   if (yy != xx) {
995     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
996   } else {
997     y = x;
998   }
999   if (zz != yy) {
1000     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1001     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
1002     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1003   } else {
1004     z = y;
1005   }
1006 
1007   v     = a->a;
1008   xb = x;
1009 
1010   for (i=0; i<mbs; i++) {
1011     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1012     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
1013     ib = aj + *ai;
1014     jmin = 0;
1015     if (*ib == i){      /* (diag of A)*x */
1016       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
1017       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
1018       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
1019       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
1020       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
1021       v += 25; jmin++;
1022     }
1023     for (j=jmin; j<n; j++) {
1024       /* (strict lower triangular part of A)*x  */
1025       cval       = ib[j]*5;
1026       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1027       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1028       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
1029       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
1030       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
1031       /* (strict upper triangular part of A)*x  */
1032       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
1033       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
1034       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
1035       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
1036       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
1037       v  += 25;
1038     }
1039     xb +=5; ai++;
1040   }
1041 
1042   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1043   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1044   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1045 
1046   PetscLogFlops(50*(a->nz*2 - A->m));
1047   PetscFunctionReturn(0);
1048 }
1049 #undef __FUNCT__
1050 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
1051 int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1052 {
1053   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1054   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
1055   MatScalar       *v;
1056   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1057 
1058   PetscFunctionBegin;
1059   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1060   if (yy != xx) {
1061     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1062   } else {
1063     y = x;
1064   }
1065   if (zz != yy) {
1066     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1067     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
1068     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1069   } else {
1070     z = y;
1071   }
1072 
1073   v     = a->a;
1074   xb = x;
1075 
1076   for (i=0; i<mbs; i++) {
1077     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1078     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1079     ib = aj + *ai;
1080     jmin = 0;
1081     if (*ib == i){     /* (diag of A)*x */
1082       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1083       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1084       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1085       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1086       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1087       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1088       v += 36; jmin++;
1089     }
1090     for (j=jmin; j<n; j++) {
1091       /* (strict lower triangular part of A)*x  */
1092       cval       = ib[j]*6;
1093       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1094       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1095       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1096       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1097       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1098       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1099       /* (strict upper triangular part of A)*x  */
1100       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1101       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1102       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1103       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1104       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1105       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1106       v  += 36;
1107     }
1108     xb +=6; ai++;
1109   }
1110 
1111   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1112   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1113   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1114 
1115   PetscLogFlops(72*(a->nz*2 - A->m));
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 #undef __FUNCT__
1120 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1121 int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1122 {
1123   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1124   PetscScalar     *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1125   MatScalar       *v;
1126   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
1127 
1128   PetscFunctionBegin;
1129   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1130   if (yy != xx) {
1131     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1132   } else {
1133     y = x;
1134   }
1135   if (zz != yy) {
1136     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1137     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr);
1138     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1139   } else {
1140     z = y;
1141   }
1142 
1143   v     = a->a;
1144   xb = x;
1145 
1146   for (i=0; i<mbs; i++) {
1147     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1148     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1149     ib = aj + *ai;
1150     jmin = 0;
1151     if (*ib == i){     /* (diag of A)*x */
1152       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1153       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1154       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1155       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1156       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1157       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1158       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1159       v += 49; jmin++;
1160     }
1161     for (j=jmin; j<n; j++) {
1162       /* (strict lower triangular part of A)*x  */
1163       cval       = ib[j]*7;
1164       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1165       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1166       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1167       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1168       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1169       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1170       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1171       /* (strict upper triangular part of A)*x  */
1172       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1173       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1174       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1175       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1176       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1177       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1178       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1179       v  += 49;
1180     }
1181     xb +=7; ai++;
1182   }
1183 
1184   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1185   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1186   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1187 
1188   PetscLogFlops(98*(a->nz*2 - A->m));
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 #undef __FUNCT__
1193 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1194 int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1195 {
1196   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1197   PetscScalar     *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1198   MatScalar       *v;
1199   int             ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1200   int             ncols,k;
1201 
1202   PetscFunctionBegin;
1203   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); x_ptr=x;
1204   if (yy != xx) {
1205     ierr = VecGetArrayFast(yy,&y);CHKERRQ(ierr);
1206   } else {
1207     y = x;
1208   }
1209   if (zz != yy) {
1210     /* ierr = VecCopy(yy,zz);CHKERRQ(ierr); */
1211     ierr = VecGetArrayFast(zz,&z);CHKERRQ(ierr); z_ptr=z;
1212     ierr = PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));CHKERRQ(ierr);
1213   } else {
1214     z = y;
1215   }
1216 
1217   aj   = a->j;
1218   v    = a->a;
1219   ii   = a->i;
1220 
1221   if (!a->mult_work) {
1222     ierr = PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1223   }
1224   work = a->mult_work;
1225 
1226 
1227   for (i=0; i<mbs; i++) {
1228     n     = ii[1] - ii[0]; ncols = n*bs;
1229     workt = work; idx=aj+ii[0];
1230 
1231     /* upper triangular part */
1232     for (j=0; j<n; j++) {
1233       xb = x_ptr + bs*(*idx++);
1234       for (k=0; k<bs; k++) workt[k] = xb[k];
1235       workt += bs;
1236     }
1237     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1238     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1239 
1240     /* strict lower triangular part */
1241     idx = aj+ii[0];
1242     if (*idx == i){
1243       ncols -= bs; v += bs2; idx++; n--;
1244     }
1245     if (ncols > 0){
1246       workt = work;
1247       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1248       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1249       for (j=0; j<n; j++) {
1250         zb = z_ptr + bs*(*idx++);
1251         /* idx++; */
1252         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1253         workt += bs;
1254       }
1255     }
1256 
1257     x += bs; v += n*bs2; z += bs; ii++;
1258   }
1259 
1260   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1261   if (yy != xx) ierr = VecRestoreArrayFast(yy,&y);CHKERRQ(ierr);
1262   if (zz != yy) ierr = VecRestoreArrayFast(zz,&z);CHKERRQ(ierr);
1263 
1264   PetscLogFlops(2*(a->nz*2 - A->m));
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatMultTranspose_SeqSBAIJ"
1270 int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1271 {
1272   int ierr;
1273 
1274   PetscFunctionBegin;
1275   ierr = MatMult(A,xx,zz);CHKERRQ(ierr);
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 #undef __FUNCT__
1280 #define __FUNCT__ "MatMultTransposeAdd_SeqSBAIJ"
1281 int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1282 
1283 {
1284   int ierr;
1285 
1286   PetscFunctionBegin;
1287   ierr = MatMultAdd(A,xx,yy,zz);CHKERRQ(ierr);
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 #undef __FUNCT__
1292 #define __FUNCT__ "MatScale_SeqSBAIJ"
1293 int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1294 {
1295   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1296   int         one = 1,totalnz = a->bs2*a->nz;
1297 
1298   PetscFunctionBegin;
1299   BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1300   PetscLogFlops(totalnz);
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 #undef __FUNCT__
1305 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1306 int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1307 {
1308   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1309   MatScalar   *v = a->a;
1310   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1311   int         i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1312   int         *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1313 
1314   PetscFunctionBegin;
1315   if (type == NORM_FROBENIUS) {
1316     for (k=0; k<mbs; k++){
1317       jmin = a->i[k]; jmax = a->i[k+1];
1318       col  = aj + jmin;
1319       if (*col == k){         /* diagonal block */
1320         for (i=0; i<bs2; i++){
1321 #if defined(PETSC_USE_COMPLEX)
1322           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1323 #else
1324           sum_diag += (*v)*(*v); v++;
1325 #endif
1326         }
1327         jmin++;
1328       }
1329       for (j=jmin; j<jmax; j++){  /* off-diagonal blocks */
1330         for (i=0; i<bs2; i++){
1331 #if defined(PETSC_USE_COMPLEX)
1332           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1333 #else
1334           sum_off += (*v)*(*v); v++;
1335 #endif
1336         }
1337       }
1338     }
1339     *norm = sqrt(sum_diag + 2*sum_off);
1340 
1341   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1342     ierr = PetscMalloc(mbs*sizeof(int),&il);CHKERRQ(ierr);
1343     ierr = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1344     ierr = PetscMalloc(bs*sizeof(PetscReal),&sum);CHKERRQ(ierr);
1345     for (i=0; i<mbs; i++) {
1346       jl[i] = mbs; il[0] = 0;
1347     }
1348 
1349     *norm = 0.0;
1350     for (k=0; k<mbs; k++) { /* k_th block row */
1351       for (j=0; j<bs; j++) sum[j]=0.0;
1352 
1353       /*-- col sum --*/
1354       i = jl[k]; /* first |A(i,k)| to be added */
1355       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1356                   at step k */
1357       while (i<mbs){
1358         nexti = jl[i];  /* next block row to be added */
1359         ik    = il[i];  /* block index of A(i,k) in the array a */
1360         for (j=0; j<bs; j++){
1361           v = a->a + ik*bs2 + j*bs;
1362           for (k1=0; k1<bs; k1++) {
1363             sum[j] += PetscAbsScalar(*v); v++;
1364           }
1365         }
1366         /* update il, jl */
1367         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1368         jmax = a->i[i+1];
1369         if (jmin < jmax){
1370           il[i] = jmin;
1371           j   = a->j[jmin];
1372           jl[i] = jl[j]; jl[j]=i;
1373         }
1374         i = nexti;
1375       }
1376 
1377       /*-- row sum --*/
1378       jmin = a->i[k]; jmax = a->i[k+1];
1379       for (i=jmin; i<jmax; i++) {
1380         for (j=0; j<bs; j++){
1381           v = a->a + i*bs2 + j;
1382           for (k1=0; k1<bs; k1++){
1383             sum[j] += PetscAbsScalar(*v);
1384             v   += bs;
1385           }
1386         }
1387       }
1388       /* add k_th block row to il, jl */
1389       col = aj+jmin;
1390       if (*col == k) jmin++;
1391       if (jmin < jmax){
1392         il[k] = jmin;
1393         j   = a->j[jmin];
1394         jl[k] = jl[j]; jl[j] = k;
1395       }
1396       for (j=0; j<bs; j++){
1397         if (sum[j] > *norm) *norm = sum[j];
1398       }
1399     }
1400     ierr = PetscFree(il);CHKERRQ(ierr);
1401     ierr = PetscFree(jl);CHKERRQ(ierr);
1402     ierr = PetscFree(sum);CHKERRQ(ierr);
1403   } else {
1404     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1405   }
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 #undef __FUNCT__
1410 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1411 int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1412 {
1413   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1414   int          ierr;
1415 
1416   PetscFunctionBegin;
1417 
1418   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1419   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1420     *flg = PETSC_FALSE;
1421     PetscFunctionReturn(0);
1422   }
1423 
1424   /* if the a->i are the same */
1425   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1426   if (*flg == PETSC_FALSE) {
1427     PetscFunctionReturn(0);
1428   }
1429 
1430   /* if a->j are the same */
1431   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);CHKERRQ(ierr);
1432   if (*flg == PETSC_FALSE) {
1433     PetscFunctionReturn(0);
1434   }
1435   /* if a->a are the same */
1436   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1437 
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 #undef __FUNCT__
1442 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1443 int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1444 {
1445   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1446   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1447   PetscScalar  *x,zero = 0.0;
1448   MatScalar    *aa,*aa_j;
1449 
1450   PetscFunctionBegin;
1451   bs   = a->bs;
1452   if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1453 
1454   aa   = a->a;
1455   ai   = a->i;
1456   aj   = a->j;
1457   ambs = a->mbs;
1458   bs2  = a->bs2;
1459 
1460   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1461   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1462   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1463   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1464   for (i=0; i<ambs; i++) {
1465     j=ai[i];
1466     if (aj[j] == i) {             /* if this is a diagonal element */
1467       row  = i*bs;
1468       aa_j = aa + j*bs2;
1469       if (A->factor && bs==1){
1470         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1471       } else {
1472         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1473       }
1474     }
1475   }
1476 
1477   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1478   PetscFunctionReturn(0);
1479 }
1480 
1481 #undef __FUNCT__
1482 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1483 int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1484 {
1485   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1486   PetscScalar  *l,*r,x,*li,*ri;
1487   MatScalar    *aa,*v;
1488   int          ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1489 
1490   PetscFunctionBegin;
1491   ai  = a->i;
1492   aj  = a->j;
1493   aa  = a->a;
1494   m   = A->m;
1495   bs  = a->bs;
1496   mbs = a->mbs;
1497   bs2 = a->bs2;
1498 
1499   if (ll != rr) {
1500     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1501   }
1502   if (ll) {
1503     ierr = VecGetArrayFast(ll,&l);CHKERRQ(ierr);
1504     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1505     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1506     for (i=0; i<mbs; i++) { /* for each block row */
1507       M  = ai[i+1] - ai[i];
1508       li = l + i*bs;
1509       v  = aa + bs2*ai[i];
1510       for (j=0; j<M; j++) { /* for each block */
1511         for (k=0; k<bs2; k++) {
1512           (*v++) *= li[k%bs];
1513         }
1514 #ifdef CONT
1515         /* will be used to replace the above loop */
1516         ri = l + bs*aj[ai[i]+j];
1517         for (k=0; k<bs; k++) { /* column value */
1518           x = ri[k];
1519           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1520         }
1521 #endif
1522 
1523       }
1524     }
1525     ierr = VecRestoreArrayFast(ll,&l);CHKERRQ(ierr);
1526     PetscLogFlops(2*a->nz);
1527   }
1528   /* will be deleted */
1529   if (rr) {
1530     ierr = VecGetArrayFast(rr,&r);CHKERRQ(ierr);
1531     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1532     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1533     for (i=0; i<mbs; i++) { /* for each block row */
1534       M  = ai[i+1] - ai[i];
1535       v  = aa + bs2*ai[i];
1536       for (j=0; j<M; j++) { /* for each block */
1537         ri = r + bs*aj[ai[i]+j];
1538         for (k=0; k<bs; k++) {
1539           x = ri[k];
1540           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1541         }
1542       }
1543     }
1544     ierr = VecRestoreArrayFast(rr,&r);CHKERRQ(ierr);
1545     PetscLogFlops(a->nz);
1546   }
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 #undef __FUNCT__
1551 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1552 int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1553 {
1554   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1555 
1556   PetscFunctionBegin;
1557   info->rows_global    = (double)A->m;
1558   info->columns_global = (double)A->m;
1559   info->rows_local     = (double)A->m;
1560   info->columns_local  = (double)A->m;
1561   info->block_size     = a->bs2;
1562   info->nz_allocated   = a->maxnz; /*num. of nonzeros in upper triangular part */
1563   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1564   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1565   info->assemblies   = A->num_ass;
1566   info->mallocs      = a->reallocs;
1567   info->memory       = A->mem;
1568   if (A->factor) {
1569     info->fill_ratio_given  = A->info.fill_ratio_given;
1570     info->fill_ratio_needed = A->info.fill_ratio_needed;
1571     info->factor_mallocs    = A->info.factor_mallocs;
1572   } else {
1573     info->fill_ratio_given  = 0;
1574     info->fill_ratio_needed = 0;
1575     info->factor_mallocs    = 0;
1576   }
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 
1581 #undef __FUNCT__
1582 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1583 int MatZeroEntries_SeqSBAIJ(Mat A)
1584 {
1585   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1586   int         ierr;
1587 
1588   PetscFunctionBegin;
1589   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 #undef __FUNCT__
1594 #define __FUNCT__ "MatGetRowMax_SeqSBAIJ"
1595 int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1596 {
1597   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1598   int          ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1599   PetscReal    atmp;
1600   MatScalar    *aa;
1601   PetscScalar  zero = 0.0,*x;
1602   int          ncols,brow,bcol,krow,kcol;
1603 
1604   PetscFunctionBegin;
1605   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1606   bs   = a->bs;
1607   aa   = a->a;
1608   ai   = a->i;
1609   aj   = a->j;
1610   mbs = a->mbs;
1611 
1612   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1613   ierr = VecGetArrayFast(v,&x);CHKERRQ(ierr);
1614   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1615   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1616   for (i=0; i<mbs; i++) {
1617     ncols = ai[1] - ai[0]; ai++;
1618     brow  = bs*i;
1619     for (j=0; j<ncols; j++){
1620       bcol = bs*(*aj);
1621       for (kcol=0; kcol<bs; kcol++){
1622         col = bcol + kcol;      /* col index */
1623         for (krow=0; krow<bs; krow++){
1624           atmp = PetscAbsScalar(*aa); aa++;
1625           row = brow + krow;    /* row index */
1626           /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1627           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1628           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1629         }
1630       }
1631       aj++;
1632     }
1633   }
1634   ierr = VecRestoreArrayFast(v,&x);CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637