xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 49b5e25f194e2f2348d54770c8c9996e6dec7aec)
1*49b5e25fSSatish Balay /*$Id: SBAIJ2.c,v 1.55 2000/01/11 21:00:52 bsmith Exp $*/
2*49b5e25fSSatish Balay 
3*49b5e25fSSatish Balay #include "petscsys.h"
4*49b5e25fSSatish Balay #include "src/mat/impls/baij/seq/baij.h"
5*49b5e25fSSatish Balay #include "src/vec/vecimpl.h"
6*49b5e25fSSatish Balay #include "src/inline/spops.h"
7*49b5e25fSSatish Balay #include "src/inline/ilu.h"
8*49b5e25fSSatish Balay #include "petscbt.h"
9*49b5e25fSSatish Balay #include "sbaij.h"
10*49b5e25fSSatish Balay 
11*49b5e25fSSatish Balay #undef __FUNC__
12*49b5e25fSSatish Balay #define __FUNC__ "MatIncreaseOverlap_SeqSBAIJ"
13*49b5e25fSSatish Balay int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
14*49b5e25fSSatish Balay {
15*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16*49b5e25fSSatish Balay   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
17*49b5e25fSSatish Balay   int         start,end,*ai,*aj,bs,*nidx2;
18*49b5e25fSSatish Balay   PetscBT     table;
19*49b5e25fSSatish Balay 
20*49b5e25fSSatish Balay   PetscFunctionBegin;
21*49b5e25fSSatish Balay   m     = a->mbs;
22*49b5e25fSSatish Balay   ai    = a->i;
23*49b5e25fSSatish Balay   aj    = a->j;
24*49b5e25fSSatish Balay   bs    = a->bs;
25*49b5e25fSSatish Balay 
26*49b5e25fSSatish Balay   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
27*49b5e25fSSatish Balay 
28*49b5e25fSSatish Balay   ierr  = PetscBTCreate(m,table);CHKERRQ(ierr);
29*49b5e25fSSatish Balay   nidx  = (int*)PetscMalloc((m+1)*sizeof(int));CHKPTRQ(nidx);
30*49b5e25fSSatish Balay   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int));CHKPTRQ(nidx2);
31*49b5e25fSSatish Balay 
32*49b5e25fSSatish Balay   for (i=0; i<is_max; i++) {
33*49b5e25fSSatish Balay     /* Initialise the two local arrays */
34*49b5e25fSSatish Balay     isz  = 0;
35*49b5e25fSSatish Balay     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
36*49b5e25fSSatish Balay 
37*49b5e25fSSatish Balay     /* Extract the indices, assume there can be duplicate entries */
38*49b5e25fSSatish Balay     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
39*49b5e25fSSatish Balay     ierr = ISGetSize(is[i],&n);CHKERRQ(ierr);
40*49b5e25fSSatish Balay 
41*49b5e25fSSatish Balay     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42*49b5e25fSSatish Balay     for (j=0; j<n ; ++j){
43*49b5e25fSSatish Balay       ival = idx[j]/bs; /* convert the indices into block indices */
44*49b5e25fSSatish Balay       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
45*49b5e25fSSatish Balay       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
46*49b5e25fSSatish Balay     }
47*49b5e25fSSatish Balay     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48*49b5e25fSSatish Balay     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
49*49b5e25fSSatish Balay     k = 0;
50*49b5e25fSSatish Balay     for (j=0; j<ov; j++){ /* for each overlap*/
51*49b5e25fSSatish Balay       n = isz;
52*49b5e25fSSatish Balay       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53*49b5e25fSSatish Balay         row   = nidx[k];
54*49b5e25fSSatish Balay         start = ai[row];
55*49b5e25fSSatish Balay         end   = ai[row+1];
56*49b5e25fSSatish Balay         for (l = start; l<end ; l++){
57*49b5e25fSSatish Balay           val = aj[l];
58*49b5e25fSSatish Balay           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59*49b5e25fSSatish Balay         }
60*49b5e25fSSatish Balay       }
61*49b5e25fSSatish Balay     }
62*49b5e25fSSatish Balay     /* expand the Index Set */
63*49b5e25fSSatish Balay     for (j=0; j<isz; j++) {
64*49b5e25fSSatish Balay       for (k=0; k<bs; k++)
65*49b5e25fSSatish Balay         nidx2[j*bs+k] = nidx[j]*bs+k;
66*49b5e25fSSatish Balay     }
67*49b5e25fSSatish Balay     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);CHKERRQ(ierr);
68*49b5e25fSSatish Balay   }
69*49b5e25fSSatish Balay   ierr = PetscBTDestroy(table);CHKERRQ(ierr);
70*49b5e25fSSatish Balay   ierr = PetscFree(nidx);CHKERRQ(ierr);
71*49b5e25fSSatish Balay   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72*49b5e25fSSatish Balay   PetscFunctionReturn(0);
73*49b5e25fSSatish Balay }
74*49b5e25fSSatish Balay 
75*49b5e25fSSatish Balay #undef __FUNC__
76*49b5e25fSSatish Balay #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ_Private"
77*49b5e25fSSatish Balay int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
78*49b5e25fSSatish Balay {
79*49b5e25fSSatish Balay   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
80*49b5e25fSSatish Balay   int          *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
81*49b5e25fSSatish Balay   int          row,mat_i,*mat_j,tcol,*mat_ilen;
82*49b5e25fSSatish Balay   int          *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
83*49b5e25fSSatish Balay   int          *aj = a->j,*ai = a->i;
84*49b5e25fSSatish Balay   MatScalar    *mat_a;
85*49b5e25fSSatish Balay   Mat          C;
86*49b5e25fSSatish Balay   PetscTruth   flag;
87*49b5e25fSSatish Balay 
88*49b5e25fSSatish Balay   PetscFunctionBegin;
89*49b5e25fSSatish Balay 
90*49b5e25fSSatish Balay   if ( isrow != iscol ) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
91*49b5e25fSSatish Balay   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
92*49b5e25fSSatish Balay   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
93*49b5e25fSSatish Balay 
94*49b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
95*49b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
96*49b5e25fSSatish Balay 
97*49b5e25fSSatish Balay   smap  = (int*)PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
98*49b5e25fSSatish Balay   ssmap = smap;
99*49b5e25fSSatish Balay   lens  = (int*)PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
100*49b5e25fSSatish Balay   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
101*49b5e25fSSatish Balay   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
102*49b5e25fSSatish Balay   /* determine lens of each row */
103*49b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
104*49b5e25fSSatish Balay     kstart  = ai[irow[i]];
105*49b5e25fSSatish Balay     kend    = kstart + a->ilen[irow[i]];
106*49b5e25fSSatish Balay     lens[i] = 0;
107*49b5e25fSSatish Balay       for (k=kstart; k<kend; k++) {
108*49b5e25fSSatish Balay         if (ssmap[aj[k]]) {
109*49b5e25fSSatish Balay           lens[i]++;
110*49b5e25fSSatish Balay         }
111*49b5e25fSSatish Balay       }
112*49b5e25fSSatish Balay     }
113*49b5e25fSSatish Balay   /* Create and fill new matrix */
114*49b5e25fSSatish Balay   if (scall == MAT_REUSE_MATRIX) {
115*49b5e25fSSatish Balay     c = (Mat_SeqSBAIJ *)((*B)->data);
116*49b5e25fSSatish Balay 
117*49b5e25fSSatish Balay     if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
118*49b5e25fSSatish Balay     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);CHKERRQ(ierr);
119*49b5e25fSSatish Balay     if (flag == PETSC_FALSE) {
120*49b5e25fSSatish Balay       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
121*49b5e25fSSatish Balay     }
122*49b5e25fSSatish Balay     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
123*49b5e25fSSatish Balay     C = *B;
124*49b5e25fSSatish Balay   } else {
125*49b5e25fSSatish Balay     ierr = MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);CHKERRQ(ierr);
126*49b5e25fSSatish Balay   }
127*49b5e25fSSatish Balay   c = (Mat_SeqSBAIJ *)(C->data);
128*49b5e25fSSatish Balay   for (i=0; i<nrows; i++) {
129*49b5e25fSSatish Balay     row    = irow[i];
130*49b5e25fSSatish Balay     kstart = ai[row];
131*49b5e25fSSatish Balay     kend   = kstart + a->ilen[row];
132*49b5e25fSSatish Balay     mat_i  = c->i[i];
133*49b5e25fSSatish Balay     mat_j  = c->j + mat_i;
134*49b5e25fSSatish Balay     mat_a  = c->a + mat_i*bs2;
135*49b5e25fSSatish Balay     mat_ilen = c->ilen + i;
136*49b5e25fSSatish Balay     for (k=kstart; k<kend; k++) {
137*49b5e25fSSatish Balay       if ((tcol=ssmap[a->j[k]])) {
138*49b5e25fSSatish Balay         *mat_j++ = tcol - 1;
139*49b5e25fSSatish Balay         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
140*49b5e25fSSatish Balay         mat_a   += bs2;
141*49b5e25fSSatish Balay         (*mat_ilen)++;
142*49b5e25fSSatish Balay       }
143*49b5e25fSSatish Balay     }
144*49b5e25fSSatish Balay   }
145*49b5e25fSSatish Balay 
146*49b5e25fSSatish Balay   /* Free work space */
147*49b5e25fSSatish Balay   ierr = PetscFree(smap);CHKERRQ(ierr);
148*49b5e25fSSatish Balay   ierr = PetscFree(lens);CHKERRQ(ierr);
149*49b5e25fSSatish Balay   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
150*49b5e25fSSatish Balay   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
151*49b5e25fSSatish Balay 
152*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
153*49b5e25fSSatish Balay   *B = C;
154*49b5e25fSSatish Balay   PetscFunctionReturn(0);
155*49b5e25fSSatish Balay }
156*49b5e25fSSatish Balay 
157*49b5e25fSSatish Balay #undef __FUNC__
158*49b5e25fSSatish Balay #define __FUNC__ "MatGetSubMatrix_SeqSBAIJ"
159*49b5e25fSSatish Balay int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
160*49b5e25fSSatish Balay {
161*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
162*49b5e25fSSatish Balay   IS          is1;
163*49b5e25fSSatish Balay   int         *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
164*49b5e25fSSatish Balay 
165*49b5e25fSSatish Balay   PetscFunctionBegin;
166*49b5e25fSSatish Balay   if ( isrow != iscol ) SETERRA(1,0,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
167*49b5e25fSSatish Balay 
168*49b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
169*49b5e25fSSatish Balay   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
170*49b5e25fSSatish Balay 
171*49b5e25fSSatish Balay   /* Verify if the indices corespond to each element in a block
172*49b5e25fSSatish Balay    and form the IS with compressed IS */
173*49b5e25fSSatish Balay   vary = (int*)PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
174*49b5e25fSSatish Balay   iary = vary + a->mbs;
175*49b5e25fSSatish Balay   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
176*49b5e25fSSatish Balay   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
177*49b5e25fSSatish Balay 
178*49b5e25fSSatish Balay   count = 0;
179*49b5e25fSSatish Balay   for (i=0; i<a->mbs; i++) {
180*49b5e25fSSatish Balay     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
181*49b5e25fSSatish Balay     if (vary[i]==bs) iary[count++] = i;
182*49b5e25fSSatish Balay   }
183*49b5e25fSSatish Balay   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);CHKERRQ(ierr);
184*49b5e25fSSatish Balay 
185*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
186*49b5e25fSSatish Balay   ierr = PetscFree(vary);CHKERRQ(ierr);
187*49b5e25fSSatish Balay 
188*49b5e25fSSatish Balay   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);CHKERRQ(ierr);
189*49b5e25fSSatish Balay   ISDestroy(is1);
190*49b5e25fSSatish Balay   PetscFunctionReturn(0);
191*49b5e25fSSatish Balay }
192*49b5e25fSSatish Balay 
193*49b5e25fSSatish Balay #undef __FUNC__
194*49b5e25fSSatish Balay #define __FUNC__ "MatGetSubMatrices_SeqSBAIJ"
195*49b5e25fSSatish Balay int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
196*49b5e25fSSatish Balay {
197*49b5e25fSSatish Balay   int ierr,i;
198*49b5e25fSSatish Balay 
199*49b5e25fSSatish Balay   PetscFunctionBegin;
200*49b5e25fSSatish Balay   if (scall == MAT_INITIAL_MATRIX) {
201*49b5e25fSSatish Balay     *B = (Mat*)PetscMalloc((n+1)*sizeof(Mat));CHKPTRQ(*B);
202*49b5e25fSSatish Balay   }
203*49b5e25fSSatish Balay 
204*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
205*49b5e25fSSatish Balay     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
206*49b5e25fSSatish Balay   }
207*49b5e25fSSatish Balay   PetscFunctionReturn(0);
208*49b5e25fSSatish Balay }
209*49b5e25fSSatish Balay 
210*49b5e25fSSatish Balay /* -------------------------------------------------------*/
211*49b5e25fSSatish Balay /* Should check that shapes of vectors and matrices match */
212*49b5e25fSSatish Balay /* -------------------------------------------------------*/
213*49b5e25fSSatish Balay #include "pinclude/blaslapack.h"
214*49b5e25fSSatish Balay 
215*49b5e25fSSatish Balay #undef __FUNC__
216*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_1"
217*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
218*49b5e25fSSatish Balay {
219*49b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
220*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,zero=0.0;
221*49b5e25fSSatish Balay   MatScalar       *v;
222*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
223*49b5e25fSSatish Balay 
224*49b5e25fSSatish Balay   PetscFunctionBegin;
225*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
226*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
227*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
228*49b5e25fSSatish Balay 
229*49b5e25fSSatish Balay   v  = a->a;
230*49b5e25fSSatish Balay   xb = x;
231*49b5e25fSSatish Balay 
232*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
233*49b5e25fSSatish Balay     n  = ai[1] - ai[0];  /* length of i_th row of A */
234*49b5e25fSSatish Balay     x1 = xb[0];
235*49b5e25fSSatish Balay     ib = aj + *ai;
236*49b5e25fSSatish Balay     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
237*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
238*49b5e25fSSatish Balay       cval    = *ib;
239*49b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
240*49b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
241*49b5e25fSSatish Balay     }
242*49b5e25fSSatish Balay     xb++; ai++;
243*49b5e25fSSatish Balay   }
244*49b5e25fSSatish Balay 
245*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
246*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
247*49b5e25fSSatish Balay   PLogFlops(2*(a->s_nz*2 - a->m) - a->m);  /* s_nz = (nz+m)/2 */
248*49b5e25fSSatish Balay   PetscFunctionReturn(0);
249*49b5e25fSSatish Balay }
250*49b5e25fSSatish Balay 
251*49b5e25fSSatish Balay #undef __FUNC__
252*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_2"
253*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
254*49b5e25fSSatish Balay {
255*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
256*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,zero=0.0;
257*49b5e25fSSatish Balay   MatScalar       *v;
258*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
259*49b5e25fSSatish Balay 
260*49b5e25fSSatish Balay 
261*49b5e25fSSatish Balay   PetscFunctionBegin;
262*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
263*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
264*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
265*49b5e25fSSatish Balay 
266*49b5e25fSSatish Balay   v     = a->a;
267*49b5e25fSSatish Balay   xb = x;
268*49b5e25fSSatish Balay 
269*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
270*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
271*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1];
272*49b5e25fSSatish Balay     ib = aj + *ai;
273*49b5e25fSSatish Balay     /* (diag of A)*x */
274*49b5e25fSSatish Balay     z[2*i]   += v[0]*x1 + v[2]*x2;
275*49b5e25fSSatish Balay     z[2*i+1] += v[2]*x1 + v[3]*x2;
276*49b5e25fSSatish Balay     v += 4;
277*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
278*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
279*49b5e25fSSatish Balay       cval       = ib[j]*2;
280*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2;
281*49b5e25fSSatish Balay       z[cval+1]   += v[2]*x1 + v[3]*x2;
282*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
283*49b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
284*49b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
285*49b5e25fSSatish Balay       v  += 4;
286*49b5e25fSSatish Balay     }
287*49b5e25fSSatish Balay     xb +=2; ai++;
288*49b5e25fSSatish Balay   }
289*49b5e25fSSatish Balay 
290*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
291*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
292*49b5e25fSSatish Balay   PLogFlops(8*(a->s_nz*2 - a->m) - a->m);
293*49b5e25fSSatish Balay   PetscFunctionReturn(0);
294*49b5e25fSSatish Balay }
295*49b5e25fSSatish Balay 
296*49b5e25fSSatish Balay #undef __FUNC__
297*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_3"
298*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
299*49b5e25fSSatish Balay {
300*49b5e25fSSatish Balay   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
301*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,x3,zero=0.0;
302*49b5e25fSSatish Balay   MatScalar       *v;
303*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
304*49b5e25fSSatish Balay 
305*49b5e25fSSatish Balay 
306*49b5e25fSSatish Balay   PetscFunctionBegin;
307*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
308*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
309*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
310*49b5e25fSSatish Balay 
311*49b5e25fSSatish Balay   v     = a->a;
312*49b5e25fSSatish Balay   xb = x;
313*49b5e25fSSatish Balay 
314*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
315*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
316*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
317*49b5e25fSSatish Balay     ib = aj + *ai;
318*49b5e25fSSatish Balay     /* (diag of A)*x */
319*49b5e25fSSatish Balay     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
320*49b5e25fSSatish Balay     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
321*49b5e25fSSatish Balay     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
322*49b5e25fSSatish Balay     v += 9;
323*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
324*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
325*49b5e25fSSatish Balay       cval       = ib[j]*3;
326*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
327*49b5e25fSSatish Balay       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
328*49b5e25fSSatish Balay       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
329*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
330*49b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
331*49b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
332*49b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
333*49b5e25fSSatish Balay       v  += 9;
334*49b5e25fSSatish Balay     }
335*49b5e25fSSatish Balay     xb +=3; ai++;
336*49b5e25fSSatish Balay   }
337*49b5e25fSSatish Balay 
338*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
339*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
340*49b5e25fSSatish Balay   PLogFlops(18*(a->s_nz*2 - a->m) - a->m);
341*49b5e25fSSatish Balay   PetscFunctionReturn(0);
342*49b5e25fSSatish Balay }
343*49b5e25fSSatish Balay 
344*49b5e25fSSatish Balay #undef __FUNC__
345*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_4"
346*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
347*49b5e25fSSatish Balay {
348*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
349*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
350*49b5e25fSSatish Balay   MatScalar       *v;
351*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
352*49b5e25fSSatish Balay 
353*49b5e25fSSatish Balay   PetscFunctionBegin;
354*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
355*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
356*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
357*49b5e25fSSatish Balay 
358*49b5e25fSSatish Balay   v     = a->a;
359*49b5e25fSSatish Balay   xb = x;
360*49b5e25fSSatish Balay 
361*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
362*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
363*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
364*49b5e25fSSatish Balay     ib = aj + *ai;
365*49b5e25fSSatish Balay     /* (diag of A)*x */
366*49b5e25fSSatish Balay     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
367*49b5e25fSSatish Balay     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
368*49b5e25fSSatish Balay     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
369*49b5e25fSSatish Balay     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
370*49b5e25fSSatish Balay     v += 16;
371*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
372*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
373*49b5e25fSSatish Balay       cval       = ib[j]*4;
374*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
375*49b5e25fSSatish Balay       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
376*49b5e25fSSatish Balay       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
377*49b5e25fSSatish Balay       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
378*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
379*49b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
380*49b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
381*49b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
382*49b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
383*49b5e25fSSatish Balay       v  += 16;
384*49b5e25fSSatish Balay     }
385*49b5e25fSSatish Balay     xb +=4; ai++;
386*49b5e25fSSatish Balay   }
387*49b5e25fSSatish Balay 
388*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
389*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
390*49b5e25fSSatish Balay   PLogFlops(32*(a->s_nz*2 - a->m) - a->m);
391*49b5e25fSSatish Balay   PetscFunctionReturn(0);
392*49b5e25fSSatish Balay }
393*49b5e25fSSatish Balay 
394*49b5e25fSSatish Balay #undef __FUNC__
395*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_5"
396*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
397*49b5e25fSSatish Balay {
398*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
399*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
400*49b5e25fSSatish Balay   MatScalar       *v;
401*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
402*49b5e25fSSatish Balay 
403*49b5e25fSSatish Balay   PetscFunctionBegin;
404*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
405*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
406*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
407*49b5e25fSSatish Balay 
408*49b5e25fSSatish Balay   v     = a->a;
409*49b5e25fSSatish Balay   xb = x;
410*49b5e25fSSatish Balay 
411*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
412*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
413*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
414*49b5e25fSSatish Balay     ib = aj + *ai;
415*49b5e25fSSatish Balay     /* (diag of A)*x */
416*49b5e25fSSatish Balay     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
417*49b5e25fSSatish Balay     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
418*49b5e25fSSatish Balay     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
419*49b5e25fSSatish Balay     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
420*49b5e25fSSatish Balay     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
421*49b5e25fSSatish Balay     v += 25;
422*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
423*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
424*49b5e25fSSatish Balay       cval       = ib[j]*5;
425*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
426*49b5e25fSSatish Balay       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
427*49b5e25fSSatish Balay       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
428*49b5e25fSSatish Balay       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
429*49b5e25fSSatish Balay       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
430*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
431*49b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
432*49b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
433*49b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
434*49b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
435*49b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
436*49b5e25fSSatish Balay       v  += 25;
437*49b5e25fSSatish Balay     }
438*49b5e25fSSatish Balay     xb +=5; ai++;
439*49b5e25fSSatish Balay   }
440*49b5e25fSSatish Balay 
441*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
442*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
443*49b5e25fSSatish Balay   PLogFlops(50*(a->s_nz*2 - a->m) - a->m);
444*49b5e25fSSatish Balay   PetscFunctionReturn(0);
445*49b5e25fSSatish Balay }
446*49b5e25fSSatish Balay 
447*49b5e25fSSatish Balay 
448*49b5e25fSSatish Balay #undef __FUNC__
449*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_6"
450*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
451*49b5e25fSSatish Balay {
452*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
453*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
454*49b5e25fSSatish Balay   MatScalar       *v;
455*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
456*49b5e25fSSatish Balay 
457*49b5e25fSSatish Balay   PetscFunctionBegin;
458*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
459*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
460*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
461*49b5e25fSSatish Balay 
462*49b5e25fSSatish Balay   v     = a->a;
463*49b5e25fSSatish Balay   xb = x;
464*49b5e25fSSatish Balay 
465*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
466*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
467*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
468*49b5e25fSSatish Balay     ib = aj + *ai;
469*49b5e25fSSatish Balay     /* (diag of A)*x */
470*49b5e25fSSatish Balay     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
471*49b5e25fSSatish Balay     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
472*49b5e25fSSatish Balay     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
473*49b5e25fSSatish Balay     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
474*49b5e25fSSatish Balay     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
475*49b5e25fSSatish Balay     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
476*49b5e25fSSatish Balay     v += 36;
477*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
478*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
479*49b5e25fSSatish Balay       cval       = ib[j]*6;
480*49b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
481*49b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
482*49b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
483*49b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
484*49b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
485*49b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
486*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
487*49b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
488*49b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
489*49b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
490*49b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
491*49b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
492*49b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
493*49b5e25fSSatish Balay       v  += 36;
494*49b5e25fSSatish Balay     }
495*49b5e25fSSatish Balay     xb +=6; ai++;
496*49b5e25fSSatish Balay   }
497*49b5e25fSSatish Balay 
498*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
499*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
500*49b5e25fSSatish Balay   PLogFlops(72*(a->s_nz*2 - a->m) - a->m);
501*49b5e25fSSatish Balay   PetscFunctionReturn(0);
502*49b5e25fSSatish Balay }
503*49b5e25fSSatish Balay #undef __FUNC__
504*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_7"
505*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
506*49b5e25fSSatish Balay {
507*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
508*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
509*49b5e25fSSatish Balay   MatScalar       *v;
510*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
511*49b5e25fSSatish Balay 
512*49b5e25fSSatish Balay   PetscFunctionBegin;
513*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
514*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
516*49b5e25fSSatish Balay 
517*49b5e25fSSatish Balay   v     = a->a;
518*49b5e25fSSatish Balay   xb = x;
519*49b5e25fSSatish Balay 
520*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
521*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
522*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
523*49b5e25fSSatish Balay     ib = aj + *ai;
524*49b5e25fSSatish Balay     /* (diag of A)*x */
525*49b5e25fSSatish Balay     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
526*49b5e25fSSatish Balay     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
527*49b5e25fSSatish Balay     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
528*49b5e25fSSatish Balay     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
529*49b5e25fSSatish Balay     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
530*49b5e25fSSatish Balay     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
531*49b5e25fSSatish Balay     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
532*49b5e25fSSatish Balay     v += 49;
533*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
534*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
535*49b5e25fSSatish Balay       cval       = ib[j]*7;
536*49b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
537*49b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
538*49b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
539*49b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
540*49b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
541*49b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
542*49b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
543*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
544*49b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
545*49b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
546*49b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
547*49b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
548*49b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
549*49b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
550*49b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
551*49b5e25fSSatish Balay       v  += 49;
552*49b5e25fSSatish Balay     }
553*49b5e25fSSatish Balay     xb +=7; ai++;
554*49b5e25fSSatish Balay   }
555*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
556*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
557*49b5e25fSSatish Balay   PLogFlops(98*(a->s_nz*2 - a->m) - a->m);
558*49b5e25fSSatish Balay   PetscFunctionReturn(0);
559*49b5e25fSSatish Balay }
560*49b5e25fSSatish Balay 
561*49b5e25fSSatish Balay /*
562*49b5e25fSSatish Balay     This will not work with MatScalar == float because it calls the BLAS
563*49b5e25fSSatish Balay */
564*49b5e25fSSatish Balay #undef __FUNC__
565*49b5e25fSSatish Balay #define __FUNC__ "MatMult_SeqSBAIJ_N"
566*49b5e25fSSatish Balay int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
567*49b5e25fSSatish Balay {
568*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
569*49b5e25fSSatish Balay   Scalar          *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,*diag_ptr,zero=0.0;
570*49b5e25fSSatish Balay   MatScalar       *v,*vwk;
571*49b5e25fSSatish Balay   int             ierr,mbs=a->mbs,i,*idx,*idxt,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
572*49b5e25fSSatish Balay   int             ncols,k,col;
573*49b5e25fSSatish Balay 
574*49b5e25fSSatish Balay   PetscFunctionBegin;
575*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
576*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
577*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
578*49b5e25fSSatish Balay 
579*49b5e25fSSatish Balay   aj   = a->j;
580*49b5e25fSSatish Balay   v    = a->a;
581*49b5e25fSSatish Balay   ii   = a->i;
582*49b5e25fSSatish Balay 
583*49b5e25fSSatish Balay   if (!a->mult_work) {
584*49b5e25fSSatish Balay     k = a->m;
585*49b5e25fSSatish Balay     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
586*49b5e25fSSatish Balay   }
587*49b5e25fSSatish Balay   work = a->mult_work;
588*49b5e25fSSatish Balay 
589*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
590*49b5e25fSSatish Balay     n     = ii[1] - ii[0]; ncols = n*bs;
591*49b5e25fSSatish Balay     workt = work; idx=aj+ii[0];
592*49b5e25fSSatish Balay 
593*49b5e25fSSatish Balay     /* upper triangular part */
594*49b5e25fSSatish Balay     for (j=0; j<n; j++) {
595*49b5e25fSSatish Balay       col = bs*(*idx);
596*49b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
597*49b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
598*49b5e25fSSatish Balay       workt += bs;
599*49b5e25fSSatish Balay     }
600*49b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
601*49b5e25fSSatish Balay     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
602*49b5e25fSSatish Balay 
603*49b5e25fSSatish Balay     /* strict lower triangular part */
604*49b5e25fSSatish Balay     ncols -= bs;
605*49b5e25fSSatish Balay     if (ncols > 0){
606*49b5e25fSSatish Balay       workt = work;
607*49b5e25fSSatish Balay       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
608*49b5e25fSSatish Balay       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
609*49b5e25fSSatish Balay 
610*49b5e25fSSatish Balay       idx=aj+ii[0]+1;
611*49b5e25fSSatish Balay       for (j=1; j<n; j++) {
612*49b5e25fSSatish Balay         zb = z_ptr + bs*(*idx);
613*49b5e25fSSatish Balay         idx++;
614*49b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k] ;
615*49b5e25fSSatish Balay         workt += bs;
616*49b5e25fSSatish Balay       }
617*49b5e25fSSatish Balay     }
618*49b5e25fSSatish Balay 
619*49b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
620*49b5e25fSSatish Balay   }
621*49b5e25fSSatish Balay 
622*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
623*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
624*49b5e25fSSatish Balay   PLogFlops(2*(a->s_nz*2 - a->m)*bs2 - a->m);
625*49b5e25fSSatish Balay   PetscFunctionReturn(0);
626*49b5e25fSSatish Balay }
627*49b5e25fSSatish Balay 
628*49b5e25fSSatish Balay #undef __FUNC__
629*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_1"
630*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
631*49b5e25fSSatish Balay {
632*49b5e25fSSatish Balay   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
633*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1;
634*49b5e25fSSatish Balay   MatScalar       *v;
635*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
636*49b5e25fSSatish Balay 
637*49b5e25fSSatish Balay   PetscFunctionBegin;
638*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
639*49b5e25fSSatish Balay   if (yy != xx) {
640*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
641*49b5e25fSSatish Balay   } else {
642*49b5e25fSSatish Balay     y = x;
643*49b5e25fSSatish Balay   }
644*49b5e25fSSatish Balay   if (zz != yy) {
645*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
646*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
647*49b5e25fSSatish Balay   } else {
648*49b5e25fSSatish Balay     z = y;
649*49b5e25fSSatish Balay   }
650*49b5e25fSSatish Balay 
651*49b5e25fSSatish Balay   v  = a->a;
652*49b5e25fSSatish Balay   xb = x;
653*49b5e25fSSatish Balay 
654*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
655*49b5e25fSSatish Balay     n  = ai[1] - ai[0];  /* length of i_th row of A */
656*49b5e25fSSatish Balay     x1 = xb[0];
657*49b5e25fSSatish Balay     ib = aj + *ai;
658*49b5e25fSSatish Balay     z[i] += *v++ * x[*ib++];   /* (diag of A)*x */
659*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
660*49b5e25fSSatish Balay       cval    = *ib;
661*49b5e25fSSatish Balay       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
662*49b5e25fSSatish Balay       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
663*49b5e25fSSatish Balay     }
664*49b5e25fSSatish Balay     xb++; ai++;
665*49b5e25fSSatish Balay   }
666*49b5e25fSSatish Balay 
667*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
668*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
669*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
670*49b5e25fSSatish Balay 
671*49b5e25fSSatish Balay   PLogFlops(2*(a->s_nz*2 - a->m));
672*49b5e25fSSatish Balay   PetscFunctionReturn(0);
673*49b5e25fSSatish Balay }
674*49b5e25fSSatish Balay 
675*49b5e25fSSatish Balay #undef __FUNC__
676*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_2"
677*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
678*49b5e25fSSatish Balay {
679*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
680*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1,x2;
681*49b5e25fSSatish Balay   MatScalar       *v;
682*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
683*49b5e25fSSatish Balay 
684*49b5e25fSSatish Balay   PetscFunctionBegin;
685*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
686*49b5e25fSSatish Balay   if (yy != xx) {
687*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
688*49b5e25fSSatish Balay   } else {
689*49b5e25fSSatish Balay     y = x;
690*49b5e25fSSatish Balay   }
691*49b5e25fSSatish Balay   if (zz != yy) {
692*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
693*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
694*49b5e25fSSatish Balay   } else {
695*49b5e25fSSatish Balay     z = y;
696*49b5e25fSSatish Balay   }
697*49b5e25fSSatish Balay 
698*49b5e25fSSatish Balay   v     = a->a;
699*49b5e25fSSatish Balay   xb = x;
700*49b5e25fSSatish Balay 
701*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
702*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
703*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1];
704*49b5e25fSSatish Balay     ib = aj + *ai;
705*49b5e25fSSatish Balay     /* (diag of A)*x */
706*49b5e25fSSatish Balay     z[2*i]   += v[0]*x1 + v[2]*x2;
707*49b5e25fSSatish Balay     z[2*i+1] += v[2]*x1 + v[3]*x2;
708*49b5e25fSSatish Balay     v += 4;
709*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
710*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
711*49b5e25fSSatish Balay       cval       = ib[j]*2;
712*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2;
713*49b5e25fSSatish Balay       z[cval+1]   += v[2]*x1 + v[3]*x2;
714*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
715*49b5e25fSSatish Balay       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
716*49b5e25fSSatish Balay       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
717*49b5e25fSSatish Balay       v  += 4;
718*49b5e25fSSatish Balay     }
719*49b5e25fSSatish Balay     xb +=2; ai++;
720*49b5e25fSSatish Balay   }
721*49b5e25fSSatish Balay 
722*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
723*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
724*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
725*49b5e25fSSatish Balay 
726*49b5e25fSSatish Balay   PLogFlops(4*(a->s_nz*2 - a->m));
727*49b5e25fSSatish Balay   PetscFunctionReturn(0);
728*49b5e25fSSatish Balay }
729*49b5e25fSSatish Balay 
730*49b5e25fSSatish Balay #undef __FUNC__
731*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_3"
732*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
733*49b5e25fSSatish Balay {
734*49b5e25fSSatish Balay   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data;
735*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1,x2,x3;
736*49b5e25fSSatish Balay   MatScalar       *v;
737*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
738*49b5e25fSSatish Balay 
739*49b5e25fSSatish Balay   PetscFunctionBegin;
740*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
741*49b5e25fSSatish Balay   if (yy != xx) {
742*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
743*49b5e25fSSatish Balay   } else {
744*49b5e25fSSatish Balay     y = x;
745*49b5e25fSSatish Balay   }
746*49b5e25fSSatish Balay   if (zz != yy) {
747*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
748*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
749*49b5e25fSSatish Balay   } else {
750*49b5e25fSSatish Balay     z = y;
751*49b5e25fSSatish Balay   }
752*49b5e25fSSatish Balay 
753*49b5e25fSSatish Balay   v     = a->a;
754*49b5e25fSSatish Balay   xb = x;
755*49b5e25fSSatish Balay 
756*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
757*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
758*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
759*49b5e25fSSatish Balay     ib = aj + *ai;
760*49b5e25fSSatish Balay     /* (diag of A)*x */
761*49b5e25fSSatish Balay     z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
762*49b5e25fSSatish Balay     z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
763*49b5e25fSSatish Balay     z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
764*49b5e25fSSatish Balay     v += 9;
765*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
766*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
767*49b5e25fSSatish Balay       cval       = ib[j]*3;
768*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
769*49b5e25fSSatish Balay       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
770*49b5e25fSSatish Balay       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
771*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
772*49b5e25fSSatish Balay       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
773*49b5e25fSSatish Balay       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
774*49b5e25fSSatish Balay       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
775*49b5e25fSSatish Balay       v  += 9;
776*49b5e25fSSatish Balay     }
777*49b5e25fSSatish Balay     xb +=3; ai++;
778*49b5e25fSSatish Balay   }
779*49b5e25fSSatish Balay 
780*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
781*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
782*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
783*49b5e25fSSatish Balay 
784*49b5e25fSSatish Balay   PLogFlops(18*(a->s_nz*2 - a->m));
785*49b5e25fSSatish Balay   PetscFunctionReturn(0);
786*49b5e25fSSatish Balay }
787*49b5e25fSSatish Balay 
788*49b5e25fSSatish Balay #undef __FUNC__
789*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_4"
790*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
791*49b5e25fSSatish Balay {
792*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
793*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4;
794*49b5e25fSSatish Balay   MatScalar       *v;
795*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
796*49b5e25fSSatish Balay 
797*49b5e25fSSatish Balay   PetscFunctionBegin;
798*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799*49b5e25fSSatish Balay   if (yy != xx) {
800*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
801*49b5e25fSSatish Balay   } else {
802*49b5e25fSSatish Balay     y = x;
803*49b5e25fSSatish Balay   }
804*49b5e25fSSatish Balay   if (zz != yy) {
805*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
806*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
807*49b5e25fSSatish Balay   } else {
808*49b5e25fSSatish Balay     z = y;
809*49b5e25fSSatish Balay   }
810*49b5e25fSSatish Balay 
811*49b5e25fSSatish Balay   v     = a->a;
812*49b5e25fSSatish Balay   xb = x;
813*49b5e25fSSatish Balay 
814*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
815*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
816*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
817*49b5e25fSSatish Balay     ib = aj + *ai;
818*49b5e25fSSatish Balay     /* (diag of A)*x */
819*49b5e25fSSatish Balay     z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
820*49b5e25fSSatish Balay     z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
821*49b5e25fSSatish Balay     z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
822*49b5e25fSSatish Balay     z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
823*49b5e25fSSatish Balay     v += 16;
824*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
825*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
826*49b5e25fSSatish Balay       cval       = ib[j]*4;
827*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
828*49b5e25fSSatish Balay       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
829*49b5e25fSSatish Balay       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
830*49b5e25fSSatish Balay       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
831*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
832*49b5e25fSSatish Balay       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
833*49b5e25fSSatish Balay       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
834*49b5e25fSSatish Balay       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
835*49b5e25fSSatish Balay       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
836*49b5e25fSSatish Balay       v  += 16;
837*49b5e25fSSatish Balay     }
838*49b5e25fSSatish Balay     xb +=4; ai++;
839*49b5e25fSSatish Balay   }
840*49b5e25fSSatish Balay 
841*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
842*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
843*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
844*49b5e25fSSatish Balay 
845*49b5e25fSSatish Balay   PLogFlops(32*(a->s_nz*2 - a->m));
846*49b5e25fSSatish Balay   PetscFunctionReturn(0);
847*49b5e25fSSatish Balay }
848*49b5e25fSSatish Balay 
849*49b5e25fSSatish Balay #undef __FUNC__
850*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_5"
851*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
852*49b5e25fSSatish Balay {
853*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
854*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5;
855*49b5e25fSSatish Balay   MatScalar       *v;
856*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
857*49b5e25fSSatish Balay 
858*49b5e25fSSatish Balay   PetscFunctionBegin;
859*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
860*49b5e25fSSatish Balay   if (yy != xx) {
861*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
862*49b5e25fSSatish Balay   } else {
863*49b5e25fSSatish Balay     y = x;
864*49b5e25fSSatish Balay   }
865*49b5e25fSSatish Balay   if (zz != yy) {
866*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
867*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
868*49b5e25fSSatish Balay   } else {
869*49b5e25fSSatish Balay     z = y;
870*49b5e25fSSatish Balay   }
871*49b5e25fSSatish Balay 
872*49b5e25fSSatish Balay   v     = a->a;
873*49b5e25fSSatish Balay   xb = x;
874*49b5e25fSSatish Balay 
875*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
876*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
877*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
878*49b5e25fSSatish Balay     ib = aj + *ai;
879*49b5e25fSSatish Balay     /* (diag of A)*x */
880*49b5e25fSSatish Balay     z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
881*49b5e25fSSatish Balay     z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
882*49b5e25fSSatish Balay     z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
883*49b5e25fSSatish Balay     z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
884*49b5e25fSSatish Balay     z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
885*49b5e25fSSatish Balay     v += 25;
886*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
887*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
888*49b5e25fSSatish Balay       cval       = ib[j]*5;
889*49b5e25fSSatish Balay       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
890*49b5e25fSSatish Balay       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
891*49b5e25fSSatish Balay       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
892*49b5e25fSSatish Balay       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
893*49b5e25fSSatish Balay       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
894*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
895*49b5e25fSSatish Balay       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
896*49b5e25fSSatish Balay       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
897*49b5e25fSSatish Balay       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
898*49b5e25fSSatish Balay       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
899*49b5e25fSSatish Balay       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
900*49b5e25fSSatish Balay       v  += 25;
901*49b5e25fSSatish Balay     }
902*49b5e25fSSatish Balay     xb +=5; ai++;
903*49b5e25fSSatish Balay   }
904*49b5e25fSSatish Balay 
905*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
906*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
907*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
908*49b5e25fSSatish Balay 
909*49b5e25fSSatish Balay   PLogFlops(50*(a->s_nz*2 - a->m));
910*49b5e25fSSatish Balay   PetscFunctionReturn(0);
911*49b5e25fSSatish Balay }
912*49b5e25fSSatish Balay #undef __FUNC__
913*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_6"
914*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
915*49b5e25fSSatish Balay {
916*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
917*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
918*49b5e25fSSatish Balay   MatScalar       *v;
919*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
920*49b5e25fSSatish Balay 
921*49b5e25fSSatish Balay   PetscFunctionBegin;
922*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
923*49b5e25fSSatish Balay   if (yy != xx) {
924*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
925*49b5e25fSSatish Balay   } else {
926*49b5e25fSSatish Balay     y = x;
927*49b5e25fSSatish Balay   }
928*49b5e25fSSatish Balay   if (zz != yy) {
929*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
930*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
931*49b5e25fSSatish Balay   } else {
932*49b5e25fSSatish Balay     z = y;
933*49b5e25fSSatish Balay   }
934*49b5e25fSSatish Balay 
935*49b5e25fSSatish Balay   v     = a->a;
936*49b5e25fSSatish Balay   xb = x;
937*49b5e25fSSatish Balay 
938*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
939*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
940*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
941*49b5e25fSSatish Balay     ib = aj + *ai;
942*49b5e25fSSatish Balay     /* (diag of A)*x */
943*49b5e25fSSatish Balay     z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
944*49b5e25fSSatish Balay     z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
945*49b5e25fSSatish Balay     z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
946*49b5e25fSSatish Balay     z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
947*49b5e25fSSatish Balay     z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
948*49b5e25fSSatish Balay     z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
949*49b5e25fSSatish Balay     v += 36;
950*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
951*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
952*49b5e25fSSatish Balay       cval       = ib[j]*6;
953*49b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
954*49b5e25fSSatish Balay       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
955*49b5e25fSSatish Balay       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
956*49b5e25fSSatish Balay       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
957*49b5e25fSSatish Balay       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
958*49b5e25fSSatish Balay       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
959*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
960*49b5e25fSSatish Balay       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
961*49b5e25fSSatish Balay       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
962*49b5e25fSSatish Balay       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
963*49b5e25fSSatish Balay       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
964*49b5e25fSSatish Balay       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
965*49b5e25fSSatish Balay       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
966*49b5e25fSSatish Balay       v  += 36;
967*49b5e25fSSatish Balay     }
968*49b5e25fSSatish Balay     xb +=6; ai++;
969*49b5e25fSSatish Balay   }
970*49b5e25fSSatish Balay 
971*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
972*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
973*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
974*49b5e25fSSatish Balay 
975*49b5e25fSSatish Balay   PLogFlops(72*(a->s_nz*2 - a->m));
976*49b5e25fSSatish Balay   PetscFunctionReturn(0);
977*49b5e25fSSatish Balay }
978*49b5e25fSSatish Balay 
979*49b5e25fSSatish Balay #undef __FUNC__
980*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_7"
981*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
982*49b5e25fSSatish Balay {
983*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
984*49b5e25fSSatish Balay   Scalar          *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
985*49b5e25fSSatish Balay   MatScalar       *v;
986*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j;
987*49b5e25fSSatish Balay 
988*49b5e25fSSatish Balay   PetscFunctionBegin;
989*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
990*49b5e25fSSatish Balay   if (yy != xx) {
991*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
992*49b5e25fSSatish Balay   } else {
993*49b5e25fSSatish Balay     y = x;
994*49b5e25fSSatish Balay   }
995*49b5e25fSSatish Balay   if (zz != yy) {
996*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
997*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
998*49b5e25fSSatish Balay   } else {
999*49b5e25fSSatish Balay     z = y;
1000*49b5e25fSSatish Balay   }
1001*49b5e25fSSatish Balay 
1002*49b5e25fSSatish Balay   v     = a->a;
1003*49b5e25fSSatish Balay   xb = x;
1004*49b5e25fSSatish Balay 
1005*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
1006*49b5e25fSSatish Balay     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1007*49b5e25fSSatish Balay     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1008*49b5e25fSSatish Balay     ib = aj + *ai;
1009*49b5e25fSSatish Balay     /* (diag of A)*x */
1010*49b5e25fSSatish Balay     z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1011*49b5e25fSSatish Balay     z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1012*49b5e25fSSatish Balay     z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1013*49b5e25fSSatish Balay     z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1014*49b5e25fSSatish Balay     z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1015*49b5e25fSSatish Balay     z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1016*49b5e25fSSatish Balay     z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1017*49b5e25fSSatish Balay     v += 49;
1018*49b5e25fSSatish Balay     for (j=1; j<n; j++) {
1019*49b5e25fSSatish Balay       /* (strict lower triangular part of A)*x  */
1020*49b5e25fSSatish Balay       cval       = ib[j]*7;
1021*49b5e25fSSatish Balay       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1022*49b5e25fSSatish Balay       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1023*49b5e25fSSatish Balay       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1024*49b5e25fSSatish Balay       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1025*49b5e25fSSatish Balay       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1026*49b5e25fSSatish Balay       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1027*49b5e25fSSatish Balay       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1028*49b5e25fSSatish Balay       /* (strict upper triangular part of A)*x  */
1029*49b5e25fSSatish Balay       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1030*49b5e25fSSatish Balay       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1031*49b5e25fSSatish Balay       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1032*49b5e25fSSatish Balay       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1033*49b5e25fSSatish Balay       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1034*49b5e25fSSatish Balay       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1035*49b5e25fSSatish Balay       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1036*49b5e25fSSatish Balay       v  += 49;
1037*49b5e25fSSatish Balay     }
1038*49b5e25fSSatish Balay     xb +=7; ai++;
1039*49b5e25fSSatish Balay   }
1040*49b5e25fSSatish Balay 
1041*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1042*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1043*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1044*49b5e25fSSatish Balay 
1045*49b5e25fSSatish Balay   PLogFlops(98*(a->s_nz*2 - a->m));
1046*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1047*49b5e25fSSatish Balay }
1048*49b5e25fSSatish Balay 
1049*49b5e25fSSatish Balay #undef __FUNC__
1050*49b5e25fSSatish Balay #define __FUNC__ "MatMultAdd_SeqSBAIJ_N"
1051*49b5e25fSSatish Balay int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1052*49b5e25fSSatish Balay {
1053*49b5e25fSSatish Balay   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ*)A->data;
1054*49b5e25fSSatish Balay   Scalar          *x,*x_ptr,*y,*z,*z_ptr,*xb,*zb,*work,*workt,*diag_ptr;
1055*49b5e25fSSatish Balay   MatScalar       *v,*vwk;
1056*49b5e25fSSatish Balay   int             ierr,mbs=a->mbs,i,*idx,*idxt,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1057*49b5e25fSSatish Balay   int             ncols,k,col;
1058*49b5e25fSSatish Balay 
1059*49b5e25fSSatish Balay   PetscFunctionBegin;
1060*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1061*49b5e25fSSatish Balay   if (yy != xx) {
1062*49b5e25fSSatish Balay     ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
1063*49b5e25fSSatish Balay   } else {
1064*49b5e25fSSatish Balay     y = x;
1065*49b5e25fSSatish Balay   }
1066*49b5e25fSSatish Balay   if (zz != yy) {
1067*49b5e25fSSatish Balay     ierr = VecCopy(yy,zz); CHKERRQ(ierr);
1068*49b5e25fSSatish Balay     ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1069*49b5e25fSSatish Balay   } else {
1070*49b5e25fSSatish Balay     z = y;
1071*49b5e25fSSatish Balay   }
1072*49b5e25fSSatish Balay 
1073*49b5e25fSSatish Balay   aj   = a->j;
1074*49b5e25fSSatish Balay   v    = a->a;
1075*49b5e25fSSatish Balay   ii   = a->i;
1076*49b5e25fSSatish Balay 
1077*49b5e25fSSatish Balay   if (!a->mult_work) {
1078*49b5e25fSSatish Balay     k = a->m;
1079*49b5e25fSSatish Balay     a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1080*49b5e25fSSatish Balay   }
1081*49b5e25fSSatish Balay   work = a->mult_work;
1082*49b5e25fSSatish Balay 
1083*49b5e25fSSatish Balay 
1084*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
1085*49b5e25fSSatish Balay     n     = ii[1] - ii[0]; ncols = n*bs;
1086*49b5e25fSSatish Balay     workt = work; idx=aj+ii[0];
1087*49b5e25fSSatish Balay 
1088*49b5e25fSSatish Balay     /* upper triangular part */
1089*49b5e25fSSatish Balay     for (j=0; j<n; j++) {
1090*49b5e25fSSatish Balay       col = bs*(*idx);
1091*49b5e25fSSatish Balay       xb = x_ptr + bs*(*idx++);
1092*49b5e25fSSatish Balay       for (k=0; k<bs; k++) workt[k] = xb[k];
1093*49b5e25fSSatish Balay       workt += bs;
1094*49b5e25fSSatish Balay     }
1095*49b5e25fSSatish Balay     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1096*49b5e25fSSatish Balay     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1097*49b5e25fSSatish Balay 
1098*49b5e25fSSatish Balay     /* strict lower triangular part */
1099*49b5e25fSSatish Balay     ncols -= bs;
1100*49b5e25fSSatish Balay     if (ncols > 0){
1101*49b5e25fSSatish Balay       workt = work;
1102*49b5e25fSSatish Balay       ierr  = PetscMemzero(workt,ncols*sizeof(Scalar));CHKERRQ(ierr);
1103*49b5e25fSSatish Balay       Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v+bs2,workt);
1104*49b5e25fSSatish Balay 
1105*49b5e25fSSatish Balay       idx=aj+ii[0]+1;
1106*49b5e25fSSatish Balay       for (j=1; j<n; j++) {
1107*49b5e25fSSatish Balay         zb = z_ptr + bs*(*idx);
1108*49b5e25fSSatish Balay         idx++;
1109*49b5e25fSSatish Balay         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1110*49b5e25fSSatish Balay         workt += bs;
1111*49b5e25fSSatish Balay       }
1112*49b5e25fSSatish Balay     }
1113*49b5e25fSSatish Balay 
1114*49b5e25fSSatish Balay     x += bs; v += n*bs2; z += bs; ii++;
1115*49b5e25fSSatish Balay   }
1116*49b5e25fSSatish Balay 
1117*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1118*49b5e25fSSatish Balay   if (yy != xx) ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
1119*49b5e25fSSatish Balay   if (zz != yy) ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1120*49b5e25fSSatish Balay 
1121*49b5e25fSSatish Balay   PLogFlops(2*(a->s_nz*2 - a->m));
1122*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1123*49b5e25fSSatish Balay }
1124*49b5e25fSSatish Balay 
1125*49b5e25fSSatish Balay #undef __FUNC__
1126*49b5e25fSSatish Balay #define __FUNC__ "MatMultTranspose_SeqSBAIJ"
1127*49b5e25fSSatish Balay int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1128*49b5e25fSSatish Balay {
1129*49b5e25fSSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1130*49b5e25fSSatish Balay   Scalar          *xg,*zg,*zb,zero = 0.0;
1131*49b5e25fSSatish Balay   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
1132*49b5e25fSSatish Balay   MatScalar       *v;
1133*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
1134*49b5e25fSSatish Balay   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1135*49b5e25fSSatish Balay 
1136*49b5e25fSSatish Balay   PetscFunctionBegin;
1137*49b5e25fSSatish Balay   ierr = VecSet(&zero,zz);CHKERRQ(ierr);
1138*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1139*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1140*49b5e25fSSatish Balay 
1141*49b5e25fSSatish Balay   idx   = a->j;
1142*49b5e25fSSatish Balay   v     = a->a;
1143*49b5e25fSSatish Balay   ii    = a->i;
1144*49b5e25fSSatish Balay   xb    = x;
1145*49b5e25fSSatish Balay   switch (bs) {
1146*49b5e25fSSatish Balay   case 1:
1147*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1148*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1149*49b5e25fSSatish Balay       x1 = xb[0];
1150*49b5e25fSSatish Balay       ib = idx + ai[i];
1151*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1152*49b5e25fSSatish Balay         rval    = ib[j];
1153*49b5e25fSSatish Balay         z[rval] += *v * x1;
1154*49b5e25fSSatish Balay         v++;
1155*49b5e25fSSatish Balay       }
1156*49b5e25fSSatish Balay       xb++;
1157*49b5e25fSSatish Balay     }
1158*49b5e25fSSatish Balay     break;
1159*49b5e25fSSatish Balay   case 2:
1160*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1161*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1162*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1];
1163*49b5e25fSSatish Balay       ib = idx + ai[i];
1164*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1165*49b5e25fSSatish Balay         rval      = ib[j]*2;
1166*49b5e25fSSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2;
1167*49b5e25fSSatish Balay         z[rval]   += v[2]*x1 + v[3]*x2;
1168*49b5e25fSSatish Balay         v  += 4;
1169*49b5e25fSSatish Balay       }
1170*49b5e25fSSatish Balay       xb += 2;
1171*49b5e25fSSatish Balay     }
1172*49b5e25fSSatish Balay     break;
1173*49b5e25fSSatish Balay   case 3:
1174*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1175*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1176*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1177*49b5e25fSSatish Balay       ib = idx + ai[i];
1178*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1179*49b5e25fSSatish Balay         rval      = ib[j]*3;
1180*49b5e25fSSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1181*49b5e25fSSatish Balay         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1182*49b5e25fSSatish Balay         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
1183*49b5e25fSSatish Balay         v  += 9;
1184*49b5e25fSSatish Balay       }
1185*49b5e25fSSatish Balay       xb += 3;
1186*49b5e25fSSatish Balay     }
1187*49b5e25fSSatish Balay     break;
1188*49b5e25fSSatish Balay   case 4:
1189*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1190*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1191*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1192*49b5e25fSSatish Balay       ib = idx + ai[i];
1193*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1194*49b5e25fSSatish Balay         rval      = ib[j]*4;
1195*49b5e25fSSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1196*49b5e25fSSatish Balay         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1197*49b5e25fSSatish Balay         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1198*49b5e25fSSatish Balay         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1199*49b5e25fSSatish Balay         v  += 16;
1200*49b5e25fSSatish Balay       }
1201*49b5e25fSSatish Balay       xb += 4;
1202*49b5e25fSSatish Balay     }
1203*49b5e25fSSatish Balay     break;
1204*49b5e25fSSatish Balay   case 5:
1205*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1206*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1207*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1208*49b5e25fSSatish Balay       x4 = xb[3]; x5 = xb[4];
1209*49b5e25fSSatish Balay       ib = idx + ai[i];
1210*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1211*49b5e25fSSatish Balay         rval      = ib[j]*5;
1212*49b5e25fSSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1213*49b5e25fSSatish Balay         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1214*49b5e25fSSatish Balay         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1215*49b5e25fSSatish Balay         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1216*49b5e25fSSatish Balay         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1217*49b5e25fSSatish Balay         v  += 25;
1218*49b5e25fSSatish Balay       }
1219*49b5e25fSSatish Balay       xb += 5;
1220*49b5e25fSSatish Balay     }
1221*49b5e25fSSatish Balay     break;
1222*49b5e25fSSatish Balay   case 6:
1223*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1224*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1225*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1226*49b5e25fSSatish Balay       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1227*49b5e25fSSatish Balay       ib = idx + ai[i];
1228*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1229*49b5e25fSSatish Balay         rval      = ib[j]*6;
1230*49b5e25fSSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
1231*49b5e25fSSatish Balay         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1232*49b5e25fSSatish Balay         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1233*49b5e25fSSatish Balay         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1234*49b5e25fSSatish Balay         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1235*49b5e25fSSatish Balay         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1236*49b5e25fSSatish Balay         v  += 36;
1237*49b5e25fSSatish Balay       }
1238*49b5e25fSSatish Balay       xb += 6;
1239*49b5e25fSSatish Balay     }
1240*49b5e25fSSatish Balay     break;
1241*49b5e25fSSatish Balay   case 7:
1242*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1243*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1244*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1245*49b5e25fSSatish Balay       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1246*49b5e25fSSatish Balay       ib = idx + ai[i];
1247*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1248*49b5e25fSSatish Balay         rval      = ib[j]*7;
1249*49b5e25fSSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1250*49b5e25fSSatish Balay         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1251*49b5e25fSSatish Balay         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1252*49b5e25fSSatish Balay         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1253*49b5e25fSSatish Balay         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1254*49b5e25fSSatish Balay         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1255*49b5e25fSSatish Balay         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1256*49b5e25fSSatish Balay         v  += 49;
1257*49b5e25fSSatish Balay       }
1258*49b5e25fSSatish Balay       xb += 7;
1259*49b5e25fSSatish Balay     }
1260*49b5e25fSSatish Balay     break;
1261*49b5e25fSSatish Balay   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
1262*49b5e25fSSatish Balay       int       ncols,k;
1263*49b5e25fSSatish Balay       Scalar    *work,*workt;
1264*49b5e25fSSatish Balay 
1265*49b5e25fSSatish Balay       if (!a->mult_work) {
1266*49b5e25fSSatish Balay         k = PetscMax(a->m,a->n);
1267*49b5e25fSSatish Balay         a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1268*49b5e25fSSatish Balay       }
1269*49b5e25fSSatish Balay       work = a->mult_work;
1270*49b5e25fSSatish Balay       for (i=0; i<mbs; i++) {
1271*49b5e25fSSatish Balay         n     = ii[1] - ii[0]; ii++;
1272*49b5e25fSSatish Balay         ncols = n*bs;
1273*49b5e25fSSatish Balay         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1274*49b5e25fSSatish Balay         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1275*49b5e25fSSatish Balay         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1276*49b5e25fSSatish Balay         v += n*bs2;
1277*49b5e25fSSatish Balay         x += bs;
1278*49b5e25fSSatish Balay         workt = work;
1279*49b5e25fSSatish Balay         for (j=0; j<n; j++) {
1280*49b5e25fSSatish Balay           zb = z + bs*(*idx++);
1281*49b5e25fSSatish Balay           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1282*49b5e25fSSatish Balay           workt += bs;
1283*49b5e25fSSatish Balay         }
1284*49b5e25fSSatish Balay       }
1285*49b5e25fSSatish Balay     }
1286*49b5e25fSSatish Balay   }
1287*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1288*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1289*49b5e25fSSatish Balay   PLogFlops(2*a->nz*a->bs2 - a->n);
1290*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1291*49b5e25fSSatish Balay }
1292*49b5e25fSSatish Balay 
1293*49b5e25fSSatish Balay #undef __FUNC__
1294*49b5e25fSSatish Balay #define __FUNC__ "MatMultTransposeAdd_SeqSBAIJ"
1295*49b5e25fSSatish Balay int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1296*49b5e25fSSatish Balay 
1297*49b5e25fSSatish Balay {
1298*49b5e25fSSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1299*49b5e25fSSatish Balay   Scalar          *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1300*49b5e25fSSatish Balay   MatScalar       *v;
1301*49b5e25fSSatish Balay   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1302*49b5e25fSSatish Balay 
1303*49b5e25fSSatish Balay   PetscFunctionBegin;
1304*49b5e25fSSatish Balay   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1305*49b5e25fSSatish Balay   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1306*49b5e25fSSatish Balay 
1307*49b5e25fSSatish Balay   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1308*49b5e25fSSatish Balay 
1309*49b5e25fSSatish Balay   idx   = a->j;
1310*49b5e25fSSatish Balay   v     = a->a;
1311*49b5e25fSSatish Balay   ii    = a->i;
1312*49b5e25fSSatish Balay   xb    = x;
1313*49b5e25fSSatish Balay 
1314*49b5e25fSSatish Balay   switch (bs) {
1315*49b5e25fSSatish Balay   case 1:
1316*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1317*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1318*49b5e25fSSatish Balay       x1 = xb[0];
1319*49b5e25fSSatish Balay       ib = idx + ai[i];
1320*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1321*49b5e25fSSatish Balay         rval    = ib[j];
1322*49b5e25fSSatish Balay         z[rval] += *v * x1;
1323*49b5e25fSSatish Balay         v++;
1324*49b5e25fSSatish Balay       }
1325*49b5e25fSSatish Balay       xb++;
1326*49b5e25fSSatish Balay     }
1327*49b5e25fSSatish Balay     break;
1328*49b5e25fSSatish Balay   case 2:
1329*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1330*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1331*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1];
1332*49b5e25fSSatish Balay       ib = idx + ai[i];
1333*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1334*49b5e25fSSatish Balay         rval      = ib[j]*2;
1335*49b5e25fSSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2;
1336*49b5e25fSSatish Balay         z[rval++] += v[2]*x1 + v[3]*x2;
1337*49b5e25fSSatish Balay         v  += 4;
1338*49b5e25fSSatish Balay       }
1339*49b5e25fSSatish Balay       xb += 2;
1340*49b5e25fSSatish Balay     }
1341*49b5e25fSSatish Balay     break;
1342*49b5e25fSSatish Balay   case 3:
1343*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1344*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1345*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1346*49b5e25fSSatish Balay       ib = idx + ai[i];
1347*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1348*49b5e25fSSatish Balay         rval      = ib[j]*3;
1349*49b5e25fSSatish Balay         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1350*49b5e25fSSatish Balay         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1351*49b5e25fSSatish Balay         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1352*49b5e25fSSatish Balay         v  += 9;
1353*49b5e25fSSatish Balay       }
1354*49b5e25fSSatish Balay       xb += 3;
1355*49b5e25fSSatish Balay     }
1356*49b5e25fSSatish Balay     break;
1357*49b5e25fSSatish Balay   case 4:
1358*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1359*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1360*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1361*49b5e25fSSatish Balay       ib = idx + ai[i];
1362*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1363*49b5e25fSSatish Balay         rval      = ib[j]*4;
1364*49b5e25fSSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1365*49b5e25fSSatish Balay         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1366*49b5e25fSSatish Balay         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1367*49b5e25fSSatish Balay         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1368*49b5e25fSSatish Balay         v  += 16;
1369*49b5e25fSSatish Balay       }
1370*49b5e25fSSatish Balay       xb += 4;
1371*49b5e25fSSatish Balay     }
1372*49b5e25fSSatish Balay     break;
1373*49b5e25fSSatish Balay   case 5:
1374*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1375*49b5e25fSSatish Balay       n  = ii[1] - ii[0]; ii++;
1376*49b5e25fSSatish Balay       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1377*49b5e25fSSatish Balay       x4 = xb[3]; x5 = xb[4];
1378*49b5e25fSSatish Balay       ib = idx + ai[i];
1379*49b5e25fSSatish Balay       for (j=0; j<n; j++) {
1380*49b5e25fSSatish Balay         rval      = ib[j]*5;
1381*49b5e25fSSatish Balay         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1382*49b5e25fSSatish Balay         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1383*49b5e25fSSatish Balay         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1384*49b5e25fSSatish Balay         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1385*49b5e25fSSatish Balay         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1386*49b5e25fSSatish Balay         v  += 25;
1387*49b5e25fSSatish Balay       }
1388*49b5e25fSSatish Balay       xb += 5;
1389*49b5e25fSSatish Balay     }
1390*49b5e25fSSatish Balay     break;
1391*49b5e25fSSatish Balay   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1392*49b5e25fSSatish Balay       int       ncols,k;
1393*49b5e25fSSatish Balay       Scalar    *work,*workt;
1394*49b5e25fSSatish Balay 
1395*49b5e25fSSatish Balay       if (!a->mult_work) {
1396*49b5e25fSSatish Balay         k = PetscMax(a->m,a->n);
1397*49b5e25fSSatish Balay         a->mult_work = (Scalar*)PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1398*49b5e25fSSatish Balay       }
1399*49b5e25fSSatish Balay       work = a->mult_work;
1400*49b5e25fSSatish Balay       for (i=0; i<mbs; i++) {
1401*49b5e25fSSatish Balay         n     = ii[1] - ii[0]; ii++;
1402*49b5e25fSSatish Balay         ncols = n*bs;
1403*49b5e25fSSatish Balay         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1404*49b5e25fSSatish Balay         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1405*49b5e25fSSatish Balay         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1406*49b5e25fSSatish Balay         v += n*bs2;
1407*49b5e25fSSatish Balay         x += bs;
1408*49b5e25fSSatish Balay         workt = work;
1409*49b5e25fSSatish Balay         for (j=0; j<n; j++) {
1410*49b5e25fSSatish Balay           zb = z + bs*(*idx++);
1411*49b5e25fSSatish Balay           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1412*49b5e25fSSatish Balay           workt += bs;
1413*49b5e25fSSatish Balay         }
1414*49b5e25fSSatish Balay       }
1415*49b5e25fSSatish Balay     }
1416*49b5e25fSSatish Balay   }
1417*49b5e25fSSatish Balay   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1418*49b5e25fSSatish Balay   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1419*49b5e25fSSatish Balay   PLogFlops(2*a->nz*a->bs2);
1420*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1421*49b5e25fSSatish Balay }
1422*49b5e25fSSatish Balay 
1423*49b5e25fSSatish Balay #undef __FUNC__
1424*49b5e25fSSatish Balay #define __FUNC__ "MatScale_SeqSBAIJ"
1425*49b5e25fSSatish Balay int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1426*49b5e25fSSatish Balay {
1427*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1428*49b5e25fSSatish Balay   int         one = 1,totalnz = a->bs2*a->s_nz;
1429*49b5e25fSSatish Balay 
1430*49b5e25fSSatish Balay   PetscFunctionBegin;
1431*49b5e25fSSatish Balay   BLscal_(&totalnz,alpha,a->a,&one);
1432*49b5e25fSSatish Balay   PLogFlops(totalnz);
1433*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1434*49b5e25fSSatish Balay }
1435*49b5e25fSSatish Balay 
1436*49b5e25fSSatish Balay #undef __FUNC__
1437*49b5e25fSSatish Balay #define __FUNC__ "MatNorm_SeqSBAIJ"
1438*49b5e25fSSatish Balay int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1439*49b5e25fSSatish Balay {
1440*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1441*49b5e25fSSatish Balay   MatScalar   *v = a->a;
1442*49b5e25fSSatish Balay   PetscReal   sum_diag = 0.0, sum_off = 0.0, *sum;
1443*49b5e25fSSatish Balay   int         i,j,k,bs = a->bs,nz=a->s_nz,bs2=a->bs2,k1,mbs=a->mbs;
1444*49b5e25fSSatish Balay   int         *jl,*il,jmin,jmax,ierr,nexti,ik;
1445*49b5e25fSSatish Balay 
1446*49b5e25fSSatish Balay   PetscFunctionBegin;
1447*49b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
1448*49b5e25fSSatish Balay     for (k=0; k<mbs; k++){
1449*49b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1450*49b5e25fSSatish Balay       /* diagonal block */
1451*49b5e25fSSatish Balay       for (i=0; i<bs2; i++){
1452*49b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX)
1453*49b5e25fSSatish Balay         sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1454*49b5e25fSSatish Balay #else
1455*49b5e25fSSatish Balay         sum_diag += (*v)*(*v); v++;
1456*49b5e25fSSatish Balay #endif
1457*49b5e25fSSatish Balay       }
1458*49b5e25fSSatish Balay       for (j=jmin+1; j<jmax; j++){  /* off-diagonal blocks */
1459*49b5e25fSSatish Balay         for (i=0; i<bs2; i++){
1460*49b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX)
1461*49b5e25fSSatish Balay           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1462*49b5e25fSSatish Balay #else
1463*49b5e25fSSatish Balay           sum_off += (*v)*(*v); v++;
1464*49b5e25fSSatish Balay #endif
1465*49b5e25fSSatish Balay         }
1466*49b5e25fSSatish Balay       }
1467*49b5e25fSSatish Balay     }
1468*49b5e25fSSatish Balay     *norm = sqrt(sum_diag + 2*sum_off);
1469*49b5e25fSSatish Balay 
1470*49b5e25fSSatish Balay   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1471*49b5e25fSSatish Balay     il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1472*49b5e25fSSatish Balay     jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1473*49b5e25fSSatish Balay     sum = (Scalar*)PetscMalloc(bs*sizeof(Scalar));CHKPTRQ(sum);
1474*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
1475*49b5e25fSSatish Balay       jl[i] = mbs; il[0] = 0;
1476*49b5e25fSSatish Balay     }
1477*49b5e25fSSatish Balay 
1478*49b5e25fSSatish Balay     *norm = 0.0;
1479*49b5e25fSSatish Balay     for (k=0; k<mbs; k++) { /* k_th block row */
1480*49b5e25fSSatish Balay       for (j=0; j<bs; j++) sum[j]=0.0;
1481*49b5e25fSSatish Balay 
1482*49b5e25fSSatish Balay       /*-- col sum --*/
1483*49b5e25fSSatish Balay       i = jl[k]; /* first |A(i,k)| to be added */
1484*49b5e25fSSatish Balay       while (i<mbs){
1485*49b5e25fSSatish Balay         nexti = jl[i];  /* next block row to be added */
1486*49b5e25fSSatish Balay         ik    = il[i];  /* block index of A(i,k) in the array a */
1487*49b5e25fSSatish Balay         for (j=0; j<bs; j++){
1488*49b5e25fSSatish Balay           v = a->a + ik*bs2 + j*bs;
1489*49b5e25fSSatish Balay           for (k1=0; k1<bs; k1++) {
1490*49b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v); v++;
1491*49b5e25fSSatish Balay           }
1492*49b5e25fSSatish Balay         }
1493*49b5e25fSSatish Balay         /* update il, jl */
1494*49b5e25fSSatish Balay         jmin = ik + 1; jmax = a->i[i+1];
1495*49b5e25fSSatish Balay         if (jmin < jmax){
1496*49b5e25fSSatish Balay           il[i] = jmin;
1497*49b5e25fSSatish Balay           j   = a->j[jmin];
1498*49b5e25fSSatish Balay           jl[i] = jl[j]; jl[j]=i;
1499*49b5e25fSSatish Balay         }
1500*49b5e25fSSatish Balay         i = nexti;
1501*49b5e25fSSatish Balay       }
1502*49b5e25fSSatish Balay 
1503*49b5e25fSSatish Balay       /*-- row sum --*/
1504*49b5e25fSSatish Balay       jmin = a->i[k]; jmax = a->i[k+1];
1505*49b5e25fSSatish Balay       for (i=jmin; i<jmax; i++) {
1506*49b5e25fSSatish Balay         for (j=0; j<bs; j++){
1507*49b5e25fSSatish Balay           v = a->a + i*bs2 + j;
1508*49b5e25fSSatish Balay           for (k1=0; k1<bs; k1++){
1509*49b5e25fSSatish Balay             sum[j] += PetscAbsScalar(*v);
1510*49b5e25fSSatish Balay             v   += bs;
1511*49b5e25fSSatish Balay           }
1512*49b5e25fSSatish Balay         }
1513*49b5e25fSSatish Balay       }
1514*49b5e25fSSatish Balay       /* add k_th block row to il, jl */
1515*49b5e25fSSatish Balay       jmin++;
1516*49b5e25fSSatish Balay       if (jmin < jmax){
1517*49b5e25fSSatish Balay         il[k] = jmin;
1518*49b5e25fSSatish Balay         j   = a->j[jmin];
1519*49b5e25fSSatish Balay         jl[k] = jl[j]; jl[j] = k;
1520*49b5e25fSSatish Balay       }
1521*49b5e25fSSatish Balay       for (j=0; j<bs; j++){
1522*49b5e25fSSatish Balay         if (sum[j] > *norm) *norm = sum[j];
1523*49b5e25fSSatish Balay       }
1524*49b5e25fSSatish Balay     }
1525*49b5e25fSSatish Balay     ierr = PetscFree(il);CHKERRQ(ierr);
1526*49b5e25fSSatish Balay     ierr = PetscFree(jl);CHKERRQ(ierr);
1527*49b5e25fSSatish Balay     ierr = PetscFree(sum);CHKERRQ(ierr);
1528*49b5e25fSSatish Balay   } else {
1529*49b5e25fSSatish Balay     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1530*49b5e25fSSatish Balay   }
1531*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1532*49b5e25fSSatish Balay }
1533*49b5e25fSSatish Balay 
1534*49b5e25fSSatish Balay #ifdef MatNorm_SeqBAIJ
1535*49b5e25fSSatish Balay /* This is modified MatNorm_SeqBAIJ.
1536*49b5e25fSSatish Balay    MatNorm_SeqBAIJ in baij/seq/baij2.c is not correct for bs>1 */
1537*49b5e25fSSatish Balay 
1538*49b5e25fSSatish Balay #undef __FUNC__
1539*49b5e25fSSatish Balay #define __FUNC__ "MatNorm_SeqBAIJ"
1540*49b5e25fSSatish Balay int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1541*49b5e25fSSatish Balay {
1542*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1543*49b5e25fSSatish Balay   MatScalar   *v = a->a;
1544*49b5e25fSSatish Balay   PetscReal   sum = 0.0;
1545*49b5e25fSSatish Balay   int         i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1546*49b5e25fSSatish Balay 
1547*49b5e25fSSatish Balay   PetscFunctionBegin;
1548*49b5e25fSSatish Balay   if (type == NORM_FROBENIUS) {
1549*49b5e25fSSatish Balay     for (i=0; i< bs2*nz; i++) {
1550*49b5e25fSSatish Balay #if defined(PETSC_USE_COMPLEX)
1551*49b5e25fSSatish Balay       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1552*49b5e25fSSatish Balay #else
1553*49b5e25fSSatish Balay       sum += (*v)*(*v); v++;
1554*49b5e25fSSatish Balay #endif
1555*49b5e25fSSatish Balay     }
1556*49b5e25fSSatish Balay     *norm = sqrt(sum);
1557*49b5e25fSSatish Balay   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1558*49b5e25fSSatish Balay     *norm = 0.0;
1559*49b5e25fSSatish Balay     for (j=0; j<a->mbs; j++) {
1560*49b5e25fSSatish Balay       for (k=0; k<bs; k++) {
1561*49b5e25fSSatish Balay         v = a->a + bs2*a->i[j] + k;
1562*49b5e25fSSatish Balay         sum = 0.0;
1563*49b5e25fSSatish Balay         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1564*49b5e25fSSatish Balay           for (k1=0; k1<bs; k1++){   /* this loop was missing in the original code*/
1565*49b5e25fSSatish Balay             sum += PetscAbsScalar(*v);
1566*49b5e25fSSatish Balay             v   += bs;
1567*49b5e25fSSatish Balay           }
1568*49b5e25fSSatish Balay         }
1569*49b5e25fSSatish Balay         if (sum > *norm) *norm = sum;
1570*49b5e25fSSatish Balay       }
1571*49b5e25fSSatish Balay     }
1572*49b5e25fSSatish Balay   } else {
1573*49b5e25fSSatish Balay     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1574*49b5e25fSSatish Balay   }
1575*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1576*49b5e25fSSatish Balay }
1577*49b5e25fSSatish Balay #endif
1578*49b5e25fSSatish Balay 
1579*49b5e25fSSatish Balay #undef __FUNC__
1580*49b5e25fSSatish Balay #define __FUNC__ "MatEqual_SeqSBAIJ"
1581*49b5e25fSSatish Balay int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1582*49b5e25fSSatish Balay {
1583*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1584*49b5e25fSSatish Balay   int         ierr;
1585*49b5e25fSSatish Balay 
1586*49b5e25fSSatish Balay   PetscFunctionBegin;
1587*49b5e25fSSatish Balay   if (B->type !=MATSEQSBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1588*49b5e25fSSatish Balay 
1589*49b5e25fSSatish Balay   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1590*49b5e25fSSatish Balay   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1591*49b5e25fSSatish Balay     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1592*49b5e25fSSatish Balay   }
1593*49b5e25fSSatish Balay 
1594*49b5e25fSSatish Balay   /* if the a->i are the same */
1595*49b5e25fSSatish Balay   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);CHKERRQ(ierr);
1596*49b5e25fSSatish Balay   if (*flg == PETSC_FALSE) {
1597*49b5e25fSSatish Balay     PetscFunctionReturn(0);
1598*49b5e25fSSatish Balay   }
1599*49b5e25fSSatish Balay 
1600*49b5e25fSSatish Balay   /* if a->j are the same */
1601*49b5e25fSSatish Balay   ierr = PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);CHKERRQ(ierr);
1602*49b5e25fSSatish Balay   if (*flg == PETSC_FALSE) {
1603*49b5e25fSSatish Balay     PetscFunctionReturn(0);
1604*49b5e25fSSatish Balay   }
1605*49b5e25fSSatish Balay   /* if a->a are the same */
1606*49b5e25fSSatish Balay   ierr = PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);CHKERRQ(ierr);
1607*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1608*49b5e25fSSatish Balay 
1609*49b5e25fSSatish Balay }
1610*49b5e25fSSatish Balay 
1611*49b5e25fSSatish Balay #undef __FUNC__
1612*49b5e25fSSatish Balay #define __FUNC__ "MatGetDiagonal_SeqSBAIJ"
1613*49b5e25fSSatish Balay int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1614*49b5e25fSSatish Balay {
1615*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1616*49b5e25fSSatish Balay   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1617*49b5e25fSSatish Balay   Scalar      *x,zero = 0.0;
1618*49b5e25fSSatish Balay   MatScalar   *aa,*aa_j;
1619*49b5e25fSSatish Balay 
1620*49b5e25fSSatish Balay   PetscFunctionBegin;
1621*49b5e25fSSatish Balay   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1622*49b5e25fSSatish Balay   bs   = a->bs;
1623*49b5e25fSSatish Balay   aa   = a->a;
1624*49b5e25fSSatish Balay   ai   = a->i;
1625*49b5e25fSSatish Balay   aj   = a->j;
1626*49b5e25fSSatish Balay   ambs = a->mbs;
1627*49b5e25fSSatish Balay   bs2  = a->bs2;
1628*49b5e25fSSatish Balay 
1629*49b5e25fSSatish Balay   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1630*49b5e25fSSatish Balay   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1631*49b5e25fSSatish Balay   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1632*49b5e25fSSatish Balay   /* if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");*/
1633*49b5e25fSSatish Balay   for (i=0; i<ambs; i++) {
1634*49b5e25fSSatish Balay     j=ai[i];
1635*49b5e25fSSatish Balay     if (aj[j] == i) {             /* if this is a diagonal element */
1636*49b5e25fSSatish Balay       row  = i*bs;
1637*49b5e25fSSatish Balay       aa_j = aa + j*bs2;
1638*49b5e25fSSatish Balay       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1639*49b5e25fSSatish Balay     }
1640*49b5e25fSSatish Balay   }
1641*49b5e25fSSatish Balay   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1642*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1643*49b5e25fSSatish Balay }
1644*49b5e25fSSatish Balay 
1645*49b5e25fSSatish Balay #undef __FUNC__
1646*49b5e25fSSatish Balay #define __FUNC__ "MatDiagonalScale_SeqSBAIJ"
1647*49b5e25fSSatish Balay int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1648*49b5e25fSSatish Balay {
1649*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1650*49b5e25fSSatish Balay   Scalar      *l,*r,x,*li,*ri;
1651*49b5e25fSSatish Balay   MatScalar   *aa,*v;
1652*49b5e25fSSatish Balay   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1653*49b5e25fSSatish Balay 
1654*49b5e25fSSatish Balay   PetscFunctionBegin;
1655*49b5e25fSSatish Balay   ai  = a->i;
1656*49b5e25fSSatish Balay   aj  = a->j;
1657*49b5e25fSSatish Balay   aa  = a->a;
1658*49b5e25fSSatish Balay   m   = a->m;
1659*49b5e25fSSatish Balay   n   = a->n;
1660*49b5e25fSSatish Balay   bs  = a->bs;
1661*49b5e25fSSatish Balay   mbs = a->mbs;
1662*49b5e25fSSatish Balay   bs2 = a->bs2;
1663*49b5e25fSSatish Balay 
1664*49b5e25fSSatish Balay   if (ll != rr) {
1665*49b5e25fSSatish Balay     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"For symmetric format, left and right scaling vectors must be same\n");
1666*49b5e25fSSatish Balay   }
1667*49b5e25fSSatish Balay   if (ll) {
1668*49b5e25fSSatish Balay     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1669*49b5e25fSSatish Balay     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1670*49b5e25fSSatish Balay     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1671*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) { /* for each block row */
1672*49b5e25fSSatish Balay       M  = ai[i+1] - ai[i];
1673*49b5e25fSSatish Balay       li = l + i*bs;
1674*49b5e25fSSatish Balay       v  = aa + bs2*ai[i];
1675*49b5e25fSSatish Balay       for (j=0; j<M; j++) { /* for each block */
1676*49b5e25fSSatish Balay         for (k=0; k<bs2; k++) {
1677*49b5e25fSSatish Balay           (*v++) *= li[k%bs];
1678*49b5e25fSSatish Balay         }
1679*49b5e25fSSatish Balay #ifdef CONT
1680*49b5e25fSSatish Balay         /* will be used to replace the above loop */
1681*49b5e25fSSatish Balay         ri = l + bs*aj[ai[i]+j];
1682*49b5e25fSSatish Balay         for (k=0; k<bs; k++) { /* column value */
1683*49b5e25fSSatish Balay           x = ri[k];
1684*49b5e25fSSatish Balay           for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1685*49b5e25fSSatish Balay         }
1686*49b5e25fSSatish Balay #endif
1687*49b5e25fSSatish Balay 
1688*49b5e25fSSatish Balay       }
1689*49b5e25fSSatish Balay     }
1690*49b5e25fSSatish Balay     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1691*49b5e25fSSatish Balay     PLogFlops(2*a->s_nz);
1692*49b5e25fSSatish Balay   }
1693*49b5e25fSSatish Balay   /* will be deleted */
1694*49b5e25fSSatish Balay   if (rr) {
1695*49b5e25fSSatish Balay     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1696*49b5e25fSSatish Balay     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1697*49b5e25fSSatish Balay     if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1698*49b5e25fSSatish Balay     for (i=0; i<mbs; i++) { /* for each block row */
1699*49b5e25fSSatish Balay       M  = ai[i+1] - ai[i];
1700*49b5e25fSSatish Balay       v  = aa + bs2*ai[i];
1701*49b5e25fSSatish Balay       for (j=0; j<M; j++) { /* for each block */
1702*49b5e25fSSatish Balay         ri = r + bs*aj[ai[i]+j];
1703*49b5e25fSSatish Balay         for (k=0; k<bs; k++) {
1704*49b5e25fSSatish Balay           x = ri[k];
1705*49b5e25fSSatish Balay           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1706*49b5e25fSSatish Balay         }
1707*49b5e25fSSatish Balay       }
1708*49b5e25fSSatish Balay     }
1709*49b5e25fSSatish Balay     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1710*49b5e25fSSatish Balay     PLogFlops(a->s_nz);
1711*49b5e25fSSatish Balay   }
1712*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1713*49b5e25fSSatish Balay }
1714*49b5e25fSSatish Balay 
1715*49b5e25fSSatish Balay #undef __FUNC__
1716*49b5e25fSSatish Balay #define __FUNC__ "MatGetInfo_SeqSBAIJ"
1717*49b5e25fSSatish Balay int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1718*49b5e25fSSatish Balay {
1719*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1720*49b5e25fSSatish Balay 
1721*49b5e25fSSatish Balay   PetscFunctionBegin;
1722*49b5e25fSSatish Balay   info->rows_global    = (double)a->m;
1723*49b5e25fSSatish Balay   info->columns_global = (double)a->m;
1724*49b5e25fSSatish Balay   info->rows_local     = (double)a->m;
1725*49b5e25fSSatish Balay   info->columns_local  = (double)a->m;
1726*49b5e25fSSatish Balay   info->block_size     = a->bs2;
1727*49b5e25fSSatish Balay   info->nz_allocated   = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1728*49b5e25fSSatish Balay   info->nz_used        = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1729*49b5e25fSSatish Balay   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1730*49b5e25fSSatish Balay   info->assemblies   = A->num_ass;
1731*49b5e25fSSatish Balay   info->mallocs      = a->reallocs;
1732*49b5e25fSSatish Balay   info->memory       = A->mem;
1733*49b5e25fSSatish Balay   if (A->factor) {
1734*49b5e25fSSatish Balay     info->fill_ratio_given  = A->info.fill_ratio_given;
1735*49b5e25fSSatish Balay     info->fill_ratio_needed = A->info.fill_ratio_needed;
1736*49b5e25fSSatish Balay     info->factor_mallocs    = A->info.factor_mallocs;
1737*49b5e25fSSatish Balay   } else {
1738*49b5e25fSSatish Balay     info->fill_ratio_given  = 0;
1739*49b5e25fSSatish Balay     info->fill_ratio_needed = 0;
1740*49b5e25fSSatish Balay     info->factor_mallocs    = 0;
1741*49b5e25fSSatish Balay   }
1742*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1743*49b5e25fSSatish Balay }
1744*49b5e25fSSatish Balay 
1745*49b5e25fSSatish Balay 
1746*49b5e25fSSatish Balay #undef __FUNC__
1747*49b5e25fSSatish Balay #define __FUNC__ "MatZeroEntries_SeqSBAIJ"
1748*49b5e25fSSatish Balay int MatZeroEntries_SeqSBAIJ(Mat A)
1749*49b5e25fSSatish Balay {
1750*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1751*49b5e25fSSatish Balay   int         ierr;
1752*49b5e25fSSatish Balay 
1753*49b5e25fSSatish Balay   PetscFunctionBegin;
1754*49b5e25fSSatish Balay   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1755*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1756*49b5e25fSSatish Balay }
1757