xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 28e30874238f574d0bb41b4014cc791ade860c23)
1*28e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h>
2*28e30874SSatish Balay #include <petsc-private/kernels/blockinvert.h>
3*28e30874SSatish Balay 
4*28e30874SSatish Balay #undef __FUNCT__
5*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
6*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
7*28e30874SSatish Balay {
8*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
9*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
10*28e30874SSatish Balay   PetscErrorCode    ierr;
11*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
12*28e30874SSatish Balay   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
13*28e30874SSatish Balay   PetscInt          i,nz;
14*28e30874SSatish Balay   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
15*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
16*28e30874SSatish Balay   PetscScalar       *x,*s,*t,*ls;
17*28e30874SSatish Balay   const PetscScalar *b;
18*28e30874SSatish Balay 
19*28e30874SSatish Balay   PetscFunctionBegin;
20*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22*28e30874SSatish Balay   t    = a->solve_work;
23*28e30874SSatish Balay 
24*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26*28e30874SSatish Balay 
27*28e30874SSatish Balay   /* forward solve the lower triangular */
28*28e30874SSatish Balay   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
29*28e30874SSatish Balay   for (i=1; i<n; i++) {
30*28e30874SSatish Balay     v    = aa + bs2*ai[i];
31*28e30874SSatish Balay     vi   = aj + ai[i];
32*28e30874SSatish Balay     nz   = a->diag[i] - ai[i];
33*28e30874SSatish Balay     s    = t + bs*i;
34*28e30874SSatish Balay     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
35*28e30874SSatish Balay     while (nz--) {
36*28e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
37*28e30874SSatish Balay       v += bs2;
38*28e30874SSatish Balay     }
39*28e30874SSatish Balay   }
40*28e30874SSatish Balay   /* backward solve the upper triangular */
41*28e30874SSatish Balay   ls = a->solve_work + A->cmap->n;
42*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
43*28e30874SSatish Balay     v    = aa + bs2*(a->diag[i] + 1);
44*28e30874SSatish Balay     vi   = aj + a->diag[i] + 1;
45*28e30874SSatish Balay     nz   = ai[i+1] - a->diag[i] - 1;
46*28e30874SSatish Balay     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
47*28e30874SSatish Balay     while (nz--) {
48*28e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
49*28e30874SSatish Balay       v += bs2;
50*28e30874SSatish Balay     }
51*28e30874SSatish Balay     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
52*28e30874SSatish Balay     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
53*28e30874SSatish Balay   }
54*28e30874SSatish Balay 
55*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
56*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
57*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
58*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
59*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
60*28e30874SSatish Balay   PetscFunctionReturn(0);
61*28e30874SSatish Balay }
62*28e30874SSatish Balay 
63*28e30874SSatish Balay #undef __FUNCT__
64*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
65*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
66*28e30874SSatish Balay {
67*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
68*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
69*28e30874SSatish Balay   PetscErrorCode    ierr;
70*28e30874SSatish Balay   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
71*28e30874SSatish Balay   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
72*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
73*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
74*28e30874SSatish Balay   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
75*28e30874SSatish Balay   const PetscScalar *b;
76*28e30874SSatish Balay 
77*28e30874SSatish Balay   PetscFunctionBegin;
78*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
79*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
80*28e30874SSatish Balay   t    = a->solve_work;
81*28e30874SSatish Balay 
82*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
83*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
84*28e30874SSatish Balay 
85*28e30874SSatish Balay   /* forward solve the lower triangular */
86*28e30874SSatish Balay   idx  = 7*(*r++);
87*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
88*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
89*28e30874SSatish Balay   t[5] = b[5+idx]; t[6] = b[6+idx];
90*28e30874SSatish Balay 
91*28e30874SSatish Balay   for (i=1; i<n; i++) {
92*28e30874SSatish Balay     v   = aa + 49*ai[i];
93*28e30874SSatish Balay     vi  = aj + ai[i];
94*28e30874SSatish Balay     nz  = diag[i] - ai[i];
95*28e30874SSatish Balay     idx = 7*(*r++);
96*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
97*28e30874SSatish Balay     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
98*28e30874SSatish Balay     while (nz--) {
99*28e30874SSatish Balay       idx = 7*(*vi++);
100*28e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
101*28e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
102*28e30874SSatish Balay       x6  = t[5+idx];x7 = t[6+idx];
103*28e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
104*28e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
105*28e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
106*28e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
107*28e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
108*28e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
109*28e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
110*28e30874SSatish Balay       v  += 49;
111*28e30874SSatish Balay     }
112*28e30874SSatish Balay     idx      = 7*i;
113*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
114*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
115*28e30874SSatish Balay     t[5+idx] = s6;t[6+idx] = s7;
116*28e30874SSatish Balay   }
117*28e30874SSatish Balay   /* backward solve the upper triangular */
118*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
119*28e30874SSatish Balay     v   = aa + 49*diag[i] + 49;
120*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
121*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
122*28e30874SSatish Balay     idt = 7*i;
123*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
124*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
125*28e30874SSatish Balay     s6  = t[5+idt];s7 = t[6+idt];
126*28e30874SSatish Balay     while (nz--) {
127*28e30874SSatish Balay       idx = 7*(*vi++);
128*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
129*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
130*28e30874SSatish Balay       x6  = t[5+idx]; x7 = t[6+idx];
131*28e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
132*28e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
133*28e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
134*28e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
135*28e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
136*28e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
137*28e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
138*28e30874SSatish Balay       v  += 49;
139*28e30874SSatish Balay     }
140*28e30874SSatish Balay     idc    = 7*(*c--);
141*28e30874SSatish Balay     v      = aa + 49*diag[i];
142*28e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
143*28e30874SSatish Balay                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
144*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
145*28e30874SSatish Balay                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
146*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
147*28e30874SSatish Balay                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
148*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
149*28e30874SSatish Balay                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
150*28e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
151*28e30874SSatish Balay                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
152*28e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
153*28e30874SSatish Balay                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
154*28e30874SSatish Balay     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
155*28e30874SSatish Balay                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
156*28e30874SSatish Balay   }
157*28e30874SSatish Balay 
158*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
159*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
160*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
161*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
162*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
163*28e30874SSatish Balay   PetscFunctionReturn(0);
164*28e30874SSatish Balay }
165*28e30874SSatish Balay 
166*28e30874SSatish Balay #undef __FUNCT__
167*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
168*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
169*28e30874SSatish Balay {
170*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
171*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
172*28e30874SSatish Balay   PetscErrorCode    ierr;
173*28e30874SSatish Balay   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
174*28e30874SSatish Balay   const PetscInt    n=a->mbs,*rout,*cout,*vi;
175*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
176*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
177*28e30874SSatish Balay   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
178*28e30874SSatish Balay   const PetscScalar *b;
179*28e30874SSatish Balay 
180*28e30874SSatish Balay   PetscFunctionBegin;
181*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
182*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
183*28e30874SSatish Balay   t    = a->solve_work;
184*28e30874SSatish Balay 
185*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
186*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
187*28e30874SSatish Balay 
188*28e30874SSatish Balay   /* forward solve the lower triangular */
189*28e30874SSatish Balay   idx  = 7*r[0];
190*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
191*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
192*28e30874SSatish Balay   t[5] = b[5+idx]; t[6] = b[6+idx];
193*28e30874SSatish Balay 
194*28e30874SSatish Balay   for (i=1; i<n; i++) {
195*28e30874SSatish Balay     v   = aa + 49*ai[i];
196*28e30874SSatish Balay     vi  = aj + ai[i];
197*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
198*28e30874SSatish Balay     idx = 7*r[i];
199*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
200*28e30874SSatish Balay     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
201*28e30874SSatish Balay     for (m=0; m<nz; m++) {
202*28e30874SSatish Balay       idx = 7*vi[m];
203*28e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
204*28e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
205*28e30874SSatish Balay       x6  = t[5+idx];x7 = t[6+idx];
206*28e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
207*28e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
208*28e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
209*28e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
210*28e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
211*28e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
212*28e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
213*28e30874SSatish Balay       v  += 49;
214*28e30874SSatish Balay     }
215*28e30874SSatish Balay     idx      = 7*i;
216*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
217*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
218*28e30874SSatish Balay     t[5+idx] = s6;t[6+idx] = s7;
219*28e30874SSatish Balay   }
220*28e30874SSatish Balay   /* backward solve the upper triangular */
221*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
222*28e30874SSatish Balay     v   = aa + 49*(adiag[i+1]+1);
223*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
224*28e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
225*28e30874SSatish Balay     idt = 7*i;
226*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
227*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
228*28e30874SSatish Balay     s6  = t[5+idt];s7 = t[6+idt];
229*28e30874SSatish Balay     for (m=0; m<nz; m++) {
230*28e30874SSatish Balay       idx = 7*vi[m];
231*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
232*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
233*28e30874SSatish Balay       x6  = t[5+idx]; x7 = t[6+idx];
234*28e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
235*28e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
236*28e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
237*28e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
238*28e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
239*28e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
240*28e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
241*28e30874SSatish Balay       v  += 49;
242*28e30874SSatish Balay     }
243*28e30874SSatish Balay     idc    = 7*c[i];
244*28e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
245*28e30874SSatish Balay                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
246*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
247*28e30874SSatish Balay                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
248*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
249*28e30874SSatish Balay                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
250*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
251*28e30874SSatish Balay                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
252*28e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
253*28e30874SSatish Balay                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
254*28e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
255*28e30874SSatish Balay                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
256*28e30874SSatish Balay     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
257*28e30874SSatish Balay                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
258*28e30874SSatish Balay   }
259*28e30874SSatish Balay 
260*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
261*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
262*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
263*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
264*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
265*28e30874SSatish Balay   PetscFunctionReturn(0);
266*28e30874SSatish Balay }
267*28e30874SSatish Balay 
268*28e30874SSatish Balay #undef __FUNCT__
269*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
270*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
271*28e30874SSatish Balay {
272*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
273*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
274*28e30874SSatish Balay   PetscErrorCode    ierr;
275*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
276*28e30874SSatish Balay   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
277*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
278*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
279*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
280*28e30874SSatish Balay   const PetscScalar *b;
281*28e30874SSatish Balay 
282*28e30874SSatish Balay   PetscFunctionBegin;
283*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
284*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
285*28e30874SSatish Balay   t    = a->solve_work;
286*28e30874SSatish Balay 
287*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
288*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
289*28e30874SSatish Balay 
290*28e30874SSatish Balay   /* forward solve the lower triangular */
291*28e30874SSatish Balay   idx  = 6*(*r++);
292*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
293*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
294*28e30874SSatish Balay   t[4] = b[4+idx]; t[5] = b[5+idx];
295*28e30874SSatish Balay   for (i=1; i<n; i++) {
296*28e30874SSatish Balay     v   = aa + 36*ai[i];
297*28e30874SSatish Balay     vi  = aj + ai[i];
298*28e30874SSatish Balay     nz  = diag[i] - ai[i];
299*28e30874SSatish Balay     idx = 6*(*r++);
300*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
301*28e30874SSatish Balay     s5  = b[4+idx]; s6 = b[5+idx];
302*28e30874SSatish Balay     while (nz--) {
303*28e30874SSatish Balay       idx = 6*(*vi++);
304*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
305*28e30874SSatish Balay       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
306*28e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
307*28e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
308*28e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
309*28e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
310*28e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
311*28e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
312*28e30874SSatish Balay       v  += 36;
313*28e30874SSatish Balay     }
314*28e30874SSatish Balay     idx      = 6*i;
315*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
316*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
317*28e30874SSatish Balay     t[4+idx] = s5;t[5+idx] = s6;
318*28e30874SSatish Balay   }
319*28e30874SSatish Balay   /* backward solve the upper triangular */
320*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
321*28e30874SSatish Balay     v   = aa + 36*diag[i] + 36;
322*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
323*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
324*28e30874SSatish Balay     idt = 6*i;
325*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
326*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
327*28e30874SSatish Balay     s5  = t[4+idt];s6 = t[5+idt];
328*28e30874SSatish Balay     while (nz--) {
329*28e30874SSatish Balay       idx = 6*(*vi++);
330*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
331*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
332*28e30874SSatish Balay       x5  = t[4+idx]; x6 = t[5+idx];
333*28e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
334*28e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
335*28e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
336*28e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
337*28e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
338*28e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
339*28e30874SSatish Balay       v  += 36;
340*28e30874SSatish Balay     }
341*28e30874SSatish Balay     idc    = 6*(*c--);
342*28e30874SSatish Balay     v      = aa + 36*diag[i];
343*28e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
344*28e30874SSatish Balay                         v[18]*s4+v[24]*s5+v[30]*s6;
345*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
346*28e30874SSatish Balay                           v[19]*s4+v[25]*s5+v[31]*s6;
347*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
348*28e30874SSatish Balay                           v[20]*s4+v[26]*s5+v[32]*s6;
349*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
350*28e30874SSatish Balay                           v[21]*s4+v[27]*s5+v[33]*s6;
351*28e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
352*28e30874SSatish Balay                           v[22]*s4+v[28]*s5+v[34]*s6;
353*28e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
354*28e30874SSatish Balay                           v[23]*s4+v[29]*s5+v[35]*s6;
355*28e30874SSatish Balay   }
356*28e30874SSatish Balay 
357*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
358*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
359*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
360*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
362*28e30874SSatish Balay   PetscFunctionReturn(0);
363*28e30874SSatish Balay }
364*28e30874SSatish Balay 
365*28e30874SSatish Balay #undef __FUNCT__
366*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
367*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
368*28e30874SSatish Balay {
369*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
370*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
371*28e30874SSatish Balay   PetscErrorCode    ierr;
372*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
373*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
374*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
375*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
376*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
377*28e30874SSatish Balay   const PetscScalar *b;
378*28e30874SSatish Balay 
379*28e30874SSatish Balay   PetscFunctionBegin;
380*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
381*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382*28e30874SSatish Balay   t    = a->solve_work;
383*28e30874SSatish Balay 
384*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
385*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
386*28e30874SSatish Balay 
387*28e30874SSatish Balay   /* forward solve the lower triangular */
388*28e30874SSatish Balay   idx  = 6*r[0];
389*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
390*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
391*28e30874SSatish Balay   t[4] = b[4+idx]; t[5] = b[5+idx];
392*28e30874SSatish Balay   for (i=1; i<n; i++) {
393*28e30874SSatish Balay     v   = aa + 36*ai[i];
394*28e30874SSatish Balay     vi  = aj + ai[i];
395*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
396*28e30874SSatish Balay     idx = 6*r[i];
397*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
398*28e30874SSatish Balay     s5  = b[4+idx]; s6 = b[5+idx];
399*28e30874SSatish Balay     for (m=0; m<nz; m++) {
400*28e30874SSatish Balay       idx = 6*vi[m];
401*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
402*28e30874SSatish Balay       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
403*28e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
404*28e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
405*28e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
406*28e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
407*28e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
408*28e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
409*28e30874SSatish Balay       v  += 36;
410*28e30874SSatish Balay     }
411*28e30874SSatish Balay     idx      = 6*i;
412*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
413*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
414*28e30874SSatish Balay     t[4+idx] = s5;t[5+idx] = s6;
415*28e30874SSatish Balay   }
416*28e30874SSatish Balay   /* backward solve the upper triangular */
417*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
418*28e30874SSatish Balay     v   = aa + 36*(adiag[i+1]+1);
419*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
420*28e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
421*28e30874SSatish Balay     idt = 6*i;
422*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
423*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
424*28e30874SSatish Balay     s5  = t[4+idt];s6 = t[5+idt];
425*28e30874SSatish Balay     for (m=0; m<nz; m++) {
426*28e30874SSatish Balay       idx = 6*vi[m];
427*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
428*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
429*28e30874SSatish Balay       x5  = t[4+idx]; x6 = t[5+idx];
430*28e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
431*28e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
432*28e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
433*28e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
434*28e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
435*28e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
436*28e30874SSatish Balay       v  += 36;
437*28e30874SSatish Balay     }
438*28e30874SSatish Balay     idc    = 6*c[i];
439*28e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
440*28e30874SSatish Balay                         v[18]*s4+v[24]*s5+v[30]*s6;
441*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
442*28e30874SSatish Balay                           v[19]*s4+v[25]*s5+v[31]*s6;
443*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
444*28e30874SSatish Balay                           v[20]*s4+v[26]*s5+v[32]*s6;
445*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
446*28e30874SSatish Balay                           v[21]*s4+v[27]*s5+v[33]*s6;
447*28e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
448*28e30874SSatish Balay                           v[22]*s4+v[28]*s5+v[34]*s6;
449*28e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
450*28e30874SSatish Balay                           v[23]*s4+v[29]*s5+v[35]*s6;
451*28e30874SSatish Balay   }
452*28e30874SSatish Balay 
453*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
454*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
455*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
456*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
458*28e30874SSatish Balay   PetscFunctionReturn(0);
459*28e30874SSatish Balay }
460*28e30874SSatish Balay 
461*28e30874SSatish Balay #undef __FUNCT__
462*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
463*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
464*28e30874SSatish Balay {
465*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
466*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
467*28e30874SSatish Balay   PetscErrorCode    ierr;
468*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
469*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
470*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
471*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
472*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
473*28e30874SSatish Balay   const PetscScalar *b;
474*28e30874SSatish Balay 
475*28e30874SSatish Balay   PetscFunctionBegin;
476*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
477*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478*28e30874SSatish Balay   t    = a->solve_work;
479*28e30874SSatish Balay 
480*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
482*28e30874SSatish Balay 
483*28e30874SSatish Balay   /* forward solve the lower triangular */
484*28e30874SSatish Balay   idx  = 5*(*r++);
485*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
486*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
487*28e30874SSatish Balay   for (i=1; i<n; i++) {
488*28e30874SSatish Balay     v   = aa + 25*ai[i];
489*28e30874SSatish Balay     vi  = aj + ai[i];
490*28e30874SSatish Balay     nz  = diag[i] - ai[i];
491*28e30874SSatish Balay     idx = 5*(*r++);
492*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
493*28e30874SSatish Balay     s5  = b[4+idx];
494*28e30874SSatish Balay     while (nz--) {
495*28e30874SSatish Balay       idx = 5*(*vi++);
496*28e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
497*28e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
498*28e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
499*28e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
500*28e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
501*28e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
502*28e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
503*28e30874SSatish Balay       v  += 25;
504*28e30874SSatish Balay     }
505*28e30874SSatish Balay     idx      = 5*i;
506*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
507*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
508*28e30874SSatish Balay   }
509*28e30874SSatish Balay   /* backward solve the upper triangular */
510*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
511*28e30874SSatish Balay     v   = aa + 25*diag[i] + 25;
512*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
513*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
514*28e30874SSatish Balay     idt = 5*i;
515*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
516*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
517*28e30874SSatish Balay     while (nz--) {
518*28e30874SSatish Balay       idx = 5*(*vi++);
519*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
520*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
521*28e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
522*28e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
523*28e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
524*28e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
525*28e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
526*28e30874SSatish Balay       v  += 25;
527*28e30874SSatish Balay     }
528*28e30874SSatish Balay     idc    = 5*(*c--);
529*28e30874SSatish Balay     v      = aa + 25*diag[i];
530*28e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
531*28e30874SSatish Balay                         v[15]*s4+v[20]*s5;
532*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
533*28e30874SSatish Balay                           v[16]*s4+v[21]*s5;
534*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
535*28e30874SSatish Balay                           v[17]*s4+v[22]*s5;
536*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
537*28e30874SSatish Balay                           v[18]*s4+v[23]*s5;
538*28e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
539*28e30874SSatish Balay                           v[19]*s4+v[24]*s5;
540*28e30874SSatish Balay   }
541*28e30874SSatish Balay 
542*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
543*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
544*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
545*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
546*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
547*28e30874SSatish Balay   PetscFunctionReturn(0);
548*28e30874SSatish Balay }
549*28e30874SSatish Balay 
550*28e30874SSatish Balay #undef __FUNCT__
551*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
552*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
553*28e30874SSatish Balay {
554*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
555*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
556*28e30874SSatish Balay   PetscErrorCode    ierr;
557*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
558*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
559*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
560*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
561*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
562*28e30874SSatish Balay   const PetscScalar *b;
563*28e30874SSatish Balay 
564*28e30874SSatish Balay   PetscFunctionBegin;
565*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
566*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
567*28e30874SSatish Balay   t    = a->solve_work;
568*28e30874SSatish Balay 
569*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
570*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
571*28e30874SSatish Balay 
572*28e30874SSatish Balay   /* forward solve the lower triangular */
573*28e30874SSatish Balay   idx  = 5*r[0];
574*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
575*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
576*28e30874SSatish Balay   for (i=1; i<n; i++) {
577*28e30874SSatish Balay     v   = aa + 25*ai[i];
578*28e30874SSatish Balay     vi  = aj + ai[i];
579*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
580*28e30874SSatish Balay     idx = 5*r[i];
581*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
582*28e30874SSatish Balay     s5  = b[4+idx];
583*28e30874SSatish Balay     for (m=0; m<nz; m++) {
584*28e30874SSatish Balay       idx = 5*vi[m];
585*28e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
586*28e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
587*28e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
588*28e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
589*28e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
590*28e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
591*28e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
592*28e30874SSatish Balay       v  += 25;
593*28e30874SSatish Balay     }
594*28e30874SSatish Balay     idx      = 5*i;
595*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
596*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
597*28e30874SSatish Balay   }
598*28e30874SSatish Balay   /* backward solve the upper triangular */
599*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
600*28e30874SSatish Balay     v   = aa + 25*(adiag[i+1]+1);
601*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
602*28e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
603*28e30874SSatish Balay     idt = 5*i;
604*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
605*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
606*28e30874SSatish Balay     for (m=0; m<nz; m++) {
607*28e30874SSatish Balay       idx = 5*vi[m];
608*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
609*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
610*28e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
611*28e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
612*28e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
613*28e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
614*28e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
615*28e30874SSatish Balay       v  += 25;
616*28e30874SSatish Balay     }
617*28e30874SSatish Balay     idc    = 5*c[i];
618*28e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
619*28e30874SSatish Balay                         v[15]*s4+v[20]*s5;
620*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
621*28e30874SSatish Balay                           v[16]*s4+v[21]*s5;
622*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
623*28e30874SSatish Balay                           v[17]*s4+v[22]*s5;
624*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
625*28e30874SSatish Balay                           v[18]*s4+v[23]*s5;
626*28e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
627*28e30874SSatish Balay                           v[19]*s4+v[24]*s5;
628*28e30874SSatish Balay   }
629*28e30874SSatish Balay 
630*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
631*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
632*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
633*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
634*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
635*28e30874SSatish Balay   PetscFunctionReturn(0);
636*28e30874SSatish Balay }
637*28e30874SSatish Balay 
638*28e30874SSatish Balay #undef __FUNCT__
639*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
640*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
641*28e30874SSatish Balay {
642*28e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
643*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
644*28e30874SSatish Balay   PetscErrorCode    ierr;
645*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
646*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
647*28e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
648*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
649*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
650*28e30874SSatish Balay   const PetscScalar *b;
651*28e30874SSatish Balay 
652*28e30874SSatish Balay   PetscFunctionBegin;
653*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
654*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
655*28e30874SSatish Balay   t    = a->solve_work;
656*28e30874SSatish Balay 
657*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
658*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
659*28e30874SSatish Balay 
660*28e30874SSatish Balay   /* forward solve the lower triangular */
661*28e30874SSatish Balay   idx  = 4*(*r++);
662*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
663*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
664*28e30874SSatish Balay   for (i=1; i<n; i++) {
665*28e30874SSatish Balay     v   = aa + 16*ai[i];
666*28e30874SSatish Balay     vi  = aj + ai[i];
667*28e30874SSatish Balay     nz  = diag[i] - ai[i];
668*28e30874SSatish Balay     idx = 4*(*r++);
669*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
670*28e30874SSatish Balay     while (nz--) {
671*28e30874SSatish Balay       idx = 4*(*vi++);
672*28e30874SSatish Balay       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
673*28e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
674*28e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
675*28e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
676*28e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
677*28e30874SSatish Balay       v  += 16;
678*28e30874SSatish Balay     }
679*28e30874SSatish Balay     idx      = 4*i;
680*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
681*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
682*28e30874SSatish Balay   }
683*28e30874SSatish Balay   /* backward solve the upper triangular */
684*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
685*28e30874SSatish Balay     v   = aa + 16*diag[i] + 16;
686*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
687*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
688*28e30874SSatish Balay     idt = 4*i;
689*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
690*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
691*28e30874SSatish Balay     while (nz--) {
692*28e30874SSatish Balay       idx = 4*(*vi++);
693*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
694*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
695*28e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
696*28e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
697*28e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
698*28e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
699*28e30874SSatish Balay       v  += 16;
700*28e30874SSatish Balay     }
701*28e30874SSatish Balay     idc      = 4*(*c--);
702*28e30874SSatish Balay     v        = aa + 16*diag[i];
703*28e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
704*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
705*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
706*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
707*28e30874SSatish Balay   }
708*28e30874SSatish Balay 
709*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
710*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
711*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
712*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
713*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
714*28e30874SSatish Balay   PetscFunctionReturn(0);
715*28e30874SSatish Balay }
716*28e30874SSatish Balay 
717*28e30874SSatish Balay #undef __FUNCT__
718*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
719*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
720*28e30874SSatish Balay {
721*28e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
722*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
723*28e30874SSatish Balay   PetscErrorCode    ierr;
724*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
725*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
726*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
727*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
728*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
729*28e30874SSatish Balay   const PetscScalar *b;
730*28e30874SSatish Balay 
731*28e30874SSatish Balay   PetscFunctionBegin;
732*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
733*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
734*28e30874SSatish Balay   t    = a->solve_work;
735*28e30874SSatish Balay 
736*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
737*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
738*28e30874SSatish Balay 
739*28e30874SSatish Balay   /* forward solve the lower triangular */
740*28e30874SSatish Balay   idx  = 4*r[0];
741*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
742*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
743*28e30874SSatish Balay   for (i=1; i<n; i++) {
744*28e30874SSatish Balay     v   = aa + 16*ai[i];
745*28e30874SSatish Balay     vi  = aj + ai[i];
746*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
747*28e30874SSatish Balay     idx = 4*r[i];
748*28e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
749*28e30874SSatish Balay     for (m=0; m<nz; m++) {
750*28e30874SSatish Balay       idx = 4*vi[m];
751*28e30874SSatish Balay       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
752*28e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
753*28e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
754*28e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
755*28e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
756*28e30874SSatish Balay       v  += 16;
757*28e30874SSatish Balay     }
758*28e30874SSatish Balay     idx      = 4*i;
759*28e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
760*28e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
761*28e30874SSatish Balay   }
762*28e30874SSatish Balay   /* backward solve the upper triangular */
763*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
764*28e30874SSatish Balay     v   = aa + 16*(adiag[i+1]+1);
765*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
766*28e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
767*28e30874SSatish Balay     idt = 4*i;
768*28e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
769*28e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
770*28e30874SSatish Balay     for (m=0; m<nz; m++) {
771*28e30874SSatish Balay       idx = 4*vi[m];
772*28e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
773*28e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
774*28e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
775*28e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
776*28e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
777*28e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
778*28e30874SSatish Balay       v  += 16;
779*28e30874SSatish Balay     }
780*28e30874SSatish Balay     idc      = 4*c[i];
781*28e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
782*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
783*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
784*28e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
785*28e30874SSatish Balay   }
786*28e30874SSatish Balay 
787*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
789*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
790*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
792*28e30874SSatish Balay   PetscFunctionReturn(0);
793*28e30874SSatish Balay }
794*28e30874SSatish Balay 
795*28e30874SSatish Balay #undef __FUNCT__
796*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
797*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
798*28e30874SSatish Balay {
799*28e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
800*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
801*28e30874SSatish Balay   PetscErrorCode    ierr;
802*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
803*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
804*28e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
805*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
806*28e30874SSatish Balay   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
807*28e30874SSatish Balay   PetscScalar       *x;
808*28e30874SSatish Balay   const PetscScalar *b;
809*28e30874SSatish Balay 
810*28e30874SSatish Balay   PetscFunctionBegin;
811*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
812*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
813*28e30874SSatish Balay   t    = (MatScalar*)a->solve_work;
814*28e30874SSatish Balay 
815*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
816*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
817*28e30874SSatish Balay 
818*28e30874SSatish Balay   /* forward solve the lower triangular */
819*28e30874SSatish Balay   idx  = 4*(*r++);
820*28e30874SSatish Balay   t[0] = (MatScalar)b[idx];
821*28e30874SSatish Balay   t[1] = (MatScalar)b[1+idx];
822*28e30874SSatish Balay   t[2] = (MatScalar)b[2+idx];
823*28e30874SSatish Balay   t[3] = (MatScalar)b[3+idx];
824*28e30874SSatish Balay   for (i=1; i<n; i++) {
825*28e30874SSatish Balay     v   = aa + 16*ai[i];
826*28e30874SSatish Balay     vi  = aj + ai[i];
827*28e30874SSatish Balay     nz  = diag[i] - ai[i];
828*28e30874SSatish Balay     idx = 4*(*r++);
829*28e30874SSatish Balay     s1  = (MatScalar)b[idx];
830*28e30874SSatish Balay     s2  = (MatScalar)b[1+idx];
831*28e30874SSatish Balay     s3  = (MatScalar)b[2+idx];
832*28e30874SSatish Balay     s4  = (MatScalar)b[3+idx];
833*28e30874SSatish Balay     while (nz--) {
834*28e30874SSatish Balay       idx = 4*(*vi++);
835*28e30874SSatish Balay       x1  = t[idx];
836*28e30874SSatish Balay       x2  = t[1+idx];
837*28e30874SSatish Balay       x3  = t[2+idx];
838*28e30874SSatish Balay       x4  = t[3+idx];
839*28e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
840*28e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
841*28e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
842*28e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
843*28e30874SSatish Balay       v  += 16;
844*28e30874SSatish Balay     }
845*28e30874SSatish Balay     idx      = 4*i;
846*28e30874SSatish Balay     t[idx]   = s1;
847*28e30874SSatish Balay     t[1+idx] = s2;
848*28e30874SSatish Balay     t[2+idx] = s3;
849*28e30874SSatish Balay     t[3+idx] = s4;
850*28e30874SSatish Balay   }
851*28e30874SSatish Balay   /* backward solve the upper triangular */
852*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
853*28e30874SSatish Balay     v   = aa + 16*diag[i] + 16;
854*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
855*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
856*28e30874SSatish Balay     idt = 4*i;
857*28e30874SSatish Balay     s1  = t[idt];
858*28e30874SSatish Balay     s2  = t[1+idt];
859*28e30874SSatish Balay     s3  = t[2+idt];
860*28e30874SSatish Balay     s4  = t[3+idt];
861*28e30874SSatish Balay     while (nz--) {
862*28e30874SSatish Balay       idx = 4*(*vi++);
863*28e30874SSatish Balay       x1  = t[idx];
864*28e30874SSatish Balay       x2  = t[1+idx];
865*28e30874SSatish Balay       x3  = t[2+idx];
866*28e30874SSatish Balay       x4  = t[3+idx];
867*28e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
868*28e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
869*28e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
870*28e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
871*28e30874SSatish Balay       v  += 16;
872*28e30874SSatish Balay     }
873*28e30874SSatish Balay     idc      = 4*(*c--);
874*28e30874SSatish Balay     v        = aa + 16*diag[i];
875*28e30874SSatish Balay     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
876*28e30874SSatish Balay     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
877*28e30874SSatish Balay     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
878*28e30874SSatish Balay     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
879*28e30874SSatish Balay     x[idc]   = (PetscScalar)t[idt];
880*28e30874SSatish Balay     x[1+idc] = (PetscScalar)t[1+idt];
881*28e30874SSatish Balay     x[2+idc] = (PetscScalar)t[2+idt];
882*28e30874SSatish Balay     x[3+idc] = (PetscScalar)t[3+idt];
883*28e30874SSatish Balay   }
884*28e30874SSatish Balay 
885*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
886*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
887*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
888*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
889*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
890*28e30874SSatish Balay   PetscFunctionReturn(0);
891*28e30874SSatish Balay }
892*28e30874SSatish Balay 
893*28e30874SSatish Balay #if defined(PETSC_HAVE_SSE)
894*28e30874SSatish Balay 
895*28e30874SSatish Balay #include PETSC_HAVE_SSE
896*28e30874SSatish Balay 
897*28e30874SSatish Balay #undef __FUNCT__
898*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
899*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
900*28e30874SSatish Balay {
901*28e30874SSatish Balay   /*
902*28e30874SSatish Balay      Note: This code uses demotion of double
903*28e30874SSatish Balay      to float when performing the mixed-mode computation.
904*28e30874SSatish Balay      This may not be numerically reasonable for all applications.
905*28e30874SSatish Balay   */
906*28e30874SSatish Balay   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
907*28e30874SSatish Balay   IS             iscol=a->col,isrow=a->row;
908*28e30874SSatish Balay   PetscErrorCode ierr;
909*28e30874SSatish Balay   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
910*28e30874SSatish Balay   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
911*28e30874SSatish Balay   MatScalar      *aa=a->a,*v;
912*28e30874SSatish Balay   PetscScalar    *x,*b,*t;
913*28e30874SSatish Balay 
914*28e30874SSatish Balay   /* Make space in temp stack for 16 Byte Aligned arrays */
915*28e30874SSatish Balay   float         ssealignedspace[11],*tmps,*tmpx;
916*28e30874SSatish Balay   unsigned long offset;
917*28e30874SSatish Balay 
918*28e30874SSatish Balay   PetscFunctionBegin;
919*28e30874SSatish Balay   SSE_SCOPE_BEGIN;
920*28e30874SSatish Balay 
921*28e30874SSatish Balay   offset = (unsigned long)ssealignedspace % 16;
922*28e30874SSatish Balay   if (offset) offset = (16 - offset)/4;
923*28e30874SSatish Balay   tmps = &ssealignedspace[offset];
924*28e30874SSatish Balay   tmpx = &ssealignedspace[offset+4];
925*28e30874SSatish Balay   PREFETCH_NTA(aa+16*ai[1]);
926*28e30874SSatish Balay 
927*28e30874SSatish Balay   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
928*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
929*28e30874SSatish Balay   t    = a->solve_work;
930*28e30874SSatish Balay 
931*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
932*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
933*28e30874SSatish Balay 
934*28e30874SSatish Balay   /* forward solve the lower triangular */
935*28e30874SSatish Balay   idx  = 4*(*r++);
936*28e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
937*28e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
938*28e30874SSatish Balay   v    =  aa + 16*ai[1];
939*28e30874SSatish Balay 
940*28e30874SSatish Balay   for (i=1; i<n; ) {
941*28e30874SSatish Balay     PREFETCH_NTA(&v[8]);
942*28e30874SSatish Balay     vi  =  aj      + ai[i];
943*28e30874SSatish Balay     nz  =  diag[i] - ai[i];
944*28e30874SSatish Balay     idx =  4*(*r++);
945*28e30874SSatish Balay 
946*28e30874SSatish Balay     /* Demote sum from double to float */
947*28e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
948*28e30874SSatish Balay     LOAD_PS(tmps,XMM7);
949*28e30874SSatish Balay 
950*28e30874SSatish Balay     while (nz--) {
951*28e30874SSatish Balay       PREFETCH_NTA(&v[16]);
952*28e30874SSatish Balay       idx = 4*(*vi++);
953*28e30874SSatish Balay 
954*28e30874SSatish Balay       /* Demote solution (so far) from double to float */
955*28e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
956*28e30874SSatish Balay 
957*28e30874SSatish Balay       /* 4x4 Matrix-Vector product with negative accumulation: */
958*28e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx,v)
959*28e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
960*28e30874SSatish Balay 
961*28e30874SSatish Balay       /* First Column */
962*28e30874SSatish Balay       SSE_COPY_PS(XMM0,XMM6)
963*28e30874SSatish Balay       SSE_SHUFFLE(XMM0,XMM0,0x00)
964*28e30874SSatish Balay       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
965*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM0)
966*28e30874SSatish Balay 
967*28e30874SSatish Balay       /* Second Column */
968*28e30874SSatish Balay       SSE_COPY_PS(XMM1,XMM6)
969*28e30874SSatish Balay       SSE_SHUFFLE(XMM1,XMM1,0x55)
970*28e30874SSatish Balay       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
971*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM1)
972*28e30874SSatish Balay 
973*28e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
974*28e30874SSatish Balay 
975*28e30874SSatish Balay       /* Third Column */
976*28e30874SSatish Balay       SSE_COPY_PS(XMM2,XMM6)
977*28e30874SSatish Balay       SSE_SHUFFLE(XMM2,XMM2,0xAA)
978*28e30874SSatish Balay       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
979*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM2)
980*28e30874SSatish Balay 
981*28e30874SSatish Balay       /* Fourth Column */
982*28e30874SSatish Balay       SSE_COPY_PS(XMM3,XMM6)
983*28e30874SSatish Balay       SSE_SHUFFLE(XMM3,XMM3,0xFF)
984*28e30874SSatish Balay       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
985*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM3)
986*28e30874SSatish Balay       SSE_INLINE_END_2
987*28e30874SSatish Balay 
988*28e30874SSatish Balay       v += 16;
989*28e30874SSatish Balay     }
990*28e30874SSatish Balay     idx = 4*i;
991*28e30874SSatish Balay     v   = aa + 16*ai[++i];
992*28e30874SSatish Balay     PREFETCH_NTA(v);
993*28e30874SSatish Balay     STORE_PS(tmps,XMM7);
994*28e30874SSatish Balay 
995*28e30874SSatish Balay     /* Promote result from float to double */
996*28e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
997*28e30874SSatish Balay   }
998*28e30874SSatish Balay   /* backward solve the upper triangular */
999*28e30874SSatish Balay   idt  = 4*(n-1);
1000*28e30874SSatish Balay   ai16 = 16*diag[n-1];
1001*28e30874SSatish Balay   v    = aa + ai16 + 16;
1002*28e30874SSatish Balay   for (i=n-1; i>=0; ) {
1003*28e30874SSatish Balay     PREFETCH_NTA(&v[8]);
1004*28e30874SSatish Balay     vi = aj + diag[i] + 1;
1005*28e30874SSatish Balay     nz = ai[i+1] - diag[i] - 1;
1006*28e30874SSatish Balay 
1007*28e30874SSatish Balay     /* Demote accumulator from double to float */
1008*28e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1009*28e30874SSatish Balay     LOAD_PS(tmps,XMM7);
1010*28e30874SSatish Balay 
1011*28e30874SSatish Balay     while (nz--) {
1012*28e30874SSatish Balay       PREFETCH_NTA(&v[16]);
1013*28e30874SSatish Balay       idx = 4*(*vi++);
1014*28e30874SSatish Balay 
1015*28e30874SSatish Balay       /* Demote solution (so far) from double to float */
1016*28e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1017*28e30874SSatish Balay 
1018*28e30874SSatish Balay       /* 4x4 Matrix-Vector Product with negative accumulation: */
1019*28e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx,v)
1020*28e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1021*28e30874SSatish Balay 
1022*28e30874SSatish Balay       /* First Column */
1023*28e30874SSatish Balay       SSE_COPY_PS(XMM0,XMM6)
1024*28e30874SSatish Balay       SSE_SHUFFLE(XMM0,XMM0,0x00)
1025*28e30874SSatish Balay       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1026*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM0)
1027*28e30874SSatish Balay 
1028*28e30874SSatish Balay       /* Second Column */
1029*28e30874SSatish Balay       SSE_COPY_PS(XMM1,XMM6)
1030*28e30874SSatish Balay       SSE_SHUFFLE(XMM1,XMM1,0x55)
1031*28e30874SSatish Balay       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1032*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM1)
1033*28e30874SSatish Balay 
1034*28e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1035*28e30874SSatish Balay 
1036*28e30874SSatish Balay       /* Third Column */
1037*28e30874SSatish Balay       SSE_COPY_PS(XMM2,XMM6)
1038*28e30874SSatish Balay       SSE_SHUFFLE(XMM2,XMM2,0xAA)
1039*28e30874SSatish Balay       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1040*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM2)
1041*28e30874SSatish Balay 
1042*28e30874SSatish Balay       /* Fourth Column */
1043*28e30874SSatish Balay       SSE_COPY_PS(XMM3,XMM6)
1044*28e30874SSatish Balay       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1045*28e30874SSatish Balay       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1046*28e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM3)
1047*28e30874SSatish Balay       SSE_INLINE_END_2
1048*28e30874SSatish Balay       v += 16;
1049*28e30874SSatish Balay     }
1050*28e30874SSatish Balay     v    = aa + ai16;
1051*28e30874SSatish Balay     ai16 = 16*diag[--i];
1052*28e30874SSatish Balay     PREFETCH_NTA(aa+ai16+16);
1053*28e30874SSatish Balay     /*
1054*28e30874SSatish Balay        Scale the result by the diagonal 4x4 block,
1055*28e30874SSatish Balay        which was inverted as part of the factorization
1056*28e30874SSatish Balay     */
1057*28e30874SSatish Balay     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1058*28e30874SSatish Balay     /* First Column */
1059*28e30874SSatish Balay     SSE_COPY_PS(XMM0,XMM7)
1060*28e30874SSatish Balay     SSE_SHUFFLE(XMM0,XMM0,0x00)
1061*28e30874SSatish Balay     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1062*28e30874SSatish Balay 
1063*28e30874SSatish Balay     /* Second Column */
1064*28e30874SSatish Balay     SSE_COPY_PS(XMM1,XMM7)
1065*28e30874SSatish Balay     SSE_SHUFFLE(XMM1,XMM1,0x55)
1066*28e30874SSatish Balay     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1067*28e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM1)
1068*28e30874SSatish Balay 
1069*28e30874SSatish Balay     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1070*28e30874SSatish Balay 
1071*28e30874SSatish Balay     /* Third Column */
1072*28e30874SSatish Balay     SSE_COPY_PS(XMM2,XMM7)
1073*28e30874SSatish Balay     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1074*28e30874SSatish Balay     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1075*28e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM2)
1076*28e30874SSatish Balay 
1077*28e30874SSatish Balay     /* Fourth Column */
1078*28e30874SSatish Balay     SSE_COPY_PS(XMM3,XMM7)
1079*28e30874SSatish Balay     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1080*28e30874SSatish Balay     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1081*28e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM3)
1082*28e30874SSatish Balay 
1083*28e30874SSatish Balay     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1084*28e30874SSatish Balay     SSE_INLINE_END_3
1085*28e30874SSatish Balay 
1086*28e30874SSatish Balay     /* Promote solution from float to double */
1087*28e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1088*28e30874SSatish Balay 
1089*28e30874SSatish Balay     /* Apply reordering to t and stream into x.    */
1090*28e30874SSatish Balay     /* This way, x doesn't pollute the cache.      */
1091*28e30874SSatish Balay     /* Be careful with size: 2 doubles = 4 floats! */
1092*28e30874SSatish Balay     idc = 4*(*c--);
1093*28e30874SSatish Balay     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1094*28e30874SSatish Balay     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1095*28e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096*28e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1097*28e30874SSatish Balay     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1098*28e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099*28e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1100*28e30874SSatish Balay     SSE_INLINE_END_2
1101*28e30874SSatish Balay     v    = aa + ai16 + 16;
1102*28e30874SSatish Balay     idt -= 4;
1103*28e30874SSatish Balay   }
1104*28e30874SSatish Balay 
1105*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1106*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1107*28e30874SSatish Balay   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1108*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1109*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1110*28e30874SSatish Balay   SSE_SCOPE_END;
1111*28e30874SSatish Balay   PetscFunctionReturn(0);
1112*28e30874SSatish Balay }
1113*28e30874SSatish Balay 
1114*28e30874SSatish Balay #endif
1115*28e30874SSatish Balay 
1116*28e30874SSatish Balay #undef __FUNCT__
1117*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
1118*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1119*28e30874SSatish Balay {
1120*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1121*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
1122*28e30874SSatish Balay   PetscErrorCode    ierr;
1123*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1124*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
1125*28e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1126*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1127*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1128*28e30874SSatish Balay   const PetscScalar *b;
1129*28e30874SSatish Balay 
1130*28e30874SSatish Balay   PetscFunctionBegin;
1131*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1132*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133*28e30874SSatish Balay   t    = a->solve_work;
1134*28e30874SSatish Balay 
1135*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1136*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1137*28e30874SSatish Balay 
1138*28e30874SSatish Balay   /* forward solve the lower triangular */
1139*28e30874SSatish Balay   idx  = 3*(*r++);
1140*28e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1141*28e30874SSatish Balay   for (i=1; i<n; i++) {
1142*28e30874SSatish Balay     v   = aa + 9*ai[i];
1143*28e30874SSatish Balay     vi  = aj + ai[i];
1144*28e30874SSatish Balay     nz  = diag[i] - ai[i];
1145*28e30874SSatish Balay     idx = 3*(*r++);
1146*28e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1147*28e30874SSatish Balay     while (nz--) {
1148*28e30874SSatish Balay       idx = 3*(*vi++);
1149*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1150*28e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1151*28e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1152*28e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1153*28e30874SSatish Balay       v  += 9;
1154*28e30874SSatish Balay     }
1155*28e30874SSatish Balay     idx    = 3*i;
1156*28e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1157*28e30874SSatish Balay   }
1158*28e30874SSatish Balay   /* backward solve the upper triangular */
1159*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
1160*28e30874SSatish Balay     v   = aa + 9*diag[i] + 9;
1161*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
1162*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
1163*28e30874SSatish Balay     idt = 3*i;
1164*28e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1165*28e30874SSatish Balay     while (nz--) {
1166*28e30874SSatish Balay       idx = 3*(*vi++);
1167*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1168*28e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1169*28e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1170*28e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1171*28e30874SSatish Balay       v  += 9;
1172*28e30874SSatish Balay     }
1173*28e30874SSatish Balay     idc      = 3*(*c--);
1174*28e30874SSatish Balay     v        = aa + 9*diag[i];
1175*28e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1176*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1177*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1178*28e30874SSatish Balay   }
1179*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1180*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1181*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1182*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1183*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1184*28e30874SSatish Balay   PetscFunctionReturn(0);
1185*28e30874SSatish Balay }
1186*28e30874SSatish Balay 
1187*28e30874SSatish Balay #undef __FUNCT__
1188*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
1189*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1190*28e30874SSatish Balay {
1191*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1192*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
1193*28e30874SSatish Balay   PetscErrorCode    ierr;
1194*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1195*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
1196*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
1197*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1198*28e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1199*28e30874SSatish Balay   const PetscScalar *b;
1200*28e30874SSatish Balay 
1201*28e30874SSatish Balay   PetscFunctionBegin;
1202*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1203*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204*28e30874SSatish Balay   t    = a->solve_work;
1205*28e30874SSatish Balay 
1206*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208*28e30874SSatish Balay 
1209*28e30874SSatish Balay   /* forward solve the lower triangular */
1210*28e30874SSatish Balay   idx  = 3*r[0];
1211*28e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1212*28e30874SSatish Balay   for (i=1; i<n; i++) {
1213*28e30874SSatish Balay     v   = aa + 9*ai[i];
1214*28e30874SSatish Balay     vi  = aj + ai[i];
1215*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
1216*28e30874SSatish Balay     idx = 3*r[i];
1217*28e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1218*28e30874SSatish Balay     for (m=0; m<nz; m++) {
1219*28e30874SSatish Balay       idx = 3*vi[m];
1220*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1221*28e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1222*28e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1223*28e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1224*28e30874SSatish Balay       v  += 9;
1225*28e30874SSatish Balay     }
1226*28e30874SSatish Balay     idx    = 3*i;
1227*28e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1228*28e30874SSatish Balay   }
1229*28e30874SSatish Balay   /* backward solve the upper triangular */
1230*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
1231*28e30874SSatish Balay     v   = aa + 9*(adiag[i+1]+1);
1232*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
1233*28e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
1234*28e30874SSatish Balay     idt = 3*i;
1235*28e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1236*28e30874SSatish Balay     for (m=0; m<nz; m++) {
1237*28e30874SSatish Balay       idx = 3*vi[m];
1238*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1239*28e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1240*28e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1241*28e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1242*28e30874SSatish Balay       v  += 9;
1243*28e30874SSatish Balay     }
1244*28e30874SSatish Balay     idc      = 3*c[i];
1245*28e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1246*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1247*28e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1248*28e30874SSatish Balay   }
1249*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1250*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1251*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1252*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1253*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1254*28e30874SSatish Balay   PetscFunctionReturn(0);
1255*28e30874SSatish Balay }
1256*28e30874SSatish Balay 
1257*28e30874SSatish Balay #undef __FUNCT__
1258*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
1259*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1260*28e30874SSatish Balay {
1261*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1262*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
1263*28e30874SSatish Balay   PetscErrorCode    ierr;
1264*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1265*28e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
1266*28e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1267*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1268*28e30874SSatish Balay   PetscScalar       *x,s1,s2,x1,x2,*t;
1269*28e30874SSatish Balay   const PetscScalar *b;
1270*28e30874SSatish Balay 
1271*28e30874SSatish Balay   PetscFunctionBegin;
1272*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1273*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1274*28e30874SSatish Balay   t    = a->solve_work;
1275*28e30874SSatish Balay 
1276*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1277*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1278*28e30874SSatish Balay 
1279*28e30874SSatish Balay   /* forward solve the lower triangular */
1280*28e30874SSatish Balay   idx  = 2*(*r++);
1281*28e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx];
1282*28e30874SSatish Balay   for (i=1; i<n; i++) {
1283*28e30874SSatish Balay     v   = aa + 4*ai[i];
1284*28e30874SSatish Balay     vi  = aj + ai[i];
1285*28e30874SSatish Balay     nz  = diag[i] - ai[i];
1286*28e30874SSatish Balay     idx = 2*(*r++);
1287*28e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx];
1288*28e30874SSatish Balay     while (nz--) {
1289*28e30874SSatish Balay       idx = 2*(*vi++);
1290*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
1291*28e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
1292*28e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
1293*28e30874SSatish Balay       v  += 4;
1294*28e30874SSatish Balay     }
1295*28e30874SSatish Balay     idx    = 2*i;
1296*28e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2;
1297*28e30874SSatish Balay   }
1298*28e30874SSatish Balay   /* backward solve the upper triangular */
1299*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
1300*28e30874SSatish Balay     v   = aa + 4*diag[i] + 4;
1301*28e30874SSatish Balay     vi  = aj + diag[i] + 1;
1302*28e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
1303*28e30874SSatish Balay     idt = 2*i;
1304*28e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt];
1305*28e30874SSatish Balay     while (nz--) {
1306*28e30874SSatish Balay       idx = 2*(*vi++);
1307*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
1308*28e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
1309*28e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
1310*28e30874SSatish Balay       v  += 4;
1311*28e30874SSatish Balay     }
1312*28e30874SSatish Balay     idc      = 2*(*c--);
1313*28e30874SSatish Balay     v        = aa + 4*diag[i];
1314*28e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1315*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1316*28e30874SSatish Balay   }
1317*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1318*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1319*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1320*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1321*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1322*28e30874SSatish Balay   PetscFunctionReturn(0);
1323*28e30874SSatish Balay }
1324*28e30874SSatish Balay 
1325*28e30874SSatish Balay #undef __FUNCT__
1326*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
1327*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1328*28e30874SSatish Balay {
1329*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1330*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
1331*28e30874SSatish Balay   PetscErrorCode    ierr;
1332*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1333*28e30874SSatish Balay   PetscInt          i,nz,idx,jdx,idt,idc,m;
1334*28e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
1335*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1336*28e30874SSatish Balay   PetscScalar       *x,s1,s2,x1,x2,*t;
1337*28e30874SSatish Balay   const PetscScalar *b;
1338*28e30874SSatish Balay 
1339*28e30874SSatish Balay   PetscFunctionBegin;
1340*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1341*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1342*28e30874SSatish Balay   t    = a->solve_work;
1343*28e30874SSatish Balay 
1344*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1345*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1346*28e30874SSatish Balay 
1347*28e30874SSatish Balay   /* forward solve the lower triangular */
1348*28e30874SSatish Balay   idx  = 2*r[0];
1349*28e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx];
1350*28e30874SSatish Balay   for (i=1; i<n; i++) {
1351*28e30874SSatish Balay     v   = aa + 4*ai[i];
1352*28e30874SSatish Balay     vi  = aj + ai[i];
1353*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
1354*28e30874SSatish Balay     idx = 2*r[i];
1355*28e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx];
1356*28e30874SSatish Balay     for (m=0; m<nz; m++) {
1357*28e30874SSatish Balay       jdx = 2*vi[m];
1358*28e30874SSatish Balay       x1  = t[jdx]; x2 = t[1+jdx];
1359*28e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
1360*28e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
1361*28e30874SSatish Balay       v  += 4;
1362*28e30874SSatish Balay     }
1363*28e30874SSatish Balay     idx    = 2*i;
1364*28e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2;
1365*28e30874SSatish Balay   }
1366*28e30874SSatish Balay   /* backward solve the upper triangular */
1367*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
1368*28e30874SSatish Balay     v   = aa + 4*(adiag[i+1]+1);
1369*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
1370*28e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
1371*28e30874SSatish Balay     idt = 2*i;
1372*28e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt];
1373*28e30874SSatish Balay     for (m=0; m<nz; m++) {
1374*28e30874SSatish Balay       idx = 2*vi[m];
1375*28e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
1376*28e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
1377*28e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
1378*28e30874SSatish Balay       v  += 4;
1379*28e30874SSatish Balay     }
1380*28e30874SSatish Balay     idc      = 2*c[i];
1381*28e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1382*28e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1383*28e30874SSatish Balay   }
1384*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1385*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1386*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1387*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1388*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1389*28e30874SSatish Balay   PetscFunctionReturn(0);
1390*28e30874SSatish Balay }
1391*28e30874SSatish Balay 
1392*28e30874SSatish Balay #undef __FUNCT__
1393*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
1394*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1395*28e30874SSatish Balay {
1396*28e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1397*28e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
1398*28e30874SSatish Balay   PetscErrorCode    ierr;
1399*28e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1400*28e30874SSatish Balay   PetscInt          i,nz;
1401*28e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1402*28e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1403*28e30874SSatish Balay   PetscScalar       *x,s1,*t;
1404*28e30874SSatish Balay   const PetscScalar *b;
1405*28e30874SSatish Balay 
1406*28e30874SSatish Balay   PetscFunctionBegin;
1407*28e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
1408*28e30874SSatish Balay 
1409*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1410*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1411*28e30874SSatish Balay   t    = a->solve_work;
1412*28e30874SSatish Balay 
1413*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1414*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1415*28e30874SSatish Balay 
1416*28e30874SSatish Balay   /* forward solve the lower triangular */
1417*28e30874SSatish Balay   t[0] = b[*r++];
1418*28e30874SSatish Balay   for (i=1; i<n; i++) {
1419*28e30874SSatish Balay     v  = aa + ai[i];
1420*28e30874SSatish Balay     vi = aj + ai[i];
1421*28e30874SSatish Balay     nz = diag[i] - ai[i];
1422*28e30874SSatish Balay     s1 = b[*r++];
1423*28e30874SSatish Balay     while (nz--) {
1424*28e30874SSatish Balay       s1 -= (*v++)*t[*vi++];
1425*28e30874SSatish Balay     }
1426*28e30874SSatish Balay     t[i] = s1;
1427*28e30874SSatish Balay   }
1428*28e30874SSatish Balay   /* backward solve the upper triangular */
1429*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
1430*28e30874SSatish Balay     v  = aa + diag[i] + 1;
1431*28e30874SSatish Balay     vi = aj + diag[i] + 1;
1432*28e30874SSatish Balay     nz = ai[i+1] - diag[i] - 1;
1433*28e30874SSatish Balay     s1 = t[i];
1434*28e30874SSatish Balay     while (nz--) {
1435*28e30874SSatish Balay       s1 -= (*v++)*t[*vi++];
1436*28e30874SSatish Balay     }
1437*28e30874SSatish Balay     x[*c--] = t[i] = aa[diag[i]]*s1;
1438*28e30874SSatish Balay   }
1439*28e30874SSatish Balay 
1440*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1441*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1443*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1444*28e30874SSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1445*28e30874SSatish Balay   PetscFunctionReturn(0);
1446*28e30874SSatish Balay }
1447*28e30874SSatish Balay 
1448*28e30874SSatish Balay #undef __FUNCT__
1449*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
1450*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1451*28e30874SSatish Balay {
1452*28e30874SSatish Balay   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1453*28e30874SSatish Balay   IS                iscol = a->col,isrow = a->row;
1454*28e30874SSatish Balay   PetscErrorCode    ierr;
1455*28e30874SSatish Balay   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1456*28e30874SSatish Balay   const PetscInt    *rout,*cout,*r,*c;
1457*28e30874SSatish Balay   PetscScalar       *x,*tmp,sum;
1458*28e30874SSatish Balay   const PetscScalar *b;
1459*28e30874SSatish Balay   const MatScalar   *aa = a->a,*v;
1460*28e30874SSatish Balay 
1461*28e30874SSatish Balay   PetscFunctionBegin;
1462*28e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
1463*28e30874SSatish Balay 
1464*28e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1465*28e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1466*28e30874SSatish Balay   tmp  = a->solve_work;
1467*28e30874SSatish Balay 
1468*28e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1469*28e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1470*28e30874SSatish Balay 
1471*28e30874SSatish Balay   /* forward solve the lower triangular */
1472*28e30874SSatish Balay   tmp[0] = b[r[0]];
1473*28e30874SSatish Balay   v      = aa;
1474*28e30874SSatish Balay   vi     = aj;
1475*28e30874SSatish Balay   for (i=1; i<n; i++) {
1476*28e30874SSatish Balay     nz  = ai[i+1] - ai[i];
1477*28e30874SSatish Balay     sum = b[r[i]];
1478*28e30874SSatish Balay     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1479*28e30874SSatish Balay     tmp[i] = sum;
1480*28e30874SSatish Balay     v     += nz; vi += nz;
1481*28e30874SSatish Balay   }
1482*28e30874SSatish Balay 
1483*28e30874SSatish Balay   /* backward solve the upper triangular */
1484*28e30874SSatish Balay   for (i=n-1; i>=0; i--) {
1485*28e30874SSatish Balay     v   = aa + adiag[i+1]+1;
1486*28e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
1487*28e30874SSatish Balay     nz  = adiag[i]-adiag[i+1]-1;
1488*28e30874SSatish Balay     sum = tmp[i];
1489*28e30874SSatish Balay     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1490*28e30874SSatish Balay     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1491*28e30874SSatish Balay   }
1492*28e30874SSatish Balay 
1493*28e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1494*28e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1495*28e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1496*28e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1497*28e30874SSatish Balay   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
1498*28e30874SSatish Balay   PetscFunctionReturn(0);
1499*28e30874SSatish Balay }
1500*28e30874SSatish Balay 
1501