xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision 2c733ed45a445b0336611d812c866924feeaee2c)
1*2c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h>
2*2c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h>
3*2c733ed4SBarry Smith 
4*2c733ed4SBarry Smith /*
5*2c733ed4SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
6*2c733ed4SBarry Smith    ordering. This eliminates the need for the column and row permutation.
7*2c733ed4SBarry Smith */
8*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
9*2c733ed4SBarry Smith {
10*2c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
11*2c733ed4SBarry Smith   PetscInt          n  =a->mbs;
12*2c733ed4SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
13*2c733ed4SBarry Smith   PetscErrorCode    ierr;
14*2c733ed4SBarry Smith   const PetscInt    *diag = a->diag;
15*2c733ed4SBarry Smith   const MatScalar   *aa   =a->a;
16*2c733ed4SBarry Smith   PetscScalar       *x;
17*2c733ed4SBarry Smith   const PetscScalar *b;
18*2c733ed4SBarry Smith 
19*2c733ed4SBarry Smith   PetscFunctionBegin;
20*2c733ed4SBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21*2c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22*2c733ed4SBarry Smith 
23*2c733ed4SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
24*2c733ed4SBarry Smith   {
25*2c733ed4SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
26*2c733ed4SBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
27*2c733ed4SBarry Smith   }
28*2c733ed4SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
29*2c733ed4SBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
30*2c733ed4SBarry Smith #else
31*2c733ed4SBarry Smith   {
32*2c733ed4SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
33*2c733ed4SBarry Smith     const MatScalar *v;
34*2c733ed4SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
35*2c733ed4SBarry Smith     const PetscInt  *vi;
36*2c733ed4SBarry Smith 
37*2c733ed4SBarry Smith     /* forward solve the lower triangular */
38*2c733ed4SBarry Smith     idx  = 0;
39*2c733ed4SBarry Smith     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
40*2c733ed4SBarry Smith     for (i=1; i<n; i++) {
41*2c733ed4SBarry Smith       v    =  aa      + 16*ai[i];
42*2c733ed4SBarry Smith       vi   =  aj      + ai[i];
43*2c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
44*2c733ed4SBarry Smith       idx +=  4;
45*2c733ed4SBarry Smith       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
46*2c733ed4SBarry Smith       while (nz--) {
47*2c733ed4SBarry Smith         jdx = 4*(*vi++);
48*2c733ed4SBarry Smith         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
49*2c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
50*2c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
51*2c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
52*2c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
53*2c733ed4SBarry Smith         v  += 16;
54*2c733ed4SBarry Smith       }
55*2c733ed4SBarry Smith       x[idx]   = s1;
56*2c733ed4SBarry Smith       x[1+idx] = s2;
57*2c733ed4SBarry Smith       x[2+idx] = s3;
58*2c733ed4SBarry Smith       x[3+idx] = s4;
59*2c733ed4SBarry Smith     }
60*2c733ed4SBarry Smith     /* backward solve the upper triangular */
61*2c733ed4SBarry Smith     idt = 4*(n-1);
62*2c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
63*2c733ed4SBarry Smith       ai16 = 16*diag[i];
64*2c733ed4SBarry Smith       v    = aa + ai16 + 16;
65*2c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
66*2c733ed4SBarry Smith       nz   = ai[i+1] - diag[i] - 1;
67*2c733ed4SBarry Smith       s1   = x[idt];  s2 = x[1+idt];
68*2c733ed4SBarry Smith       s3   = x[2+idt];s4 = x[3+idt];
69*2c733ed4SBarry Smith       while (nz--) {
70*2c733ed4SBarry Smith         idx = 4*(*vi++);
71*2c733ed4SBarry Smith         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
72*2c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
73*2c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
74*2c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
75*2c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
76*2c733ed4SBarry Smith         v  += 16;
77*2c733ed4SBarry Smith       }
78*2c733ed4SBarry Smith       v        = aa + ai16;
79*2c733ed4SBarry Smith       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
80*2c733ed4SBarry Smith       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
81*2c733ed4SBarry Smith       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
82*2c733ed4SBarry Smith       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
83*2c733ed4SBarry Smith       idt     -= 4;
84*2c733ed4SBarry Smith     }
85*2c733ed4SBarry Smith   }
86*2c733ed4SBarry Smith #endif
87*2c733ed4SBarry Smith 
88*2c733ed4SBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
89*2c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
90*2c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
91*2c733ed4SBarry Smith   PetscFunctionReturn(0);
92*2c733ed4SBarry Smith }
93*2c733ed4SBarry Smith 
94*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
95*2c733ed4SBarry Smith {
96*2c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
97*2c733ed4SBarry Smith   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
98*2c733ed4SBarry Smith   PetscInt          i,k,nz,idx,jdx,idt;
99*2c733ed4SBarry Smith   PetscErrorCode    ierr;
100*2c733ed4SBarry Smith   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
101*2c733ed4SBarry Smith   const MatScalar   *aa=a->a,*v;
102*2c733ed4SBarry Smith   PetscScalar       *x;
103*2c733ed4SBarry Smith   const PetscScalar *b;
104*2c733ed4SBarry Smith   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
105*2c733ed4SBarry Smith 
106*2c733ed4SBarry Smith   PetscFunctionBegin;
107*2c733ed4SBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
108*2c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
109*2c733ed4SBarry Smith   /* forward solve the lower triangular */
110*2c733ed4SBarry Smith   idx  = 0;
111*2c733ed4SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
112*2c733ed4SBarry Smith   for (i=1; i<n; i++) {
113*2c733ed4SBarry Smith     v   = aa + bs2*ai[i];
114*2c733ed4SBarry Smith     vi  = aj + ai[i];
115*2c733ed4SBarry Smith     nz  = ai[i+1] - ai[i];
116*2c733ed4SBarry Smith     idx = bs*i;
117*2c733ed4SBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
118*2c733ed4SBarry Smith     for (k=0; k<nz; k++) {
119*2c733ed4SBarry Smith       jdx = bs*vi[k];
120*2c733ed4SBarry Smith       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
121*2c733ed4SBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
122*2c733ed4SBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
123*2c733ed4SBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
124*2c733ed4SBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
125*2c733ed4SBarry Smith 
126*2c733ed4SBarry Smith       v +=  bs2;
127*2c733ed4SBarry Smith     }
128*2c733ed4SBarry Smith 
129*2c733ed4SBarry Smith     x[idx]   = s1;
130*2c733ed4SBarry Smith     x[1+idx] = s2;
131*2c733ed4SBarry Smith     x[2+idx] = s3;
132*2c733ed4SBarry Smith     x[3+idx] = s4;
133*2c733ed4SBarry Smith   }
134*2c733ed4SBarry Smith 
135*2c733ed4SBarry Smith   /* backward solve the upper triangular */
136*2c733ed4SBarry Smith   for (i=n-1; i>=0; i--) {
137*2c733ed4SBarry Smith     v   = aa + bs2*(adiag[i+1]+1);
138*2c733ed4SBarry Smith     vi  = aj + adiag[i+1]+1;
139*2c733ed4SBarry Smith     nz  = adiag[i] - adiag[i+1]-1;
140*2c733ed4SBarry Smith     idt = bs*i;
141*2c733ed4SBarry Smith     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
142*2c733ed4SBarry Smith 
143*2c733ed4SBarry Smith     for (k=0; k<nz; k++) {
144*2c733ed4SBarry Smith       idx = bs*vi[k];
145*2c733ed4SBarry Smith       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
146*2c733ed4SBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
147*2c733ed4SBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
148*2c733ed4SBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
149*2c733ed4SBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
150*2c733ed4SBarry Smith 
151*2c733ed4SBarry Smith       v +=  bs2;
152*2c733ed4SBarry Smith     }
153*2c733ed4SBarry Smith     /* x = inv_diagonal*x */
154*2c733ed4SBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
155*2c733ed4SBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
156*2c733ed4SBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
157*2c733ed4SBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
158*2c733ed4SBarry Smith 
159*2c733ed4SBarry Smith   }
160*2c733ed4SBarry Smith 
161*2c733ed4SBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
162*2c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
163*2c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
164*2c733ed4SBarry Smith   PetscFunctionReturn(0);
165*2c733ed4SBarry Smith }
166*2c733ed4SBarry Smith 
167*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
168*2c733ed4SBarry Smith {
169*2c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
170*2c733ed4SBarry Smith   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
171*2c733ed4SBarry Smith   PetscErrorCode    ierr;
172*2c733ed4SBarry Smith   const MatScalar   *aa=a->a;
173*2c733ed4SBarry Smith   const PetscScalar *b;
174*2c733ed4SBarry Smith   PetscScalar       *x;
175*2c733ed4SBarry Smith 
176*2c733ed4SBarry Smith   PetscFunctionBegin;
177*2c733ed4SBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
178*2c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
179*2c733ed4SBarry Smith 
180*2c733ed4SBarry Smith   {
181*2c733ed4SBarry Smith     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
182*2c733ed4SBarry Smith     const MatScalar *v;
183*2c733ed4SBarry Smith     MatScalar       *t=(MatScalar*)x;
184*2c733ed4SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
185*2c733ed4SBarry Smith     const PetscInt  *vi;
186*2c733ed4SBarry Smith 
187*2c733ed4SBarry Smith     /* forward solve the lower triangular */
188*2c733ed4SBarry Smith     idx  = 0;
189*2c733ed4SBarry Smith     t[0] = (MatScalar)b[0];
190*2c733ed4SBarry Smith     t[1] = (MatScalar)b[1];
191*2c733ed4SBarry Smith     t[2] = (MatScalar)b[2];
192*2c733ed4SBarry Smith     t[3] = (MatScalar)b[3];
193*2c733ed4SBarry Smith     for (i=1; i<n; i++) {
194*2c733ed4SBarry Smith       v    =  aa      + 16*ai[i];
195*2c733ed4SBarry Smith       vi   =  aj      + ai[i];
196*2c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
197*2c733ed4SBarry Smith       idx +=  4;
198*2c733ed4SBarry Smith       s1   = (MatScalar)b[idx];
199*2c733ed4SBarry Smith       s2   = (MatScalar)b[1+idx];
200*2c733ed4SBarry Smith       s3   = (MatScalar)b[2+idx];
201*2c733ed4SBarry Smith       s4   = (MatScalar)b[3+idx];
202*2c733ed4SBarry Smith       while (nz--) {
203*2c733ed4SBarry Smith         jdx = 4*(*vi++);
204*2c733ed4SBarry Smith         x1  = t[jdx];
205*2c733ed4SBarry Smith         x2  = t[1+jdx];
206*2c733ed4SBarry Smith         x3  = t[2+jdx];
207*2c733ed4SBarry Smith         x4  = t[3+jdx];
208*2c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
209*2c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
210*2c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
211*2c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
212*2c733ed4SBarry Smith         v  += 16;
213*2c733ed4SBarry Smith       }
214*2c733ed4SBarry Smith       t[idx]   = s1;
215*2c733ed4SBarry Smith       t[1+idx] = s2;
216*2c733ed4SBarry Smith       t[2+idx] = s3;
217*2c733ed4SBarry Smith       t[3+idx] = s4;
218*2c733ed4SBarry Smith     }
219*2c733ed4SBarry Smith     /* backward solve the upper triangular */
220*2c733ed4SBarry Smith     idt = 4*(n-1);
221*2c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
222*2c733ed4SBarry Smith       ai16 = 16*diag[i];
223*2c733ed4SBarry Smith       v    = aa + ai16 + 16;
224*2c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
225*2c733ed4SBarry Smith       nz   = ai[i+1] - diag[i] - 1;
226*2c733ed4SBarry Smith       s1   = t[idt];
227*2c733ed4SBarry Smith       s2   = t[1+idt];
228*2c733ed4SBarry Smith       s3   = t[2+idt];
229*2c733ed4SBarry Smith       s4   = t[3+idt];
230*2c733ed4SBarry Smith       while (nz--) {
231*2c733ed4SBarry Smith         idx = 4*(*vi++);
232*2c733ed4SBarry Smith         x1  = (MatScalar)x[idx];
233*2c733ed4SBarry Smith         x2  = (MatScalar)x[1+idx];
234*2c733ed4SBarry Smith         x3  = (MatScalar)x[2+idx];
235*2c733ed4SBarry Smith         x4  = (MatScalar)x[3+idx];
236*2c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
237*2c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
238*2c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
239*2c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
240*2c733ed4SBarry Smith         v  += 16;
241*2c733ed4SBarry Smith       }
242*2c733ed4SBarry Smith       v        = aa + ai16;
243*2c733ed4SBarry Smith       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
244*2c733ed4SBarry Smith       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
245*2c733ed4SBarry Smith       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
246*2c733ed4SBarry Smith       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
247*2c733ed4SBarry Smith       idt     -= 4;
248*2c733ed4SBarry Smith     }
249*2c733ed4SBarry Smith   }
250*2c733ed4SBarry Smith 
251*2c733ed4SBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
252*2c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
253*2c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
254*2c733ed4SBarry Smith   PetscFunctionReturn(0);
255*2c733ed4SBarry Smith }
256*2c733ed4SBarry Smith 
257*2c733ed4SBarry Smith #if defined(PETSC_HAVE_SSE)
258*2c733ed4SBarry Smith 
259*2c733ed4SBarry Smith #include PETSC_HAVE_SSE
260*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
261*2c733ed4SBarry Smith {
262*2c733ed4SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
263*2c733ed4SBarry Smith   unsigned short *aj=(unsigned short*)a->j;
264*2c733ed4SBarry Smith   PetscErrorCode ierr;
265*2c733ed4SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
266*2c733ed4SBarry Smith   MatScalar      *aa=a->a;
267*2c733ed4SBarry Smith   PetscScalar    *x,*b;
268*2c733ed4SBarry Smith 
269*2c733ed4SBarry Smith   PetscFunctionBegin;
270*2c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
271*2c733ed4SBarry Smith   /*
272*2c733ed4SBarry Smith      Note: This code currently uses demotion of double
273*2c733ed4SBarry Smith      to float when performing the mixed-mode computation.
274*2c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
275*2c733ed4SBarry Smith   */
276*2c733ed4SBarry Smith   PREFETCH_NTA(aa+16*ai[1]);
277*2c733ed4SBarry Smith 
278*2c733ed4SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
279*2c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
280*2c733ed4SBarry Smith   {
281*2c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
282*2c733ed4SBarry Smith     MatScalar      *v,*t=(MatScalar*)x;
283*2c733ed4SBarry Smith     int            nz,i,idt,ai16;
284*2c733ed4SBarry Smith     unsigned int   jdx,idx;
285*2c733ed4SBarry Smith     unsigned short *vi;
286*2c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
287*2c733ed4SBarry Smith 
288*2c733ed4SBarry Smith     /* First block is the identity. */
289*2c733ed4SBarry Smith     idx = 0;
290*2c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t,b);
291*2c733ed4SBarry Smith     v =  aa + 16*((unsigned int)ai[1]);
292*2c733ed4SBarry Smith 
293*2c733ed4SBarry Smith     for (i=1; i<n; ) {
294*2c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
295*2c733ed4SBarry Smith       vi   =  aj      + ai[i];
296*2c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
297*2c733ed4SBarry Smith       idx +=  4;
298*2c733ed4SBarry Smith 
299*2c733ed4SBarry Smith       /* Demote RHS from double to float. */
300*2c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
301*2c733ed4SBarry Smith       LOAD_PS(&t[idx],XMM7);
302*2c733ed4SBarry Smith 
303*2c733ed4SBarry Smith       while (nz--) {
304*2c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
305*2c733ed4SBarry Smith         jdx = 4*((unsigned int)(*vi++));
306*2c733ed4SBarry Smith 
307*2c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
308*2c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx],v)
309*2c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
310*2c733ed4SBarry Smith 
311*2c733ed4SBarry Smith         /* First Column */
312*2c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
313*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
314*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
315*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
316*2c733ed4SBarry Smith 
317*2c733ed4SBarry Smith         /* Second Column */
318*2c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
319*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
320*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
321*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
322*2c733ed4SBarry Smith 
323*2c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
324*2c733ed4SBarry Smith 
325*2c733ed4SBarry Smith         /* Third Column */
326*2c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
327*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
328*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
329*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
330*2c733ed4SBarry Smith 
331*2c733ed4SBarry Smith         /* Fourth Column */
332*2c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
333*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
334*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
335*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
336*2c733ed4SBarry Smith         SSE_INLINE_END_2
337*2c733ed4SBarry Smith 
338*2c733ed4SBarry Smith         v += 16;
339*2c733ed4SBarry Smith       }
340*2c733ed4SBarry Smith       v =  aa + 16*ai[++i];
341*2c733ed4SBarry Smith       PREFETCH_NTA(v);
342*2c733ed4SBarry Smith       STORE_PS(&t[idx],XMM7);
343*2c733ed4SBarry Smith     }
344*2c733ed4SBarry Smith 
345*2c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
346*2c733ed4SBarry Smith 
347*2c733ed4SBarry Smith     idt  = 4*(n-1);
348*2c733ed4SBarry Smith     ai16 = 16*diag[n-1];
349*2c733ed4SBarry Smith     v    = aa + ai16 + 16;
350*2c733ed4SBarry Smith     for (i=n-1; i>=0; ) {
351*2c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
352*2c733ed4SBarry Smith       vi = aj + diag[i] + 1;
353*2c733ed4SBarry Smith       nz = ai[i+1] - diag[i] - 1;
354*2c733ed4SBarry Smith 
355*2c733ed4SBarry Smith       LOAD_PS(&t[idt],XMM7);
356*2c733ed4SBarry Smith 
357*2c733ed4SBarry Smith       while (nz--) {
358*2c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
359*2c733ed4SBarry Smith         idx = 4*((unsigned int)(*vi++));
360*2c733ed4SBarry Smith 
361*2c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
362*2c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx],v)
363*2c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
364*2c733ed4SBarry Smith 
365*2c733ed4SBarry Smith         /* First Column */
366*2c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
367*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
368*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
369*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
370*2c733ed4SBarry Smith 
371*2c733ed4SBarry Smith         /* Second Column */
372*2c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
373*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
374*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
375*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
376*2c733ed4SBarry Smith 
377*2c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
378*2c733ed4SBarry Smith 
379*2c733ed4SBarry Smith         /* Third Column */
380*2c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
381*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
382*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
383*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
384*2c733ed4SBarry Smith 
385*2c733ed4SBarry Smith         /* Fourth Column */
386*2c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
387*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
388*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
389*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
390*2c733ed4SBarry Smith         SSE_INLINE_END_2
391*2c733ed4SBarry Smith         v += 16;
392*2c733ed4SBarry Smith       }
393*2c733ed4SBarry Smith       v    = aa + ai16;
394*2c733ed4SBarry Smith       ai16 = 16*diag[--i];
395*2c733ed4SBarry Smith       PREFETCH_NTA(aa+ai16+16);
396*2c733ed4SBarry Smith       /*
397*2c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
398*2c733ed4SBarry Smith          which was inverted as part of the factorization
399*2c733ed4SBarry Smith       */
400*2c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
401*2c733ed4SBarry Smith       /* First Column */
402*2c733ed4SBarry Smith       SSE_COPY_PS(XMM0,XMM7)
403*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM0,XMM0,0x00)
404*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
405*2c733ed4SBarry Smith 
406*2c733ed4SBarry Smith       /* Second Column */
407*2c733ed4SBarry Smith       SSE_COPY_PS(XMM1,XMM7)
408*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM1,XMM1,0x55)
409*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
410*2c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM1)
411*2c733ed4SBarry Smith 
412*2c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
413*2c733ed4SBarry Smith 
414*2c733ed4SBarry Smith       /* Third Column */
415*2c733ed4SBarry Smith       SSE_COPY_PS(XMM2,XMM7)
416*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM2,XMM2,0xAA)
417*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
418*2c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM2)
419*2c733ed4SBarry Smith 
420*2c733ed4SBarry Smith       /* Fourth Column */
421*2c733ed4SBarry Smith       SSE_COPY_PS(XMM3,XMM7)
422*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM3,XMM3,0xFF)
423*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
424*2c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM3)
425*2c733ed4SBarry Smith 
426*2c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
427*2c733ed4SBarry Smith       SSE_INLINE_END_3
428*2c733ed4SBarry Smith 
429*2c733ed4SBarry Smith       v    = aa + ai16 + 16;
430*2c733ed4SBarry Smith       idt -= 4;
431*2c733ed4SBarry Smith     }
432*2c733ed4SBarry Smith 
433*2c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
434*2c733ed4SBarry Smith     idt = 4*(n-1);
435*2c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
436*2c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
437*2c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
438*2c733ed4SBarry Smith       PetscScalar *xtemp=&x[idt];
439*2c733ed4SBarry Smith       MatScalar   *ttemp=&t[idt];
440*2c733ed4SBarry Smith       xtemp[3] = (PetscScalar)ttemp[3];
441*2c733ed4SBarry Smith       xtemp[2] = (PetscScalar)ttemp[2];
442*2c733ed4SBarry Smith       xtemp[1] = (PetscScalar)ttemp[1];
443*2c733ed4SBarry Smith       xtemp[0] = (PetscScalar)ttemp[0];
444*2c733ed4SBarry Smith       idt     -= 4;
445*2c733ed4SBarry Smith     }
446*2c733ed4SBarry Smith 
447*2c733ed4SBarry Smith   } /* End of artificial scope. */
448*2c733ed4SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449*2c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450*2c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
451*2c733ed4SBarry Smith   SSE_SCOPE_END;
452*2c733ed4SBarry Smith   PetscFunctionReturn(0);
453*2c733ed4SBarry Smith }
454*2c733ed4SBarry Smith 
455*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
456*2c733ed4SBarry Smith {
457*2c733ed4SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
458*2c733ed4SBarry Smith   int            *aj=a->j;
459*2c733ed4SBarry Smith   PetscErrorCode ierr;
460*2c733ed4SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
461*2c733ed4SBarry Smith   MatScalar      *aa=a->a;
462*2c733ed4SBarry Smith   PetscScalar    *x,*b;
463*2c733ed4SBarry Smith 
464*2c733ed4SBarry Smith   PetscFunctionBegin;
465*2c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
466*2c733ed4SBarry Smith   /*
467*2c733ed4SBarry Smith      Note: This code currently uses demotion of double
468*2c733ed4SBarry Smith      to float when performing the mixed-mode computation.
469*2c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
470*2c733ed4SBarry Smith   */
471*2c733ed4SBarry Smith   PREFETCH_NTA(aa+16*ai[1]);
472*2c733ed4SBarry Smith 
473*2c733ed4SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
474*2c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
475*2c733ed4SBarry Smith   {
476*2c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
477*2c733ed4SBarry Smith     MatScalar *v,*t=(MatScalar*)x;
478*2c733ed4SBarry Smith     int       nz,i,idt,ai16;
479*2c733ed4SBarry Smith     int       jdx,idx;
480*2c733ed4SBarry Smith     int       *vi;
481*2c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
482*2c733ed4SBarry Smith 
483*2c733ed4SBarry Smith     /* First block is the identity. */
484*2c733ed4SBarry Smith     idx = 0;
485*2c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t,b);
486*2c733ed4SBarry Smith     v =  aa + 16*ai[1];
487*2c733ed4SBarry Smith 
488*2c733ed4SBarry Smith     for (i=1; i<n; ) {
489*2c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
490*2c733ed4SBarry Smith       vi   =  aj      + ai[i];
491*2c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
492*2c733ed4SBarry Smith       idx +=  4;
493*2c733ed4SBarry Smith 
494*2c733ed4SBarry Smith       /* Demote RHS from double to float. */
495*2c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
496*2c733ed4SBarry Smith       LOAD_PS(&t[idx],XMM7);
497*2c733ed4SBarry Smith 
498*2c733ed4SBarry Smith       while (nz--) {
499*2c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
500*2c733ed4SBarry Smith         jdx = 4*(*vi++);
501*2c733ed4SBarry Smith /*          jdx = *vi++; */
502*2c733ed4SBarry Smith 
503*2c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
504*2c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx],v)
505*2c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
506*2c733ed4SBarry Smith 
507*2c733ed4SBarry Smith         /* First Column */
508*2c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
509*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
510*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
511*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
512*2c733ed4SBarry Smith 
513*2c733ed4SBarry Smith         /* Second Column */
514*2c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
515*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
516*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
517*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
518*2c733ed4SBarry Smith 
519*2c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
520*2c733ed4SBarry Smith 
521*2c733ed4SBarry Smith         /* Third Column */
522*2c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
523*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
524*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
525*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
526*2c733ed4SBarry Smith 
527*2c733ed4SBarry Smith         /* Fourth Column */
528*2c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
529*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
530*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
531*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
532*2c733ed4SBarry Smith         SSE_INLINE_END_2
533*2c733ed4SBarry Smith 
534*2c733ed4SBarry Smith         v += 16;
535*2c733ed4SBarry Smith       }
536*2c733ed4SBarry Smith       v =  aa + 16*ai[++i];
537*2c733ed4SBarry Smith       PREFETCH_NTA(v);
538*2c733ed4SBarry Smith       STORE_PS(&t[idx],XMM7);
539*2c733ed4SBarry Smith     }
540*2c733ed4SBarry Smith 
541*2c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
542*2c733ed4SBarry Smith 
543*2c733ed4SBarry Smith     idt  = 4*(n-1);
544*2c733ed4SBarry Smith     ai16 = 16*diag[n-1];
545*2c733ed4SBarry Smith     v    = aa + ai16 + 16;
546*2c733ed4SBarry Smith     for (i=n-1; i>=0; ) {
547*2c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
548*2c733ed4SBarry Smith       vi = aj + diag[i] + 1;
549*2c733ed4SBarry Smith       nz = ai[i+1] - diag[i] - 1;
550*2c733ed4SBarry Smith 
551*2c733ed4SBarry Smith       LOAD_PS(&t[idt],XMM7);
552*2c733ed4SBarry Smith 
553*2c733ed4SBarry Smith       while (nz--) {
554*2c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
555*2c733ed4SBarry Smith         idx = 4*(*vi++);
556*2c733ed4SBarry Smith /*          idx = *vi++; */
557*2c733ed4SBarry Smith 
558*2c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
559*2c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx],v)
560*2c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
561*2c733ed4SBarry Smith 
562*2c733ed4SBarry Smith         /* First Column */
563*2c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
564*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
565*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
566*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
567*2c733ed4SBarry Smith 
568*2c733ed4SBarry Smith         /* Second Column */
569*2c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
570*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
571*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
572*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
573*2c733ed4SBarry Smith 
574*2c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
575*2c733ed4SBarry Smith 
576*2c733ed4SBarry Smith         /* Third Column */
577*2c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
578*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
579*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
580*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
581*2c733ed4SBarry Smith 
582*2c733ed4SBarry Smith         /* Fourth Column */
583*2c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
584*2c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
585*2c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
586*2c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
587*2c733ed4SBarry Smith         SSE_INLINE_END_2
588*2c733ed4SBarry Smith         v += 16;
589*2c733ed4SBarry Smith       }
590*2c733ed4SBarry Smith       v    = aa + ai16;
591*2c733ed4SBarry Smith       ai16 = 16*diag[--i];
592*2c733ed4SBarry Smith       PREFETCH_NTA(aa+ai16+16);
593*2c733ed4SBarry Smith       /*
594*2c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
595*2c733ed4SBarry Smith          which was inverted as part of the factorization
596*2c733ed4SBarry Smith       */
597*2c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
598*2c733ed4SBarry Smith       /* First Column */
599*2c733ed4SBarry Smith       SSE_COPY_PS(XMM0,XMM7)
600*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM0,XMM0,0x00)
601*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
602*2c733ed4SBarry Smith 
603*2c733ed4SBarry Smith       /* Second Column */
604*2c733ed4SBarry Smith       SSE_COPY_PS(XMM1,XMM7)
605*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM1,XMM1,0x55)
606*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
607*2c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM1)
608*2c733ed4SBarry Smith 
609*2c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
610*2c733ed4SBarry Smith 
611*2c733ed4SBarry Smith       /* Third Column */
612*2c733ed4SBarry Smith       SSE_COPY_PS(XMM2,XMM7)
613*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM2,XMM2,0xAA)
614*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
615*2c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM2)
616*2c733ed4SBarry Smith 
617*2c733ed4SBarry Smith       /* Fourth Column */
618*2c733ed4SBarry Smith       SSE_COPY_PS(XMM3,XMM7)
619*2c733ed4SBarry Smith       SSE_SHUFFLE(XMM3,XMM3,0xFF)
620*2c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
621*2c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM3)
622*2c733ed4SBarry Smith 
623*2c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
624*2c733ed4SBarry Smith       SSE_INLINE_END_3
625*2c733ed4SBarry Smith 
626*2c733ed4SBarry Smith       v    = aa + ai16 + 16;
627*2c733ed4SBarry Smith       idt -= 4;
628*2c733ed4SBarry Smith     }
629*2c733ed4SBarry Smith 
630*2c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
631*2c733ed4SBarry Smith     idt = 4*(n-1);
632*2c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
633*2c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
634*2c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
635*2c733ed4SBarry Smith       PetscScalar *xtemp=&x[idt];
636*2c733ed4SBarry Smith       MatScalar   *ttemp=&t[idt];
637*2c733ed4SBarry Smith       xtemp[3] = (PetscScalar)ttemp[3];
638*2c733ed4SBarry Smith       xtemp[2] = (PetscScalar)ttemp[2];
639*2c733ed4SBarry Smith       xtemp[1] = (PetscScalar)ttemp[1];
640*2c733ed4SBarry Smith       xtemp[0] = (PetscScalar)ttemp[0];
641*2c733ed4SBarry Smith       idt     -= 4;
642*2c733ed4SBarry Smith     }
643*2c733ed4SBarry Smith 
644*2c733ed4SBarry Smith   } /* End of artificial scope. */
645*2c733ed4SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
646*2c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
647*2c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
648*2c733ed4SBarry Smith   SSE_SCOPE_END;
649*2c733ed4SBarry Smith   PetscFunctionReturn(0);
650*2c733ed4SBarry Smith }
651*2c733ed4SBarry Smith 
652*2c733ed4SBarry Smith #endif
653