xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision 503c0ea9b45bcfbcebbb1ea5341243bbc69f0bea)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 /*
5       Special case where the matrix was ILU(0) factored in the natural
6    ordering. This eliminates the need for the column and row permutation.
7 */
8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
9 {
10   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
11   PetscInt          n  =a->mbs;
12   const PetscInt    *ai=a->i,*aj=a->j;
13   const PetscInt    *diag = a->diag;
14   const MatScalar   *aa   =a->a;
15   PetscScalar       *x;
16   const PetscScalar *b;
17 
18   PetscFunctionBegin;
19   PetscCall(VecGetArrayRead(bb,&b));
20   PetscCall(VecGetArray(xx,&x));
21 
22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23   {
24     static PetscScalar w[2000]; /* very BAD need to fix */
25     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
26   }
27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
29 #else
30   {
31     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
32     const MatScalar *v;
33     PetscInt        jdx,idt,idx,nz,i,ai16;
34     const PetscInt  *vi;
35 
36     /* forward solve the lower triangular */
37     idx  = 0;
38     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
39     for (i=1; i<n; i++) {
40       v    =  aa      + 16*ai[i];
41       vi   =  aj      + ai[i];
42       nz   =  diag[i] - ai[i];
43       idx +=  4;
44       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
45       while (nz--) {
46         jdx = 4*(*vi++);
47         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
48         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
49         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
50         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
51         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
52         v  += 16;
53       }
54       x[idx]   = s1;
55       x[1+idx] = s2;
56       x[2+idx] = s3;
57       x[3+idx] = s4;
58     }
59     /* backward solve the upper triangular */
60     idt = 4*(n-1);
61     for (i=n-1; i>=0; i--) {
62       ai16 = 16*diag[i];
63       v    = aa + ai16 + 16;
64       vi   = aj + diag[i] + 1;
65       nz   = ai[i+1] - diag[i] - 1;
66       s1   = x[idt];  s2 = x[1+idt];
67       s3   = x[2+idt];s4 = x[3+idt];
68       while (nz--) {
69         idx = 4*(*vi++);
70         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
71         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
72         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
73         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
74         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
75         v  += 16;
76       }
77       v        = aa + ai16;
78       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
79       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
80       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
81       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
82       idt     -= 4;
83     }
84   }
85 #endif
86 
87   PetscCall(VecRestoreArrayRead(bb,&b));
88   PetscCall(VecRestoreArray(xx,&x));
89   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
94 {
95   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
96   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
97   PetscInt          i,k,nz,idx,jdx,idt;
98   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
99   const MatScalar   *aa=a->a,*v;
100   PetscScalar       *x;
101   const PetscScalar *b;
102   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
103 
104   PetscFunctionBegin;
105   PetscCall(VecGetArrayRead(bb,&b));
106   PetscCall(VecGetArray(xx,&x));
107   /* forward solve the lower triangular */
108   idx  = 0;
109   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
110   for (i=1; i<n; i++) {
111     v   = aa + bs2*ai[i];
112     vi  = aj + ai[i];
113     nz  = ai[i+1] - ai[i];
114     idx = bs*i;
115     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
116     for (k=0; k<nz; k++) {
117       jdx = bs*vi[k];
118       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
119       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
120       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
121       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
122       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
123 
124       v +=  bs2;
125     }
126 
127     x[idx]   = s1;
128     x[1+idx] = s2;
129     x[2+idx] = s3;
130     x[3+idx] = s4;
131   }
132 
133   /* backward solve the upper triangular */
134   for (i=n-1; i>=0; i--) {
135     v   = aa + bs2*(adiag[i+1]+1);
136     vi  = aj + adiag[i+1]+1;
137     nz  = adiag[i] - adiag[i+1]-1;
138     idt = bs*i;
139     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
140 
141     for (k=0; k<nz; k++) {
142       idx = bs*vi[k];
143       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
144       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
145       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
146       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
147       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
148 
149       v +=  bs2;
150     }
151     /* x = inv_diagonal*x */
152     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
153     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
154     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
155     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
156 
157   }
158 
159   PetscCall(VecRestoreArrayRead(bb,&b));
160   PetscCall(VecRestoreArray(xx,&x));
161   PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n));
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
166 {
167   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
168   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
169   const MatScalar   *aa=a->a;
170   const PetscScalar *b;
171   PetscScalar       *x;
172 
173   PetscFunctionBegin;
174   PetscCall(VecGetArrayRead(bb,&b));
175   PetscCall(VecGetArray(xx,&x));
176 
177   {
178     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
179     const MatScalar *v;
180     MatScalar       *t=(MatScalar*)x;
181     PetscInt        jdx,idt,idx,nz,i,ai16;
182     const PetscInt  *vi;
183 
184     /* forward solve the lower triangular */
185     idx  = 0;
186     t[0] = (MatScalar)b[0];
187     t[1] = (MatScalar)b[1];
188     t[2] = (MatScalar)b[2];
189     t[3] = (MatScalar)b[3];
190     for (i=1; i<n; i++) {
191       v    =  aa      + 16*ai[i];
192       vi   =  aj      + ai[i];
193       nz   =  diag[i] - ai[i];
194       idx +=  4;
195       s1   = (MatScalar)b[idx];
196       s2   = (MatScalar)b[1+idx];
197       s3   = (MatScalar)b[2+idx];
198       s4   = (MatScalar)b[3+idx];
199       while (nz--) {
200         jdx = 4*(*vi++);
201         x1  = t[jdx];
202         x2  = t[1+jdx];
203         x3  = t[2+jdx];
204         x4  = t[3+jdx];
205         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
206         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
207         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
208         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
209         v  += 16;
210       }
211       t[idx]   = s1;
212       t[1+idx] = s2;
213       t[2+idx] = s3;
214       t[3+idx] = s4;
215     }
216     /* backward solve the upper triangular */
217     idt = 4*(n-1);
218     for (i=n-1; i>=0; i--) {
219       ai16 = 16*diag[i];
220       v    = aa + ai16 + 16;
221       vi   = aj + diag[i] + 1;
222       nz   = ai[i+1] - diag[i] - 1;
223       s1   = t[idt];
224       s2   = t[1+idt];
225       s3   = t[2+idt];
226       s4   = t[3+idt];
227       while (nz--) {
228         idx = 4*(*vi++);
229         x1  = (MatScalar)x[idx];
230         x2  = (MatScalar)x[1+idx];
231         x3  = (MatScalar)x[2+idx];
232         x4  = (MatScalar)x[3+idx];
233         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
234         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
235         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
236         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
237         v  += 16;
238       }
239       v        = aa + ai16;
240       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
241       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
242       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
243       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
244       idt     -= 4;
245     }
246   }
247 
248   PetscCall(VecRestoreArrayRead(bb,&b));
249   PetscCall(VecRestoreArray(xx,&x));
250   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
251   PetscFunctionReturn(0);
252 }
253 
254 #if defined(PETSC_HAVE_SSE)
255 
256 #include PETSC_HAVE_SSE
257 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
258 {
259   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
260   unsigned short *aj=(unsigned short*)a->j;
261   PetscErrorCode ierr;
262   int            *ai=a->i,n=a->mbs,*diag = a->diag;
263   MatScalar      *aa=a->a;
264   PetscScalar    *x,*b;
265 
266   PetscFunctionBegin;
267   SSE_SCOPE_BEGIN;
268   /*
269      Note: This code currently uses demotion of double
270      to float when performing the mixed-mode computation.
271      This may not be numerically reasonable for all applications.
272   */
273   PREFETCH_NTA(aa+16*ai[1]);
274 
275   PetscCall(VecGetArray(bb,&b));
276   PetscCall(VecGetArray(xx,&x));
277   {
278     /* x will first be computed in single precision then promoted inplace to double */
279     MatScalar      *v,*t=(MatScalar*)x;
280     int            nz,i,idt,ai16;
281     unsigned int   jdx,idx;
282     unsigned short *vi;
283     /* Forward solve the lower triangular factor. */
284 
285     /* First block is the identity. */
286     idx = 0;
287     CONVERT_DOUBLE4_FLOAT4(t,b);
288     v =  aa + 16*((unsigned int)ai[1]);
289 
290     for (i=1; i<n;) {
291       PREFETCH_NTA(&v[8]);
292       vi   =  aj      + ai[i];
293       nz   =  diag[i] - ai[i];
294       idx +=  4;
295 
296       /* Demote RHS from double to float. */
297       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
298       LOAD_PS(&t[idx],XMM7);
299 
300       while (nz--) {
301         PREFETCH_NTA(&v[16]);
302         jdx = 4*((unsigned int)(*vi++));
303 
304         /* 4x4 Matrix-Vector product with negative accumulation: */
305         SSE_INLINE_BEGIN_2(&t[jdx],v)
306         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
307 
308         /* First Column */
309         SSE_COPY_PS(XMM0,XMM6)
310         SSE_SHUFFLE(XMM0,XMM0,0x00)
311         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
312         SSE_SUB_PS(XMM7,XMM0)
313 
314         /* Second Column */
315         SSE_COPY_PS(XMM1,XMM6)
316         SSE_SHUFFLE(XMM1,XMM1,0x55)
317         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
318         SSE_SUB_PS(XMM7,XMM1)
319 
320         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
321 
322         /* Third Column */
323         SSE_COPY_PS(XMM2,XMM6)
324         SSE_SHUFFLE(XMM2,XMM2,0xAA)
325         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
326         SSE_SUB_PS(XMM7,XMM2)
327 
328         /* Fourth Column */
329         SSE_COPY_PS(XMM3,XMM6)
330         SSE_SHUFFLE(XMM3,XMM3,0xFF)
331         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
332         SSE_SUB_PS(XMM7,XMM3)
333         SSE_INLINE_END_2
334 
335         v += 16;
336       }
337       v =  aa + 16*ai[++i];
338       PREFETCH_NTA(v);
339       STORE_PS(&t[idx],XMM7);
340     }
341 
342     /* Backward solve the upper triangular factor.*/
343 
344     idt  = 4*(n-1);
345     ai16 = 16*diag[n-1];
346     v    = aa + ai16 + 16;
347     for (i=n-1; i>=0;) {
348       PREFETCH_NTA(&v[8]);
349       vi = aj + diag[i] + 1;
350       nz = ai[i+1] - diag[i] - 1;
351 
352       LOAD_PS(&t[idt],XMM7);
353 
354       while (nz--) {
355         PREFETCH_NTA(&v[16]);
356         idx = 4*((unsigned int)(*vi++));
357 
358         /* 4x4 Matrix-Vector Product with negative accumulation: */
359         SSE_INLINE_BEGIN_2(&t[idx],v)
360         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
361 
362         /* First Column */
363         SSE_COPY_PS(XMM0,XMM6)
364         SSE_SHUFFLE(XMM0,XMM0,0x00)
365         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
366         SSE_SUB_PS(XMM7,XMM0)
367 
368         /* Second Column */
369         SSE_COPY_PS(XMM1,XMM6)
370         SSE_SHUFFLE(XMM1,XMM1,0x55)
371         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
372         SSE_SUB_PS(XMM7,XMM1)
373 
374         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
375 
376         /* Third Column */
377         SSE_COPY_PS(XMM2,XMM6)
378         SSE_SHUFFLE(XMM2,XMM2,0xAA)
379         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
380         SSE_SUB_PS(XMM7,XMM2)
381 
382         /* Fourth Column */
383         SSE_COPY_PS(XMM3,XMM6)
384         SSE_SHUFFLE(XMM3,XMM3,0xFF)
385         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
386         SSE_SUB_PS(XMM7,XMM3)
387         SSE_INLINE_END_2
388         v += 16;
389       }
390       v    = aa + ai16;
391       ai16 = 16*diag[--i];
392       PREFETCH_NTA(aa+ai16+16);
393       /*
394          Scale the result by the diagonal 4x4 block,
395          which was inverted as part of the factorization
396       */
397       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
398       /* First Column */
399       SSE_COPY_PS(XMM0,XMM7)
400       SSE_SHUFFLE(XMM0,XMM0,0x00)
401       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
402 
403       /* Second Column */
404       SSE_COPY_PS(XMM1,XMM7)
405       SSE_SHUFFLE(XMM1,XMM1,0x55)
406       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
407       SSE_ADD_PS(XMM0,XMM1)
408 
409       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
410 
411       /* Third Column */
412       SSE_COPY_PS(XMM2,XMM7)
413       SSE_SHUFFLE(XMM2,XMM2,0xAA)
414       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
415       SSE_ADD_PS(XMM0,XMM2)
416 
417       /* Fourth Column */
418       SSE_COPY_PS(XMM3,XMM7)
419       SSE_SHUFFLE(XMM3,XMM3,0xFF)
420       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
421       SSE_ADD_PS(XMM0,XMM3)
422 
423       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
424       SSE_INLINE_END_3
425 
426       v    = aa + ai16 + 16;
427       idt -= 4;
428     }
429 
430     /* Convert t from single precision back to double precision (inplace)*/
431     idt = 4*(n-1);
432     for (i=n-1; i>=0; i--) {
433       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
434       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
435       PetscScalar *xtemp=&x[idt];
436       MatScalar   *ttemp=&t[idt];
437       xtemp[3] = (PetscScalar)ttemp[3];
438       xtemp[2] = (PetscScalar)ttemp[2];
439       xtemp[1] = (PetscScalar)ttemp[1];
440       xtemp[0] = (PetscScalar)ttemp[0];
441       idt     -= 4;
442     }
443 
444   } /* End of artificial scope. */
445   PetscCall(VecRestoreArray(bb,&b));
446   PetscCall(VecRestoreArray(xx,&x));
447   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
448   SSE_SCOPE_END;
449   PetscFunctionReturn(0);
450 }
451 
452 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
453 {
454   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
455   int            *aj=a->j;
456   PetscErrorCode ierr;
457   int            *ai=a->i,n=a->mbs,*diag = a->diag;
458   MatScalar      *aa=a->a;
459   PetscScalar    *x,*b;
460 
461   PetscFunctionBegin;
462   SSE_SCOPE_BEGIN;
463   /*
464      Note: This code currently uses demotion of double
465      to float when performing the mixed-mode computation.
466      This may not be numerically reasonable for all applications.
467   */
468   PREFETCH_NTA(aa+16*ai[1]);
469 
470   PetscCall(VecGetArray(bb,&b));
471   PetscCall(VecGetArray(xx,&x));
472   {
473     /* x will first be computed in single precision then promoted inplace to double */
474     MatScalar *v,*t=(MatScalar*)x;
475     int       nz,i,idt,ai16;
476     int       jdx,idx;
477     int       *vi;
478     /* Forward solve the lower triangular factor. */
479 
480     /* First block is the identity. */
481     idx = 0;
482     CONVERT_DOUBLE4_FLOAT4(t,b);
483     v =  aa + 16*ai[1];
484 
485     for (i=1; i<n;) {
486       PREFETCH_NTA(&v[8]);
487       vi   =  aj      + ai[i];
488       nz   =  diag[i] - ai[i];
489       idx +=  4;
490 
491       /* Demote RHS from double to float. */
492       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
493       LOAD_PS(&t[idx],XMM7);
494 
495       while (nz--) {
496         PREFETCH_NTA(&v[16]);
497         jdx = 4*(*vi++);
498 /*          jdx = *vi++; */
499 
500         /* 4x4 Matrix-Vector product with negative accumulation: */
501         SSE_INLINE_BEGIN_2(&t[jdx],v)
502         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
503 
504         /* First Column */
505         SSE_COPY_PS(XMM0,XMM6)
506         SSE_SHUFFLE(XMM0,XMM0,0x00)
507         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
508         SSE_SUB_PS(XMM7,XMM0)
509 
510         /* Second Column */
511         SSE_COPY_PS(XMM1,XMM6)
512         SSE_SHUFFLE(XMM1,XMM1,0x55)
513         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
514         SSE_SUB_PS(XMM7,XMM1)
515 
516         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
517 
518         /* Third Column */
519         SSE_COPY_PS(XMM2,XMM6)
520         SSE_SHUFFLE(XMM2,XMM2,0xAA)
521         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
522         SSE_SUB_PS(XMM7,XMM2)
523 
524         /* Fourth Column */
525         SSE_COPY_PS(XMM3,XMM6)
526         SSE_SHUFFLE(XMM3,XMM3,0xFF)
527         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
528         SSE_SUB_PS(XMM7,XMM3)
529         SSE_INLINE_END_2
530 
531         v += 16;
532       }
533       v =  aa + 16*ai[++i];
534       PREFETCH_NTA(v);
535       STORE_PS(&t[idx],XMM7);
536     }
537 
538     /* Backward solve the upper triangular factor.*/
539 
540     idt  = 4*(n-1);
541     ai16 = 16*diag[n-1];
542     v    = aa + ai16 + 16;
543     for (i=n-1; i>=0;) {
544       PREFETCH_NTA(&v[8]);
545       vi = aj + diag[i] + 1;
546       nz = ai[i+1] - diag[i] - 1;
547 
548       LOAD_PS(&t[idt],XMM7);
549 
550       while (nz--) {
551         PREFETCH_NTA(&v[16]);
552         idx = 4*(*vi++);
553 /*          idx = *vi++; */
554 
555         /* 4x4 Matrix-Vector Product with negative accumulation: */
556         SSE_INLINE_BEGIN_2(&t[idx],v)
557         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
558 
559         /* First Column */
560         SSE_COPY_PS(XMM0,XMM6)
561         SSE_SHUFFLE(XMM0,XMM0,0x00)
562         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
563         SSE_SUB_PS(XMM7,XMM0)
564 
565         /* Second Column */
566         SSE_COPY_PS(XMM1,XMM6)
567         SSE_SHUFFLE(XMM1,XMM1,0x55)
568         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
569         SSE_SUB_PS(XMM7,XMM1)
570 
571         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
572 
573         /* Third Column */
574         SSE_COPY_PS(XMM2,XMM6)
575         SSE_SHUFFLE(XMM2,XMM2,0xAA)
576         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
577         SSE_SUB_PS(XMM7,XMM2)
578 
579         /* Fourth Column */
580         SSE_COPY_PS(XMM3,XMM6)
581         SSE_SHUFFLE(XMM3,XMM3,0xFF)
582         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
583         SSE_SUB_PS(XMM7,XMM3)
584         SSE_INLINE_END_2
585         v += 16;
586       }
587       v    = aa + ai16;
588       ai16 = 16*diag[--i];
589       PREFETCH_NTA(aa+ai16+16);
590       /*
591          Scale the result by the diagonal 4x4 block,
592          which was inverted as part of the factorization
593       */
594       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
595       /* First Column */
596       SSE_COPY_PS(XMM0,XMM7)
597       SSE_SHUFFLE(XMM0,XMM0,0x00)
598       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
599 
600       /* Second Column */
601       SSE_COPY_PS(XMM1,XMM7)
602       SSE_SHUFFLE(XMM1,XMM1,0x55)
603       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
604       SSE_ADD_PS(XMM0,XMM1)
605 
606       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
607 
608       /* Third Column */
609       SSE_COPY_PS(XMM2,XMM7)
610       SSE_SHUFFLE(XMM2,XMM2,0xAA)
611       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
612       SSE_ADD_PS(XMM0,XMM2)
613 
614       /* Fourth Column */
615       SSE_COPY_PS(XMM3,XMM7)
616       SSE_SHUFFLE(XMM3,XMM3,0xFF)
617       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
618       SSE_ADD_PS(XMM0,XMM3)
619 
620       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
621       SSE_INLINE_END_3
622 
623       v    = aa + ai16 + 16;
624       idt -= 4;
625     }
626 
627     /* Convert t from single precision back to double precision (inplace)*/
628     idt = 4*(n-1);
629     for (i=n-1; i>=0; i--) {
630       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
631       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
632       PetscScalar *xtemp=&x[idt];
633       MatScalar   *ttemp=&t[idt];
634       xtemp[3] = (PetscScalar)ttemp[3];
635       xtemp[2] = (PetscScalar)ttemp[2];
636       xtemp[1] = (PetscScalar)ttemp[1];
637       xtemp[0] = (PetscScalar)ttemp[0];
638       idt     -= 4;
639     }
640 
641   } /* End of artificial scope. */
642   PetscCall(VecRestoreArray(bb,&b));
643   PetscCall(VecRestoreArray(xx,&x));
644   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
645   SSE_SCOPE_END;
646   PetscFunctionReturn(0);
647 }
648 
649 #endif
650