xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision d5b43468fb8780a8feea140ccd6fa3e6a50411cc)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 /*
5       Special case where the matrix was ILU(0) factored in the natural
6    ordering. This eliminates the need for the column and row permutation.
7 */
8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9 {
10   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
11   PetscInt           n  = a->mbs;
12   const PetscInt    *ai = a->i, *aj = a->j;
13   const PetscInt    *diag = a->diag;
14   const MatScalar   *aa   = a->a;
15   PetscScalar       *x;
16   const PetscScalar *b;
17 
18   PetscFunctionBegin;
19   PetscCall(VecGetArrayRead(bb, &b));
20   PetscCall(VecGetArray(xx, &x));
21 
22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23   {
24     static PetscScalar w[2000]; /* very BAD need to fix */
25     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
26   }
27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
29 #else
30   {
31     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
32     const MatScalar *v;
33     PetscInt         jdx, idt, idx, nz, i, ai16;
34     const PetscInt  *vi;
35 
36     /* forward solve the lower triangular */
37     idx  = 0;
38     x[0] = b[0];
39     x[1] = b[1];
40     x[2] = b[2];
41     x[3] = b[3];
42     for (i = 1; i < n; i++) {
43       v  = aa + 16 * ai[i];
44       vi = aj + ai[i];
45       nz = diag[i] - ai[i];
46       idx += 4;
47       s1 = b[idx];
48       s2 = b[1 + idx];
49       s3 = b[2 + idx];
50       s4 = b[3 + idx];
51       while (nz--) {
52         jdx = 4 * (*vi++);
53         x1  = x[jdx];
54         x2  = x[1 + jdx];
55         x3  = x[2 + jdx];
56         x4  = x[3 + jdx];
57         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
58         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
59         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
60         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
61         v += 16;
62       }
63       x[idx]     = s1;
64       x[1 + idx] = s2;
65       x[2 + idx] = s3;
66       x[3 + idx] = s4;
67     }
68     /* backward solve the upper triangular */
69     idt = 4 * (n - 1);
70     for (i = n - 1; i >= 0; i--) {
71       ai16 = 16 * diag[i];
72       v    = aa + ai16 + 16;
73       vi   = aj + diag[i] + 1;
74       nz   = ai[i + 1] - diag[i] - 1;
75       s1   = x[idt];
76       s2   = x[1 + idt];
77       s3   = x[2 + idt];
78       s4   = x[3 + idt];
79       while (nz--) {
80         idx = 4 * (*vi++);
81         x1  = x[idx];
82         x2  = x[1 + idx];
83         x3  = x[2 + idx];
84         x4  = x[3 + idx];
85         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
86         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
87         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
89         v += 16;
90       }
91       v          = aa + ai16;
92       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
93       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
94       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
95       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
96       idt -= 4;
97     }
98   }
99 #endif
100 
101   PetscCall(VecRestoreArrayRead(bb, &b));
102   PetscCall(VecRestoreArray(xx, &x));
103   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104   PetscFunctionReturn(0);
105 }
106 
107 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108 {
109   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
110   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111   PetscInt           i, k, nz, idx, jdx, idt;
112   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
113   const MatScalar   *aa = a->a, *v;
114   PetscScalar       *x;
115   const PetscScalar *b;
116   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;
117 
118   PetscFunctionBegin;
119   PetscCall(VecGetArrayRead(bb, &b));
120   PetscCall(VecGetArray(xx, &x));
121   /* forward solve the lower triangular */
122   idx  = 0;
123   x[0] = b[idx];
124   x[1] = b[1 + idx];
125   x[2] = b[2 + idx];
126   x[3] = b[3 + idx];
127   for (i = 1; i < n; i++) {
128     v   = aa + bs2 * ai[i];
129     vi  = aj + ai[i];
130     nz  = ai[i + 1] - ai[i];
131     idx = bs * i;
132     s1  = b[idx];
133     s2  = b[1 + idx];
134     s3  = b[2 + idx];
135     s4  = b[3 + idx];
136     for (k = 0; k < nz; k++) {
137       jdx = bs * vi[k];
138       x1  = x[jdx];
139       x2  = x[1 + jdx];
140       x3  = x[2 + jdx];
141       x4  = x[3 + jdx];
142       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
146 
147       v += bs2;
148     }
149 
150     x[idx]     = s1;
151     x[1 + idx] = s2;
152     x[2 + idx] = s3;
153     x[3 + idx] = s4;
154   }
155 
156   /* backward solve the upper triangular */
157   for (i = n - 1; i >= 0; i--) {
158     v   = aa + bs2 * (adiag[i + 1] + 1);
159     vi  = aj + adiag[i + 1] + 1;
160     nz  = adiag[i] - adiag[i + 1] - 1;
161     idt = bs * i;
162     s1  = x[idt];
163     s2  = x[1 + idt];
164     s3  = x[2 + idt];
165     s4  = x[3 + idt];
166 
167     for (k = 0; k < nz; k++) {
168       idx = bs * vi[k];
169       x1  = x[idx];
170       x2  = x[1 + idx];
171       x3  = x[2 + idx];
172       x4  = x[3 + idx];
173       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
177 
178       v += bs2;
179     }
180     /* x = inv_diagonal*x */
181     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185   }
186 
187   PetscCall(VecRestoreArrayRead(bb, &b));
188   PetscCall(VecRestoreArray(xx, &x));
189   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190   PetscFunctionReturn(0);
191 }
192 
193 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx)
194 {
195   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
196   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
197   const MatScalar   *aa = a->a;
198   const PetscScalar *b;
199   PetscScalar       *x;
200 
201   PetscFunctionBegin;
202   PetscCall(VecGetArrayRead(bb, &b));
203   PetscCall(VecGetArray(xx, &x));
204 
205   {
206     MatScalar        s1, s2, s3, s4, x1, x2, x3, x4;
207     const MatScalar *v;
208     MatScalar       *t = (MatScalar *)x;
209     PetscInt         jdx, idt, idx, nz, i, ai16;
210     const PetscInt  *vi;
211 
212     /* forward solve the lower triangular */
213     idx  = 0;
214     t[0] = (MatScalar)b[0];
215     t[1] = (MatScalar)b[1];
216     t[2] = (MatScalar)b[2];
217     t[3] = (MatScalar)b[3];
218     for (i = 1; i < n; i++) {
219       v  = aa + 16 * ai[i];
220       vi = aj + ai[i];
221       nz = diag[i] - ai[i];
222       idx += 4;
223       s1 = (MatScalar)b[idx];
224       s2 = (MatScalar)b[1 + idx];
225       s3 = (MatScalar)b[2 + idx];
226       s4 = (MatScalar)b[3 + idx];
227       while (nz--) {
228         jdx = 4 * (*vi++);
229         x1  = t[jdx];
230         x2  = t[1 + jdx];
231         x3  = t[2 + jdx];
232         x4  = t[3 + jdx];
233         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
234         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
235         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
236         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
237         v += 16;
238       }
239       t[idx]     = s1;
240       t[1 + idx] = s2;
241       t[2 + idx] = s3;
242       t[3 + idx] = s4;
243     }
244     /* backward solve the upper triangular */
245     idt = 4 * (n - 1);
246     for (i = n - 1; i >= 0; i--) {
247       ai16 = 16 * diag[i];
248       v    = aa + ai16 + 16;
249       vi   = aj + diag[i] + 1;
250       nz   = ai[i + 1] - diag[i] - 1;
251       s1   = t[idt];
252       s2   = t[1 + idt];
253       s3   = t[2 + idt];
254       s4   = t[3 + idt];
255       while (nz--) {
256         idx = 4 * (*vi++);
257         x1  = (MatScalar)x[idx];
258         x2  = (MatScalar)x[1 + idx];
259         x3  = (MatScalar)x[2 + idx];
260         x4  = (MatScalar)x[3 + idx];
261         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
262         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
263         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
264         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
265         v += 16;
266       }
267       v          = aa + ai16;
268       x[idt]     = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
269       x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
270       x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
271       x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
272       idt -= 4;
273     }
274   }
275 
276   PetscCall(VecRestoreArrayRead(bb, &b));
277   PetscCall(VecRestoreArray(xx, &x));
278   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
279   PetscFunctionReturn(0);
280 }
281 
282 #if defined(PETSC_HAVE_SSE)
283 
284   #include PETSC_HAVE_SSE
285 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
286 {
287   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
288   unsigned short *aj = (unsigned short *)a->j;
289   int            *ai = a->i, n = a->mbs, *diag = a->diag;
290   MatScalar      *aa = a->a;
291   PetscScalar    *x, *b;
292 
293   PetscFunctionBegin;
294   SSE_SCOPE_BEGIN;
295   /*
296      Note: This code currently uses demotion of double
297      to float when performing the mixed-mode computation.
298      This may not be numerically reasonable for all applications.
299   */
300   PREFETCH_NTA(aa + 16 * ai[1]);
301 
302   PetscCall(VecGetArray(bb, &b));
303   PetscCall(VecGetArray(xx, &x));
304   {
305     /* x will first be computed in single precision then promoted inplace to double */
306     MatScalar      *v, *t = (MatScalar *)x;
307     int             nz, i, idt, ai16;
308     unsigned int    jdx, idx;
309     unsigned short *vi;
310     /* Forward solve the lower triangular factor. */
311 
312     /* First block is the identity. */
313     idx = 0;
314     CONVERT_DOUBLE4_FLOAT4(t, b);
315     v = aa + 16 * ((unsigned int)ai[1]);
316 
317     for (i = 1; i < n;) {
318       PREFETCH_NTA(&v[8]);
319       vi = aj + ai[i];
320       nz = diag[i] - ai[i];
321       idx += 4;
322 
323       /* Demote RHS from double to float. */
324       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
325       LOAD_PS(&t[idx], XMM7);
326 
327       while (nz--) {
328         PREFETCH_NTA(&v[16]);
329         jdx = 4 * ((unsigned int)(*vi++));
330 
331         /* 4x4 Matrix-Vector product with negative accumulation: */
332         SSE_INLINE_BEGIN_2(&t[jdx], v)
333         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
334 
335         /* First Column */
336         SSE_COPY_PS(XMM0, XMM6)
337         SSE_SHUFFLE(XMM0, XMM0, 0x00)
338         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
339         SSE_SUB_PS(XMM7, XMM0)
340 
341         /* Second Column */
342         SSE_COPY_PS(XMM1, XMM6)
343         SSE_SHUFFLE(XMM1, XMM1, 0x55)
344         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
345         SSE_SUB_PS(XMM7, XMM1)
346 
347         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
348 
349         /* Third Column */
350         SSE_COPY_PS(XMM2, XMM6)
351         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
352         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
353         SSE_SUB_PS(XMM7, XMM2)
354 
355         /* Fourth Column */
356         SSE_COPY_PS(XMM3, XMM6)
357         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
358         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
359         SSE_SUB_PS(XMM7, XMM3)
360         SSE_INLINE_END_2
361 
362         v += 16;
363       }
364       v = aa + 16 * ai[++i];
365       PREFETCH_NTA(v);
366       STORE_PS(&t[idx], XMM7);
367     }
368 
369     /* Backward solve the upper triangular factor.*/
370 
371     idt  = 4 * (n - 1);
372     ai16 = 16 * diag[n - 1];
373     v    = aa + ai16 + 16;
374     for (i = n - 1; i >= 0;) {
375       PREFETCH_NTA(&v[8]);
376       vi = aj + diag[i] + 1;
377       nz = ai[i + 1] - diag[i] - 1;
378 
379       LOAD_PS(&t[idt], XMM7);
380 
381       while (nz--) {
382         PREFETCH_NTA(&v[16]);
383         idx = 4 * ((unsigned int)(*vi++));
384 
385         /* 4x4 Matrix-Vector Product with negative accumulation: */
386         SSE_INLINE_BEGIN_2(&t[idx], v)
387         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
388 
389         /* First Column */
390         SSE_COPY_PS(XMM0, XMM6)
391         SSE_SHUFFLE(XMM0, XMM0, 0x00)
392         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
393         SSE_SUB_PS(XMM7, XMM0)
394 
395         /* Second Column */
396         SSE_COPY_PS(XMM1, XMM6)
397         SSE_SHUFFLE(XMM1, XMM1, 0x55)
398         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
399         SSE_SUB_PS(XMM7, XMM1)
400 
401         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
402 
403         /* Third Column */
404         SSE_COPY_PS(XMM2, XMM6)
405         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
406         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
407         SSE_SUB_PS(XMM7, XMM2)
408 
409         /* Fourth Column */
410         SSE_COPY_PS(XMM3, XMM6)
411         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
412         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
413         SSE_SUB_PS(XMM7, XMM3)
414         SSE_INLINE_END_2
415         v += 16;
416       }
417       v    = aa + ai16;
418       ai16 = 16 * diag[--i];
419       PREFETCH_NTA(aa + ai16 + 16);
420       /*
421          Scale the result by the diagonal 4x4 block,
422          which was inverted as part of the factorization
423       */
424       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
425       /* First Column */
426       SSE_COPY_PS(XMM0, XMM7)
427       SSE_SHUFFLE(XMM0, XMM0, 0x00)
428       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
429 
430       /* Second Column */
431       SSE_COPY_PS(XMM1, XMM7)
432       SSE_SHUFFLE(XMM1, XMM1, 0x55)
433       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
434       SSE_ADD_PS(XMM0, XMM1)
435 
436       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
437 
438       /* Third Column */
439       SSE_COPY_PS(XMM2, XMM7)
440       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
441       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
442       SSE_ADD_PS(XMM0, XMM2)
443 
444       /* Fourth Column */
445       SSE_COPY_PS(XMM3, XMM7)
446       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
447       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
448       SSE_ADD_PS(XMM0, XMM3)
449 
450       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
451       SSE_INLINE_END_3
452 
453       v = aa + ai16 + 16;
454       idt -= 4;
455     }
456 
457     /* Convert t from single precision back to double precision (inplace)*/
458     idt = 4 * (n - 1);
459     for (i = n - 1; i >= 0; i--) {
460       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
461       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
462       PetscScalar *xtemp = &x[idt];
463       MatScalar   *ttemp = &t[idt];
464       xtemp[3]           = (PetscScalar)ttemp[3];
465       xtemp[2]           = (PetscScalar)ttemp[2];
466       xtemp[1]           = (PetscScalar)ttemp[1];
467       xtemp[0]           = (PetscScalar)ttemp[0];
468       idt -= 4;
469     }
470 
471   } /* End of artificial scope. */
472   PetscCall(VecRestoreArray(bb, &b));
473   PetscCall(VecRestoreArray(xx, &x));
474   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
475   SSE_SCOPE_END;
476   PetscFunctionReturn(0);
477 }
478 
479 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
480 {
481   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
482   int         *aj = a->j;
483   int         *ai = a->i, n = a->mbs, *diag = a->diag;
484   MatScalar   *aa = a->a;
485   PetscScalar *x, *b;
486 
487   PetscFunctionBegin;
488   SSE_SCOPE_BEGIN;
489   /*
490      Note: This code currently uses demotion of double
491      to float when performing the mixed-mode computation.
492      This may not be numerically reasonable for all applications.
493   */
494   PREFETCH_NTA(aa + 16 * ai[1]);
495 
496   PetscCall(VecGetArray(bb, &b));
497   PetscCall(VecGetArray(xx, &x));
498   {
499     /* x will first be computed in single precision then promoted inplace to double */
500     MatScalar *v, *t = (MatScalar *)x;
501     int        nz, i, idt, ai16;
502     int        jdx, idx;
503     int       *vi;
504     /* Forward solve the lower triangular factor. */
505 
506     /* First block is the identity. */
507     idx = 0;
508     CONVERT_DOUBLE4_FLOAT4(t, b);
509     v = aa + 16 * ai[1];
510 
511     for (i = 1; i < n;) {
512       PREFETCH_NTA(&v[8]);
513       vi = aj + ai[i];
514       nz = diag[i] - ai[i];
515       idx += 4;
516 
517       /* Demote RHS from double to float. */
518       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
519       LOAD_PS(&t[idx], XMM7);
520 
521       while (nz--) {
522         PREFETCH_NTA(&v[16]);
523         jdx = 4 * (*vi++);
524         /*          jdx = *vi++; */
525 
526         /* 4x4 Matrix-Vector product with negative accumulation: */
527         SSE_INLINE_BEGIN_2(&t[jdx], v)
528         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
529 
530         /* First Column */
531         SSE_COPY_PS(XMM0, XMM6)
532         SSE_SHUFFLE(XMM0, XMM0, 0x00)
533         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
534         SSE_SUB_PS(XMM7, XMM0)
535 
536         /* Second Column */
537         SSE_COPY_PS(XMM1, XMM6)
538         SSE_SHUFFLE(XMM1, XMM1, 0x55)
539         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
540         SSE_SUB_PS(XMM7, XMM1)
541 
542         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
543 
544         /* Third Column */
545         SSE_COPY_PS(XMM2, XMM6)
546         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
547         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
548         SSE_SUB_PS(XMM7, XMM2)
549 
550         /* Fourth Column */
551         SSE_COPY_PS(XMM3, XMM6)
552         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
553         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
554         SSE_SUB_PS(XMM7, XMM3)
555         SSE_INLINE_END_2
556 
557         v += 16;
558       }
559       v = aa + 16 * ai[++i];
560       PREFETCH_NTA(v);
561       STORE_PS(&t[idx], XMM7);
562     }
563 
564     /* Backward solve the upper triangular factor.*/
565 
566     idt  = 4 * (n - 1);
567     ai16 = 16 * diag[n - 1];
568     v    = aa + ai16 + 16;
569     for (i = n - 1; i >= 0;) {
570       PREFETCH_NTA(&v[8]);
571       vi = aj + diag[i] + 1;
572       nz = ai[i + 1] - diag[i] - 1;
573 
574       LOAD_PS(&t[idt], XMM7);
575 
576       while (nz--) {
577         PREFETCH_NTA(&v[16]);
578         idx = 4 * (*vi++);
579         /*          idx = *vi++; */
580 
581         /* 4x4 Matrix-Vector Product with negative accumulation: */
582         SSE_INLINE_BEGIN_2(&t[idx], v)
583         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
584 
585         /* First Column */
586         SSE_COPY_PS(XMM0, XMM6)
587         SSE_SHUFFLE(XMM0, XMM0, 0x00)
588         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
589         SSE_SUB_PS(XMM7, XMM0)
590 
591         /* Second Column */
592         SSE_COPY_PS(XMM1, XMM6)
593         SSE_SHUFFLE(XMM1, XMM1, 0x55)
594         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
595         SSE_SUB_PS(XMM7, XMM1)
596 
597         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
598 
599         /* Third Column */
600         SSE_COPY_PS(XMM2, XMM6)
601         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
602         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
603         SSE_SUB_PS(XMM7, XMM2)
604 
605         /* Fourth Column */
606         SSE_COPY_PS(XMM3, XMM6)
607         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
608         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
609         SSE_SUB_PS(XMM7, XMM3)
610         SSE_INLINE_END_2
611         v += 16;
612       }
613       v    = aa + ai16;
614       ai16 = 16 * diag[--i];
615       PREFETCH_NTA(aa + ai16 + 16);
616       /*
617          Scale the result by the diagonal 4x4 block,
618          which was inverted as part of the factorization
619       */
620       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
621       /* First Column */
622       SSE_COPY_PS(XMM0, XMM7)
623       SSE_SHUFFLE(XMM0, XMM0, 0x00)
624       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
625 
626       /* Second Column */
627       SSE_COPY_PS(XMM1, XMM7)
628       SSE_SHUFFLE(XMM1, XMM1, 0x55)
629       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
630       SSE_ADD_PS(XMM0, XMM1)
631 
632       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
633 
634       /* Third Column */
635       SSE_COPY_PS(XMM2, XMM7)
636       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
637       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
638       SSE_ADD_PS(XMM0, XMM2)
639 
640       /* Fourth Column */
641       SSE_COPY_PS(XMM3, XMM7)
642       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
643       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
644       SSE_ADD_PS(XMM0, XMM3)
645 
646       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
647       SSE_INLINE_END_3
648 
649       v = aa + ai16 + 16;
650       idt -= 4;
651     }
652 
653     /* Convert t from single precision back to double precision (inplace)*/
654     idt = 4 * (n - 1);
655     for (i = n - 1; i >= 0; i--) {
656       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
657       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
658       PetscScalar *xtemp = &x[idt];
659       MatScalar   *ttemp = &t[idt];
660       xtemp[3]           = (PetscScalar)ttemp[3];
661       xtemp[2]           = (PetscScalar)ttemp[2];
662       xtemp[1]           = (PetscScalar)ttemp[1];
663       xtemp[0]           = (PetscScalar)ttemp[0];
664       idt -= 4;
665     }
666 
667   } /* End of artificial scope. */
668   PetscCall(VecRestoreArray(bb, &b));
669   PetscCall(VecRestoreArray(xx, &x));
670   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
671   SSE_SCOPE_END;
672   PetscFunctionReturn(0);
673 }
674 
675 #endif
676