xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h>
22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h>
32c733ed4SBarry Smith 
42c733ed4SBarry Smith /*
52c733ed4SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
62c733ed4SBarry Smith    ordering. This eliminates the need for the column and row permutation.
72c733ed4SBarry Smith */
8*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
92c733ed4SBarry Smith   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
102c733ed4SBarry Smith   PetscInt           n  = a->mbs;
112c733ed4SBarry Smith   const PetscInt    *ai = a->i, *aj = a->j;
122c733ed4SBarry Smith   const PetscInt    *diag = a->diag;
132c733ed4SBarry Smith   const MatScalar   *aa   = a->a;
142c733ed4SBarry Smith   PetscScalar       *x;
152c733ed4SBarry Smith   const PetscScalar *b;
162c733ed4SBarry Smith 
172c733ed4SBarry Smith   PetscFunctionBegin;
189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
199566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
202c733ed4SBarry Smith 
212c733ed4SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
222c733ed4SBarry Smith   {
232c733ed4SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
242c733ed4SBarry Smith     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
252c733ed4SBarry Smith   }
262c733ed4SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
272c733ed4SBarry Smith   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
282c733ed4SBarry Smith #else
292c733ed4SBarry Smith   {
302c733ed4SBarry Smith     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
312c733ed4SBarry Smith     const MatScalar *v;
322c733ed4SBarry Smith     PetscInt         jdx, idt, idx, nz, i, ai16;
332c733ed4SBarry Smith     const PetscInt  *vi;
342c733ed4SBarry Smith 
352c733ed4SBarry Smith     /* forward solve the lower triangular */
362c733ed4SBarry Smith     idx  = 0;
37*9371c9d4SSatish Balay     x[0] = b[0];
38*9371c9d4SSatish Balay     x[1] = b[1];
39*9371c9d4SSatish Balay     x[2] = b[2];
40*9371c9d4SSatish Balay     x[3] = b[3];
412c733ed4SBarry Smith     for (i = 1; i < n; i++) {
422c733ed4SBarry Smith       v  = aa + 16 * ai[i];
432c733ed4SBarry Smith       vi = aj + ai[i];
442c733ed4SBarry Smith       nz = diag[i] - ai[i];
452c733ed4SBarry Smith       idx += 4;
46*9371c9d4SSatish Balay       s1 = b[idx];
47*9371c9d4SSatish Balay       s2 = b[1 + idx];
48*9371c9d4SSatish Balay       s3 = b[2 + idx];
49*9371c9d4SSatish Balay       s4 = b[3 + idx];
502c733ed4SBarry Smith       while (nz--) {
512c733ed4SBarry Smith         jdx = 4 * (*vi++);
52*9371c9d4SSatish Balay         x1  = x[jdx];
53*9371c9d4SSatish Balay         x2  = x[1 + jdx];
54*9371c9d4SSatish Balay         x3  = x[2 + jdx];
55*9371c9d4SSatish Balay         x4  = x[3 + jdx];
562c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
572c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
582c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
592c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
602c733ed4SBarry Smith         v += 16;
612c733ed4SBarry Smith       }
622c733ed4SBarry Smith       x[idx]     = s1;
632c733ed4SBarry Smith       x[1 + idx] = s2;
642c733ed4SBarry Smith       x[2 + idx] = s3;
652c733ed4SBarry Smith       x[3 + idx] = s4;
662c733ed4SBarry Smith     }
672c733ed4SBarry Smith     /* backward solve the upper triangular */
682c733ed4SBarry Smith     idt = 4 * (n - 1);
692c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
702c733ed4SBarry Smith       ai16 = 16 * diag[i];
712c733ed4SBarry Smith       v    = aa + ai16 + 16;
722c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
732c733ed4SBarry Smith       nz   = ai[i + 1] - diag[i] - 1;
74*9371c9d4SSatish Balay       s1   = x[idt];
75*9371c9d4SSatish Balay       s2   = x[1 + idt];
76*9371c9d4SSatish Balay       s3   = x[2 + idt];
77*9371c9d4SSatish Balay       s4   = x[3 + idt];
782c733ed4SBarry Smith       while (nz--) {
792c733ed4SBarry Smith         idx = 4 * (*vi++);
80*9371c9d4SSatish Balay         x1  = x[idx];
81*9371c9d4SSatish Balay         x2  = x[1 + idx];
82*9371c9d4SSatish Balay         x3  = x[2 + idx];
83*9371c9d4SSatish Balay         x4  = x[3 + idx];
842c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
852c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
862c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
872c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
882c733ed4SBarry Smith         v += 16;
892c733ed4SBarry Smith       }
902c733ed4SBarry Smith       v          = aa + ai16;
912c733ed4SBarry Smith       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
922c733ed4SBarry Smith       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
932c733ed4SBarry Smith       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
942c733ed4SBarry Smith       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
952c733ed4SBarry Smith       idt -= 4;
962c733ed4SBarry Smith     }
972c733ed4SBarry Smith   }
982c733ed4SBarry Smith #endif
992c733ed4SBarry Smith 
1009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1029566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1032c733ed4SBarry Smith   PetscFunctionReturn(0);
1042c733ed4SBarry Smith }
1052c733ed4SBarry Smith 
106*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1072c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1082c733ed4SBarry Smith   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1092c733ed4SBarry Smith   PetscInt           i, k, nz, idx, jdx, idt;
1102c733ed4SBarry Smith   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
1112c733ed4SBarry Smith   const MatScalar   *aa = a->a, *v;
1122c733ed4SBarry Smith   PetscScalar       *x;
1132c733ed4SBarry Smith   const PetscScalar *b;
1142c733ed4SBarry Smith   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;
1152c733ed4SBarry Smith 
1162c733ed4SBarry Smith   PetscFunctionBegin;
1179566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
1189566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
1192c733ed4SBarry Smith   /* forward solve the lower triangular */
1202c733ed4SBarry Smith   idx  = 0;
121*9371c9d4SSatish Balay   x[0] = b[idx];
122*9371c9d4SSatish Balay   x[1] = b[1 + idx];
123*9371c9d4SSatish Balay   x[2] = b[2 + idx];
124*9371c9d4SSatish Balay   x[3] = b[3 + idx];
1252c733ed4SBarry Smith   for (i = 1; i < n; i++) {
1262c733ed4SBarry Smith     v   = aa + bs2 * ai[i];
1272c733ed4SBarry Smith     vi  = aj + ai[i];
1282c733ed4SBarry Smith     nz  = ai[i + 1] - ai[i];
1292c733ed4SBarry Smith     idx = bs * i;
130*9371c9d4SSatish Balay     s1  = b[idx];
131*9371c9d4SSatish Balay     s2  = b[1 + idx];
132*9371c9d4SSatish Balay     s3  = b[2 + idx];
133*9371c9d4SSatish Balay     s4  = b[3 + idx];
1342c733ed4SBarry Smith     for (k = 0; k < nz; k++) {
1352c733ed4SBarry Smith       jdx = bs * vi[k];
136*9371c9d4SSatish Balay       x1  = x[jdx];
137*9371c9d4SSatish Balay       x2  = x[1 + jdx];
138*9371c9d4SSatish Balay       x3  = x[2 + jdx];
139*9371c9d4SSatish Balay       x4  = x[3 + jdx];
1402c733ed4SBarry Smith       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1412c733ed4SBarry Smith       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1422c733ed4SBarry Smith       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1432c733ed4SBarry Smith       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1442c733ed4SBarry Smith 
1452c733ed4SBarry Smith       v += bs2;
1462c733ed4SBarry Smith     }
1472c733ed4SBarry Smith 
1482c733ed4SBarry Smith     x[idx]     = s1;
1492c733ed4SBarry Smith     x[1 + idx] = s2;
1502c733ed4SBarry Smith     x[2 + idx] = s3;
1512c733ed4SBarry Smith     x[3 + idx] = s4;
1522c733ed4SBarry Smith   }
1532c733ed4SBarry Smith 
1542c733ed4SBarry Smith   /* backward solve the upper triangular */
1552c733ed4SBarry Smith   for (i = n - 1; i >= 0; i--) {
1562c733ed4SBarry Smith     v   = aa + bs2 * (adiag[i + 1] + 1);
1572c733ed4SBarry Smith     vi  = aj + adiag[i + 1] + 1;
1582c733ed4SBarry Smith     nz  = adiag[i] - adiag[i + 1] - 1;
1592c733ed4SBarry Smith     idt = bs * i;
160*9371c9d4SSatish Balay     s1  = x[idt];
161*9371c9d4SSatish Balay     s2  = x[1 + idt];
162*9371c9d4SSatish Balay     s3  = x[2 + idt];
163*9371c9d4SSatish Balay     s4  = x[3 + idt];
1642c733ed4SBarry Smith 
1652c733ed4SBarry Smith     for (k = 0; k < nz; k++) {
1662c733ed4SBarry Smith       idx = bs * vi[k];
167*9371c9d4SSatish Balay       x1  = x[idx];
168*9371c9d4SSatish Balay       x2  = x[1 + idx];
169*9371c9d4SSatish Balay       x3  = x[2 + idx];
170*9371c9d4SSatish Balay       x4  = x[3 + idx];
1712c733ed4SBarry Smith       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1722c733ed4SBarry Smith       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1732c733ed4SBarry Smith       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1742c733ed4SBarry Smith       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1752c733ed4SBarry Smith 
1762c733ed4SBarry Smith       v += bs2;
1772c733ed4SBarry Smith     }
1782c733ed4SBarry Smith     /* x = inv_diagonal*x */
1792c733ed4SBarry Smith     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
180feb237baSPierre Jolivet     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
1812c733ed4SBarry Smith     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
1822c733ed4SBarry Smith     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
1832c733ed4SBarry Smith   }
1842c733ed4SBarry Smith 
1859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1879566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
1882c733ed4SBarry Smith   PetscFunctionReturn(0);
1892c733ed4SBarry Smith }
1902c733ed4SBarry Smith 
191*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx) {
1922c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1932c733ed4SBarry Smith   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
1942c733ed4SBarry Smith   const MatScalar   *aa = a->a;
1952c733ed4SBarry Smith   const PetscScalar *b;
1962c733ed4SBarry Smith   PetscScalar       *x;
1972c733ed4SBarry Smith 
1982c733ed4SBarry Smith   PetscFunctionBegin;
1999566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
2009566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
2012c733ed4SBarry Smith 
2022c733ed4SBarry Smith   {
2032c733ed4SBarry Smith     MatScalar        s1, s2, s3, s4, x1, x2, x3, x4;
2042c733ed4SBarry Smith     const MatScalar *v;
2052c733ed4SBarry Smith     MatScalar       *t = (MatScalar *)x;
2062c733ed4SBarry Smith     PetscInt         jdx, idt, idx, nz, i, ai16;
2072c733ed4SBarry Smith     const PetscInt  *vi;
2082c733ed4SBarry Smith 
2092c733ed4SBarry Smith     /* forward solve the lower triangular */
2102c733ed4SBarry Smith     idx  = 0;
2112c733ed4SBarry Smith     t[0] = (MatScalar)b[0];
2122c733ed4SBarry Smith     t[1] = (MatScalar)b[1];
2132c733ed4SBarry Smith     t[2] = (MatScalar)b[2];
2142c733ed4SBarry Smith     t[3] = (MatScalar)b[3];
2152c733ed4SBarry Smith     for (i = 1; i < n; i++) {
2162c733ed4SBarry Smith       v  = aa + 16 * ai[i];
2172c733ed4SBarry Smith       vi = aj + ai[i];
2182c733ed4SBarry Smith       nz = diag[i] - ai[i];
2192c733ed4SBarry Smith       idx += 4;
2202c733ed4SBarry Smith       s1 = (MatScalar)b[idx];
2212c733ed4SBarry Smith       s2 = (MatScalar)b[1 + idx];
2222c733ed4SBarry Smith       s3 = (MatScalar)b[2 + idx];
2232c733ed4SBarry Smith       s4 = (MatScalar)b[3 + idx];
2242c733ed4SBarry Smith       while (nz--) {
2252c733ed4SBarry Smith         jdx = 4 * (*vi++);
2262c733ed4SBarry Smith         x1  = t[jdx];
2272c733ed4SBarry Smith         x2  = t[1 + jdx];
2282c733ed4SBarry Smith         x3  = t[2 + jdx];
2292c733ed4SBarry Smith         x4  = t[3 + jdx];
2302c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2312c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2322c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2332c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2342c733ed4SBarry Smith         v += 16;
2352c733ed4SBarry Smith       }
2362c733ed4SBarry Smith       t[idx]     = s1;
2372c733ed4SBarry Smith       t[1 + idx] = s2;
2382c733ed4SBarry Smith       t[2 + idx] = s3;
2392c733ed4SBarry Smith       t[3 + idx] = s4;
2402c733ed4SBarry Smith     }
2412c733ed4SBarry Smith     /* backward solve the upper triangular */
2422c733ed4SBarry Smith     idt = 4 * (n - 1);
2432c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
2442c733ed4SBarry Smith       ai16 = 16 * diag[i];
2452c733ed4SBarry Smith       v    = aa + ai16 + 16;
2462c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
2472c733ed4SBarry Smith       nz   = ai[i + 1] - diag[i] - 1;
2482c733ed4SBarry Smith       s1   = t[idt];
2492c733ed4SBarry Smith       s2   = t[1 + idt];
2502c733ed4SBarry Smith       s3   = t[2 + idt];
2512c733ed4SBarry Smith       s4   = t[3 + idt];
2522c733ed4SBarry Smith       while (nz--) {
2532c733ed4SBarry Smith         idx = 4 * (*vi++);
2542c733ed4SBarry Smith         x1  = (MatScalar)x[idx];
2552c733ed4SBarry Smith         x2  = (MatScalar)x[1 + idx];
2562c733ed4SBarry Smith         x3  = (MatScalar)x[2 + idx];
2572c733ed4SBarry Smith         x4  = (MatScalar)x[3 + idx];
2582c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2592c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2602c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2612c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2622c733ed4SBarry Smith         v += 16;
2632c733ed4SBarry Smith       }
2642c733ed4SBarry Smith       v          = aa + ai16;
2652c733ed4SBarry Smith       x[idt]     = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
2662c733ed4SBarry Smith       x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
2672c733ed4SBarry Smith       x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
2682c733ed4SBarry Smith       x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
2692c733ed4SBarry Smith       idt -= 4;
2702c733ed4SBarry Smith     }
2712c733ed4SBarry Smith   }
2722c733ed4SBarry Smith 
2739566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
2749566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
2759566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
2762c733ed4SBarry Smith   PetscFunctionReturn(0);
2772c733ed4SBarry Smith }
2782c733ed4SBarry Smith 
2792c733ed4SBarry Smith #if defined(PETSC_HAVE_SSE)
2802c733ed4SBarry Smith 
2812c733ed4SBarry Smith #include PETSC_HAVE_SSE
282*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx) {
2832c733ed4SBarry Smith   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
2842c733ed4SBarry Smith   unsigned short *aj = (unsigned short *)a->j;
2852c733ed4SBarry Smith   int            *ai = a->i, n = a->mbs, *diag = a->diag;
2862c733ed4SBarry Smith   MatScalar      *aa = a->a;
2872c733ed4SBarry Smith   PetscScalar    *x, *b;
2882c733ed4SBarry Smith 
2892c733ed4SBarry Smith   PetscFunctionBegin;
2902c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
2912c733ed4SBarry Smith   /*
2922c733ed4SBarry Smith      Note: This code currently uses demotion of double
2932c733ed4SBarry Smith      to float when performing the mixed-mode computation.
2942c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
2952c733ed4SBarry Smith   */
2962c733ed4SBarry Smith   PREFETCH_NTA(aa + 16 * ai[1]);
2972c733ed4SBarry Smith 
2989566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
2999566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
3002c733ed4SBarry Smith   {
3012c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
3022c733ed4SBarry Smith     MatScalar      *v, *t = (MatScalar *)x;
3032c733ed4SBarry Smith     int             nz, i, idt, ai16;
3042c733ed4SBarry Smith     unsigned int    jdx, idx;
3052c733ed4SBarry Smith     unsigned short *vi;
3062c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
3072c733ed4SBarry Smith 
3082c733ed4SBarry Smith     /* First block is the identity. */
3092c733ed4SBarry Smith     idx = 0;
3102c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t, b);
3112c733ed4SBarry Smith     v = aa + 16 * ((unsigned int)ai[1]);
3122c733ed4SBarry Smith 
3132c733ed4SBarry Smith     for (i = 1; i < n;) {
3142c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
3152c733ed4SBarry Smith       vi = aj + ai[i];
3162c733ed4SBarry Smith       nz = diag[i] - ai[i];
3172c733ed4SBarry Smith       idx += 4;
3182c733ed4SBarry Smith 
3192c733ed4SBarry Smith       /* Demote RHS from double to float. */
3202c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
3212c733ed4SBarry Smith       LOAD_PS(&t[idx], XMM7);
3222c733ed4SBarry Smith 
3232c733ed4SBarry Smith       while (nz--) {
3242c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3252c733ed4SBarry Smith         jdx = 4 * ((unsigned int)(*vi++));
3262c733ed4SBarry Smith 
3272c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
3282c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx], v)
3292c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
3302c733ed4SBarry Smith 
3312c733ed4SBarry Smith         /* First Column */
3322c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
3332c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
3342c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
3352c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
3362c733ed4SBarry Smith 
3372c733ed4SBarry Smith         /* Second Column */
3382c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
3392c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
3402c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
3412c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
3422c733ed4SBarry Smith 
3432c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
3442c733ed4SBarry Smith 
3452c733ed4SBarry Smith         /* Third Column */
3462c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
3472c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
3482c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
3492c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
3502c733ed4SBarry Smith 
3512c733ed4SBarry Smith         /* Fourth Column */
3522c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
3532c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
3542c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
3552c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
3562c733ed4SBarry Smith         SSE_INLINE_END_2
3572c733ed4SBarry Smith 
3582c733ed4SBarry Smith         v += 16;
3592c733ed4SBarry Smith       }
3602c733ed4SBarry Smith       v = aa + 16 * ai[++i];
3612c733ed4SBarry Smith       PREFETCH_NTA(v);
3622c733ed4SBarry Smith       STORE_PS(&t[idx], XMM7);
3632c733ed4SBarry Smith     }
3642c733ed4SBarry Smith 
3652c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
3662c733ed4SBarry Smith 
3672c733ed4SBarry Smith     idt  = 4 * (n - 1);
3682c733ed4SBarry Smith     ai16 = 16 * diag[n - 1];
3692c733ed4SBarry Smith     v    = aa + ai16 + 16;
3702c733ed4SBarry Smith     for (i = n - 1; i >= 0;) {
3712c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
3722c733ed4SBarry Smith       vi = aj + diag[i] + 1;
3732c733ed4SBarry Smith       nz = ai[i + 1] - diag[i] - 1;
3742c733ed4SBarry Smith 
3752c733ed4SBarry Smith       LOAD_PS(&t[idt], XMM7);
3762c733ed4SBarry Smith 
3772c733ed4SBarry Smith       while (nz--) {
3782c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3792c733ed4SBarry Smith         idx = 4 * ((unsigned int)(*vi++));
3802c733ed4SBarry Smith 
3812c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
3822c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx], v)
3832c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
3842c733ed4SBarry Smith 
3852c733ed4SBarry Smith         /* First Column */
3862c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
3872c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
3882c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
3892c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
3902c733ed4SBarry Smith 
3912c733ed4SBarry Smith         /* Second Column */
3922c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
3932c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
3942c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
3952c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
3962c733ed4SBarry Smith 
3972c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
3982c733ed4SBarry Smith 
3992c733ed4SBarry Smith         /* Third Column */
4002c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
4012c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
4022c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
4032c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
4042c733ed4SBarry Smith 
4052c733ed4SBarry Smith         /* Fourth Column */
4062c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
4072c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
4082c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
4092c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
4102c733ed4SBarry Smith         SSE_INLINE_END_2
4112c733ed4SBarry Smith         v += 16;
4122c733ed4SBarry Smith       }
4132c733ed4SBarry Smith       v    = aa + ai16;
4142c733ed4SBarry Smith       ai16 = 16 * diag[--i];
4152c733ed4SBarry Smith       PREFETCH_NTA(aa + ai16 + 16);
4162c733ed4SBarry Smith       /*
4172c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
4182c733ed4SBarry Smith          which was inverted as part of the factorization
4192c733ed4SBarry Smith       */
4202c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
4212c733ed4SBarry Smith       /* First Column */
4222c733ed4SBarry Smith       SSE_COPY_PS(XMM0, XMM7)
4232c733ed4SBarry Smith       SSE_SHUFFLE(XMM0, XMM0, 0x00)
4242c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
4252c733ed4SBarry Smith 
4262c733ed4SBarry Smith       /* Second Column */
4272c733ed4SBarry Smith       SSE_COPY_PS(XMM1, XMM7)
4282c733ed4SBarry Smith       SSE_SHUFFLE(XMM1, XMM1, 0x55)
4292c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
4302c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM1)
4312c733ed4SBarry Smith 
4322c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
4332c733ed4SBarry Smith 
4342c733ed4SBarry Smith       /* Third Column */
4352c733ed4SBarry Smith       SSE_COPY_PS(XMM2, XMM7)
4362c733ed4SBarry Smith       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
4372c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
4382c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM2)
4392c733ed4SBarry Smith 
4402c733ed4SBarry Smith       /* Fourth Column */
4412c733ed4SBarry Smith       SSE_COPY_PS(XMM3, XMM7)
4422c733ed4SBarry Smith       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
4432c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
4442c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM3)
4452c733ed4SBarry Smith 
4462c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
4472c733ed4SBarry Smith       SSE_INLINE_END_3
4482c733ed4SBarry Smith 
4492c733ed4SBarry Smith       v = aa + ai16 + 16;
4502c733ed4SBarry Smith       idt -= 4;
4512c733ed4SBarry Smith     }
4522c733ed4SBarry Smith 
4532c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
4542c733ed4SBarry Smith     idt = 4 * (n - 1);
4552c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
4562c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4572c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4582c733ed4SBarry Smith       PetscScalar *xtemp = &x[idt];
4592c733ed4SBarry Smith       MatScalar   *ttemp = &t[idt];
4602c733ed4SBarry Smith       xtemp[3]           = (PetscScalar)ttemp[3];
4612c733ed4SBarry Smith       xtemp[2]           = (PetscScalar)ttemp[2];
4622c733ed4SBarry Smith       xtemp[1]           = (PetscScalar)ttemp[1];
4632c733ed4SBarry Smith       xtemp[0]           = (PetscScalar)ttemp[0];
4642c733ed4SBarry Smith       idt -= 4;
4652c733ed4SBarry Smith     }
4662c733ed4SBarry Smith 
4672c733ed4SBarry Smith   } /* End of artificial scope. */
4689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
4699566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
4709566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
4712c733ed4SBarry Smith   SSE_SCOPE_END;
4722c733ed4SBarry Smith   PetscFunctionReturn(0);
4732c733ed4SBarry Smith }
4742c733ed4SBarry Smith 
475*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx) {
4762c733ed4SBarry Smith   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
4772c733ed4SBarry Smith   int         *aj = a->j;
4782c733ed4SBarry Smith   int         *ai = a->i, n = a->mbs, *diag = a->diag;
4792c733ed4SBarry Smith   MatScalar   *aa = a->a;
4802c733ed4SBarry Smith   PetscScalar *x, *b;
4812c733ed4SBarry Smith 
4822c733ed4SBarry Smith   PetscFunctionBegin;
4832c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
4842c733ed4SBarry Smith   /*
4852c733ed4SBarry Smith      Note: This code currently uses demotion of double
4862c733ed4SBarry Smith      to float when performing the mixed-mode computation.
4872c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
4882c733ed4SBarry Smith   */
4892c733ed4SBarry Smith   PREFETCH_NTA(aa + 16 * ai[1]);
4902c733ed4SBarry Smith 
4919566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
4929566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
4932c733ed4SBarry Smith   {
4942c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
4952c733ed4SBarry Smith     MatScalar *v, *t = (MatScalar *)x;
4962c733ed4SBarry Smith     int        nz, i, idt, ai16;
4972c733ed4SBarry Smith     int        jdx, idx;
4982c733ed4SBarry Smith     int       *vi;
4992c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
5002c733ed4SBarry Smith 
5012c733ed4SBarry Smith     /* First block is the identity. */
5022c733ed4SBarry Smith     idx = 0;
5032c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t, b);
5042c733ed4SBarry Smith     v = aa + 16 * ai[1];
5052c733ed4SBarry Smith 
5062c733ed4SBarry Smith     for (i = 1; i < n;) {
5072c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
5082c733ed4SBarry Smith       vi = aj + ai[i];
5092c733ed4SBarry Smith       nz = diag[i] - ai[i];
5102c733ed4SBarry Smith       idx += 4;
5112c733ed4SBarry Smith 
5122c733ed4SBarry Smith       /* Demote RHS from double to float. */
5132c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
5142c733ed4SBarry Smith       LOAD_PS(&t[idx], XMM7);
5152c733ed4SBarry Smith 
5162c733ed4SBarry Smith       while (nz--) {
5172c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5182c733ed4SBarry Smith         jdx = 4 * (*vi++);
5192c733ed4SBarry Smith         /*          jdx = *vi++; */
5202c733ed4SBarry Smith 
5212c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
5222c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx], v)
5232c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
5242c733ed4SBarry Smith 
5252c733ed4SBarry Smith         /* First Column */
5262c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
5272c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
5282c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
5292c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
5302c733ed4SBarry Smith 
5312c733ed4SBarry Smith         /* Second Column */
5322c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
5332c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
5342c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
5352c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
5362c733ed4SBarry Smith 
5372c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
5382c733ed4SBarry Smith 
5392c733ed4SBarry Smith         /* Third Column */
5402c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
5412c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
5422c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
5432c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
5442c733ed4SBarry Smith 
5452c733ed4SBarry Smith         /* Fourth Column */
5462c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
5472c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
5482c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
5492c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
5502c733ed4SBarry Smith         SSE_INLINE_END_2
5512c733ed4SBarry Smith 
5522c733ed4SBarry Smith         v += 16;
5532c733ed4SBarry Smith       }
5542c733ed4SBarry Smith       v = aa + 16 * ai[++i];
5552c733ed4SBarry Smith       PREFETCH_NTA(v);
5562c733ed4SBarry Smith       STORE_PS(&t[idx], XMM7);
5572c733ed4SBarry Smith     }
5582c733ed4SBarry Smith 
5592c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
5602c733ed4SBarry Smith 
5612c733ed4SBarry Smith     idt  = 4 * (n - 1);
5622c733ed4SBarry Smith     ai16 = 16 * diag[n - 1];
5632c733ed4SBarry Smith     v    = aa + ai16 + 16;
5642c733ed4SBarry Smith     for (i = n - 1; i >= 0;) {
5652c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
5662c733ed4SBarry Smith       vi = aj + diag[i] + 1;
5672c733ed4SBarry Smith       nz = ai[i + 1] - diag[i] - 1;
5682c733ed4SBarry Smith 
5692c733ed4SBarry Smith       LOAD_PS(&t[idt], XMM7);
5702c733ed4SBarry Smith 
5712c733ed4SBarry Smith       while (nz--) {
5722c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5732c733ed4SBarry Smith         idx = 4 * (*vi++);
5742c733ed4SBarry Smith         /*          idx = *vi++; */
5752c733ed4SBarry Smith 
5762c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
5772c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx], v)
5782c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
5792c733ed4SBarry Smith 
5802c733ed4SBarry Smith         /* First Column */
5812c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
5822c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
5832c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
5842c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
5852c733ed4SBarry Smith 
5862c733ed4SBarry Smith         /* Second Column */
5872c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
5882c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
5892c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
5902c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
5912c733ed4SBarry Smith 
5922c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
5932c733ed4SBarry Smith 
5942c733ed4SBarry Smith         /* Third Column */
5952c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
5962c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
5972c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
5982c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
5992c733ed4SBarry Smith 
6002c733ed4SBarry Smith         /* Fourth Column */
6012c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
6022c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
6032c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
6042c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
6052c733ed4SBarry Smith         SSE_INLINE_END_2
6062c733ed4SBarry Smith         v += 16;
6072c733ed4SBarry Smith       }
6082c733ed4SBarry Smith       v    = aa + ai16;
6092c733ed4SBarry Smith       ai16 = 16 * diag[--i];
6102c733ed4SBarry Smith       PREFETCH_NTA(aa + ai16 + 16);
6112c733ed4SBarry Smith       /*
6122c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
6132c733ed4SBarry Smith          which was inverted as part of the factorization
6142c733ed4SBarry Smith       */
6152c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
6162c733ed4SBarry Smith       /* First Column */
6172c733ed4SBarry Smith       SSE_COPY_PS(XMM0, XMM7)
6182c733ed4SBarry Smith       SSE_SHUFFLE(XMM0, XMM0, 0x00)
6192c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
6202c733ed4SBarry Smith 
6212c733ed4SBarry Smith       /* Second Column */
6222c733ed4SBarry Smith       SSE_COPY_PS(XMM1, XMM7)
6232c733ed4SBarry Smith       SSE_SHUFFLE(XMM1, XMM1, 0x55)
6242c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
6252c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM1)
6262c733ed4SBarry Smith 
6272c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
6282c733ed4SBarry Smith 
6292c733ed4SBarry Smith       /* Third Column */
6302c733ed4SBarry Smith       SSE_COPY_PS(XMM2, XMM7)
6312c733ed4SBarry Smith       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
6322c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
6332c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM2)
6342c733ed4SBarry Smith 
6352c733ed4SBarry Smith       /* Fourth Column */
6362c733ed4SBarry Smith       SSE_COPY_PS(XMM3, XMM7)
6372c733ed4SBarry Smith       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
6382c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
6392c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM3)
6402c733ed4SBarry Smith 
6412c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
6422c733ed4SBarry Smith       SSE_INLINE_END_3
6432c733ed4SBarry Smith 
6442c733ed4SBarry Smith       v = aa + ai16 + 16;
6452c733ed4SBarry Smith       idt -= 4;
6462c733ed4SBarry Smith     }
6472c733ed4SBarry Smith 
6482c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
6492c733ed4SBarry Smith     idt = 4 * (n - 1);
6502c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
6512c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
6522c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
6532c733ed4SBarry Smith       PetscScalar *xtemp = &x[idt];
6542c733ed4SBarry Smith       MatScalar   *ttemp = &t[idt];
6552c733ed4SBarry Smith       xtemp[3]           = (PetscScalar)ttemp[3];
6562c733ed4SBarry Smith       xtemp[2]           = (PetscScalar)ttemp[2];
6572c733ed4SBarry Smith       xtemp[1]           = (PetscScalar)ttemp[1];
6582c733ed4SBarry Smith       xtemp[0]           = (PetscScalar)ttemp[0];
6592c733ed4SBarry Smith       idt -= 4;
6602c733ed4SBarry Smith     }
6612c733ed4SBarry Smith 
6622c733ed4SBarry Smith   } /* End of artificial scope. */
6639566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
6649566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
6659566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
6662c733ed4SBarry Smith   SSE_SCOPE_END;
6672c733ed4SBarry Smith   PetscFunctionReturn(0);
6682c733ed4SBarry Smith }
6692c733ed4SBarry Smith 
6702c733ed4SBarry Smith #endif
671