xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision 3ba1676111f5c958fe6c2729b46ca4d523958bb3)
12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h>
22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h>
32c733ed4SBarry Smith 
42c733ed4SBarry Smith /*
52c733ed4SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
62c733ed4SBarry Smith    ordering. This eliminates the need for the column and row permutation.
72c733ed4SBarry Smith */
8d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9d71ae5a4SJacob Faibussowitsch {
102c733ed4SBarry Smith   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
112c733ed4SBarry Smith   PetscInt           n  = a->mbs;
122c733ed4SBarry Smith   const PetscInt    *ai = a->i, *aj = a->j;
132c733ed4SBarry Smith   const PetscInt    *diag = a->diag;
142c733ed4SBarry Smith   const MatScalar   *aa   = a->a;
152c733ed4SBarry Smith   PetscScalar       *x;
162c733ed4SBarry Smith   const PetscScalar *b;
172c733ed4SBarry Smith 
182c733ed4SBarry Smith   PetscFunctionBegin;
199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
209566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
212c733ed4SBarry Smith 
222c733ed4SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
232c733ed4SBarry Smith   {
242c733ed4SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
252c733ed4SBarry Smith     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
262c733ed4SBarry Smith   }
272c733ed4SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
282c733ed4SBarry Smith   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
292c733ed4SBarry Smith #else
302c733ed4SBarry Smith   {
312c733ed4SBarry Smith     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
322c733ed4SBarry Smith     const MatScalar *v;
332c733ed4SBarry Smith     PetscInt         jdx, idt, idx, nz, i, ai16;
342c733ed4SBarry Smith     const PetscInt  *vi;
352c733ed4SBarry Smith 
362c733ed4SBarry Smith     /* forward solve the lower triangular */
372c733ed4SBarry Smith     idx  = 0;
389371c9d4SSatish Balay     x[0] = b[0];
399371c9d4SSatish Balay     x[1] = b[1];
409371c9d4SSatish Balay     x[2] = b[2];
419371c9d4SSatish Balay     x[3] = b[3];
422c733ed4SBarry Smith     for (i = 1; i < n; i++) {
432c733ed4SBarry Smith       v  = aa + 16 * ai[i];
442c733ed4SBarry Smith       vi = aj + ai[i];
452c733ed4SBarry Smith       nz = diag[i] - ai[i];
462c733ed4SBarry Smith       idx += 4;
479371c9d4SSatish Balay       s1 = b[idx];
489371c9d4SSatish Balay       s2 = b[1 + idx];
499371c9d4SSatish Balay       s3 = b[2 + idx];
509371c9d4SSatish Balay       s4 = b[3 + idx];
512c733ed4SBarry Smith       while (nz--) {
522c733ed4SBarry Smith         jdx = 4 * (*vi++);
539371c9d4SSatish Balay         x1  = x[jdx];
549371c9d4SSatish Balay         x2  = x[1 + jdx];
559371c9d4SSatish Balay         x3  = x[2 + jdx];
569371c9d4SSatish Balay         x4  = x[3 + jdx];
572c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
582c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
592c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
602c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
612c733ed4SBarry Smith         v += 16;
622c733ed4SBarry Smith       }
632c733ed4SBarry Smith       x[idx]     = s1;
642c733ed4SBarry Smith       x[1 + idx] = s2;
652c733ed4SBarry Smith       x[2 + idx] = s3;
662c733ed4SBarry Smith       x[3 + idx] = s4;
672c733ed4SBarry Smith     }
682c733ed4SBarry Smith     /* backward solve the upper triangular */
692c733ed4SBarry Smith     idt = 4 * (n - 1);
702c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
712c733ed4SBarry Smith       ai16 = 16 * diag[i];
722c733ed4SBarry Smith       v    = aa + ai16 + 16;
732c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
742c733ed4SBarry Smith       nz   = ai[i + 1] - diag[i] - 1;
759371c9d4SSatish Balay       s1   = x[idt];
769371c9d4SSatish Balay       s2   = x[1 + idt];
779371c9d4SSatish Balay       s3   = x[2 + idt];
789371c9d4SSatish Balay       s4   = x[3 + idt];
792c733ed4SBarry Smith       while (nz--) {
802c733ed4SBarry Smith         idx = 4 * (*vi++);
819371c9d4SSatish Balay         x1  = x[idx];
829371c9d4SSatish Balay         x2  = x[1 + idx];
839371c9d4SSatish Balay         x3  = x[2 + idx];
849371c9d4SSatish Balay         x4  = x[3 + idx];
852c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
862c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
872c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
882c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
892c733ed4SBarry Smith         v += 16;
902c733ed4SBarry Smith       }
912c733ed4SBarry Smith       v          = aa + ai16;
922c733ed4SBarry Smith       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
932c733ed4SBarry Smith       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
942c733ed4SBarry Smith       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
952c733ed4SBarry Smith       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
962c733ed4SBarry Smith       idt -= 4;
972c733ed4SBarry Smith     }
982c733ed4SBarry Smith   }
992c733ed4SBarry Smith #endif
1002c733ed4SBarry Smith 
1019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1052c733ed4SBarry Smith }
1062c733ed4SBarry Smith 
107d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108d71ae5a4SJacob Faibussowitsch {
1092c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1102c733ed4SBarry Smith   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1112c733ed4SBarry Smith   PetscInt           i, k, nz, idx, jdx, idt;
1122c733ed4SBarry Smith   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
1132c733ed4SBarry Smith   const MatScalar   *aa = a->a, *v;
1142c733ed4SBarry Smith   PetscScalar       *x;
1152c733ed4SBarry Smith   const PetscScalar *b;
1162c733ed4SBarry Smith   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;
1172c733ed4SBarry Smith 
1182c733ed4SBarry Smith   PetscFunctionBegin;
1199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
1209566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
1212c733ed4SBarry Smith   /* forward solve the lower triangular */
1222c733ed4SBarry Smith   idx  = 0;
1239371c9d4SSatish Balay   x[0] = b[idx];
1249371c9d4SSatish Balay   x[1] = b[1 + idx];
1259371c9d4SSatish Balay   x[2] = b[2 + idx];
1269371c9d4SSatish Balay   x[3] = b[3 + idx];
1272c733ed4SBarry Smith   for (i = 1; i < n; i++) {
1282c733ed4SBarry Smith     v   = aa + bs2 * ai[i];
1292c733ed4SBarry Smith     vi  = aj + ai[i];
1302c733ed4SBarry Smith     nz  = ai[i + 1] - ai[i];
1312c733ed4SBarry Smith     idx = bs * i;
1329371c9d4SSatish Balay     s1  = b[idx];
1339371c9d4SSatish Balay     s2  = b[1 + idx];
1349371c9d4SSatish Balay     s3  = b[2 + idx];
1359371c9d4SSatish Balay     s4  = b[3 + idx];
1362c733ed4SBarry Smith     for (k = 0; k < nz; k++) {
1372c733ed4SBarry Smith       jdx = bs * vi[k];
1389371c9d4SSatish Balay       x1  = x[jdx];
1399371c9d4SSatish Balay       x2  = x[1 + jdx];
1409371c9d4SSatish Balay       x3  = x[2 + jdx];
1419371c9d4SSatish Balay       x4  = x[3 + jdx];
1422c733ed4SBarry Smith       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1432c733ed4SBarry Smith       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1442c733ed4SBarry Smith       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1452c733ed4SBarry Smith       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1462c733ed4SBarry Smith 
1472c733ed4SBarry Smith       v += bs2;
1482c733ed4SBarry Smith     }
1492c733ed4SBarry Smith 
1502c733ed4SBarry Smith     x[idx]     = s1;
1512c733ed4SBarry Smith     x[1 + idx] = s2;
1522c733ed4SBarry Smith     x[2 + idx] = s3;
1532c733ed4SBarry Smith     x[3 + idx] = s4;
1542c733ed4SBarry Smith   }
1552c733ed4SBarry Smith 
1562c733ed4SBarry Smith   /* backward solve the upper triangular */
1572c733ed4SBarry Smith   for (i = n - 1; i >= 0; i--) {
1582c733ed4SBarry Smith     v   = aa + bs2 * (adiag[i + 1] + 1);
1592c733ed4SBarry Smith     vi  = aj + adiag[i + 1] + 1;
1602c733ed4SBarry Smith     nz  = adiag[i] - adiag[i + 1] - 1;
1612c733ed4SBarry Smith     idt = bs * i;
1629371c9d4SSatish Balay     s1  = x[idt];
1639371c9d4SSatish Balay     s2  = x[1 + idt];
1649371c9d4SSatish Balay     s3  = x[2 + idt];
1659371c9d4SSatish Balay     s4  = x[3 + idt];
1662c733ed4SBarry Smith 
1672c733ed4SBarry Smith     for (k = 0; k < nz; k++) {
1682c733ed4SBarry Smith       idx = bs * vi[k];
1699371c9d4SSatish Balay       x1  = x[idx];
1709371c9d4SSatish Balay       x2  = x[1 + idx];
1719371c9d4SSatish Balay       x3  = x[2 + idx];
1729371c9d4SSatish Balay       x4  = x[3 + idx];
1732c733ed4SBarry Smith       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1742c733ed4SBarry Smith       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1752c733ed4SBarry Smith       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1762c733ed4SBarry Smith       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1772c733ed4SBarry Smith 
1782c733ed4SBarry Smith       v += bs2;
1792c733ed4SBarry Smith     }
1802c733ed4SBarry Smith     /* x = inv_diagonal*x */
1812c733ed4SBarry Smith     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182feb237baSPierre Jolivet     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
1832c733ed4SBarry Smith     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
1842c733ed4SBarry Smith     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
1852c733ed4SBarry Smith   }
1862c733ed4SBarry Smith 
1879566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1899566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1912c733ed4SBarry Smith }
1922c733ed4SBarry Smith 
193d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx)
194d71ae5a4SJacob Faibussowitsch {
1952c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1962c733ed4SBarry Smith   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
1972c733ed4SBarry Smith   const MatScalar   *aa = a->a;
1982c733ed4SBarry Smith   const PetscScalar *b;
1992c733ed4SBarry Smith   PetscScalar       *x;
2002c733ed4SBarry Smith 
2012c733ed4SBarry Smith   PetscFunctionBegin;
2029566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
2039566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
2042c733ed4SBarry Smith 
2052c733ed4SBarry Smith   {
2062c733ed4SBarry Smith     MatScalar        s1, s2, s3, s4, x1, x2, x3, x4;
2072c733ed4SBarry Smith     const MatScalar *v;
2082c733ed4SBarry Smith     MatScalar       *t = (MatScalar *)x;
2092c733ed4SBarry Smith     PetscInt         jdx, idt, idx, nz, i, ai16;
2102c733ed4SBarry Smith     const PetscInt  *vi;
2112c733ed4SBarry Smith 
2122c733ed4SBarry Smith     /* forward solve the lower triangular */
2132c733ed4SBarry Smith     idx  = 0;
2142c733ed4SBarry Smith     t[0] = (MatScalar)b[0];
2152c733ed4SBarry Smith     t[1] = (MatScalar)b[1];
2162c733ed4SBarry Smith     t[2] = (MatScalar)b[2];
2172c733ed4SBarry Smith     t[3] = (MatScalar)b[3];
2182c733ed4SBarry Smith     for (i = 1; i < n; i++) {
2192c733ed4SBarry Smith       v  = aa + 16 * ai[i];
2202c733ed4SBarry Smith       vi = aj + ai[i];
2212c733ed4SBarry Smith       nz = diag[i] - ai[i];
2222c733ed4SBarry Smith       idx += 4;
2232c733ed4SBarry Smith       s1 = (MatScalar)b[idx];
2242c733ed4SBarry Smith       s2 = (MatScalar)b[1 + idx];
2252c733ed4SBarry Smith       s3 = (MatScalar)b[2 + idx];
2262c733ed4SBarry Smith       s4 = (MatScalar)b[3 + idx];
2272c733ed4SBarry Smith       while (nz--) {
2282c733ed4SBarry Smith         jdx = 4 * (*vi++);
2292c733ed4SBarry Smith         x1  = t[jdx];
2302c733ed4SBarry Smith         x2  = t[1 + jdx];
2312c733ed4SBarry Smith         x3  = t[2 + jdx];
2322c733ed4SBarry Smith         x4  = t[3 + jdx];
2332c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2342c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2352c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2362c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2372c733ed4SBarry Smith         v += 16;
2382c733ed4SBarry Smith       }
2392c733ed4SBarry Smith       t[idx]     = s1;
2402c733ed4SBarry Smith       t[1 + idx] = s2;
2412c733ed4SBarry Smith       t[2 + idx] = s3;
2422c733ed4SBarry Smith       t[3 + idx] = s4;
2432c733ed4SBarry Smith     }
2442c733ed4SBarry Smith     /* backward solve the upper triangular */
2452c733ed4SBarry Smith     idt = 4 * (n - 1);
2462c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
2472c733ed4SBarry Smith       ai16 = 16 * diag[i];
2482c733ed4SBarry Smith       v    = aa + ai16 + 16;
2492c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
2502c733ed4SBarry Smith       nz   = ai[i + 1] - diag[i] - 1;
2512c733ed4SBarry Smith       s1   = t[idt];
2522c733ed4SBarry Smith       s2   = t[1 + idt];
2532c733ed4SBarry Smith       s3   = t[2 + idt];
2542c733ed4SBarry Smith       s4   = t[3 + idt];
2552c733ed4SBarry Smith       while (nz--) {
2562c733ed4SBarry Smith         idx = 4 * (*vi++);
2572c733ed4SBarry Smith         x1  = (MatScalar)x[idx];
2582c733ed4SBarry Smith         x2  = (MatScalar)x[1 + idx];
2592c733ed4SBarry Smith         x3  = (MatScalar)x[2 + idx];
2602c733ed4SBarry Smith         x4  = (MatScalar)x[3 + idx];
2612c733ed4SBarry Smith         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2622c733ed4SBarry Smith         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2632c733ed4SBarry Smith         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2642c733ed4SBarry Smith         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2652c733ed4SBarry Smith         v += 16;
2662c733ed4SBarry Smith       }
2672c733ed4SBarry Smith       v          = aa + ai16;
2682c733ed4SBarry Smith       x[idt]     = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
2692c733ed4SBarry Smith       x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
2702c733ed4SBarry Smith       x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
2712c733ed4SBarry Smith       x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
2722c733ed4SBarry Smith       idt -= 4;
2732c733ed4SBarry Smith     }
2742c733ed4SBarry Smith   }
2752c733ed4SBarry Smith 
2769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
2779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
2789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
279*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2802c733ed4SBarry Smith }
2812c733ed4SBarry Smith 
2822c733ed4SBarry Smith #if defined(PETSC_HAVE_SSE)
2832c733ed4SBarry Smith 
2842c733ed4SBarry Smith   #include PETSC_HAVE_SSE
285d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
286d71ae5a4SJacob Faibussowitsch {
2872c733ed4SBarry Smith   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
2882c733ed4SBarry Smith   unsigned short *aj = (unsigned short *)a->j;
2892c733ed4SBarry Smith   int            *ai = a->i, n = a->mbs, *diag = a->diag;
2902c733ed4SBarry Smith   MatScalar      *aa = a->a;
2912c733ed4SBarry Smith   PetscScalar    *x, *b;
2922c733ed4SBarry Smith 
2932c733ed4SBarry Smith   PetscFunctionBegin;
2942c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
2952c733ed4SBarry Smith   /*
2962c733ed4SBarry Smith      Note: This code currently uses demotion of double
2972c733ed4SBarry Smith      to float when performing the mixed-mode computation.
2982c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
2992c733ed4SBarry Smith   */
3002c733ed4SBarry Smith   PREFETCH_NTA(aa + 16 * ai[1]);
3012c733ed4SBarry Smith 
3029566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
3039566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
3042c733ed4SBarry Smith   {
3052c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
3062c733ed4SBarry Smith     MatScalar      *v, *t = (MatScalar *)x;
3072c733ed4SBarry Smith     int             nz, i, idt, ai16;
3082c733ed4SBarry Smith     unsigned int    jdx, idx;
3092c733ed4SBarry Smith     unsigned short *vi;
3102c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
3112c733ed4SBarry Smith 
3122c733ed4SBarry Smith     /* First block is the identity. */
3132c733ed4SBarry Smith     idx = 0;
3142c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t, b);
3152c733ed4SBarry Smith     v = aa + 16 * ((unsigned int)ai[1]);
3162c733ed4SBarry Smith 
3172c733ed4SBarry Smith     for (i = 1; i < n;) {
3182c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
3192c733ed4SBarry Smith       vi = aj + ai[i];
3202c733ed4SBarry Smith       nz = diag[i] - ai[i];
3212c733ed4SBarry Smith       idx += 4;
3222c733ed4SBarry Smith 
3232c733ed4SBarry Smith       /* Demote RHS from double to float. */
3242c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
3252c733ed4SBarry Smith       LOAD_PS(&t[idx], XMM7);
3262c733ed4SBarry Smith 
3272c733ed4SBarry Smith       while (nz--) {
3282c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3292c733ed4SBarry Smith         jdx = 4 * ((unsigned int)(*vi++));
3302c733ed4SBarry Smith 
3312c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
3322c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx], v)
3332c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
3342c733ed4SBarry Smith 
3352c733ed4SBarry Smith         /* First Column */
3362c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
3372c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
3382c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
3392c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
3402c733ed4SBarry Smith 
3412c733ed4SBarry Smith         /* Second Column */
3422c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
3432c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
3442c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
3452c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
3462c733ed4SBarry Smith 
3472c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
3482c733ed4SBarry Smith 
3492c733ed4SBarry Smith         /* Third Column */
3502c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
3512c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
3522c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
3532c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
3542c733ed4SBarry Smith 
3552c733ed4SBarry Smith         /* Fourth Column */
3562c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
3572c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
3582c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
3592c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
3602c733ed4SBarry Smith         SSE_INLINE_END_2
3612c733ed4SBarry Smith 
3622c733ed4SBarry Smith         v += 16;
3632c733ed4SBarry Smith       }
3642c733ed4SBarry Smith       v = aa + 16 * ai[++i];
3652c733ed4SBarry Smith       PREFETCH_NTA(v);
3662c733ed4SBarry Smith       STORE_PS(&t[idx], XMM7);
3672c733ed4SBarry Smith     }
3682c733ed4SBarry Smith 
3692c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
3702c733ed4SBarry Smith 
3712c733ed4SBarry Smith     idt  = 4 * (n - 1);
3722c733ed4SBarry Smith     ai16 = 16 * diag[n - 1];
3732c733ed4SBarry Smith     v    = aa + ai16 + 16;
3742c733ed4SBarry Smith     for (i = n - 1; i >= 0;) {
3752c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
3762c733ed4SBarry Smith       vi = aj + diag[i] + 1;
3772c733ed4SBarry Smith       nz = ai[i + 1] - diag[i] - 1;
3782c733ed4SBarry Smith 
3792c733ed4SBarry Smith       LOAD_PS(&t[idt], XMM7);
3802c733ed4SBarry Smith 
3812c733ed4SBarry Smith       while (nz--) {
3822c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3832c733ed4SBarry Smith         idx = 4 * ((unsigned int)(*vi++));
3842c733ed4SBarry Smith 
3852c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
3862c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx], v)
3872c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
3882c733ed4SBarry Smith 
3892c733ed4SBarry Smith         /* First Column */
3902c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
3912c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
3922c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
3932c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
3942c733ed4SBarry Smith 
3952c733ed4SBarry Smith         /* Second Column */
3962c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
3972c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
3982c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
3992c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
4002c733ed4SBarry Smith 
4012c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
4022c733ed4SBarry Smith 
4032c733ed4SBarry Smith         /* Third Column */
4042c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
4052c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
4062c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
4072c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
4082c733ed4SBarry Smith 
4092c733ed4SBarry Smith         /* Fourth Column */
4102c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
4112c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
4122c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
4132c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
4142c733ed4SBarry Smith         SSE_INLINE_END_2
4152c733ed4SBarry Smith         v += 16;
4162c733ed4SBarry Smith       }
4172c733ed4SBarry Smith       v    = aa + ai16;
4182c733ed4SBarry Smith       ai16 = 16 * diag[--i];
4192c733ed4SBarry Smith       PREFETCH_NTA(aa + ai16 + 16);
4202c733ed4SBarry Smith       /*
4212c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
4222c733ed4SBarry Smith          which was inverted as part of the factorization
4232c733ed4SBarry Smith       */
4242c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
4252c733ed4SBarry Smith       /* First Column */
4262c733ed4SBarry Smith       SSE_COPY_PS(XMM0, XMM7)
4272c733ed4SBarry Smith       SSE_SHUFFLE(XMM0, XMM0, 0x00)
4282c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
4292c733ed4SBarry Smith 
4302c733ed4SBarry Smith       /* Second Column */
4312c733ed4SBarry Smith       SSE_COPY_PS(XMM1, XMM7)
4322c733ed4SBarry Smith       SSE_SHUFFLE(XMM1, XMM1, 0x55)
4332c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
4342c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM1)
4352c733ed4SBarry Smith 
4362c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
4372c733ed4SBarry Smith 
4382c733ed4SBarry Smith       /* Third Column */
4392c733ed4SBarry Smith       SSE_COPY_PS(XMM2, XMM7)
4402c733ed4SBarry Smith       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
4412c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
4422c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM2)
4432c733ed4SBarry Smith 
4442c733ed4SBarry Smith       /* Fourth Column */
4452c733ed4SBarry Smith       SSE_COPY_PS(XMM3, XMM7)
4462c733ed4SBarry Smith       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
4472c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
4482c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM3)
4492c733ed4SBarry Smith 
4502c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
4512c733ed4SBarry Smith       SSE_INLINE_END_3
4522c733ed4SBarry Smith 
4532c733ed4SBarry Smith       v = aa + ai16 + 16;
4542c733ed4SBarry Smith       idt -= 4;
4552c733ed4SBarry Smith     }
4562c733ed4SBarry Smith 
4572c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
4582c733ed4SBarry Smith     idt = 4 * (n - 1);
4592c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
4602c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4612c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4622c733ed4SBarry Smith       PetscScalar *xtemp = &x[idt];
4632c733ed4SBarry Smith       MatScalar   *ttemp = &t[idt];
4642c733ed4SBarry Smith       xtemp[3]           = (PetscScalar)ttemp[3];
4652c733ed4SBarry Smith       xtemp[2]           = (PetscScalar)ttemp[2];
4662c733ed4SBarry Smith       xtemp[1]           = (PetscScalar)ttemp[1];
4672c733ed4SBarry Smith       xtemp[0]           = (PetscScalar)ttemp[0];
4682c733ed4SBarry Smith       idt -= 4;
4692c733ed4SBarry Smith     }
4702c733ed4SBarry Smith 
4712c733ed4SBarry Smith   } /* End of artificial scope. */
4729566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
4739566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
4749566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
4752c733ed4SBarry Smith   SSE_SCOPE_END;
476*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4772c733ed4SBarry Smith }
4782c733ed4SBarry Smith 
479d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
480d71ae5a4SJacob Faibussowitsch {
4812c733ed4SBarry Smith   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
4822c733ed4SBarry Smith   int         *aj = a->j;
4832c733ed4SBarry Smith   int         *ai = a->i, n = a->mbs, *diag = a->diag;
4842c733ed4SBarry Smith   MatScalar   *aa = a->a;
4852c733ed4SBarry Smith   PetscScalar *x, *b;
4862c733ed4SBarry Smith 
4872c733ed4SBarry Smith   PetscFunctionBegin;
4882c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
4892c733ed4SBarry Smith   /*
4902c733ed4SBarry Smith      Note: This code currently uses demotion of double
4912c733ed4SBarry Smith      to float when performing the mixed-mode computation.
4922c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
4932c733ed4SBarry Smith   */
4942c733ed4SBarry Smith   PREFETCH_NTA(aa + 16 * ai[1]);
4952c733ed4SBarry Smith 
4969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
4979566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
4982c733ed4SBarry Smith   {
4992c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
5002c733ed4SBarry Smith     MatScalar *v, *t = (MatScalar *)x;
5012c733ed4SBarry Smith     int        nz, i, idt, ai16;
5022c733ed4SBarry Smith     int        jdx, idx;
5032c733ed4SBarry Smith     int       *vi;
5042c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
5052c733ed4SBarry Smith 
5062c733ed4SBarry Smith     /* First block is the identity. */
5072c733ed4SBarry Smith     idx = 0;
5082c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t, b);
5092c733ed4SBarry Smith     v = aa + 16 * ai[1];
5102c733ed4SBarry Smith 
5112c733ed4SBarry Smith     for (i = 1; i < n;) {
5122c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
5132c733ed4SBarry Smith       vi = aj + ai[i];
5142c733ed4SBarry Smith       nz = diag[i] - ai[i];
5152c733ed4SBarry Smith       idx += 4;
5162c733ed4SBarry Smith 
5172c733ed4SBarry Smith       /* Demote RHS from double to float. */
5182c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
5192c733ed4SBarry Smith       LOAD_PS(&t[idx], XMM7);
5202c733ed4SBarry Smith 
5212c733ed4SBarry Smith       while (nz--) {
5222c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5232c733ed4SBarry Smith         jdx = 4 * (*vi++);
5242c733ed4SBarry Smith         /*          jdx = *vi++; */
5252c733ed4SBarry Smith 
5262c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
5272c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx], v)
5282c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
5292c733ed4SBarry Smith 
5302c733ed4SBarry Smith         /* First Column */
5312c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
5322c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
5332c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
5342c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
5352c733ed4SBarry Smith 
5362c733ed4SBarry Smith         /* Second Column */
5372c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
5382c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
5392c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
5402c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
5412c733ed4SBarry Smith 
5422c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
5432c733ed4SBarry Smith 
5442c733ed4SBarry Smith         /* Third Column */
5452c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
5462c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
5472c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
5482c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
5492c733ed4SBarry Smith 
5502c733ed4SBarry Smith         /* Fourth Column */
5512c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
5522c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
5532c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
5542c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
5552c733ed4SBarry Smith         SSE_INLINE_END_2
5562c733ed4SBarry Smith 
5572c733ed4SBarry Smith         v += 16;
5582c733ed4SBarry Smith       }
5592c733ed4SBarry Smith       v = aa + 16 * ai[++i];
5602c733ed4SBarry Smith       PREFETCH_NTA(v);
5612c733ed4SBarry Smith       STORE_PS(&t[idx], XMM7);
5622c733ed4SBarry Smith     }
5632c733ed4SBarry Smith 
5642c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
5652c733ed4SBarry Smith 
5662c733ed4SBarry Smith     idt  = 4 * (n - 1);
5672c733ed4SBarry Smith     ai16 = 16 * diag[n - 1];
5682c733ed4SBarry Smith     v    = aa + ai16 + 16;
5692c733ed4SBarry Smith     for (i = n - 1; i >= 0;) {
5702c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
5712c733ed4SBarry Smith       vi = aj + diag[i] + 1;
5722c733ed4SBarry Smith       nz = ai[i + 1] - diag[i] - 1;
5732c733ed4SBarry Smith 
5742c733ed4SBarry Smith       LOAD_PS(&t[idt], XMM7);
5752c733ed4SBarry Smith 
5762c733ed4SBarry Smith       while (nz--) {
5772c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5782c733ed4SBarry Smith         idx = 4 * (*vi++);
5792c733ed4SBarry Smith         /*          idx = *vi++; */
5802c733ed4SBarry Smith 
5812c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
5822c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx], v)
5832c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
5842c733ed4SBarry Smith 
5852c733ed4SBarry Smith         /* First Column */
5862c733ed4SBarry Smith         SSE_COPY_PS(XMM0, XMM6)
5872c733ed4SBarry Smith         SSE_SHUFFLE(XMM0, XMM0, 0x00)
5882c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
5892c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM0)
5902c733ed4SBarry Smith 
5912c733ed4SBarry Smith         /* Second Column */
5922c733ed4SBarry Smith         SSE_COPY_PS(XMM1, XMM6)
5932c733ed4SBarry Smith         SSE_SHUFFLE(XMM1, XMM1, 0x55)
5942c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
5952c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM1)
5962c733ed4SBarry Smith 
5972c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
5982c733ed4SBarry Smith 
5992c733ed4SBarry Smith         /* Third Column */
6002c733ed4SBarry Smith         SSE_COPY_PS(XMM2, XMM6)
6012c733ed4SBarry Smith         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
6022c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
6032c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM2)
6042c733ed4SBarry Smith 
6052c733ed4SBarry Smith         /* Fourth Column */
6062c733ed4SBarry Smith         SSE_COPY_PS(XMM3, XMM6)
6072c733ed4SBarry Smith         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
6082c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
6092c733ed4SBarry Smith         SSE_SUB_PS(XMM7, XMM3)
6102c733ed4SBarry Smith         SSE_INLINE_END_2
6112c733ed4SBarry Smith         v += 16;
6122c733ed4SBarry Smith       }
6132c733ed4SBarry Smith       v    = aa + ai16;
6142c733ed4SBarry Smith       ai16 = 16 * diag[--i];
6152c733ed4SBarry Smith       PREFETCH_NTA(aa + ai16 + 16);
6162c733ed4SBarry Smith       /*
6172c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
6182c733ed4SBarry Smith          which was inverted as part of the factorization
6192c733ed4SBarry Smith       */
6202c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
6212c733ed4SBarry Smith       /* First Column */
6222c733ed4SBarry Smith       SSE_COPY_PS(XMM0, XMM7)
6232c733ed4SBarry Smith       SSE_SHUFFLE(XMM0, XMM0, 0x00)
6242c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
6252c733ed4SBarry Smith 
6262c733ed4SBarry Smith       /* Second Column */
6272c733ed4SBarry Smith       SSE_COPY_PS(XMM1, XMM7)
6282c733ed4SBarry Smith       SSE_SHUFFLE(XMM1, XMM1, 0x55)
6292c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
6302c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM1)
6312c733ed4SBarry Smith 
6322c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
6332c733ed4SBarry Smith 
6342c733ed4SBarry Smith       /* Third Column */
6352c733ed4SBarry Smith       SSE_COPY_PS(XMM2, XMM7)
6362c733ed4SBarry Smith       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
6372c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
6382c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM2)
6392c733ed4SBarry Smith 
6402c733ed4SBarry Smith       /* Fourth Column */
6412c733ed4SBarry Smith       SSE_COPY_PS(XMM3, XMM7)
6422c733ed4SBarry Smith       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
6432c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
6442c733ed4SBarry Smith       SSE_ADD_PS(XMM0, XMM3)
6452c733ed4SBarry Smith 
6462c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
6472c733ed4SBarry Smith       SSE_INLINE_END_3
6482c733ed4SBarry Smith 
6492c733ed4SBarry Smith       v = aa + ai16 + 16;
6502c733ed4SBarry Smith       idt -= 4;
6512c733ed4SBarry Smith     }
6522c733ed4SBarry Smith 
6532c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
6542c733ed4SBarry Smith     idt = 4 * (n - 1);
6552c733ed4SBarry Smith     for (i = n - 1; i >= 0; i--) {
6562c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
6572c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
6582c733ed4SBarry Smith       PetscScalar *xtemp = &x[idt];
6592c733ed4SBarry Smith       MatScalar   *ttemp = &t[idt];
6602c733ed4SBarry Smith       xtemp[3]           = (PetscScalar)ttemp[3];
6612c733ed4SBarry Smith       xtemp[2]           = (PetscScalar)ttemp[2];
6622c733ed4SBarry Smith       xtemp[1]           = (PetscScalar)ttemp[1];
6632c733ed4SBarry Smith       xtemp[0]           = (PetscScalar)ttemp[0];
6642c733ed4SBarry Smith       idt -= 4;
6652c733ed4SBarry Smith     }
6662c733ed4SBarry Smith 
6672c733ed4SBarry Smith   } /* End of artificial scope. */
6689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
6699566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
6709566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
6712c733ed4SBarry Smith   SSE_SCOPE_END;
672*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6732c733ed4SBarry Smith }
6742c733ed4SBarry Smith 
6752c733ed4SBarry Smith #endif
676