xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision 9566063d113dddea24716c546802770db7481bc0)
12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h>
22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h>
32c733ed4SBarry Smith 
42c733ed4SBarry Smith /*
52c733ed4SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
62c733ed4SBarry Smith    ordering. This eliminates the need for the column and row permutation.
72c733ed4SBarry Smith */
82c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
92c733ed4SBarry Smith {
102c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
112c733ed4SBarry Smith   PetscInt          n  =a->mbs;
122c733ed4SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
132c733ed4SBarry Smith   const PetscInt    *diag = a->diag;
142c733ed4SBarry Smith   const MatScalar   *aa   =a->a;
152c733ed4SBarry Smith   PetscScalar       *x;
162c733ed4SBarry Smith   const PetscScalar *b;
172c733ed4SBarry Smith 
182c733ed4SBarry Smith   PetscFunctionBegin;
19*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
20*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
212c733ed4SBarry Smith 
222c733ed4SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
232c733ed4SBarry Smith   {
242c733ed4SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
252c733ed4SBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
262c733ed4SBarry Smith   }
272c733ed4SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
282c733ed4SBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
292c733ed4SBarry Smith #else
302c733ed4SBarry Smith   {
312c733ed4SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
322c733ed4SBarry Smith     const MatScalar *v;
332c733ed4SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
342c733ed4SBarry Smith     const PetscInt  *vi;
352c733ed4SBarry Smith 
362c733ed4SBarry Smith     /* forward solve the lower triangular */
372c733ed4SBarry Smith     idx  = 0;
382c733ed4SBarry Smith     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
392c733ed4SBarry Smith     for (i=1; i<n; i++) {
402c733ed4SBarry Smith       v    =  aa      + 16*ai[i];
412c733ed4SBarry Smith       vi   =  aj      + ai[i];
422c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
432c733ed4SBarry Smith       idx +=  4;
442c733ed4SBarry Smith       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
452c733ed4SBarry Smith       while (nz--) {
462c733ed4SBarry Smith         jdx = 4*(*vi++);
472c733ed4SBarry Smith         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
482c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
492c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
502c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
512c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
522c733ed4SBarry Smith         v  += 16;
532c733ed4SBarry Smith       }
542c733ed4SBarry Smith       x[idx]   = s1;
552c733ed4SBarry Smith       x[1+idx] = s2;
562c733ed4SBarry Smith       x[2+idx] = s3;
572c733ed4SBarry Smith       x[3+idx] = s4;
582c733ed4SBarry Smith     }
592c733ed4SBarry Smith     /* backward solve the upper triangular */
602c733ed4SBarry Smith     idt = 4*(n-1);
612c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
622c733ed4SBarry Smith       ai16 = 16*diag[i];
632c733ed4SBarry Smith       v    = aa + ai16 + 16;
642c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
652c733ed4SBarry Smith       nz   = ai[i+1] - diag[i] - 1;
662c733ed4SBarry Smith       s1   = x[idt];  s2 = x[1+idt];
672c733ed4SBarry Smith       s3   = x[2+idt];s4 = x[3+idt];
682c733ed4SBarry Smith       while (nz--) {
692c733ed4SBarry Smith         idx = 4*(*vi++);
702c733ed4SBarry Smith         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
712c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
722c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
732c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
742c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
752c733ed4SBarry Smith         v  += 16;
762c733ed4SBarry Smith       }
772c733ed4SBarry Smith       v        = aa + ai16;
782c733ed4SBarry Smith       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
792c733ed4SBarry Smith       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
802c733ed4SBarry Smith       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
812c733ed4SBarry Smith       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
822c733ed4SBarry Smith       idt     -= 4;
832c733ed4SBarry Smith     }
842c733ed4SBarry Smith   }
852c733ed4SBarry Smith #endif
862c733ed4SBarry Smith 
87*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
88*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
89*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
902c733ed4SBarry Smith   PetscFunctionReturn(0);
912c733ed4SBarry Smith }
922c733ed4SBarry Smith 
932c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
942c733ed4SBarry Smith {
952c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
962c733ed4SBarry Smith   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
972c733ed4SBarry Smith   PetscInt          i,k,nz,idx,jdx,idt;
982c733ed4SBarry Smith   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
992c733ed4SBarry Smith   const MatScalar   *aa=a->a,*v;
1002c733ed4SBarry Smith   PetscScalar       *x;
1012c733ed4SBarry Smith   const PetscScalar *b;
1022c733ed4SBarry Smith   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
1032c733ed4SBarry Smith 
1042c733ed4SBarry Smith   PetscFunctionBegin;
105*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
106*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
1072c733ed4SBarry Smith   /* forward solve the lower triangular */
1082c733ed4SBarry Smith   idx  = 0;
1092c733ed4SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1102c733ed4SBarry Smith   for (i=1; i<n; i++) {
1112c733ed4SBarry Smith     v   = aa + bs2*ai[i];
1122c733ed4SBarry Smith     vi  = aj + ai[i];
1132c733ed4SBarry Smith     nz  = ai[i+1] - ai[i];
1142c733ed4SBarry Smith     idx = bs*i;
1152c733ed4SBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1162c733ed4SBarry Smith     for (k=0; k<nz; k++) {
1172c733ed4SBarry Smith       jdx = bs*vi[k];
1182c733ed4SBarry Smith       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1192c733ed4SBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1202c733ed4SBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1212c733ed4SBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1222c733ed4SBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1232c733ed4SBarry Smith 
1242c733ed4SBarry Smith       v +=  bs2;
1252c733ed4SBarry Smith     }
1262c733ed4SBarry Smith 
1272c733ed4SBarry Smith     x[idx]   = s1;
1282c733ed4SBarry Smith     x[1+idx] = s2;
1292c733ed4SBarry Smith     x[2+idx] = s3;
1302c733ed4SBarry Smith     x[3+idx] = s4;
1312c733ed4SBarry Smith   }
1322c733ed4SBarry Smith 
1332c733ed4SBarry Smith   /* backward solve the upper triangular */
1342c733ed4SBarry Smith   for (i=n-1; i>=0; i--) {
1352c733ed4SBarry Smith     v   = aa + bs2*(adiag[i+1]+1);
1362c733ed4SBarry Smith     vi  = aj + adiag[i+1]+1;
1372c733ed4SBarry Smith     nz  = adiag[i] - adiag[i+1]-1;
1382c733ed4SBarry Smith     idt = bs*i;
1392c733ed4SBarry Smith     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1402c733ed4SBarry Smith 
1412c733ed4SBarry Smith     for (k=0; k<nz; k++) {
1422c733ed4SBarry Smith       idx = bs*vi[k];
1432c733ed4SBarry Smith       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1442c733ed4SBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1452c733ed4SBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1462c733ed4SBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1472c733ed4SBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1482c733ed4SBarry Smith 
1492c733ed4SBarry Smith       v +=  bs2;
1502c733ed4SBarry Smith     }
1512c733ed4SBarry Smith     /* x = inv_diagonal*x */
1522c733ed4SBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
153feb237baSPierre Jolivet     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
1542c733ed4SBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1552c733ed4SBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1562c733ed4SBarry Smith 
1572c733ed4SBarry Smith   }
1582c733ed4SBarry Smith 
159*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
160*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
161*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n));
1622c733ed4SBarry Smith   PetscFunctionReturn(0);
1632c733ed4SBarry Smith }
1642c733ed4SBarry Smith 
1652c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
1662c733ed4SBarry Smith {
1672c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1682c733ed4SBarry Smith   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
1692c733ed4SBarry Smith   const MatScalar   *aa=a->a;
1702c733ed4SBarry Smith   const PetscScalar *b;
1712c733ed4SBarry Smith   PetscScalar       *x;
1722c733ed4SBarry Smith 
1732c733ed4SBarry Smith   PetscFunctionBegin;
174*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
175*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
1762c733ed4SBarry Smith 
1772c733ed4SBarry Smith   {
1782c733ed4SBarry Smith     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
1792c733ed4SBarry Smith     const MatScalar *v;
1802c733ed4SBarry Smith     MatScalar       *t=(MatScalar*)x;
1812c733ed4SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
1822c733ed4SBarry Smith     const PetscInt  *vi;
1832c733ed4SBarry Smith 
1842c733ed4SBarry Smith     /* forward solve the lower triangular */
1852c733ed4SBarry Smith     idx  = 0;
1862c733ed4SBarry Smith     t[0] = (MatScalar)b[0];
1872c733ed4SBarry Smith     t[1] = (MatScalar)b[1];
1882c733ed4SBarry Smith     t[2] = (MatScalar)b[2];
1892c733ed4SBarry Smith     t[3] = (MatScalar)b[3];
1902c733ed4SBarry Smith     for (i=1; i<n; i++) {
1912c733ed4SBarry Smith       v    =  aa      + 16*ai[i];
1922c733ed4SBarry Smith       vi   =  aj      + ai[i];
1932c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
1942c733ed4SBarry Smith       idx +=  4;
1952c733ed4SBarry Smith       s1   = (MatScalar)b[idx];
1962c733ed4SBarry Smith       s2   = (MatScalar)b[1+idx];
1972c733ed4SBarry Smith       s3   = (MatScalar)b[2+idx];
1982c733ed4SBarry Smith       s4   = (MatScalar)b[3+idx];
1992c733ed4SBarry Smith       while (nz--) {
2002c733ed4SBarry Smith         jdx = 4*(*vi++);
2012c733ed4SBarry Smith         x1  = t[jdx];
2022c733ed4SBarry Smith         x2  = t[1+jdx];
2032c733ed4SBarry Smith         x3  = t[2+jdx];
2042c733ed4SBarry Smith         x4  = t[3+jdx];
2052c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2062c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2072c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2082c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2092c733ed4SBarry Smith         v  += 16;
2102c733ed4SBarry Smith       }
2112c733ed4SBarry Smith       t[idx]   = s1;
2122c733ed4SBarry Smith       t[1+idx] = s2;
2132c733ed4SBarry Smith       t[2+idx] = s3;
2142c733ed4SBarry Smith       t[3+idx] = s4;
2152c733ed4SBarry Smith     }
2162c733ed4SBarry Smith     /* backward solve the upper triangular */
2172c733ed4SBarry Smith     idt = 4*(n-1);
2182c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
2192c733ed4SBarry Smith       ai16 = 16*diag[i];
2202c733ed4SBarry Smith       v    = aa + ai16 + 16;
2212c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
2222c733ed4SBarry Smith       nz   = ai[i+1] - diag[i] - 1;
2232c733ed4SBarry Smith       s1   = t[idt];
2242c733ed4SBarry Smith       s2   = t[1+idt];
2252c733ed4SBarry Smith       s3   = t[2+idt];
2262c733ed4SBarry Smith       s4   = t[3+idt];
2272c733ed4SBarry Smith       while (nz--) {
2282c733ed4SBarry Smith         idx = 4*(*vi++);
2292c733ed4SBarry Smith         x1  = (MatScalar)x[idx];
2302c733ed4SBarry Smith         x2  = (MatScalar)x[1+idx];
2312c733ed4SBarry Smith         x3  = (MatScalar)x[2+idx];
2322c733ed4SBarry Smith         x4  = (MatScalar)x[3+idx];
2332c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2342c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2352c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2362c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2372c733ed4SBarry Smith         v  += 16;
2382c733ed4SBarry Smith       }
2392c733ed4SBarry Smith       v        = aa + ai16;
2402c733ed4SBarry Smith       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2412c733ed4SBarry Smith       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2422c733ed4SBarry Smith       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2432c733ed4SBarry Smith       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2442c733ed4SBarry Smith       idt     -= 4;
2452c733ed4SBarry Smith     }
2462c733ed4SBarry Smith   }
2472c733ed4SBarry Smith 
248*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
249*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
250*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
2512c733ed4SBarry Smith   PetscFunctionReturn(0);
2522c733ed4SBarry Smith }
2532c733ed4SBarry Smith 
2542c733ed4SBarry Smith #if defined(PETSC_HAVE_SSE)
2552c733ed4SBarry Smith 
2562c733ed4SBarry Smith #include PETSC_HAVE_SSE
2572c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2582c733ed4SBarry Smith {
2592c733ed4SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2602c733ed4SBarry Smith   unsigned short *aj=(unsigned short*)a->j;
2612c733ed4SBarry Smith   PetscErrorCode ierr;
2622c733ed4SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2632c733ed4SBarry Smith   MatScalar      *aa=a->a;
2642c733ed4SBarry Smith   PetscScalar    *x,*b;
2652c733ed4SBarry Smith 
2662c733ed4SBarry Smith   PetscFunctionBegin;
2672c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
2682c733ed4SBarry Smith   /*
2692c733ed4SBarry Smith      Note: This code currently uses demotion of double
2702c733ed4SBarry Smith      to float when performing the mixed-mode computation.
2712c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
2722c733ed4SBarry Smith   */
2732c733ed4SBarry Smith   PREFETCH_NTA(aa+16*ai[1]);
2742c733ed4SBarry Smith 
275*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb,&b));
276*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
2772c733ed4SBarry Smith   {
2782c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
2792c733ed4SBarry Smith     MatScalar      *v,*t=(MatScalar*)x;
2802c733ed4SBarry Smith     int            nz,i,idt,ai16;
2812c733ed4SBarry Smith     unsigned int   jdx,idx;
2822c733ed4SBarry Smith     unsigned short *vi;
2832c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
2842c733ed4SBarry Smith 
2852c733ed4SBarry Smith     /* First block is the identity. */
2862c733ed4SBarry Smith     idx = 0;
2872c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t,b);
2882c733ed4SBarry Smith     v =  aa + 16*((unsigned int)ai[1]);
2892c733ed4SBarry Smith 
2902c733ed4SBarry Smith     for (i=1; i<n;) {
2912c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
2922c733ed4SBarry Smith       vi   =  aj      + ai[i];
2932c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
2942c733ed4SBarry Smith       idx +=  4;
2952c733ed4SBarry Smith 
2962c733ed4SBarry Smith       /* Demote RHS from double to float. */
2972c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2982c733ed4SBarry Smith       LOAD_PS(&t[idx],XMM7);
2992c733ed4SBarry Smith 
3002c733ed4SBarry Smith       while (nz--) {
3012c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3022c733ed4SBarry Smith         jdx = 4*((unsigned int)(*vi++));
3032c733ed4SBarry Smith 
3042c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
3052c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx],v)
3062c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3072c733ed4SBarry Smith 
3082c733ed4SBarry Smith         /* First Column */
3092c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
3102c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
3112c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3122c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
3132c733ed4SBarry Smith 
3142c733ed4SBarry Smith         /* Second Column */
3152c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
3162c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
3172c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3182c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
3192c733ed4SBarry Smith 
3202c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3212c733ed4SBarry Smith 
3222c733ed4SBarry Smith         /* Third Column */
3232c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
3242c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3252c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3262c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
3272c733ed4SBarry Smith 
3282c733ed4SBarry Smith         /* Fourth Column */
3292c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
3302c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3312c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3322c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
3332c733ed4SBarry Smith         SSE_INLINE_END_2
3342c733ed4SBarry Smith 
3352c733ed4SBarry Smith         v += 16;
3362c733ed4SBarry Smith       }
3372c733ed4SBarry Smith       v =  aa + 16*ai[++i];
3382c733ed4SBarry Smith       PREFETCH_NTA(v);
3392c733ed4SBarry Smith       STORE_PS(&t[idx],XMM7);
3402c733ed4SBarry Smith     }
3412c733ed4SBarry Smith 
3422c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
3432c733ed4SBarry Smith 
3442c733ed4SBarry Smith     idt  = 4*(n-1);
3452c733ed4SBarry Smith     ai16 = 16*diag[n-1];
3462c733ed4SBarry Smith     v    = aa + ai16 + 16;
3472c733ed4SBarry Smith     for (i=n-1; i>=0;) {
3482c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
3492c733ed4SBarry Smith       vi = aj + diag[i] + 1;
3502c733ed4SBarry Smith       nz = ai[i+1] - diag[i] - 1;
3512c733ed4SBarry Smith 
3522c733ed4SBarry Smith       LOAD_PS(&t[idt],XMM7);
3532c733ed4SBarry Smith 
3542c733ed4SBarry Smith       while (nz--) {
3552c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3562c733ed4SBarry Smith         idx = 4*((unsigned int)(*vi++));
3572c733ed4SBarry Smith 
3582c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
3592c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx],v)
3602c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3612c733ed4SBarry Smith 
3622c733ed4SBarry Smith         /* First Column */
3632c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
3642c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
3652c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3662c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
3672c733ed4SBarry Smith 
3682c733ed4SBarry Smith         /* Second Column */
3692c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
3702c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
3712c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3722c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
3732c733ed4SBarry Smith 
3742c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3752c733ed4SBarry Smith 
3762c733ed4SBarry Smith         /* Third Column */
3772c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
3782c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3792c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3802c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
3812c733ed4SBarry Smith 
3822c733ed4SBarry Smith         /* Fourth Column */
3832c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
3842c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3852c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3862c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
3872c733ed4SBarry Smith         SSE_INLINE_END_2
3882c733ed4SBarry Smith         v += 16;
3892c733ed4SBarry Smith       }
3902c733ed4SBarry Smith       v    = aa + ai16;
3912c733ed4SBarry Smith       ai16 = 16*diag[--i];
3922c733ed4SBarry Smith       PREFETCH_NTA(aa+ai16+16);
3932c733ed4SBarry Smith       /*
3942c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
3952c733ed4SBarry Smith          which was inverted as part of the factorization
3962c733ed4SBarry Smith       */
3972c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
3982c733ed4SBarry Smith       /* First Column */
3992c733ed4SBarry Smith       SSE_COPY_PS(XMM0,XMM7)
4002c733ed4SBarry Smith       SSE_SHUFFLE(XMM0,XMM0,0x00)
4012c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4022c733ed4SBarry Smith 
4032c733ed4SBarry Smith       /* Second Column */
4042c733ed4SBarry Smith       SSE_COPY_PS(XMM1,XMM7)
4052c733ed4SBarry Smith       SSE_SHUFFLE(XMM1,XMM1,0x55)
4062c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4072c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM1)
4082c733ed4SBarry Smith 
4092c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4102c733ed4SBarry Smith 
4112c733ed4SBarry Smith       /* Third Column */
4122c733ed4SBarry Smith       SSE_COPY_PS(XMM2,XMM7)
4132c733ed4SBarry Smith       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4142c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4152c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM2)
4162c733ed4SBarry Smith 
4172c733ed4SBarry Smith       /* Fourth Column */
4182c733ed4SBarry Smith       SSE_COPY_PS(XMM3,XMM7)
4192c733ed4SBarry Smith       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4202c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4212c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM3)
4222c733ed4SBarry Smith 
4232c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4242c733ed4SBarry Smith       SSE_INLINE_END_3
4252c733ed4SBarry Smith 
4262c733ed4SBarry Smith       v    = aa + ai16 + 16;
4272c733ed4SBarry Smith       idt -= 4;
4282c733ed4SBarry Smith     }
4292c733ed4SBarry Smith 
4302c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
4312c733ed4SBarry Smith     idt = 4*(n-1);
4322c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
4332c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4342c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4352c733ed4SBarry Smith       PetscScalar *xtemp=&x[idt];
4362c733ed4SBarry Smith       MatScalar   *ttemp=&t[idt];
4372c733ed4SBarry Smith       xtemp[3] = (PetscScalar)ttemp[3];
4382c733ed4SBarry Smith       xtemp[2] = (PetscScalar)ttemp[2];
4392c733ed4SBarry Smith       xtemp[1] = (PetscScalar)ttemp[1];
4402c733ed4SBarry Smith       xtemp[0] = (PetscScalar)ttemp[0];
4412c733ed4SBarry Smith       idt     -= 4;
4422c733ed4SBarry Smith     }
4432c733ed4SBarry Smith 
4442c733ed4SBarry Smith   } /* End of artificial scope. */
445*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb,&b));
446*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
447*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
4482c733ed4SBarry Smith   SSE_SCOPE_END;
4492c733ed4SBarry Smith   PetscFunctionReturn(0);
4502c733ed4SBarry Smith }
4512c733ed4SBarry Smith 
4522c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4532c733ed4SBarry Smith {
4542c733ed4SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
4552c733ed4SBarry Smith   int            *aj=a->j;
4562c733ed4SBarry Smith   PetscErrorCode ierr;
4572c733ed4SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4582c733ed4SBarry Smith   MatScalar      *aa=a->a;
4592c733ed4SBarry Smith   PetscScalar    *x,*b;
4602c733ed4SBarry Smith 
4612c733ed4SBarry Smith   PetscFunctionBegin;
4622c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
4632c733ed4SBarry Smith   /*
4642c733ed4SBarry Smith      Note: This code currently uses demotion of double
4652c733ed4SBarry Smith      to float when performing the mixed-mode computation.
4662c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
4672c733ed4SBarry Smith   */
4682c733ed4SBarry Smith   PREFETCH_NTA(aa+16*ai[1]);
4692c733ed4SBarry Smith 
470*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb,&b));
471*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
4722c733ed4SBarry Smith   {
4732c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
4742c733ed4SBarry Smith     MatScalar *v,*t=(MatScalar*)x;
4752c733ed4SBarry Smith     int       nz,i,idt,ai16;
4762c733ed4SBarry Smith     int       jdx,idx;
4772c733ed4SBarry Smith     int       *vi;
4782c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
4792c733ed4SBarry Smith 
4802c733ed4SBarry Smith     /* First block is the identity. */
4812c733ed4SBarry Smith     idx = 0;
4822c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t,b);
4832c733ed4SBarry Smith     v =  aa + 16*ai[1];
4842c733ed4SBarry Smith 
4852c733ed4SBarry Smith     for (i=1; i<n;) {
4862c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
4872c733ed4SBarry Smith       vi   =  aj      + ai[i];
4882c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
4892c733ed4SBarry Smith       idx +=  4;
4902c733ed4SBarry Smith 
4912c733ed4SBarry Smith       /* Demote RHS from double to float. */
4922c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4932c733ed4SBarry Smith       LOAD_PS(&t[idx],XMM7);
4942c733ed4SBarry Smith 
4952c733ed4SBarry Smith       while (nz--) {
4962c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
4972c733ed4SBarry Smith         jdx = 4*(*vi++);
4982c733ed4SBarry Smith /*          jdx = *vi++; */
4992c733ed4SBarry Smith 
5002c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
5012c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx],v)
5022c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
5032c733ed4SBarry Smith 
5042c733ed4SBarry Smith         /* First Column */
5052c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
5062c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
5072c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
5082c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
5092c733ed4SBarry Smith 
5102c733ed4SBarry Smith         /* Second Column */
5112c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
5122c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
5132c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
5142c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
5152c733ed4SBarry Smith 
5162c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
5172c733ed4SBarry Smith 
5182c733ed4SBarry Smith         /* Third Column */
5192c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
5202c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
5212c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
5222c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
5232c733ed4SBarry Smith 
5242c733ed4SBarry Smith         /* Fourth Column */
5252c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
5262c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
5272c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
5282c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
5292c733ed4SBarry Smith         SSE_INLINE_END_2
5302c733ed4SBarry Smith 
5312c733ed4SBarry Smith         v += 16;
5322c733ed4SBarry Smith       }
5332c733ed4SBarry Smith       v =  aa + 16*ai[++i];
5342c733ed4SBarry Smith       PREFETCH_NTA(v);
5352c733ed4SBarry Smith       STORE_PS(&t[idx],XMM7);
5362c733ed4SBarry Smith     }
5372c733ed4SBarry Smith 
5382c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
5392c733ed4SBarry Smith 
5402c733ed4SBarry Smith     idt  = 4*(n-1);
5412c733ed4SBarry Smith     ai16 = 16*diag[n-1];
5422c733ed4SBarry Smith     v    = aa + ai16 + 16;
5432c733ed4SBarry Smith     for (i=n-1; i>=0;) {
5442c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
5452c733ed4SBarry Smith       vi = aj + diag[i] + 1;
5462c733ed4SBarry Smith       nz = ai[i+1] - diag[i] - 1;
5472c733ed4SBarry Smith 
5482c733ed4SBarry Smith       LOAD_PS(&t[idt],XMM7);
5492c733ed4SBarry Smith 
5502c733ed4SBarry Smith       while (nz--) {
5512c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5522c733ed4SBarry Smith         idx = 4*(*vi++);
5532c733ed4SBarry Smith /*          idx = *vi++; */
5542c733ed4SBarry Smith 
5552c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
5562c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx],v)
5572c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
5582c733ed4SBarry Smith 
5592c733ed4SBarry Smith         /* First Column */
5602c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
5612c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
5622c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
5632c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
5642c733ed4SBarry Smith 
5652c733ed4SBarry Smith         /* Second Column */
5662c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
5672c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
5682c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
5692c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
5702c733ed4SBarry Smith 
5712c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
5722c733ed4SBarry Smith 
5732c733ed4SBarry Smith         /* Third Column */
5742c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
5752c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
5762c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
5772c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
5782c733ed4SBarry Smith 
5792c733ed4SBarry Smith         /* Fourth Column */
5802c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
5812c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
5822c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
5832c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
5842c733ed4SBarry Smith         SSE_INLINE_END_2
5852c733ed4SBarry Smith         v += 16;
5862c733ed4SBarry Smith       }
5872c733ed4SBarry Smith       v    = aa + ai16;
5882c733ed4SBarry Smith       ai16 = 16*diag[--i];
5892c733ed4SBarry Smith       PREFETCH_NTA(aa+ai16+16);
5902c733ed4SBarry Smith       /*
5912c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
5922c733ed4SBarry Smith          which was inverted as part of the factorization
5932c733ed4SBarry Smith       */
5942c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
5952c733ed4SBarry Smith       /* First Column */
5962c733ed4SBarry Smith       SSE_COPY_PS(XMM0,XMM7)
5972c733ed4SBarry Smith       SSE_SHUFFLE(XMM0,XMM0,0x00)
5982c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
5992c733ed4SBarry Smith 
6002c733ed4SBarry Smith       /* Second Column */
6012c733ed4SBarry Smith       SSE_COPY_PS(XMM1,XMM7)
6022c733ed4SBarry Smith       SSE_SHUFFLE(XMM1,XMM1,0x55)
6032c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
6042c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM1)
6052c733ed4SBarry Smith 
6062c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
6072c733ed4SBarry Smith 
6082c733ed4SBarry Smith       /* Third Column */
6092c733ed4SBarry Smith       SSE_COPY_PS(XMM2,XMM7)
6102c733ed4SBarry Smith       SSE_SHUFFLE(XMM2,XMM2,0xAA)
6112c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
6122c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM2)
6132c733ed4SBarry Smith 
6142c733ed4SBarry Smith       /* Fourth Column */
6152c733ed4SBarry Smith       SSE_COPY_PS(XMM3,XMM7)
6162c733ed4SBarry Smith       SSE_SHUFFLE(XMM3,XMM3,0xFF)
6172c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
6182c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM3)
6192c733ed4SBarry Smith 
6202c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
6212c733ed4SBarry Smith       SSE_INLINE_END_3
6222c733ed4SBarry Smith 
6232c733ed4SBarry Smith       v    = aa + ai16 + 16;
6242c733ed4SBarry Smith       idt -= 4;
6252c733ed4SBarry Smith     }
6262c733ed4SBarry Smith 
6272c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
6282c733ed4SBarry Smith     idt = 4*(n-1);
6292c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
6302c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
6312c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
6322c733ed4SBarry Smith       PetscScalar *xtemp=&x[idt];
6332c733ed4SBarry Smith       MatScalar   *ttemp=&t[idt];
6342c733ed4SBarry Smith       xtemp[3] = (PetscScalar)ttemp[3];
6352c733ed4SBarry Smith       xtemp[2] = (PetscScalar)ttemp[2];
6362c733ed4SBarry Smith       xtemp[1] = (PetscScalar)ttemp[1];
6372c733ed4SBarry Smith       xtemp[0] = (PetscScalar)ttemp[0];
6382c733ed4SBarry Smith       idt     -= 4;
6392c733ed4SBarry Smith     }
6402c733ed4SBarry Smith 
6412c733ed4SBarry Smith   } /* End of artificial scope. */
642*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb,&b));
643*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
644*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
6452c733ed4SBarry Smith   SSE_SCOPE_END;
6462c733ed4SBarry Smith   PetscFunctionReturn(0);
6472c733ed4SBarry Smith }
6482c733ed4SBarry Smith 
6492c733ed4SBarry Smith #endif
650