xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision feb237ba1401b79cfa9a99e81307a265c4a68462)
12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h>
22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h>
32c733ed4SBarry Smith 
42c733ed4SBarry Smith /*
52c733ed4SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
62c733ed4SBarry Smith    ordering. This eliminates the need for the column and row permutation.
72c733ed4SBarry Smith */
82c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
92c733ed4SBarry Smith {
102c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
112c733ed4SBarry Smith   PetscInt          n  =a->mbs;
122c733ed4SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
132c733ed4SBarry Smith   PetscErrorCode    ierr;
142c733ed4SBarry Smith   const PetscInt    *diag = a->diag;
152c733ed4SBarry Smith   const MatScalar   *aa   =a->a;
162c733ed4SBarry Smith   PetscScalar       *x;
172c733ed4SBarry Smith   const PetscScalar *b;
182c733ed4SBarry Smith 
192c733ed4SBarry Smith   PetscFunctionBegin;
202c733ed4SBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
212c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
222c733ed4SBarry Smith 
232c733ed4SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
242c733ed4SBarry Smith   {
252c733ed4SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
262c733ed4SBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
272c733ed4SBarry Smith   }
282c733ed4SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
292c733ed4SBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
302c733ed4SBarry Smith #else
312c733ed4SBarry Smith   {
322c733ed4SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
332c733ed4SBarry Smith     const MatScalar *v;
342c733ed4SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
352c733ed4SBarry Smith     const PetscInt  *vi;
362c733ed4SBarry Smith 
372c733ed4SBarry Smith     /* forward solve the lower triangular */
382c733ed4SBarry Smith     idx  = 0;
392c733ed4SBarry Smith     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
402c733ed4SBarry Smith     for (i=1; i<n; i++) {
412c733ed4SBarry Smith       v    =  aa      + 16*ai[i];
422c733ed4SBarry Smith       vi   =  aj      + ai[i];
432c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
442c733ed4SBarry Smith       idx +=  4;
452c733ed4SBarry Smith       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
462c733ed4SBarry Smith       while (nz--) {
472c733ed4SBarry Smith         jdx = 4*(*vi++);
482c733ed4SBarry Smith         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
492c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
502c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
512c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
522c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
532c733ed4SBarry Smith         v  += 16;
542c733ed4SBarry Smith       }
552c733ed4SBarry Smith       x[idx]   = s1;
562c733ed4SBarry Smith       x[1+idx] = s2;
572c733ed4SBarry Smith       x[2+idx] = s3;
582c733ed4SBarry Smith       x[3+idx] = s4;
592c733ed4SBarry Smith     }
602c733ed4SBarry Smith     /* backward solve the upper triangular */
612c733ed4SBarry Smith     idt = 4*(n-1);
622c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
632c733ed4SBarry Smith       ai16 = 16*diag[i];
642c733ed4SBarry Smith       v    = aa + ai16 + 16;
652c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
662c733ed4SBarry Smith       nz   = ai[i+1] - diag[i] - 1;
672c733ed4SBarry Smith       s1   = x[idt];  s2 = x[1+idt];
682c733ed4SBarry Smith       s3   = x[2+idt];s4 = x[3+idt];
692c733ed4SBarry Smith       while (nz--) {
702c733ed4SBarry Smith         idx = 4*(*vi++);
712c733ed4SBarry Smith         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
722c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
732c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
742c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
752c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
762c733ed4SBarry Smith         v  += 16;
772c733ed4SBarry Smith       }
782c733ed4SBarry Smith       v        = aa + ai16;
792c733ed4SBarry Smith       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
802c733ed4SBarry Smith       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
812c733ed4SBarry Smith       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
822c733ed4SBarry Smith       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
832c733ed4SBarry Smith       idt     -= 4;
842c733ed4SBarry Smith     }
852c733ed4SBarry Smith   }
862c733ed4SBarry Smith #endif
872c733ed4SBarry Smith 
882c733ed4SBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
892c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
902c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
912c733ed4SBarry Smith   PetscFunctionReturn(0);
922c733ed4SBarry Smith }
932c733ed4SBarry Smith 
942c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
952c733ed4SBarry Smith {
962c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
972c733ed4SBarry Smith   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
982c733ed4SBarry Smith   PetscInt          i,k,nz,idx,jdx,idt;
992c733ed4SBarry Smith   PetscErrorCode    ierr;
1002c733ed4SBarry Smith   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
1012c733ed4SBarry Smith   const MatScalar   *aa=a->a,*v;
1022c733ed4SBarry Smith   PetscScalar       *x;
1032c733ed4SBarry Smith   const PetscScalar *b;
1042c733ed4SBarry Smith   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
1052c733ed4SBarry Smith 
1062c733ed4SBarry Smith   PetscFunctionBegin;
1072c733ed4SBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1082c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1092c733ed4SBarry Smith   /* forward solve the lower triangular */
1102c733ed4SBarry Smith   idx  = 0;
1112c733ed4SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1122c733ed4SBarry Smith   for (i=1; i<n; i++) {
1132c733ed4SBarry Smith     v   = aa + bs2*ai[i];
1142c733ed4SBarry Smith     vi  = aj + ai[i];
1152c733ed4SBarry Smith     nz  = ai[i+1] - ai[i];
1162c733ed4SBarry Smith     idx = bs*i;
1172c733ed4SBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1182c733ed4SBarry Smith     for (k=0; k<nz; k++) {
1192c733ed4SBarry Smith       jdx = bs*vi[k];
1202c733ed4SBarry Smith       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1212c733ed4SBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1222c733ed4SBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1232c733ed4SBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1242c733ed4SBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1252c733ed4SBarry Smith 
1262c733ed4SBarry Smith       v +=  bs2;
1272c733ed4SBarry Smith     }
1282c733ed4SBarry Smith 
1292c733ed4SBarry Smith     x[idx]   = s1;
1302c733ed4SBarry Smith     x[1+idx] = s2;
1312c733ed4SBarry Smith     x[2+idx] = s3;
1322c733ed4SBarry Smith     x[3+idx] = s4;
1332c733ed4SBarry Smith   }
1342c733ed4SBarry Smith 
1352c733ed4SBarry Smith   /* backward solve the upper triangular */
1362c733ed4SBarry Smith   for (i=n-1; i>=0; i--) {
1372c733ed4SBarry Smith     v   = aa + bs2*(adiag[i+1]+1);
1382c733ed4SBarry Smith     vi  = aj + adiag[i+1]+1;
1392c733ed4SBarry Smith     nz  = adiag[i] - adiag[i+1]-1;
1402c733ed4SBarry Smith     idt = bs*i;
1412c733ed4SBarry Smith     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1422c733ed4SBarry Smith 
1432c733ed4SBarry Smith     for (k=0; k<nz; k++) {
1442c733ed4SBarry Smith       idx = bs*vi[k];
1452c733ed4SBarry Smith       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1462c733ed4SBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1472c733ed4SBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1482c733ed4SBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1492c733ed4SBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1502c733ed4SBarry Smith 
1512c733ed4SBarry Smith       v +=  bs2;
1522c733ed4SBarry Smith     }
1532c733ed4SBarry Smith     /* x = inv_diagonal*x */
1542c733ed4SBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
155*feb237baSPierre Jolivet     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
1562c733ed4SBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1572c733ed4SBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1582c733ed4SBarry Smith 
1592c733ed4SBarry Smith   }
1602c733ed4SBarry Smith 
1612c733ed4SBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1622c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1632c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1642c733ed4SBarry Smith   PetscFunctionReturn(0);
1652c733ed4SBarry Smith }
1662c733ed4SBarry Smith 
1672c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
1682c733ed4SBarry Smith {
1692c733ed4SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1702c733ed4SBarry Smith   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
1712c733ed4SBarry Smith   PetscErrorCode    ierr;
1722c733ed4SBarry Smith   const MatScalar   *aa=a->a;
1732c733ed4SBarry Smith   const PetscScalar *b;
1742c733ed4SBarry Smith   PetscScalar       *x;
1752c733ed4SBarry Smith 
1762c733ed4SBarry Smith   PetscFunctionBegin;
1772c733ed4SBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1782c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1792c733ed4SBarry Smith 
1802c733ed4SBarry Smith   {
1812c733ed4SBarry Smith     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
1822c733ed4SBarry Smith     const MatScalar *v;
1832c733ed4SBarry Smith     MatScalar       *t=(MatScalar*)x;
1842c733ed4SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
1852c733ed4SBarry Smith     const PetscInt  *vi;
1862c733ed4SBarry Smith 
1872c733ed4SBarry Smith     /* forward solve the lower triangular */
1882c733ed4SBarry Smith     idx  = 0;
1892c733ed4SBarry Smith     t[0] = (MatScalar)b[0];
1902c733ed4SBarry Smith     t[1] = (MatScalar)b[1];
1912c733ed4SBarry Smith     t[2] = (MatScalar)b[2];
1922c733ed4SBarry Smith     t[3] = (MatScalar)b[3];
1932c733ed4SBarry Smith     for (i=1; i<n; i++) {
1942c733ed4SBarry Smith       v    =  aa      + 16*ai[i];
1952c733ed4SBarry Smith       vi   =  aj      + ai[i];
1962c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
1972c733ed4SBarry Smith       idx +=  4;
1982c733ed4SBarry Smith       s1   = (MatScalar)b[idx];
1992c733ed4SBarry Smith       s2   = (MatScalar)b[1+idx];
2002c733ed4SBarry Smith       s3   = (MatScalar)b[2+idx];
2012c733ed4SBarry Smith       s4   = (MatScalar)b[3+idx];
2022c733ed4SBarry Smith       while (nz--) {
2032c733ed4SBarry Smith         jdx = 4*(*vi++);
2042c733ed4SBarry Smith         x1  = t[jdx];
2052c733ed4SBarry Smith         x2  = t[1+jdx];
2062c733ed4SBarry Smith         x3  = t[2+jdx];
2072c733ed4SBarry Smith         x4  = t[3+jdx];
2082c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2092c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2102c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2112c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2122c733ed4SBarry Smith         v  += 16;
2132c733ed4SBarry Smith       }
2142c733ed4SBarry Smith       t[idx]   = s1;
2152c733ed4SBarry Smith       t[1+idx] = s2;
2162c733ed4SBarry Smith       t[2+idx] = s3;
2172c733ed4SBarry Smith       t[3+idx] = s4;
2182c733ed4SBarry Smith     }
2192c733ed4SBarry Smith     /* backward solve the upper triangular */
2202c733ed4SBarry Smith     idt = 4*(n-1);
2212c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
2222c733ed4SBarry Smith       ai16 = 16*diag[i];
2232c733ed4SBarry Smith       v    = aa + ai16 + 16;
2242c733ed4SBarry Smith       vi   = aj + diag[i] + 1;
2252c733ed4SBarry Smith       nz   = ai[i+1] - diag[i] - 1;
2262c733ed4SBarry Smith       s1   = t[idt];
2272c733ed4SBarry Smith       s2   = t[1+idt];
2282c733ed4SBarry Smith       s3   = t[2+idt];
2292c733ed4SBarry Smith       s4   = t[3+idt];
2302c733ed4SBarry Smith       while (nz--) {
2312c733ed4SBarry Smith         idx = 4*(*vi++);
2322c733ed4SBarry Smith         x1  = (MatScalar)x[idx];
2332c733ed4SBarry Smith         x2  = (MatScalar)x[1+idx];
2342c733ed4SBarry Smith         x3  = (MatScalar)x[2+idx];
2352c733ed4SBarry Smith         x4  = (MatScalar)x[3+idx];
2362c733ed4SBarry Smith         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2372c733ed4SBarry Smith         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2382c733ed4SBarry Smith         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2392c733ed4SBarry Smith         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2402c733ed4SBarry Smith         v  += 16;
2412c733ed4SBarry Smith       }
2422c733ed4SBarry Smith       v        = aa + ai16;
2432c733ed4SBarry Smith       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2442c733ed4SBarry Smith       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2452c733ed4SBarry Smith       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2462c733ed4SBarry Smith       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2472c733ed4SBarry Smith       idt     -= 4;
2482c733ed4SBarry Smith     }
2492c733ed4SBarry Smith   }
2502c733ed4SBarry Smith 
2512c733ed4SBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2522c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2532c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2542c733ed4SBarry Smith   PetscFunctionReturn(0);
2552c733ed4SBarry Smith }
2562c733ed4SBarry Smith 
2572c733ed4SBarry Smith #if defined(PETSC_HAVE_SSE)
2582c733ed4SBarry Smith 
2592c733ed4SBarry Smith #include PETSC_HAVE_SSE
2602c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2612c733ed4SBarry Smith {
2622c733ed4SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2632c733ed4SBarry Smith   unsigned short *aj=(unsigned short*)a->j;
2642c733ed4SBarry Smith   PetscErrorCode ierr;
2652c733ed4SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2662c733ed4SBarry Smith   MatScalar      *aa=a->a;
2672c733ed4SBarry Smith   PetscScalar    *x,*b;
2682c733ed4SBarry Smith 
2692c733ed4SBarry Smith   PetscFunctionBegin;
2702c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
2712c733ed4SBarry Smith   /*
2722c733ed4SBarry Smith      Note: This code currently uses demotion of double
2732c733ed4SBarry Smith      to float when performing the mixed-mode computation.
2742c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
2752c733ed4SBarry Smith   */
2762c733ed4SBarry Smith   PREFETCH_NTA(aa+16*ai[1]);
2772c733ed4SBarry Smith 
2782c733ed4SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2792c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2802c733ed4SBarry Smith   {
2812c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
2822c733ed4SBarry Smith     MatScalar      *v,*t=(MatScalar*)x;
2832c733ed4SBarry Smith     int            nz,i,idt,ai16;
2842c733ed4SBarry Smith     unsigned int   jdx,idx;
2852c733ed4SBarry Smith     unsigned short *vi;
2862c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
2872c733ed4SBarry Smith 
2882c733ed4SBarry Smith     /* First block is the identity. */
2892c733ed4SBarry Smith     idx = 0;
2902c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t,b);
2912c733ed4SBarry Smith     v =  aa + 16*((unsigned int)ai[1]);
2922c733ed4SBarry Smith 
2932c733ed4SBarry Smith     for (i=1; i<n; ) {
2942c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
2952c733ed4SBarry Smith       vi   =  aj      + ai[i];
2962c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
2972c733ed4SBarry Smith       idx +=  4;
2982c733ed4SBarry Smith 
2992c733ed4SBarry Smith       /* Demote RHS from double to float. */
3002c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3012c733ed4SBarry Smith       LOAD_PS(&t[idx],XMM7);
3022c733ed4SBarry Smith 
3032c733ed4SBarry Smith       while (nz--) {
3042c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3052c733ed4SBarry Smith         jdx = 4*((unsigned int)(*vi++));
3062c733ed4SBarry Smith 
3072c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
3082c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx],v)
3092c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3102c733ed4SBarry Smith 
3112c733ed4SBarry Smith         /* First Column */
3122c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
3132c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
3142c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3152c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
3162c733ed4SBarry Smith 
3172c733ed4SBarry Smith         /* Second Column */
3182c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
3192c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
3202c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3212c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
3222c733ed4SBarry Smith 
3232c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3242c733ed4SBarry Smith 
3252c733ed4SBarry Smith         /* Third Column */
3262c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
3272c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3282c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3292c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
3302c733ed4SBarry Smith 
3312c733ed4SBarry Smith         /* Fourth Column */
3322c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
3332c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3342c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3352c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
3362c733ed4SBarry Smith         SSE_INLINE_END_2
3372c733ed4SBarry Smith 
3382c733ed4SBarry Smith         v += 16;
3392c733ed4SBarry Smith       }
3402c733ed4SBarry Smith       v =  aa + 16*ai[++i];
3412c733ed4SBarry Smith       PREFETCH_NTA(v);
3422c733ed4SBarry Smith       STORE_PS(&t[idx],XMM7);
3432c733ed4SBarry Smith     }
3442c733ed4SBarry Smith 
3452c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
3462c733ed4SBarry Smith 
3472c733ed4SBarry Smith     idt  = 4*(n-1);
3482c733ed4SBarry Smith     ai16 = 16*diag[n-1];
3492c733ed4SBarry Smith     v    = aa + ai16 + 16;
3502c733ed4SBarry Smith     for (i=n-1; i>=0; ) {
3512c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
3522c733ed4SBarry Smith       vi = aj + diag[i] + 1;
3532c733ed4SBarry Smith       nz = ai[i+1] - diag[i] - 1;
3542c733ed4SBarry Smith 
3552c733ed4SBarry Smith       LOAD_PS(&t[idt],XMM7);
3562c733ed4SBarry Smith 
3572c733ed4SBarry Smith       while (nz--) {
3582c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
3592c733ed4SBarry Smith         idx = 4*((unsigned int)(*vi++));
3602c733ed4SBarry Smith 
3612c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
3622c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx],v)
3632c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3642c733ed4SBarry Smith 
3652c733ed4SBarry Smith         /* First Column */
3662c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
3672c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
3682c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3692c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
3702c733ed4SBarry Smith 
3712c733ed4SBarry Smith         /* Second Column */
3722c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
3732c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
3742c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3752c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
3762c733ed4SBarry Smith 
3772c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3782c733ed4SBarry Smith 
3792c733ed4SBarry Smith         /* Third Column */
3802c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
3812c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3822c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3832c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
3842c733ed4SBarry Smith 
3852c733ed4SBarry Smith         /* Fourth Column */
3862c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
3872c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3882c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3892c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
3902c733ed4SBarry Smith         SSE_INLINE_END_2
3912c733ed4SBarry Smith         v += 16;
3922c733ed4SBarry Smith       }
3932c733ed4SBarry Smith       v    = aa + ai16;
3942c733ed4SBarry Smith       ai16 = 16*diag[--i];
3952c733ed4SBarry Smith       PREFETCH_NTA(aa+ai16+16);
3962c733ed4SBarry Smith       /*
3972c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
3982c733ed4SBarry Smith          which was inverted as part of the factorization
3992c733ed4SBarry Smith       */
4002c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4012c733ed4SBarry Smith       /* First Column */
4022c733ed4SBarry Smith       SSE_COPY_PS(XMM0,XMM7)
4032c733ed4SBarry Smith       SSE_SHUFFLE(XMM0,XMM0,0x00)
4042c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4052c733ed4SBarry Smith 
4062c733ed4SBarry Smith       /* Second Column */
4072c733ed4SBarry Smith       SSE_COPY_PS(XMM1,XMM7)
4082c733ed4SBarry Smith       SSE_SHUFFLE(XMM1,XMM1,0x55)
4092c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4102c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM1)
4112c733ed4SBarry Smith 
4122c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4132c733ed4SBarry Smith 
4142c733ed4SBarry Smith       /* Third Column */
4152c733ed4SBarry Smith       SSE_COPY_PS(XMM2,XMM7)
4162c733ed4SBarry Smith       SSE_SHUFFLE(XMM2,XMM2,0xAA)
4172c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4182c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM2)
4192c733ed4SBarry Smith 
4202c733ed4SBarry Smith       /* Fourth Column */
4212c733ed4SBarry Smith       SSE_COPY_PS(XMM3,XMM7)
4222c733ed4SBarry Smith       SSE_SHUFFLE(XMM3,XMM3,0xFF)
4232c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4242c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM3)
4252c733ed4SBarry Smith 
4262c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4272c733ed4SBarry Smith       SSE_INLINE_END_3
4282c733ed4SBarry Smith 
4292c733ed4SBarry Smith       v    = aa + ai16 + 16;
4302c733ed4SBarry Smith       idt -= 4;
4312c733ed4SBarry Smith     }
4322c733ed4SBarry Smith 
4332c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
4342c733ed4SBarry Smith     idt = 4*(n-1);
4352c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
4362c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4372c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4382c733ed4SBarry Smith       PetscScalar *xtemp=&x[idt];
4392c733ed4SBarry Smith       MatScalar   *ttemp=&t[idt];
4402c733ed4SBarry Smith       xtemp[3] = (PetscScalar)ttemp[3];
4412c733ed4SBarry Smith       xtemp[2] = (PetscScalar)ttemp[2];
4422c733ed4SBarry Smith       xtemp[1] = (PetscScalar)ttemp[1];
4432c733ed4SBarry Smith       xtemp[0] = (PetscScalar)ttemp[0];
4442c733ed4SBarry Smith       idt     -= 4;
4452c733ed4SBarry Smith     }
4462c733ed4SBarry Smith 
4472c733ed4SBarry Smith   } /* End of artificial scope. */
4482c733ed4SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4492c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4502c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4512c733ed4SBarry Smith   SSE_SCOPE_END;
4522c733ed4SBarry Smith   PetscFunctionReturn(0);
4532c733ed4SBarry Smith }
4542c733ed4SBarry Smith 
4552c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4562c733ed4SBarry Smith {
4572c733ed4SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
4582c733ed4SBarry Smith   int            *aj=a->j;
4592c733ed4SBarry Smith   PetscErrorCode ierr;
4602c733ed4SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4612c733ed4SBarry Smith   MatScalar      *aa=a->a;
4622c733ed4SBarry Smith   PetscScalar    *x,*b;
4632c733ed4SBarry Smith 
4642c733ed4SBarry Smith   PetscFunctionBegin;
4652c733ed4SBarry Smith   SSE_SCOPE_BEGIN;
4662c733ed4SBarry Smith   /*
4672c733ed4SBarry Smith      Note: This code currently uses demotion of double
4682c733ed4SBarry Smith      to float when performing the mixed-mode computation.
4692c733ed4SBarry Smith      This may not be numerically reasonable for all applications.
4702c733ed4SBarry Smith   */
4712c733ed4SBarry Smith   PREFETCH_NTA(aa+16*ai[1]);
4722c733ed4SBarry Smith 
4732c733ed4SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4742c733ed4SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4752c733ed4SBarry Smith   {
4762c733ed4SBarry Smith     /* x will first be computed in single precision then promoted inplace to double */
4772c733ed4SBarry Smith     MatScalar *v,*t=(MatScalar*)x;
4782c733ed4SBarry Smith     int       nz,i,idt,ai16;
4792c733ed4SBarry Smith     int       jdx,idx;
4802c733ed4SBarry Smith     int       *vi;
4812c733ed4SBarry Smith     /* Forward solve the lower triangular factor. */
4822c733ed4SBarry Smith 
4832c733ed4SBarry Smith     /* First block is the identity. */
4842c733ed4SBarry Smith     idx = 0;
4852c733ed4SBarry Smith     CONVERT_DOUBLE4_FLOAT4(t,b);
4862c733ed4SBarry Smith     v =  aa + 16*ai[1];
4872c733ed4SBarry Smith 
4882c733ed4SBarry Smith     for (i=1; i<n; ) {
4892c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
4902c733ed4SBarry Smith       vi   =  aj      + ai[i];
4912c733ed4SBarry Smith       nz   =  diag[i] - ai[i];
4922c733ed4SBarry Smith       idx +=  4;
4932c733ed4SBarry Smith 
4942c733ed4SBarry Smith       /* Demote RHS from double to float. */
4952c733ed4SBarry Smith       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4962c733ed4SBarry Smith       LOAD_PS(&t[idx],XMM7);
4972c733ed4SBarry Smith 
4982c733ed4SBarry Smith       while (nz--) {
4992c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5002c733ed4SBarry Smith         jdx = 4*(*vi++);
5012c733ed4SBarry Smith /*          jdx = *vi++; */
5022c733ed4SBarry Smith 
5032c733ed4SBarry Smith         /* 4x4 Matrix-Vector product with negative accumulation: */
5042c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[jdx],v)
5052c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
5062c733ed4SBarry Smith 
5072c733ed4SBarry Smith         /* First Column */
5082c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
5092c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
5102c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
5112c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
5122c733ed4SBarry Smith 
5132c733ed4SBarry Smith         /* Second Column */
5142c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
5152c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
5162c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
5172c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
5182c733ed4SBarry Smith 
5192c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
5202c733ed4SBarry Smith 
5212c733ed4SBarry Smith         /* Third Column */
5222c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
5232c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
5242c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
5252c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
5262c733ed4SBarry Smith 
5272c733ed4SBarry Smith         /* Fourth Column */
5282c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
5292c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
5302c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
5312c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
5322c733ed4SBarry Smith         SSE_INLINE_END_2
5332c733ed4SBarry Smith 
5342c733ed4SBarry Smith         v += 16;
5352c733ed4SBarry Smith       }
5362c733ed4SBarry Smith       v =  aa + 16*ai[++i];
5372c733ed4SBarry Smith       PREFETCH_NTA(v);
5382c733ed4SBarry Smith       STORE_PS(&t[idx],XMM7);
5392c733ed4SBarry Smith     }
5402c733ed4SBarry Smith 
5412c733ed4SBarry Smith     /* Backward solve the upper triangular factor.*/
5422c733ed4SBarry Smith 
5432c733ed4SBarry Smith     idt  = 4*(n-1);
5442c733ed4SBarry Smith     ai16 = 16*diag[n-1];
5452c733ed4SBarry Smith     v    = aa + ai16 + 16;
5462c733ed4SBarry Smith     for (i=n-1; i>=0; ) {
5472c733ed4SBarry Smith       PREFETCH_NTA(&v[8]);
5482c733ed4SBarry Smith       vi = aj + diag[i] + 1;
5492c733ed4SBarry Smith       nz = ai[i+1] - diag[i] - 1;
5502c733ed4SBarry Smith 
5512c733ed4SBarry Smith       LOAD_PS(&t[idt],XMM7);
5522c733ed4SBarry Smith 
5532c733ed4SBarry Smith       while (nz--) {
5542c733ed4SBarry Smith         PREFETCH_NTA(&v[16]);
5552c733ed4SBarry Smith         idx = 4*(*vi++);
5562c733ed4SBarry Smith /*          idx = *vi++; */
5572c733ed4SBarry Smith 
5582c733ed4SBarry Smith         /* 4x4 Matrix-Vector Product with negative accumulation: */
5592c733ed4SBarry Smith         SSE_INLINE_BEGIN_2(&t[idx],v)
5602c733ed4SBarry Smith         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
5612c733ed4SBarry Smith 
5622c733ed4SBarry Smith         /* First Column */
5632c733ed4SBarry Smith         SSE_COPY_PS(XMM0,XMM6)
5642c733ed4SBarry Smith         SSE_SHUFFLE(XMM0,XMM0,0x00)
5652c733ed4SBarry Smith         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
5662c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM0)
5672c733ed4SBarry Smith 
5682c733ed4SBarry Smith         /* Second Column */
5692c733ed4SBarry Smith         SSE_COPY_PS(XMM1,XMM6)
5702c733ed4SBarry Smith         SSE_SHUFFLE(XMM1,XMM1,0x55)
5712c733ed4SBarry Smith         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
5722c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM1)
5732c733ed4SBarry Smith 
5742c733ed4SBarry Smith         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
5752c733ed4SBarry Smith 
5762c733ed4SBarry Smith         /* Third Column */
5772c733ed4SBarry Smith         SSE_COPY_PS(XMM2,XMM6)
5782c733ed4SBarry Smith         SSE_SHUFFLE(XMM2,XMM2,0xAA)
5792c733ed4SBarry Smith         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
5802c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM2)
5812c733ed4SBarry Smith 
5822c733ed4SBarry Smith         /* Fourth Column */
5832c733ed4SBarry Smith         SSE_COPY_PS(XMM3,XMM6)
5842c733ed4SBarry Smith         SSE_SHUFFLE(XMM3,XMM3,0xFF)
5852c733ed4SBarry Smith         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
5862c733ed4SBarry Smith         SSE_SUB_PS(XMM7,XMM3)
5872c733ed4SBarry Smith         SSE_INLINE_END_2
5882c733ed4SBarry Smith         v += 16;
5892c733ed4SBarry Smith       }
5902c733ed4SBarry Smith       v    = aa + ai16;
5912c733ed4SBarry Smith       ai16 = 16*diag[--i];
5922c733ed4SBarry Smith       PREFETCH_NTA(aa+ai16+16);
5932c733ed4SBarry Smith       /*
5942c733ed4SBarry Smith          Scale the result by the diagonal 4x4 block,
5952c733ed4SBarry Smith          which was inverted as part of the factorization
5962c733ed4SBarry Smith       */
5972c733ed4SBarry Smith       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
5982c733ed4SBarry Smith       /* First Column */
5992c733ed4SBarry Smith       SSE_COPY_PS(XMM0,XMM7)
6002c733ed4SBarry Smith       SSE_SHUFFLE(XMM0,XMM0,0x00)
6012c733ed4SBarry Smith       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
6022c733ed4SBarry Smith 
6032c733ed4SBarry Smith       /* Second Column */
6042c733ed4SBarry Smith       SSE_COPY_PS(XMM1,XMM7)
6052c733ed4SBarry Smith       SSE_SHUFFLE(XMM1,XMM1,0x55)
6062c733ed4SBarry Smith       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
6072c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM1)
6082c733ed4SBarry Smith 
6092c733ed4SBarry Smith       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
6102c733ed4SBarry Smith 
6112c733ed4SBarry Smith       /* Third Column */
6122c733ed4SBarry Smith       SSE_COPY_PS(XMM2,XMM7)
6132c733ed4SBarry Smith       SSE_SHUFFLE(XMM2,XMM2,0xAA)
6142c733ed4SBarry Smith       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
6152c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM2)
6162c733ed4SBarry Smith 
6172c733ed4SBarry Smith       /* Fourth Column */
6182c733ed4SBarry Smith       SSE_COPY_PS(XMM3,XMM7)
6192c733ed4SBarry Smith       SSE_SHUFFLE(XMM3,XMM3,0xFF)
6202c733ed4SBarry Smith       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
6212c733ed4SBarry Smith       SSE_ADD_PS(XMM0,XMM3)
6222c733ed4SBarry Smith 
6232c733ed4SBarry Smith       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
6242c733ed4SBarry Smith       SSE_INLINE_END_3
6252c733ed4SBarry Smith 
6262c733ed4SBarry Smith       v    = aa + ai16 + 16;
6272c733ed4SBarry Smith       idt -= 4;
6282c733ed4SBarry Smith     }
6292c733ed4SBarry Smith 
6302c733ed4SBarry Smith     /* Convert t from single precision back to double precision (inplace)*/
6312c733ed4SBarry Smith     idt = 4*(n-1);
6322c733ed4SBarry Smith     for (i=n-1; i>=0; i--) {
6332c733ed4SBarry Smith       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
6342c733ed4SBarry Smith       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
6352c733ed4SBarry Smith       PetscScalar *xtemp=&x[idt];
6362c733ed4SBarry Smith       MatScalar   *ttemp=&t[idt];
6372c733ed4SBarry Smith       xtemp[3] = (PetscScalar)ttemp[3];
6382c733ed4SBarry Smith       xtemp[2] = (PetscScalar)ttemp[2];
6392c733ed4SBarry Smith       xtemp[1] = (PetscScalar)ttemp[1];
6402c733ed4SBarry Smith       xtemp[0] = (PetscScalar)ttemp[0];
6412c733ed4SBarry Smith       idt     -= 4;
6422c733ed4SBarry Smith     }
6432c733ed4SBarry Smith 
6442c733ed4SBarry Smith   } /* End of artificial scope. */
6452c733ed4SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6462c733ed4SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
6472c733ed4SBarry Smith   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
6482c733ed4SBarry Smith   SSE_SCOPE_END;
6492c733ed4SBarry Smith   PetscFunctionReturn(0);
6502c733ed4SBarry Smith }
6512c733ed4SBarry Smith 
6522c733ed4SBarry Smith #endif
653