1*2c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h> 2*2c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h> 3*2c733ed4SBarry Smith 4*2c733ed4SBarry Smith /* 5*2c733ed4SBarry Smith Special case where the matrix was ILU(0) factored in the natural 6*2c733ed4SBarry Smith ordering. This eliminates the need for the column and row permutation. 7*2c733ed4SBarry Smith */ 8*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 9*2c733ed4SBarry Smith { 10*2c733ed4SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 11*2c733ed4SBarry Smith PetscInt n =a->mbs; 12*2c733ed4SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 13*2c733ed4SBarry Smith PetscErrorCode ierr; 14*2c733ed4SBarry Smith const PetscInt *diag = a->diag; 15*2c733ed4SBarry Smith const MatScalar *aa =a->a; 16*2c733ed4SBarry Smith PetscScalar *x; 17*2c733ed4SBarry Smith const PetscScalar *b; 18*2c733ed4SBarry Smith 19*2c733ed4SBarry Smith PetscFunctionBegin; 20*2c733ed4SBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21*2c733ed4SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22*2c733ed4SBarry Smith 23*2c733ed4SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 24*2c733ed4SBarry Smith { 25*2c733ed4SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 26*2c733ed4SBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 27*2c733ed4SBarry Smith } 28*2c733ed4SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 29*2c733ed4SBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 30*2c733ed4SBarry Smith #else 31*2c733ed4SBarry Smith { 32*2c733ed4SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 33*2c733ed4SBarry Smith const MatScalar *v; 34*2c733ed4SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 35*2c733ed4SBarry Smith const PetscInt *vi; 36*2c733ed4SBarry Smith 37*2c733ed4SBarry Smith /* forward solve the lower triangular */ 38*2c733ed4SBarry Smith idx = 0; 39*2c733ed4SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 40*2c733ed4SBarry Smith for (i=1; i<n; i++) { 41*2c733ed4SBarry Smith v = aa + 16*ai[i]; 42*2c733ed4SBarry Smith vi = aj + ai[i]; 43*2c733ed4SBarry Smith nz = diag[i] - ai[i]; 44*2c733ed4SBarry Smith idx += 4; 45*2c733ed4SBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 46*2c733ed4SBarry Smith while (nz--) { 47*2c733ed4SBarry Smith jdx = 4*(*vi++); 48*2c733ed4SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 49*2c733ed4SBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 50*2c733ed4SBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 51*2c733ed4SBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 52*2c733ed4SBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 53*2c733ed4SBarry Smith v += 16; 54*2c733ed4SBarry Smith } 55*2c733ed4SBarry Smith x[idx] = s1; 56*2c733ed4SBarry Smith x[1+idx] = s2; 57*2c733ed4SBarry Smith x[2+idx] = s3; 58*2c733ed4SBarry Smith x[3+idx] = s4; 59*2c733ed4SBarry Smith } 60*2c733ed4SBarry Smith /* backward solve the upper triangular */ 61*2c733ed4SBarry Smith idt = 4*(n-1); 62*2c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 63*2c733ed4SBarry Smith ai16 = 16*diag[i]; 64*2c733ed4SBarry Smith v = aa + ai16 + 16; 65*2c733ed4SBarry Smith vi = aj + diag[i] + 1; 66*2c733ed4SBarry Smith nz = ai[i+1] - diag[i] - 1; 67*2c733ed4SBarry Smith s1 = x[idt]; s2 = x[1+idt]; 68*2c733ed4SBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 69*2c733ed4SBarry Smith while (nz--) { 70*2c733ed4SBarry Smith idx = 4*(*vi++); 71*2c733ed4SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 72*2c733ed4SBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 73*2c733ed4SBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 74*2c733ed4SBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 75*2c733ed4SBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 76*2c733ed4SBarry Smith v += 16; 77*2c733ed4SBarry Smith } 78*2c733ed4SBarry Smith v = aa + ai16; 79*2c733ed4SBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 80*2c733ed4SBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 81*2c733ed4SBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 82*2c733ed4SBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 83*2c733ed4SBarry Smith idt -= 4; 84*2c733ed4SBarry Smith } 85*2c733ed4SBarry Smith } 86*2c733ed4SBarry Smith #endif 87*2c733ed4SBarry Smith 88*2c733ed4SBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 89*2c733ed4SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 90*2c733ed4SBarry Smith ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 91*2c733ed4SBarry Smith PetscFunctionReturn(0); 92*2c733ed4SBarry Smith } 93*2c733ed4SBarry Smith 94*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 95*2c733ed4SBarry Smith { 96*2c733ed4SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 97*2c733ed4SBarry Smith const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 98*2c733ed4SBarry Smith PetscInt i,k,nz,idx,jdx,idt; 99*2c733ed4SBarry Smith PetscErrorCode ierr; 100*2c733ed4SBarry Smith const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 101*2c733ed4SBarry Smith const MatScalar *aa=a->a,*v; 102*2c733ed4SBarry Smith PetscScalar *x; 103*2c733ed4SBarry Smith const PetscScalar *b; 104*2c733ed4SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 105*2c733ed4SBarry Smith 106*2c733ed4SBarry Smith PetscFunctionBegin; 107*2c733ed4SBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 108*2c733ed4SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 109*2c733ed4SBarry Smith /* forward solve the lower triangular */ 110*2c733ed4SBarry Smith idx = 0; 111*2c733ed4SBarry Smith x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 112*2c733ed4SBarry Smith for (i=1; i<n; i++) { 113*2c733ed4SBarry Smith v = aa + bs2*ai[i]; 114*2c733ed4SBarry Smith vi = aj + ai[i]; 115*2c733ed4SBarry Smith nz = ai[i+1] - ai[i]; 116*2c733ed4SBarry Smith idx = bs*i; 117*2c733ed4SBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 118*2c733ed4SBarry Smith for (k=0; k<nz; k++) { 119*2c733ed4SBarry Smith jdx = bs*vi[k]; 120*2c733ed4SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 121*2c733ed4SBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 122*2c733ed4SBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 123*2c733ed4SBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 124*2c733ed4SBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 125*2c733ed4SBarry Smith 126*2c733ed4SBarry Smith v += bs2; 127*2c733ed4SBarry Smith } 128*2c733ed4SBarry Smith 129*2c733ed4SBarry Smith x[idx] = s1; 130*2c733ed4SBarry Smith x[1+idx] = s2; 131*2c733ed4SBarry Smith x[2+idx] = s3; 132*2c733ed4SBarry Smith x[3+idx] = s4; 133*2c733ed4SBarry Smith } 134*2c733ed4SBarry Smith 135*2c733ed4SBarry Smith /* backward solve the upper triangular */ 136*2c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 137*2c733ed4SBarry Smith v = aa + bs2*(adiag[i+1]+1); 138*2c733ed4SBarry Smith vi = aj + adiag[i+1]+1; 139*2c733ed4SBarry Smith nz = adiag[i] - adiag[i+1]-1; 140*2c733ed4SBarry Smith idt = bs*i; 141*2c733ed4SBarry Smith s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 142*2c733ed4SBarry Smith 143*2c733ed4SBarry Smith for (k=0; k<nz; k++) { 144*2c733ed4SBarry Smith idx = bs*vi[k]; 145*2c733ed4SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 146*2c733ed4SBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 147*2c733ed4SBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 148*2c733ed4SBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 149*2c733ed4SBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 150*2c733ed4SBarry Smith 151*2c733ed4SBarry Smith v += bs2; 152*2c733ed4SBarry Smith } 153*2c733ed4SBarry Smith /* x = inv_diagonal*x */ 154*2c733ed4SBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 155*2c733ed4SBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 156*2c733ed4SBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 157*2c733ed4SBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 158*2c733ed4SBarry Smith 159*2c733ed4SBarry Smith } 160*2c733ed4SBarry Smith 161*2c733ed4SBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 162*2c733ed4SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 163*2c733ed4SBarry Smith ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 164*2c733ed4SBarry Smith PetscFunctionReturn(0); 165*2c733ed4SBarry Smith } 166*2c733ed4SBarry Smith 167*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 168*2c733ed4SBarry Smith { 169*2c733ed4SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 170*2c733ed4SBarry Smith const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 171*2c733ed4SBarry Smith PetscErrorCode ierr; 172*2c733ed4SBarry Smith const MatScalar *aa=a->a; 173*2c733ed4SBarry Smith const PetscScalar *b; 174*2c733ed4SBarry Smith PetscScalar *x; 175*2c733ed4SBarry Smith 176*2c733ed4SBarry Smith PetscFunctionBegin; 177*2c733ed4SBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 178*2c733ed4SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 179*2c733ed4SBarry Smith 180*2c733ed4SBarry Smith { 181*2c733ed4SBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 182*2c733ed4SBarry Smith const MatScalar *v; 183*2c733ed4SBarry Smith MatScalar *t=(MatScalar*)x; 184*2c733ed4SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 185*2c733ed4SBarry Smith const PetscInt *vi; 186*2c733ed4SBarry Smith 187*2c733ed4SBarry Smith /* forward solve the lower triangular */ 188*2c733ed4SBarry Smith idx = 0; 189*2c733ed4SBarry Smith t[0] = (MatScalar)b[0]; 190*2c733ed4SBarry Smith t[1] = (MatScalar)b[1]; 191*2c733ed4SBarry Smith t[2] = (MatScalar)b[2]; 192*2c733ed4SBarry Smith t[3] = (MatScalar)b[3]; 193*2c733ed4SBarry Smith for (i=1; i<n; i++) { 194*2c733ed4SBarry Smith v = aa + 16*ai[i]; 195*2c733ed4SBarry Smith vi = aj + ai[i]; 196*2c733ed4SBarry Smith nz = diag[i] - ai[i]; 197*2c733ed4SBarry Smith idx += 4; 198*2c733ed4SBarry Smith s1 = (MatScalar)b[idx]; 199*2c733ed4SBarry Smith s2 = (MatScalar)b[1+idx]; 200*2c733ed4SBarry Smith s3 = (MatScalar)b[2+idx]; 201*2c733ed4SBarry Smith s4 = (MatScalar)b[3+idx]; 202*2c733ed4SBarry Smith while (nz--) { 203*2c733ed4SBarry Smith jdx = 4*(*vi++); 204*2c733ed4SBarry Smith x1 = t[jdx]; 205*2c733ed4SBarry Smith x2 = t[1+jdx]; 206*2c733ed4SBarry Smith x3 = t[2+jdx]; 207*2c733ed4SBarry Smith x4 = t[3+jdx]; 208*2c733ed4SBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 209*2c733ed4SBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 210*2c733ed4SBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 211*2c733ed4SBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 212*2c733ed4SBarry Smith v += 16; 213*2c733ed4SBarry Smith } 214*2c733ed4SBarry Smith t[idx] = s1; 215*2c733ed4SBarry Smith t[1+idx] = s2; 216*2c733ed4SBarry Smith t[2+idx] = s3; 217*2c733ed4SBarry Smith t[3+idx] = s4; 218*2c733ed4SBarry Smith } 219*2c733ed4SBarry Smith /* backward solve the upper triangular */ 220*2c733ed4SBarry Smith idt = 4*(n-1); 221*2c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 222*2c733ed4SBarry Smith ai16 = 16*diag[i]; 223*2c733ed4SBarry Smith v = aa + ai16 + 16; 224*2c733ed4SBarry Smith vi = aj + diag[i] + 1; 225*2c733ed4SBarry Smith nz = ai[i+1] - diag[i] - 1; 226*2c733ed4SBarry Smith s1 = t[idt]; 227*2c733ed4SBarry Smith s2 = t[1+idt]; 228*2c733ed4SBarry Smith s3 = t[2+idt]; 229*2c733ed4SBarry Smith s4 = t[3+idt]; 230*2c733ed4SBarry Smith while (nz--) { 231*2c733ed4SBarry Smith idx = 4*(*vi++); 232*2c733ed4SBarry Smith x1 = (MatScalar)x[idx]; 233*2c733ed4SBarry Smith x2 = (MatScalar)x[1+idx]; 234*2c733ed4SBarry Smith x3 = (MatScalar)x[2+idx]; 235*2c733ed4SBarry Smith x4 = (MatScalar)x[3+idx]; 236*2c733ed4SBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 237*2c733ed4SBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 238*2c733ed4SBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 239*2c733ed4SBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 240*2c733ed4SBarry Smith v += 16; 241*2c733ed4SBarry Smith } 242*2c733ed4SBarry Smith v = aa + ai16; 243*2c733ed4SBarry Smith x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 244*2c733ed4SBarry Smith x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 245*2c733ed4SBarry Smith x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 246*2c733ed4SBarry Smith x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 247*2c733ed4SBarry Smith idt -= 4; 248*2c733ed4SBarry Smith } 249*2c733ed4SBarry Smith } 250*2c733ed4SBarry Smith 251*2c733ed4SBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 252*2c733ed4SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 253*2c733ed4SBarry Smith ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 254*2c733ed4SBarry Smith PetscFunctionReturn(0); 255*2c733ed4SBarry Smith } 256*2c733ed4SBarry Smith 257*2c733ed4SBarry Smith #if defined(PETSC_HAVE_SSE) 258*2c733ed4SBarry Smith 259*2c733ed4SBarry Smith #include PETSC_HAVE_SSE 260*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 261*2c733ed4SBarry Smith { 262*2c733ed4SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 263*2c733ed4SBarry Smith unsigned short *aj=(unsigned short*)a->j; 264*2c733ed4SBarry Smith PetscErrorCode ierr; 265*2c733ed4SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 266*2c733ed4SBarry Smith MatScalar *aa=a->a; 267*2c733ed4SBarry Smith PetscScalar *x,*b; 268*2c733ed4SBarry Smith 269*2c733ed4SBarry Smith PetscFunctionBegin; 270*2c733ed4SBarry Smith SSE_SCOPE_BEGIN; 271*2c733ed4SBarry Smith /* 272*2c733ed4SBarry Smith Note: This code currently uses demotion of double 273*2c733ed4SBarry Smith to float when performing the mixed-mode computation. 274*2c733ed4SBarry Smith This may not be numerically reasonable for all applications. 275*2c733ed4SBarry Smith */ 276*2c733ed4SBarry Smith PREFETCH_NTA(aa+16*ai[1]); 277*2c733ed4SBarry Smith 278*2c733ed4SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 279*2c733ed4SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 280*2c733ed4SBarry Smith { 281*2c733ed4SBarry Smith /* x will first be computed in single precision then promoted inplace to double */ 282*2c733ed4SBarry Smith MatScalar *v,*t=(MatScalar*)x; 283*2c733ed4SBarry Smith int nz,i,idt,ai16; 284*2c733ed4SBarry Smith unsigned int jdx,idx; 285*2c733ed4SBarry Smith unsigned short *vi; 286*2c733ed4SBarry Smith /* Forward solve the lower triangular factor. */ 287*2c733ed4SBarry Smith 288*2c733ed4SBarry Smith /* First block is the identity. */ 289*2c733ed4SBarry Smith idx = 0; 290*2c733ed4SBarry Smith CONVERT_DOUBLE4_FLOAT4(t,b); 291*2c733ed4SBarry Smith v = aa + 16*((unsigned int)ai[1]); 292*2c733ed4SBarry Smith 293*2c733ed4SBarry Smith for (i=1; i<n; ) { 294*2c733ed4SBarry Smith PREFETCH_NTA(&v[8]); 295*2c733ed4SBarry Smith vi = aj + ai[i]; 296*2c733ed4SBarry Smith nz = diag[i] - ai[i]; 297*2c733ed4SBarry Smith idx += 4; 298*2c733ed4SBarry Smith 299*2c733ed4SBarry Smith /* Demote RHS from double to float. */ 300*2c733ed4SBarry Smith CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 301*2c733ed4SBarry Smith LOAD_PS(&t[idx],XMM7); 302*2c733ed4SBarry Smith 303*2c733ed4SBarry Smith while (nz--) { 304*2c733ed4SBarry Smith PREFETCH_NTA(&v[16]); 305*2c733ed4SBarry Smith jdx = 4*((unsigned int)(*vi++)); 306*2c733ed4SBarry Smith 307*2c733ed4SBarry Smith /* 4x4 Matrix-Vector product with negative accumulation: */ 308*2c733ed4SBarry Smith SSE_INLINE_BEGIN_2(&t[jdx],v) 309*2c733ed4SBarry Smith SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 310*2c733ed4SBarry Smith 311*2c733ed4SBarry Smith /* First Column */ 312*2c733ed4SBarry Smith SSE_COPY_PS(XMM0,XMM6) 313*2c733ed4SBarry Smith SSE_SHUFFLE(XMM0,XMM0,0x00) 314*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 315*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM0) 316*2c733ed4SBarry Smith 317*2c733ed4SBarry Smith /* Second Column */ 318*2c733ed4SBarry Smith SSE_COPY_PS(XMM1,XMM6) 319*2c733ed4SBarry Smith SSE_SHUFFLE(XMM1,XMM1,0x55) 320*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 321*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM1) 322*2c733ed4SBarry Smith 323*2c733ed4SBarry Smith SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 324*2c733ed4SBarry Smith 325*2c733ed4SBarry Smith /* Third Column */ 326*2c733ed4SBarry Smith SSE_COPY_PS(XMM2,XMM6) 327*2c733ed4SBarry Smith SSE_SHUFFLE(XMM2,XMM2,0xAA) 328*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 329*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM2) 330*2c733ed4SBarry Smith 331*2c733ed4SBarry Smith /* Fourth Column */ 332*2c733ed4SBarry Smith SSE_COPY_PS(XMM3,XMM6) 333*2c733ed4SBarry Smith SSE_SHUFFLE(XMM3,XMM3,0xFF) 334*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 335*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM3) 336*2c733ed4SBarry Smith SSE_INLINE_END_2 337*2c733ed4SBarry Smith 338*2c733ed4SBarry Smith v += 16; 339*2c733ed4SBarry Smith } 340*2c733ed4SBarry Smith v = aa + 16*ai[++i]; 341*2c733ed4SBarry Smith PREFETCH_NTA(v); 342*2c733ed4SBarry Smith STORE_PS(&t[idx],XMM7); 343*2c733ed4SBarry Smith } 344*2c733ed4SBarry Smith 345*2c733ed4SBarry Smith /* Backward solve the upper triangular factor.*/ 346*2c733ed4SBarry Smith 347*2c733ed4SBarry Smith idt = 4*(n-1); 348*2c733ed4SBarry Smith ai16 = 16*diag[n-1]; 349*2c733ed4SBarry Smith v = aa + ai16 + 16; 350*2c733ed4SBarry Smith for (i=n-1; i>=0; ) { 351*2c733ed4SBarry Smith PREFETCH_NTA(&v[8]); 352*2c733ed4SBarry Smith vi = aj + diag[i] + 1; 353*2c733ed4SBarry Smith nz = ai[i+1] - diag[i] - 1; 354*2c733ed4SBarry Smith 355*2c733ed4SBarry Smith LOAD_PS(&t[idt],XMM7); 356*2c733ed4SBarry Smith 357*2c733ed4SBarry Smith while (nz--) { 358*2c733ed4SBarry Smith PREFETCH_NTA(&v[16]); 359*2c733ed4SBarry Smith idx = 4*((unsigned int)(*vi++)); 360*2c733ed4SBarry Smith 361*2c733ed4SBarry Smith /* 4x4 Matrix-Vector Product with negative accumulation: */ 362*2c733ed4SBarry Smith SSE_INLINE_BEGIN_2(&t[idx],v) 363*2c733ed4SBarry Smith SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 364*2c733ed4SBarry Smith 365*2c733ed4SBarry Smith /* First Column */ 366*2c733ed4SBarry Smith SSE_COPY_PS(XMM0,XMM6) 367*2c733ed4SBarry Smith SSE_SHUFFLE(XMM0,XMM0,0x00) 368*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 369*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM0) 370*2c733ed4SBarry Smith 371*2c733ed4SBarry Smith /* Second Column */ 372*2c733ed4SBarry Smith SSE_COPY_PS(XMM1,XMM6) 373*2c733ed4SBarry Smith SSE_SHUFFLE(XMM1,XMM1,0x55) 374*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 375*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM1) 376*2c733ed4SBarry Smith 377*2c733ed4SBarry Smith SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 378*2c733ed4SBarry Smith 379*2c733ed4SBarry Smith /* Third Column */ 380*2c733ed4SBarry Smith SSE_COPY_PS(XMM2,XMM6) 381*2c733ed4SBarry Smith SSE_SHUFFLE(XMM2,XMM2,0xAA) 382*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 383*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM2) 384*2c733ed4SBarry Smith 385*2c733ed4SBarry Smith /* Fourth Column */ 386*2c733ed4SBarry Smith SSE_COPY_PS(XMM3,XMM6) 387*2c733ed4SBarry Smith SSE_SHUFFLE(XMM3,XMM3,0xFF) 388*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 389*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM3) 390*2c733ed4SBarry Smith SSE_INLINE_END_2 391*2c733ed4SBarry Smith v += 16; 392*2c733ed4SBarry Smith } 393*2c733ed4SBarry Smith v = aa + ai16; 394*2c733ed4SBarry Smith ai16 = 16*diag[--i]; 395*2c733ed4SBarry Smith PREFETCH_NTA(aa+ai16+16); 396*2c733ed4SBarry Smith /* 397*2c733ed4SBarry Smith Scale the result by the diagonal 4x4 block, 398*2c733ed4SBarry Smith which was inverted as part of the factorization 399*2c733ed4SBarry Smith */ 400*2c733ed4SBarry Smith SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 401*2c733ed4SBarry Smith /* First Column */ 402*2c733ed4SBarry Smith SSE_COPY_PS(XMM0,XMM7) 403*2c733ed4SBarry Smith SSE_SHUFFLE(XMM0,XMM0,0x00) 404*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 405*2c733ed4SBarry Smith 406*2c733ed4SBarry Smith /* Second Column */ 407*2c733ed4SBarry Smith SSE_COPY_PS(XMM1,XMM7) 408*2c733ed4SBarry Smith SSE_SHUFFLE(XMM1,XMM1,0x55) 409*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 410*2c733ed4SBarry Smith SSE_ADD_PS(XMM0,XMM1) 411*2c733ed4SBarry Smith 412*2c733ed4SBarry Smith SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 413*2c733ed4SBarry Smith 414*2c733ed4SBarry Smith /* Third Column */ 415*2c733ed4SBarry Smith SSE_COPY_PS(XMM2,XMM7) 416*2c733ed4SBarry Smith SSE_SHUFFLE(XMM2,XMM2,0xAA) 417*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 418*2c733ed4SBarry Smith SSE_ADD_PS(XMM0,XMM2) 419*2c733ed4SBarry Smith 420*2c733ed4SBarry Smith /* Fourth Column */ 421*2c733ed4SBarry Smith SSE_COPY_PS(XMM3,XMM7) 422*2c733ed4SBarry Smith SSE_SHUFFLE(XMM3,XMM3,0xFF) 423*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 424*2c733ed4SBarry Smith SSE_ADD_PS(XMM0,XMM3) 425*2c733ed4SBarry Smith 426*2c733ed4SBarry Smith SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 427*2c733ed4SBarry Smith SSE_INLINE_END_3 428*2c733ed4SBarry Smith 429*2c733ed4SBarry Smith v = aa + ai16 + 16; 430*2c733ed4SBarry Smith idt -= 4; 431*2c733ed4SBarry Smith } 432*2c733ed4SBarry Smith 433*2c733ed4SBarry Smith /* Convert t from single precision back to double precision (inplace)*/ 434*2c733ed4SBarry Smith idt = 4*(n-1); 435*2c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 436*2c733ed4SBarry Smith /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 437*2c733ed4SBarry Smith /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 438*2c733ed4SBarry Smith PetscScalar *xtemp=&x[idt]; 439*2c733ed4SBarry Smith MatScalar *ttemp=&t[idt]; 440*2c733ed4SBarry Smith xtemp[3] = (PetscScalar)ttemp[3]; 441*2c733ed4SBarry Smith xtemp[2] = (PetscScalar)ttemp[2]; 442*2c733ed4SBarry Smith xtemp[1] = (PetscScalar)ttemp[1]; 443*2c733ed4SBarry Smith xtemp[0] = (PetscScalar)ttemp[0]; 444*2c733ed4SBarry Smith idt -= 4; 445*2c733ed4SBarry Smith } 446*2c733ed4SBarry Smith 447*2c733ed4SBarry Smith } /* End of artificial scope. */ 448*2c733ed4SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449*2c733ed4SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450*2c733ed4SBarry Smith ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 451*2c733ed4SBarry Smith SSE_SCOPE_END; 452*2c733ed4SBarry Smith PetscFunctionReturn(0); 453*2c733ed4SBarry Smith } 454*2c733ed4SBarry Smith 455*2c733ed4SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 456*2c733ed4SBarry Smith { 457*2c733ed4SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 458*2c733ed4SBarry Smith int *aj=a->j; 459*2c733ed4SBarry Smith PetscErrorCode ierr; 460*2c733ed4SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 461*2c733ed4SBarry Smith MatScalar *aa=a->a; 462*2c733ed4SBarry Smith PetscScalar *x,*b; 463*2c733ed4SBarry Smith 464*2c733ed4SBarry Smith PetscFunctionBegin; 465*2c733ed4SBarry Smith SSE_SCOPE_BEGIN; 466*2c733ed4SBarry Smith /* 467*2c733ed4SBarry Smith Note: This code currently uses demotion of double 468*2c733ed4SBarry Smith to float when performing the mixed-mode computation. 469*2c733ed4SBarry Smith This may not be numerically reasonable for all applications. 470*2c733ed4SBarry Smith */ 471*2c733ed4SBarry Smith PREFETCH_NTA(aa+16*ai[1]); 472*2c733ed4SBarry Smith 473*2c733ed4SBarry Smith ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 474*2c733ed4SBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 475*2c733ed4SBarry Smith { 476*2c733ed4SBarry Smith /* x will first be computed in single precision then promoted inplace to double */ 477*2c733ed4SBarry Smith MatScalar *v,*t=(MatScalar*)x; 478*2c733ed4SBarry Smith int nz,i,idt,ai16; 479*2c733ed4SBarry Smith int jdx,idx; 480*2c733ed4SBarry Smith int *vi; 481*2c733ed4SBarry Smith /* Forward solve the lower triangular factor. */ 482*2c733ed4SBarry Smith 483*2c733ed4SBarry Smith /* First block is the identity. */ 484*2c733ed4SBarry Smith idx = 0; 485*2c733ed4SBarry Smith CONVERT_DOUBLE4_FLOAT4(t,b); 486*2c733ed4SBarry Smith v = aa + 16*ai[1]; 487*2c733ed4SBarry Smith 488*2c733ed4SBarry Smith for (i=1; i<n; ) { 489*2c733ed4SBarry Smith PREFETCH_NTA(&v[8]); 490*2c733ed4SBarry Smith vi = aj + ai[i]; 491*2c733ed4SBarry Smith nz = diag[i] - ai[i]; 492*2c733ed4SBarry Smith idx += 4; 493*2c733ed4SBarry Smith 494*2c733ed4SBarry Smith /* Demote RHS from double to float. */ 495*2c733ed4SBarry Smith CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 496*2c733ed4SBarry Smith LOAD_PS(&t[idx],XMM7); 497*2c733ed4SBarry Smith 498*2c733ed4SBarry Smith while (nz--) { 499*2c733ed4SBarry Smith PREFETCH_NTA(&v[16]); 500*2c733ed4SBarry Smith jdx = 4*(*vi++); 501*2c733ed4SBarry Smith /* jdx = *vi++; */ 502*2c733ed4SBarry Smith 503*2c733ed4SBarry Smith /* 4x4 Matrix-Vector product with negative accumulation: */ 504*2c733ed4SBarry Smith SSE_INLINE_BEGIN_2(&t[jdx],v) 505*2c733ed4SBarry Smith SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 506*2c733ed4SBarry Smith 507*2c733ed4SBarry Smith /* First Column */ 508*2c733ed4SBarry Smith SSE_COPY_PS(XMM0,XMM6) 509*2c733ed4SBarry Smith SSE_SHUFFLE(XMM0,XMM0,0x00) 510*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 511*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM0) 512*2c733ed4SBarry Smith 513*2c733ed4SBarry Smith /* Second Column */ 514*2c733ed4SBarry Smith SSE_COPY_PS(XMM1,XMM6) 515*2c733ed4SBarry Smith SSE_SHUFFLE(XMM1,XMM1,0x55) 516*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 517*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM1) 518*2c733ed4SBarry Smith 519*2c733ed4SBarry Smith SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 520*2c733ed4SBarry Smith 521*2c733ed4SBarry Smith /* Third Column */ 522*2c733ed4SBarry Smith SSE_COPY_PS(XMM2,XMM6) 523*2c733ed4SBarry Smith SSE_SHUFFLE(XMM2,XMM2,0xAA) 524*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 525*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM2) 526*2c733ed4SBarry Smith 527*2c733ed4SBarry Smith /* Fourth Column */ 528*2c733ed4SBarry Smith SSE_COPY_PS(XMM3,XMM6) 529*2c733ed4SBarry Smith SSE_SHUFFLE(XMM3,XMM3,0xFF) 530*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 531*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM3) 532*2c733ed4SBarry Smith SSE_INLINE_END_2 533*2c733ed4SBarry Smith 534*2c733ed4SBarry Smith v += 16; 535*2c733ed4SBarry Smith } 536*2c733ed4SBarry Smith v = aa + 16*ai[++i]; 537*2c733ed4SBarry Smith PREFETCH_NTA(v); 538*2c733ed4SBarry Smith STORE_PS(&t[idx],XMM7); 539*2c733ed4SBarry Smith } 540*2c733ed4SBarry Smith 541*2c733ed4SBarry Smith /* Backward solve the upper triangular factor.*/ 542*2c733ed4SBarry Smith 543*2c733ed4SBarry Smith idt = 4*(n-1); 544*2c733ed4SBarry Smith ai16 = 16*diag[n-1]; 545*2c733ed4SBarry Smith v = aa + ai16 + 16; 546*2c733ed4SBarry Smith for (i=n-1; i>=0; ) { 547*2c733ed4SBarry Smith PREFETCH_NTA(&v[8]); 548*2c733ed4SBarry Smith vi = aj + diag[i] + 1; 549*2c733ed4SBarry Smith nz = ai[i+1] - diag[i] - 1; 550*2c733ed4SBarry Smith 551*2c733ed4SBarry Smith LOAD_PS(&t[idt],XMM7); 552*2c733ed4SBarry Smith 553*2c733ed4SBarry Smith while (nz--) { 554*2c733ed4SBarry Smith PREFETCH_NTA(&v[16]); 555*2c733ed4SBarry Smith idx = 4*(*vi++); 556*2c733ed4SBarry Smith /* idx = *vi++; */ 557*2c733ed4SBarry Smith 558*2c733ed4SBarry Smith /* 4x4 Matrix-Vector Product with negative accumulation: */ 559*2c733ed4SBarry Smith SSE_INLINE_BEGIN_2(&t[idx],v) 560*2c733ed4SBarry Smith SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 561*2c733ed4SBarry Smith 562*2c733ed4SBarry Smith /* First Column */ 563*2c733ed4SBarry Smith SSE_COPY_PS(XMM0,XMM6) 564*2c733ed4SBarry Smith SSE_SHUFFLE(XMM0,XMM0,0x00) 565*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 566*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM0) 567*2c733ed4SBarry Smith 568*2c733ed4SBarry Smith /* Second Column */ 569*2c733ed4SBarry Smith SSE_COPY_PS(XMM1,XMM6) 570*2c733ed4SBarry Smith SSE_SHUFFLE(XMM1,XMM1,0x55) 571*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 572*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM1) 573*2c733ed4SBarry Smith 574*2c733ed4SBarry Smith SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 575*2c733ed4SBarry Smith 576*2c733ed4SBarry Smith /* Third Column */ 577*2c733ed4SBarry Smith SSE_COPY_PS(XMM2,XMM6) 578*2c733ed4SBarry Smith SSE_SHUFFLE(XMM2,XMM2,0xAA) 579*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 580*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM2) 581*2c733ed4SBarry Smith 582*2c733ed4SBarry Smith /* Fourth Column */ 583*2c733ed4SBarry Smith SSE_COPY_PS(XMM3,XMM6) 584*2c733ed4SBarry Smith SSE_SHUFFLE(XMM3,XMM3,0xFF) 585*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 586*2c733ed4SBarry Smith SSE_SUB_PS(XMM7,XMM3) 587*2c733ed4SBarry Smith SSE_INLINE_END_2 588*2c733ed4SBarry Smith v += 16; 589*2c733ed4SBarry Smith } 590*2c733ed4SBarry Smith v = aa + ai16; 591*2c733ed4SBarry Smith ai16 = 16*diag[--i]; 592*2c733ed4SBarry Smith PREFETCH_NTA(aa+ai16+16); 593*2c733ed4SBarry Smith /* 594*2c733ed4SBarry Smith Scale the result by the diagonal 4x4 block, 595*2c733ed4SBarry Smith which was inverted as part of the factorization 596*2c733ed4SBarry Smith */ 597*2c733ed4SBarry Smith SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 598*2c733ed4SBarry Smith /* First Column */ 599*2c733ed4SBarry Smith SSE_COPY_PS(XMM0,XMM7) 600*2c733ed4SBarry Smith SSE_SHUFFLE(XMM0,XMM0,0x00) 601*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 602*2c733ed4SBarry Smith 603*2c733ed4SBarry Smith /* Second Column */ 604*2c733ed4SBarry Smith SSE_COPY_PS(XMM1,XMM7) 605*2c733ed4SBarry Smith SSE_SHUFFLE(XMM1,XMM1,0x55) 606*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 607*2c733ed4SBarry Smith SSE_ADD_PS(XMM0,XMM1) 608*2c733ed4SBarry Smith 609*2c733ed4SBarry Smith SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 610*2c733ed4SBarry Smith 611*2c733ed4SBarry Smith /* Third Column */ 612*2c733ed4SBarry Smith SSE_COPY_PS(XMM2,XMM7) 613*2c733ed4SBarry Smith SSE_SHUFFLE(XMM2,XMM2,0xAA) 614*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 615*2c733ed4SBarry Smith SSE_ADD_PS(XMM0,XMM2) 616*2c733ed4SBarry Smith 617*2c733ed4SBarry Smith /* Fourth Column */ 618*2c733ed4SBarry Smith SSE_COPY_PS(XMM3,XMM7) 619*2c733ed4SBarry Smith SSE_SHUFFLE(XMM3,XMM3,0xFF) 620*2c733ed4SBarry Smith SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 621*2c733ed4SBarry Smith SSE_ADD_PS(XMM0,XMM3) 622*2c733ed4SBarry Smith 623*2c733ed4SBarry Smith SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 624*2c733ed4SBarry Smith SSE_INLINE_END_3 625*2c733ed4SBarry Smith 626*2c733ed4SBarry Smith v = aa + ai16 + 16; 627*2c733ed4SBarry Smith idt -= 4; 628*2c733ed4SBarry Smith } 629*2c733ed4SBarry Smith 630*2c733ed4SBarry Smith /* Convert t from single precision back to double precision (inplace)*/ 631*2c733ed4SBarry Smith idt = 4*(n-1); 632*2c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 633*2c733ed4SBarry Smith /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 634*2c733ed4SBarry Smith /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 635*2c733ed4SBarry Smith PetscScalar *xtemp=&x[idt]; 636*2c733ed4SBarry Smith MatScalar *ttemp=&t[idt]; 637*2c733ed4SBarry Smith xtemp[3] = (PetscScalar)ttemp[3]; 638*2c733ed4SBarry Smith xtemp[2] = (PetscScalar)ttemp[2]; 639*2c733ed4SBarry Smith xtemp[1] = (PetscScalar)ttemp[1]; 640*2c733ed4SBarry Smith xtemp[0] = (PetscScalar)ttemp[0]; 641*2c733ed4SBarry Smith idt -= 4; 642*2c733ed4SBarry Smith } 643*2c733ed4SBarry Smith 644*2c733ed4SBarry Smith } /* End of artificial scope. */ 645*2c733ed4SBarry Smith ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 646*2c733ed4SBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 647*2c733ed4SBarry Smith ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 648*2c733ed4SBarry Smith SSE_SCOPE_END; 649*2c733ed4SBarry Smith PetscFunctionReturn(0); 650*2c733ed4SBarry Smith } 651*2c733ed4SBarry Smith 652*2c733ed4SBarry Smith #endif 653