xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision e6580ceefd67cfee5d890f7b8ecb709dd93cb596)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
383287d42SBarry Smith /*
483287d42SBarry Smith     Factorization code for BAIJ format.
583287d42SBarry Smith */
67c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
7c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
883287d42SBarry Smith 
983287d42SBarry Smith /* ------------------------------------------------------------*/
1083287d42SBarry Smith /*
1183287d42SBarry Smith       Version for when blocks are 4 by 4
1283287d42SBarry Smith */
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
150481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
1683287d42SBarry Smith {
1783287d42SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1883287d42SBarry Smith   IS             isrow = b->row,isicol = b->icol;
196849ba73SBarry Smith   PetscErrorCode ierr;
205d0c19d7SBarry Smith   const PetscInt *r,*ic;
215d0c19d7SBarry Smith   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22690b6cddSBarry Smith   PetscInt       *ajtmpold,*ajtmp,nz,row;
23690b6cddSBarry Smith   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
2483287d42SBarry Smith   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
2583287d42SBarry Smith   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
2683287d42SBarry Smith   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
2783287d42SBarry Smith   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
2883287d42SBarry Smith   MatScalar      m13,m14,m15,m16;
2983287d42SBarry Smith   MatScalar      *ba = b->a,*aa = a->a;
30bcd9e38bSBarry Smith   PetscTruth     pivotinblocks = b->pivotinblocks;
3162bba022SBarry Smith   PetscReal      shift = info->shiftinblocks;
3283287d42SBarry Smith 
3383287d42SBarry Smith   PetscFunctionBegin;
3483287d42SBarry Smith   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3583287d42SBarry Smith   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36b0a32e0cSBarry Smith   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3783287d42SBarry Smith 
3883287d42SBarry Smith   for (i=0; i<n; i++) {
3983287d42SBarry Smith     nz    = bi[i+1] - bi[i];
4083287d42SBarry Smith     ajtmp = bj + bi[i];
4183287d42SBarry Smith     for  (j=0; j<nz; j++) {
4283287d42SBarry Smith       x = rtmp+16*ajtmp[j];
4383287d42SBarry Smith       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
4483287d42SBarry Smith       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
4583287d42SBarry Smith     }
4683287d42SBarry Smith     /* load in initial (unfactored row) */
4783287d42SBarry Smith     idx      = r[i];
4883287d42SBarry Smith     nz       = ai[idx+1] - ai[idx];
4983287d42SBarry Smith     ajtmpold = aj + ai[idx];
5083287d42SBarry Smith     v        = aa + 16*ai[idx];
5183287d42SBarry Smith     for (j=0; j<nz; j++) {
5283287d42SBarry Smith       x    = rtmp+16*ic[ajtmpold[j]];
5383287d42SBarry Smith       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
5483287d42SBarry Smith       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
5583287d42SBarry Smith       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
5683287d42SBarry Smith       x[14] = v[14]; x[15] = v[15];
5783287d42SBarry Smith       v    += 16;
5883287d42SBarry Smith     }
5983287d42SBarry Smith     row = *ajtmp++;
6083287d42SBarry Smith     while (row < i) {
6183287d42SBarry Smith       pc  = rtmp + 16*row;
6283287d42SBarry Smith       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
6383287d42SBarry Smith       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
6483287d42SBarry Smith       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
6583287d42SBarry Smith       p15 = pc[14]; p16 = pc[15];
6683287d42SBarry Smith       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
6783287d42SBarry Smith           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
6883287d42SBarry Smith           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
6983287d42SBarry Smith           || p16 != 0.0) {
7083287d42SBarry Smith         pv = ba + 16*diag_offset[row];
7183287d42SBarry Smith         pj = bj + diag_offset[row] + 1;
7283287d42SBarry Smith         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
7383287d42SBarry Smith         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
7483287d42SBarry Smith         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
7583287d42SBarry Smith         x15 = pv[14]; x16 = pv[15];
7683287d42SBarry Smith         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
7783287d42SBarry Smith         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
7883287d42SBarry Smith         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
7983287d42SBarry Smith         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
8083287d42SBarry Smith 
8183287d42SBarry Smith         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
8283287d42SBarry Smith         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
8383287d42SBarry Smith         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
8483287d42SBarry Smith         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
8583287d42SBarry Smith 
8683287d42SBarry Smith         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
8783287d42SBarry Smith         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
8883287d42SBarry Smith         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
8983287d42SBarry Smith         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
9083287d42SBarry Smith 
9183287d42SBarry Smith         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
9283287d42SBarry Smith         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
9383287d42SBarry Smith         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
9483287d42SBarry Smith         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
9583287d42SBarry Smith 
9683287d42SBarry Smith         nz = bi[row+1] - diag_offset[row] - 1;
9783287d42SBarry Smith         pv += 16;
9883287d42SBarry Smith         for (j=0; j<nz; j++) {
9983287d42SBarry Smith           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
10083287d42SBarry Smith           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
10183287d42SBarry Smith           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
10283287d42SBarry Smith           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
10383287d42SBarry Smith           x    = rtmp + 16*pj[j];
10483287d42SBarry Smith           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
10583287d42SBarry Smith           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
10683287d42SBarry Smith           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
10783287d42SBarry Smith           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
10883287d42SBarry Smith 
10983287d42SBarry Smith           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
11083287d42SBarry Smith           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
11183287d42SBarry Smith           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
11283287d42SBarry Smith           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
11383287d42SBarry Smith 
11483287d42SBarry Smith           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
11583287d42SBarry Smith           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
11683287d42SBarry Smith           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
11783287d42SBarry Smith           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
11883287d42SBarry Smith 
11983287d42SBarry Smith           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
12083287d42SBarry Smith           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
12183287d42SBarry Smith           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
12283287d42SBarry Smith           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
12383287d42SBarry Smith 
12483287d42SBarry Smith           pv   += 16;
12583287d42SBarry Smith         }
126dc0b31edSSatish Balay         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
12783287d42SBarry Smith       }
12883287d42SBarry Smith       row = *ajtmp++;
12983287d42SBarry Smith     }
13083287d42SBarry Smith     /* finished row so stick it into b->a */
13183287d42SBarry Smith     pv = ba + 16*bi[i];
13283287d42SBarry Smith     pj = bj + bi[i];
13383287d42SBarry Smith     nz = bi[i+1] - bi[i];
13483287d42SBarry Smith     for (j=0; j<nz; j++) {
13583287d42SBarry Smith       x      = rtmp+16*pj[j];
13683287d42SBarry Smith       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
13783287d42SBarry Smith       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
13883287d42SBarry Smith       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
13983287d42SBarry Smith       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
14083287d42SBarry Smith       pv   += 16;
14183287d42SBarry Smith     }
14283287d42SBarry Smith     /* invert diagonal block */
14383287d42SBarry Smith     w    = ba + 16*diag_offset[i];
144bcd9e38bSBarry Smith     if (pivotinblocks) {
14562bba022SBarry Smith       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146bcd9e38bSBarry Smith     } else {
147bcd9e38bSBarry Smith       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148bcd9e38bSBarry Smith     }
14983287d42SBarry Smith   }
15083287d42SBarry Smith 
15183287d42SBarry Smith   ierr = PetscFree(rtmp);CHKERRQ(ierr);
15283287d42SBarry Smith   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
15383287d42SBarry Smith   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154db4efbfdSBarry Smith   C->ops->solve          = MatSolve_SeqBAIJ_4;
155db4efbfdSBarry Smith   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
15683287d42SBarry Smith   C->assembled = PETSC_TRUE;
157efee365bSSatish Balay   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
15883287d42SBarry Smith   PetscFunctionReturn(0);
15983287d42SBarry Smith }
160*e6580ceeSShri Abhyankar 
161*e6580ceeSShri Abhyankar #undef __FUNCT__
162*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
163*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
164*e6580ceeSShri Abhyankar {
165*e6580ceeSShri Abhyankar /*
166*e6580ceeSShri Abhyankar     Default Version for when blocks are 4 by 4 Using natural ordering
167*e6580ceeSShri Abhyankar */
168*e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
169*e6580ceeSShri Abhyankar   PetscErrorCode ierr;
170*e6580ceeSShri Abhyankar   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
171*e6580ceeSShri Abhyankar   PetscInt       *ajtmpold,*ajtmp,nz,row;
172*e6580ceeSShri Abhyankar   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
173*e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
174*e6580ceeSShri Abhyankar   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
175*e6580ceeSShri Abhyankar   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
176*e6580ceeSShri Abhyankar   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
177*e6580ceeSShri Abhyankar   MatScalar      m13,m14,m15,m16;
178*e6580ceeSShri Abhyankar   MatScalar      *ba = b->a,*aa = a->a;
179*e6580ceeSShri Abhyankar   PetscTruth     pivotinblocks = b->pivotinblocks;
180*e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
181*e6580ceeSShri Abhyankar 
182*e6580ceeSShri Abhyankar   PetscFunctionBegin;
183*e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
184*e6580ceeSShri Abhyankar 
185*e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
186*e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
187*e6580ceeSShri Abhyankar     ajtmp = bj + bi[i];
188*e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
189*e6580ceeSShri Abhyankar       x = rtmp+16*ajtmp[j];
190*e6580ceeSShri Abhyankar       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
191*e6580ceeSShri Abhyankar       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
192*e6580ceeSShri Abhyankar     }
193*e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
194*e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
195*e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
196*e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
197*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
198*e6580ceeSShri Abhyankar       x    = rtmp+16*ajtmpold[j];
199*e6580ceeSShri Abhyankar       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
200*e6580ceeSShri Abhyankar       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
201*e6580ceeSShri Abhyankar       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
202*e6580ceeSShri Abhyankar       x[14] = v[14]; x[15] = v[15];
203*e6580ceeSShri Abhyankar       v    += 16;
204*e6580ceeSShri Abhyankar     }
205*e6580ceeSShri Abhyankar     row = *ajtmp++;
206*e6580ceeSShri Abhyankar     while (row < i) {
207*e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
208*e6580ceeSShri Abhyankar       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
209*e6580ceeSShri Abhyankar       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
210*e6580ceeSShri Abhyankar       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
211*e6580ceeSShri Abhyankar       p15 = pc[14]; p16 = pc[15];
212*e6580ceeSShri Abhyankar       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
213*e6580ceeSShri Abhyankar           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
214*e6580ceeSShri Abhyankar           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
215*e6580ceeSShri Abhyankar           || p16 != 0.0) {
216*e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
217*e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
218*e6580ceeSShri Abhyankar         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
219*e6580ceeSShri Abhyankar         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
220*e6580ceeSShri Abhyankar         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
221*e6580ceeSShri Abhyankar         x15 = pv[14]; x16 = pv[15];
222*e6580ceeSShri Abhyankar         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
223*e6580ceeSShri Abhyankar         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
224*e6580ceeSShri Abhyankar         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
225*e6580ceeSShri Abhyankar         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
226*e6580ceeSShri Abhyankar 
227*e6580ceeSShri Abhyankar         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
228*e6580ceeSShri Abhyankar         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
229*e6580ceeSShri Abhyankar         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
230*e6580ceeSShri Abhyankar         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
231*e6580ceeSShri Abhyankar 
232*e6580ceeSShri Abhyankar         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
233*e6580ceeSShri Abhyankar         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
234*e6580ceeSShri Abhyankar         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
235*e6580ceeSShri Abhyankar         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
236*e6580ceeSShri Abhyankar 
237*e6580ceeSShri Abhyankar         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
238*e6580ceeSShri Abhyankar         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
239*e6580ceeSShri Abhyankar         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
240*e6580ceeSShri Abhyankar         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
241*e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
242*e6580ceeSShri Abhyankar         pv += 16;
243*e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
244*e6580ceeSShri Abhyankar           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
245*e6580ceeSShri Abhyankar           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
246*e6580ceeSShri Abhyankar           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
247*e6580ceeSShri Abhyankar           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
248*e6580ceeSShri Abhyankar           x    = rtmp + 16*pj[j];
249*e6580ceeSShri Abhyankar           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
250*e6580ceeSShri Abhyankar           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
251*e6580ceeSShri Abhyankar           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
252*e6580ceeSShri Abhyankar           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
253*e6580ceeSShri Abhyankar 
254*e6580ceeSShri Abhyankar           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
255*e6580ceeSShri Abhyankar           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
256*e6580ceeSShri Abhyankar           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
257*e6580ceeSShri Abhyankar           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
258*e6580ceeSShri Abhyankar 
259*e6580ceeSShri Abhyankar           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
260*e6580ceeSShri Abhyankar           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
261*e6580ceeSShri Abhyankar           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
262*e6580ceeSShri Abhyankar           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
263*e6580ceeSShri Abhyankar 
264*e6580ceeSShri Abhyankar           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
265*e6580ceeSShri Abhyankar           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
266*e6580ceeSShri Abhyankar           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
267*e6580ceeSShri Abhyankar           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
268*e6580ceeSShri Abhyankar 
269*e6580ceeSShri Abhyankar           pv   += 16;
270*e6580ceeSShri Abhyankar         }
271*e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
272*e6580ceeSShri Abhyankar       }
273*e6580ceeSShri Abhyankar       row = *ajtmp++;
274*e6580ceeSShri Abhyankar     }
275*e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
276*e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
277*e6580ceeSShri Abhyankar     pj = bj + bi[i];
278*e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
279*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
280*e6580ceeSShri Abhyankar       x      = rtmp+16*pj[j];
281*e6580ceeSShri Abhyankar       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
282*e6580ceeSShri Abhyankar       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
283*e6580ceeSShri Abhyankar       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
284*e6580ceeSShri Abhyankar       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
285*e6580ceeSShri Abhyankar       pv   += 16;
286*e6580ceeSShri Abhyankar     }
287*e6580ceeSShri Abhyankar     /* invert diagonal block */
288*e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
289*e6580ceeSShri Abhyankar     if (pivotinblocks) {
290*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
291*e6580ceeSShri Abhyankar     } else {
292*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
293*e6580ceeSShri Abhyankar     }
294*e6580ceeSShri Abhyankar   }
295*e6580ceeSShri Abhyankar 
296*e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
297*e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
298*e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
299*e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
300*e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
301*e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
302*e6580ceeSShri Abhyankar }
303*e6580ceeSShri Abhyankar 
304*e6580ceeSShri Abhyankar 
305*e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE)
306*e6580ceeSShri Abhyankar 
307*e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE
308*e6580ceeSShri Abhyankar 
309*e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */
310*e6580ceeSShri Abhyankar #undef __FUNCT__
311*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
312*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
313*e6580ceeSShri Abhyankar {
314*e6580ceeSShri Abhyankar   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
315*e6580ceeSShri Abhyankar   PetscErrorCode ierr;
316*e6580ceeSShri Abhyankar   int i,j,n = a->mbs;
317*e6580ceeSShri Abhyankar   int         *bj = b->j,*bjtmp,*pj;
318*e6580ceeSShri Abhyankar   int         row;
319*e6580ceeSShri Abhyankar   int         *ajtmpold,nz,*bi=b->i;
320*e6580ceeSShri Abhyankar   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
321*e6580ceeSShri Abhyankar   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
322*e6580ceeSShri Abhyankar   MatScalar   *ba = b->a,*aa = a->a;
323*e6580ceeSShri Abhyankar   int         nonzero=0;
324*e6580ceeSShri Abhyankar /*    int            nonzero=0,colscale = 16; */
325*e6580ceeSShri Abhyankar   PetscTruth  pivotinblocks = b->pivotinblocks;
326*e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
327*e6580ceeSShri Abhyankar 
328*e6580ceeSShri Abhyankar   PetscFunctionBegin;
329*e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
330*e6580ceeSShri Abhyankar 
331*e6580ceeSShri Abhyankar   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
332*e6580ceeSShri Abhyankar   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
333*e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
334*e6580ceeSShri Abhyankar   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
335*e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
336*e6580ceeSShri Abhyankar /*      colscale = 4; */
337*e6580ceeSShri Abhyankar /*    } */
338*e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
339*e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
340*e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
341*e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
342*e6580ceeSShri Abhyankar     /* zero out one register */
343*e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
344*e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
345*e6580ceeSShri Abhyankar       x = rtmp+16*bjtmp[j];
346*e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
347*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
348*e6580ceeSShri Abhyankar         /* Copy zero register to memory locations */
349*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
350*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
351*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
352*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
353*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
354*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
355*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
356*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
357*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
358*e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
359*e6580ceeSShri Abhyankar     }
360*e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
361*e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
362*e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
363*e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
364*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
365*e6580ceeSShri Abhyankar       x = rtmp+16*ajtmpold[j];
366*e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmpold[j]; */
367*e6580ceeSShri Abhyankar       /* Copy v block into x block */
368*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
369*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
370*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
371*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
372*e6580ceeSShri Abhyankar 
373*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
374*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
375*e6580ceeSShri Abhyankar 
376*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
377*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
378*e6580ceeSShri Abhyankar 
379*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
380*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
381*e6580ceeSShri Abhyankar 
382*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
383*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
384*e6580ceeSShri Abhyankar 
385*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
386*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
387*e6580ceeSShri Abhyankar 
388*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
389*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
390*e6580ceeSShri Abhyankar 
391*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
392*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
393*e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
394*e6580ceeSShri Abhyankar 
395*e6580ceeSShri Abhyankar       v += 16;
396*e6580ceeSShri Abhyankar     }
397*e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
398*e6580ceeSShri Abhyankar     row = *bjtmp++;
399*e6580ceeSShri Abhyankar     while (row < i) {
400*e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
401*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
402*e6580ceeSShri Abhyankar         /* Load block from lower triangle */
403*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
404*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
405*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
406*e6580ceeSShri Abhyankar 
407*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
408*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
409*e6580ceeSShri Abhyankar 
410*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
411*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
412*e6580ceeSShri Abhyankar 
413*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
414*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
415*e6580ceeSShri Abhyankar 
416*e6580ceeSShri Abhyankar         /* Compare block to zero block */
417*e6580ceeSShri Abhyankar 
418*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM4,XMM7)
419*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM4,XMM0)
420*e6580ceeSShri Abhyankar 
421*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM5,XMM7)
422*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM5,XMM1)
423*e6580ceeSShri Abhyankar 
424*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM6,XMM7)
425*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM6,XMM2)
426*e6580ceeSShri Abhyankar 
427*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM7,XMM3)
428*e6580ceeSShri Abhyankar 
429*e6580ceeSShri Abhyankar         /* Reduce the comparisons to one SSE register */
430*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM6,XMM7)
431*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM4)
432*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM6)
433*e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
434*e6580ceeSShri Abhyankar 
435*e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
436*e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
437*e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
438*e6580ceeSShri Abhyankar 
439*e6580ceeSShri Abhyankar       /* If block is nonzero ... */
440*e6580ceeSShri Abhyankar       if (nonzero) {
441*e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
442*e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
443*e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
444*e6580ceeSShri Abhyankar 
445*e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
446*e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
447*e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
448*e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
449*e6580ceeSShri Abhyankar 
450*e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
451*e6580ceeSShri Abhyankar           /* Column 0, product is accumulated in XMM4 */
452*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
453*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
454*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM0)
455*e6580ceeSShri Abhyankar 
456*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
457*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
458*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM1)
459*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM5)
460*e6580ceeSShri Abhyankar 
461*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
462*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
463*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM2)
464*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM6)
465*e6580ceeSShri Abhyankar 
466*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
467*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
468*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
469*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM7)
470*e6580ceeSShri Abhyankar 
471*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
472*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
473*e6580ceeSShri Abhyankar 
474*e6580ceeSShri Abhyankar           /* Column 1, product is accumulated in XMM5 */
475*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
476*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
477*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
478*e6580ceeSShri Abhyankar 
479*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
480*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
481*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
482*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
483*e6580ceeSShri Abhyankar 
484*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
485*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
486*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
487*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM7)
488*e6580ceeSShri Abhyankar 
489*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
490*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
491*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
492*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
493*e6580ceeSShri Abhyankar 
494*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
495*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
496*e6580ceeSShri Abhyankar 
497*e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
498*e6580ceeSShri Abhyankar 
499*e6580ceeSShri Abhyankar           /* Column 2, product is accumulated in XMM6 */
500*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
501*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
502*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
503*e6580ceeSShri Abhyankar 
504*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
505*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
506*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
507*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
508*e6580ceeSShri Abhyankar 
509*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
510*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
511*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
512*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
513*e6580ceeSShri Abhyankar 
514*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
515*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
516*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
517*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
518*e6580ceeSShri Abhyankar 
519*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
520*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
521*e6580ceeSShri Abhyankar 
522*e6580ceeSShri Abhyankar           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
523*e6580ceeSShri Abhyankar           /* Column 3, product is accumulated in XMM0 */
524*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
525*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
526*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM0,XMM7)
527*e6580ceeSShri Abhyankar 
528*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
529*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
530*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM7)
531*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
532*e6580ceeSShri Abhyankar 
533*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
534*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM1,XMM1,0x00)
535*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM2)
536*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
537*e6580ceeSShri Abhyankar 
538*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
539*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
540*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM3,XMM7)
541*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM3)
542*e6580ceeSShri Abhyankar 
543*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
544*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
545*e6580ceeSShri Abhyankar 
546*e6580ceeSShri Abhyankar           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
547*e6580ceeSShri Abhyankar           /* This is code to be maintained and read by humans afterall. */
548*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 3 into XMM3 */
549*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM3,XMM0)
550*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 2 into XMM2 */
551*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM2,XMM6)
552*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 1 into XMM1 */
553*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM1,XMM5)
554*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 0 into XMM0 */
555*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM0,XMM4)
556*e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
557*e6580ceeSShri Abhyankar 
558*e6580ceeSShri Abhyankar         /* Update the row: */
559*e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
560*e6580ceeSShri Abhyankar         pv += 16;
561*e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
562*e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
563*e6580ceeSShri Abhyankar           x = rtmp + 16*pj[j];
564*e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
565*e6580ceeSShri Abhyankar 
566*e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
567*e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
568*e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
569*e6580ceeSShri Abhyankar             /* Load First Column of X*/
570*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
571*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
572*e6580ceeSShri Abhyankar 
573*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
574*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
575*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
576*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
577*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
578*e6580ceeSShri Abhyankar 
579*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
580*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
581*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
582*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
583*e6580ceeSShri Abhyankar 
584*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
585*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
586*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
587*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
588*e6580ceeSShri Abhyankar 
589*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
590*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
591*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
592*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
593*e6580ceeSShri Abhyankar 
594*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
595*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
596*e6580ceeSShri Abhyankar 
597*e6580ceeSShri Abhyankar             /* Second Column */
598*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
599*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
600*e6580ceeSShri Abhyankar 
601*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
602*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
603*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
604*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM0)
605*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
606*e6580ceeSShri Abhyankar 
607*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
608*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
609*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM1)
610*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM7)
611*e6580ceeSShri Abhyankar 
612*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
613*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
614*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM2)
615*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM4)
616*e6580ceeSShri Abhyankar 
617*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
618*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
619*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM3)
620*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
621*e6580ceeSShri Abhyankar 
622*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
623*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
624*e6580ceeSShri Abhyankar 
625*e6580ceeSShri Abhyankar             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
626*e6580ceeSShri Abhyankar 
627*e6580ceeSShri Abhyankar             /* Third Column */
628*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
629*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
630*e6580ceeSShri Abhyankar 
631*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
632*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
633*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
634*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM0)
635*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
636*e6580ceeSShri Abhyankar 
637*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
638*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
639*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM1)
640*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM4)
641*e6580ceeSShri Abhyankar 
642*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
643*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
644*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM2)
645*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM5)
646*e6580ceeSShri Abhyankar 
647*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
648*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
649*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM3)
650*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
651*e6580ceeSShri Abhyankar 
652*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
653*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
654*e6580ceeSShri Abhyankar 
655*e6580ceeSShri Abhyankar             /* Fourth Column */
656*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
657*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
658*e6580ceeSShri Abhyankar 
659*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
660*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
661*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
662*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
663*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
664*e6580ceeSShri Abhyankar 
665*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
666*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
667*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
668*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
669*e6580ceeSShri Abhyankar 
670*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
671*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
672*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
673*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
674*e6580ceeSShri Abhyankar 
675*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
676*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
677*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
678*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
679*e6580ceeSShri Abhyankar 
680*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
681*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
682*e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
683*e6580ceeSShri Abhyankar           pv   += 16;
684*e6580ceeSShri Abhyankar         }
685*e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
686*e6580ceeSShri Abhyankar       }
687*e6580ceeSShri Abhyankar       row = *bjtmp++;
688*e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
689*e6580ceeSShri Abhyankar     }
690*e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
691*e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
692*e6580ceeSShri Abhyankar     pj = bj + bi[i];
693*e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
694*e6580ceeSShri Abhyankar 
695*e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
696*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
697*e6580ceeSShri Abhyankar       x  = rtmp+16*pj[j];
698*e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
699*e6580ceeSShri Abhyankar 
700*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
701*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
702*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
703*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
704*e6580ceeSShri Abhyankar 
705*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
706*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
707*e6580ceeSShri Abhyankar 
708*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
709*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
710*e6580ceeSShri Abhyankar 
711*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
712*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
713*e6580ceeSShri Abhyankar 
714*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
715*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
716*e6580ceeSShri Abhyankar 
717*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
718*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
719*e6580ceeSShri Abhyankar 
720*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
721*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
722*e6580ceeSShri Abhyankar 
723*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
724*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
725*e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
726*e6580ceeSShri Abhyankar       pv += 16;
727*e6580ceeSShri Abhyankar     }
728*e6580ceeSShri Abhyankar     /* invert diagonal block */
729*e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
730*e6580ceeSShri Abhyankar     if (pivotinblocks) {
731*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
732*e6580ceeSShri Abhyankar     } else {
733*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
734*e6580ceeSShri Abhyankar     }
735*e6580ceeSShri Abhyankar /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
736*e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
737*e6580ceeSShri Abhyankar   }
738*e6580ceeSShri Abhyankar 
739*e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
740*e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
741*e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
742*e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
743*e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
744*e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
745*e6580ceeSShri Abhyankar   SSE_SCOPE_END;
746*e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
747*e6580ceeSShri Abhyankar }
748*e6580ceeSShri Abhyankar 
749*e6580ceeSShri Abhyankar #undef __FUNCT__
750*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
751*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
752*e6580ceeSShri Abhyankar {
753*e6580ceeSShri Abhyankar   Mat            A=C;
754*e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
755*e6580ceeSShri Abhyankar   PetscErrorCode ierr;
756*e6580ceeSShri Abhyankar   int i,j,n = a->mbs;
757*e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
758*e6580ceeSShri Abhyankar   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
759*e6580ceeSShri Abhyankar   unsigned int   row;
760*e6580ceeSShri Abhyankar   int            nz,*bi=b->i;
761*e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i;
762*e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
763*e6580ceeSShri Abhyankar   MatScalar      *ba = b->a,*aa = a->a;
764*e6580ceeSShri Abhyankar   int            nonzero=0;
765*e6580ceeSShri Abhyankar /*    int            nonzero=0,colscale = 16; */
766*e6580ceeSShri Abhyankar   PetscTruth     pivotinblocks = b->pivotinblocks;
767*e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
768*e6580ceeSShri Abhyankar 
769*e6580ceeSShri Abhyankar   PetscFunctionBegin;
770*e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
771*e6580ceeSShri Abhyankar 
772*e6580ceeSShri Abhyankar   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
773*e6580ceeSShri Abhyankar   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
774*e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
775*e6580ceeSShri Abhyankar   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
776*e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
777*e6580ceeSShri Abhyankar /*      colscale = 4; */
778*e6580ceeSShri Abhyankar /*    } */
779*e6580ceeSShri Abhyankar 
780*e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
781*e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
782*e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
783*e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
784*e6580ceeSShri Abhyankar     /* zero out one register */
785*e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
786*e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
787*e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)bjtmp[j]);
788*e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
789*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
790*e6580ceeSShri Abhyankar         /* Copy zero register to memory locations */
791*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
792*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
793*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
794*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
795*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
796*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
797*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
798*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
799*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
800*e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
801*e6580ceeSShri Abhyankar     }
802*e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
803*e6580ceeSShri Abhyankar     nz    = ai[i+1] - ai[i];
804*e6580ceeSShri Abhyankar     ajtmp = aj + ai[i];
805*e6580ceeSShri Abhyankar     v     = aa + 16*ai[i];
806*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
807*e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)ajtmp[j]);
808*e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmp[j]; */
809*e6580ceeSShri Abhyankar       /* Copy v block into x block */
810*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
811*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
812*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
813*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
814*e6580ceeSShri Abhyankar 
815*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
816*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
817*e6580ceeSShri Abhyankar 
818*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
819*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
820*e6580ceeSShri Abhyankar 
821*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
822*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
823*e6580ceeSShri Abhyankar 
824*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
825*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
826*e6580ceeSShri Abhyankar 
827*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
828*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
829*e6580ceeSShri Abhyankar 
830*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
831*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
832*e6580ceeSShri Abhyankar 
833*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
834*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
835*e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
836*e6580ceeSShri Abhyankar 
837*e6580ceeSShri Abhyankar       v += 16;
838*e6580ceeSShri Abhyankar     }
839*e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
840*e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
841*e6580ceeSShri Abhyankar     while (row < i) {
842*e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
843*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
844*e6580ceeSShri Abhyankar         /* Load block from lower triangle */
845*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
846*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
847*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
848*e6580ceeSShri Abhyankar 
849*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
850*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
851*e6580ceeSShri Abhyankar 
852*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
853*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
854*e6580ceeSShri Abhyankar 
855*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
856*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
857*e6580ceeSShri Abhyankar 
858*e6580ceeSShri Abhyankar         /* Compare block to zero block */
859*e6580ceeSShri Abhyankar 
860*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM4,XMM7)
861*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM4,XMM0)
862*e6580ceeSShri Abhyankar 
863*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM5,XMM7)
864*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM5,XMM1)
865*e6580ceeSShri Abhyankar 
866*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM6,XMM7)
867*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM6,XMM2)
868*e6580ceeSShri Abhyankar 
869*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM7,XMM3)
870*e6580ceeSShri Abhyankar 
871*e6580ceeSShri Abhyankar         /* Reduce the comparisons to one SSE register */
872*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM6,XMM7)
873*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM4)
874*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM6)
875*e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
876*e6580ceeSShri Abhyankar 
877*e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
878*e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
879*e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
880*e6580ceeSShri Abhyankar 
881*e6580ceeSShri Abhyankar       /* If block is nonzero ... */
882*e6580ceeSShri Abhyankar       if (nonzero) {
883*e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
884*e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
885*e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
886*e6580ceeSShri Abhyankar 
887*e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
888*e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
889*e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
890*e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
891*e6580ceeSShri Abhyankar 
892*e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
893*e6580ceeSShri Abhyankar           /* Column 0, product is accumulated in XMM4 */
894*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
895*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
896*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM0)
897*e6580ceeSShri Abhyankar 
898*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
899*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
900*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM1)
901*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM5)
902*e6580ceeSShri Abhyankar 
903*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
904*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
905*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM2)
906*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM6)
907*e6580ceeSShri Abhyankar 
908*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
909*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
910*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
911*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM7)
912*e6580ceeSShri Abhyankar 
913*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
914*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
915*e6580ceeSShri Abhyankar 
916*e6580ceeSShri Abhyankar           /* Column 1, product is accumulated in XMM5 */
917*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
918*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
919*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
920*e6580ceeSShri Abhyankar 
921*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
922*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
923*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
924*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
925*e6580ceeSShri Abhyankar 
926*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
927*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
928*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
929*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM7)
930*e6580ceeSShri Abhyankar 
931*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
932*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
933*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
934*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
935*e6580ceeSShri Abhyankar 
936*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
937*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
938*e6580ceeSShri Abhyankar 
939*e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
940*e6580ceeSShri Abhyankar 
941*e6580ceeSShri Abhyankar           /* Column 2, product is accumulated in XMM6 */
942*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
943*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
944*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
945*e6580ceeSShri Abhyankar 
946*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
947*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
948*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
949*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
950*e6580ceeSShri Abhyankar 
951*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
952*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
953*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
954*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
955*e6580ceeSShri Abhyankar 
956*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
957*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
958*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
959*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
960*e6580ceeSShri Abhyankar 
961*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
962*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
963*e6580ceeSShri Abhyankar 
964*e6580ceeSShri Abhyankar           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
965*e6580ceeSShri Abhyankar           /* Column 3, product is accumulated in XMM0 */
966*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
967*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
968*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM0,XMM7)
969*e6580ceeSShri Abhyankar 
970*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
971*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
972*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM7)
973*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
974*e6580ceeSShri Abhyankar 
975*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
976*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM1,XMM1,0x00)
977*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM2)
978*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
979*e6580ceeSShri Abhyankar 
980*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
981*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
982*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM3,XMM7)
983*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM3)
984*e6580ceeSShri Abhyankar 
985*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
986*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
987*e6580ceeSShri Abhyankar 
988*e6580ceeSShri Abhyankar           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
989*e6580ceeSShri Abhyankar           /* This is code to be maintained and read by humans afterall. */
990*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 3 into XMM3 */
991*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM3,XMM0)
992*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 2 into XMM2 */
993*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM2,XMM6)
994*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 1 into XMM1 */
995*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM1,XMM5)
996*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 0 into XMM0 */
997*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM0,XMM4)
998*e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
999*e6580ceeSShri Abhyankar 
1000*e6580ceeSShri Abhyankar         /* Update the row: */
1001*e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
1002*e6580ceeSShri Abhyankar         pv += 16;
1003*e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
1004*e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1005*e6580ceeSShri Abhyankar           x = rtmp + 16*((unsigned int)pj[j]);
1006*e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
1007*e6580ceeSShri Abhyankar 
1008*e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1009*e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1010*e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
1011*e6580ceeSShri Abhyankar             /* Load First Column of X*/
1012*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1013*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1014*e6580ceeSShri Abhyankar 
1015*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1016*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1017*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1018*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1019*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1020*e6580ceeSShri Abhyankar 
1021*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1022*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1023*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1024*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1025*e6580ceeSShri Abhyankar 
1026*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1027*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1028*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1029*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1030*e6580ceeSShri Abhyankar 
1031*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1032*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1033*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1034*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1035*e6580ceeSShri Abhyankar 
1036*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1037*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1038*e6580ceeSShri Abhyankar 
1039*e6580ceeSShri Abhyankar             /* Second Column */
1040*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1041*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1042*e6580ceeSShri Abhyankar 
1043*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1044*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1045*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1046*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM0)
1047*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1048*e6580ceeSShri Abhyankar 
1049*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1050*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1051*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM1)
1052*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM7)
1053*e6580ceeSShri Abhyankar 
1054*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1055*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1056*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM2)
1057*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM4)
1058*e6580ceeSShri Abhyankar 
1059*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1060*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1061*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM3)
1062*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1063*e6580ceeSShri Abhyankar 
1064*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1065*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1066*e6580ceeSShri Abhyankar 
1067*e6580ceeSShri Abhyankar             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1068*e6580ceeSShri Abhyankar 
1069*e6580ceeSShri Abhyankar             /* Third Column */
1070*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1071*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1072*e6580ceeSShri Abhyankar 
1073*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1074*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1075*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1076*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM0)
1077*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1078*e6580ceeSShri Abhyankar 
1079*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1080*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1081*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM1)
1082*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM4)
1083*e6580ceeSShri Abhyankar 
1084*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1085*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1086*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM2)
1087*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM5)
1088*e6580ceeSShri Abhyankar 
1089*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1090*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1091*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM3)
1092*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1093*e6580ceeSShri Abhyankar 
1094*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1095*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1096*e6580ceeSShri Abhyankar 
1097*e6580ceeSShri Abhyankar             /* Fourth Column */
1098*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1099*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1100*e6580ceeSShri Abhyankar 
1101*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1102*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1103*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1104*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1105*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1106*e6580ceeSShri Abhyankar 
1107*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1108*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1109*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1110*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1111*e6580ceeSShri Abhyankar 
1112*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1113*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1114*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1115*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1116*e6580ceeSShri Abhyankar 
1117*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1118*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1119*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1120*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1121*e6580ceeSShri Abhyankar 
1122*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1123*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1124*e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1125*e6580ceeSShri Abhyankar           pv   += 16;
1126*e6580ceeSShri Abhyankar         }
1127*e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1128*e6580ceeSShri Abhyankar       }
1129*e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1130*e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
1131*e6580ceeSShri Abhyankar /*        bjtmp++; */
1132*e6580ceeSShri Abhyankar     }
1133*e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1134*e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
1135*e6580ceeSShri Abhyankar     pj = bj + bi[i];
1136*e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
1137*e6580ceeSShri Abhyankar 
1138*e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1139*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1140*e6580ceeSShri Abhyankar       x  = rtmp+16*((unsigned int)pj[j]);
1141*e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
1142*e6580ceeSShri Abhyankar 
1143*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
1144*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1145*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1146*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1147*e6580ceeSShri Abhyankar 
1148*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1149*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1150*e6580ceeSShri Abhyankar 
1151*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1152*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1153*e6580ceeSShri Abhyankar 
1154*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1155*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1156*e6580ceeSShri Abhyankar 
1157*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1158*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1159*e6580ceeSShri Abhyankar 
1160*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1161*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1162*e6580ceeSShri Abhyankar 
1163*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1164*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1165*e6580ceeSShri Abhyankar 
1166*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1167*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1168*e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1169*e6580ceeSShri Abhyankar       pv += 16;
1170*e6580ceeSShri Abhyankar     }
1171*e6580ceeSShri Abhyankar     /* invert diagonal block */
1172*e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
1173*e6580ceeSShri Abhyankar     if (pivotinblocks) {
1174*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1175*e6580ceeSShri Abhyankar     } else {
1176*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1177*e6580ceeSShri Abhyankar     }
1178*e6580ceeSShri Abhyankar /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1179*e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1180*e6580ceeSShri Abhyankar   }
1181*e6580ceeSShri Abhyankar 
1182*e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1183*e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1184*e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1185*e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
1186*e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1187*e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1188*e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1189*e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1190*e6580ceeSShri Abhyankar }
1191*e6580ceeSShri Abhyankar 
1192*e6580ceeSShri Abhyankar #undef __FUNCT__
1193*e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1194*e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1195*e6580ceeSShri Abhyankar {
1196*e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1197*e6580ceeSShri Abhyankar   PetscErrorCode ierr;
1198*e6580ceeSShri Abhyankar   int  i,j,n = a->mbs;
1199*e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1200*e6580ceeSShri Abhyankar   unsigned int   row;
1201*e6580ceeSShri Abhyankar   int            *ajtmpold,nz,*bi=b->i;
1202*e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1203*e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1204*e6580ceeSShri Abhyankar   MatScalar      *ba = b->a,*aa = a->a;
1205*e6580ceeSShri Abhyankar   int            nonzero=0;
1206*e6580ceeSShri Abhyankar /*    int            nonzero=0,colscale = 16; */
1207*e6580ceeSShri Abhyankar   PetscTruth     pivotinblocks = b->pivotinblocks;
1208*e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
1209*e6580ceeSShri Abhyankar 
1210*e6580ceeSShri Abhyankar   PetscFunctionBegin;
1211*e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1212*e6580ceeSShri Abhyankar 
1213*e6580ceeSShri Abhyankar   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1214*e6580ceeSShri Abhyankar   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1215*e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1216*e6580ceeSShri Abhyankar   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1217*e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
1218*e6580ceeSShri Abhyankar /*      colscale = 4; */
1219*e6580ceeSShri Abhyankar /*    } */
1220*e6580ceeSShri Abhyankar   if ((unsigned long)bj==(unsigned long)aj) {
1221*e6580ceeSShri Abhyankar     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1222*e6580ceeSShri Abhyankar   }
1223*e6580ceeSShri Abhyankar 
1224*e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
1225*e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
1226*e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1227*e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1228*e6580ceeSShri Abhyankar     /* zero out one register */
1229*e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
1230*e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
1231*e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)bjtmp[j]);
1232*e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
1233*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1234*e6580ceeSShri Abhyankar         /* Copy zero register to memory locations */
1235*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1236*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1237*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1238*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1239*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1240*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1241*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1242*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1243*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1244*e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1245*e6580ceeSShri Abhyankar     }
1246*e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1247*e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
1248*e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
1249*e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
1250*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1251*e6580ceeSShri Abhyankar       x = rtmp+16*ajtmpold[j];
1252*e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmpold[j]; */
1253*e6580ceeSShri Abhyankar       /* Copy v block into x block */
1254*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
1255*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1256*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1257*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1258*e6580ceeSShri Abhyankar 
1259*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1260*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1261*e6580ceeSShri Abhyankar 
1262*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1263*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1264*e6580ceeSShri Abhyankar 
1265*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1266*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1267*e6580ceeSShri Abhyankar 
1268*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1269*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1270*e6580ceeSShri Abhyankar 
1271*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1272*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1273*e6580ceeSShri Abhyankar 
1274*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1275*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1276*e6580ceeSShri Abhyankar 
1277*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1278*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1279*e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1280*e6580ceeSShri Abhyankar 
1281*e6580ceeSShri Abhyankar       v += 16;
1282*e6580ceeSShri Abhyankar     }
1283*e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
1284*e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1285*e6580ceeSShri Abhyankar     while (row < i) {
1286*e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
1287*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1288*e6580ceeSShri Abhyankar         /* Load block from lower triangle */
1289*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1290*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1291*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1292*e6580ceeSShri Abhyankar 
1293*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1294*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1295*e6580ceeSShri Abhyankar 
1296*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1297*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1298*e6580ceeSShri Abhyankar 
1299*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1300*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1301*e6580ceeSShri Abhyankar 
1302*e6580ceeSShri Abhyankar         /* Compare block to zero block */
1303*e6580ceeSShri Abhyankar 
1304*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM4,XMM7)
1305*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM4,XMM0)
1306*e6580ceeSShri Abhyankar 
1307*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM5,XMM7)
1308*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM5,XMM1)
1309*e6580ceeSShri Abhyankar 
1310*e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM6,XMM7)
1311*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM6,XMM2)
1312*e6580ceeSShri Abhyankar 
1313*e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM7,XMM3)
1314*e6580ceeSShri Abhyankar 
1315*e6580ceeSShri Abhyankar         /* Reduce the comparisons to one SSE register */
1316*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM6,XMM7)
1317*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM4)
1318*e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM6)
1319*e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1320*e6580ceeSShri Abhyankar 
1321*e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1322*e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1323*e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
1324*e6580ceeSShri Abhyankar 
1325*e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1326*e6580ceeSShri Abhyankar       if (nonzero) {
1327*e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
1328*e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1329*e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1330*e6580ceeSShri Abhyankar 
1331*e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1332*e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1333*e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1334*e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1335*e6580ceeSShri Abhyankar 
1336*e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
1337*e6580ceeSShri Abhyankar           /* Column 0, product is accumulated in XMM4 */
1338*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1339*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1340*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM0)
1341*e6580ceeSShri Abhyankar 
1342*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1343*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1344*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM1)
1345*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM5)
1346*e6580ceeSShri Abhyankar 
1347*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1348*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1349*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM2)
1350*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM6)
1351*e6580ceeSShri Abhyankar 
1352*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1353*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1354*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1355*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM7)
1356*e6580ceeSShri Abhyankar 
1357*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1358*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1359*e6580ceeSShri Abhyankar 
1360*e6580ceeSShri Abhyankar           /* Column 1, product is accumulated in XMM5 */
1361*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1362*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1363*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1364*e6580ceeSShri Abhyankar 
1365*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1366*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1367*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1368*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
1369*e6580ceeSShri Abhyankar 
1370*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1371*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1372*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1373*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM7)
1374*e6580ceeSShri Abhyankar 
1375*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1376*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1377*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
1378*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
1379*e6580ceeSShri Abhyankar 
1380*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1381*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1382*e6580ceeSShri Abhyankar 
1383*e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1384*e6580ceeSShri Abhyankar 
1385*e6580ceeSShri Abhyankar           /* Column 2, product is accumulated in XMM6 */
1386*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1387*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1388*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
1389*e6580ceeSShri Abhyankar 
1390*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1391*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1392*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
1393*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1394*e6580ceeSShri Abhyankar 
1395*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1396*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1397*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1398*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1399*e6580ceeSShri Abhyankar 
1400*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1401*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1402*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1403*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1404*e6580ceeSShri Abhyankar 
1405*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1406*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1407*e6580ceeSShri Abhyankar 
1408*e6580ceeSShri Abhyankar           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1409*e6580ceeSShri Abhyankar           /* Column 3, product is accumulated in XMM0 */
1410*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1411*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1412*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM0,XMM7)
1413*e6580ceeSShri Abhyankar 
1414*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1415*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1416*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM7)
1417*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
1418*e6580ceeSShri Abhyankar 
1419*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1420*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM1,XMM1,0x00)
1421*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM2)
1422*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
1423*e6580ceeSShri Abhyankar 
1424*e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1425*e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1426*e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM3,XMM7)
1427*e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM3)
1428*e6580ceeSShri Abhyankar 
1429*e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1430*e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1431*e6580ceeSShri Abhyankar 
1432*e6580ceeSShri Abhyankar           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1433*e6580ceeSShri Abhyankar           /* This is code to be maintained and read by humans afterall. */
1434*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 3 into XMM3 */
1435*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM3,XMM0)
1436*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 2 into XMM2 */
1437*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM2,XMM6)
1438*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 1 into XMM1 */
1439*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM1,XMM5)
1440*e6580ceeSShri Abhyankar           /* Copy Multiplier Col 0 into XMM0 */
1441*e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM0,XMM4)
1442*e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1443*e6580ceeSShri Abhyankar 
1444*e6580ceeSShri Abhyankar         /* Update the row: */
1445*e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
1446*e6580ceeSShri Abhyankar         pv += 16;
1447*e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
1448*e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1449*e6580ceeSShri Abhyankar           x = rtmp + 16*((unsigned int)pj[j]);
1450*e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
1451*e6580ceeSShri Abhyankar 
1452*e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1453*e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1454*e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
1455*e6580ceeSShri Abhyankar             /* Load First Column of X*/
1456*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1457*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1458*e6580ceeSShri Abhyankar 
1459*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1460*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1461*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1462*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1463*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1464*e6580ceeSShri Abhyankar 
1465*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1466*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1467*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1468*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1469*e6580ceeSShri Abhyankar 
1470*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1471*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1472*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1473*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1474*e6580ceeSShri Abhyankar 
1475*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1476*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1477*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1478*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1479*e6580ceeSShri Abhyankar 
1480*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1481*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1482*e6580ceeSShri Abhyankar 
1483*e6580ceeSShri Abhyankar             /* Second Column */
1484*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1485*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1486*e6580ceeSShri Abhyankar 
1487*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1488*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1489*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1490*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM0)
1491*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1492*e6580ceeSShri Abhyankar 
1493*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1494*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1495*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM1)
1496*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM7)
1497*e6580ceeSShri Abhyankar 
1498*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1499*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1500*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM2)
1501*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM4)
1502*e6580ceeSShri Abhyankar 
1503*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1504*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1505*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM3)
1506*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1507*e6580ceeSShri Abhyankar 
1508*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1509*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1510*e6580ceeSShri Abhyankar 
1511*e6580ceeSShri Abhyankar             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1512*e6580ceeSShri Abhyankar 
1513*e6580ceeSShri Abhyankar             /* Third Column */
1514*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1515*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1516*e6580ceeSShri Abhyankar 
1517*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1518*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1519*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1520*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM0)
1521*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1522*e6580ceeSShri Abhyankar 
1523*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1524*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1525*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM1)
1526*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM4)
1527*e6580ceeSShri Abhyankar 
1528*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1529*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1530*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM2)
1531*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM5)
1532*e6580ceeSShri Abhyankar 
1533*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1534*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1535*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM3)
1536*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1537*e6580ceeSShri Abhyankar 
1538*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1539*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1540*e6580ceeSShri Abhyankar 
1541*e6580ceeSShri Abhyankar             /* Fourth Column */
1542*e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1543*e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1544*e6580ceeSShri Abhyankar 
1545*e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1546*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1547*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1548*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1549*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1550*e6580ceeSShri Abhyankar 
1551*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1552*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1553*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1554*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1555*e6580ceeSShri Abhyankar 
1556*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1557*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1558*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1559*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1560*e6580ceeSShri Abhyankar 
1561*e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1562*e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1563*e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1564*e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1565*e6580ceeSShri Abhyankar 
1566*e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1567*e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1568*e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1569*e6580ceeSShri Abhyankar           pv   += 16;
1570*e6580ceeSShri Abhyankar         }
1571*e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1572*e6580ceeSShri Abhyankar       }
1573*e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1574*e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
1575*e6580ceeSShri Abhyankar /*        bjtmp++; */
1576*e6580ceeSShri Abhyankar     }
1577*e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1578*e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
1579*e6580ceeSShri Abhyankar     pj = bj + bi[i];
1580*e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
1581*e6580ceeSShri Abhyankar 
1582*e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1583*e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1584*e6580ceeSShri Abhyankar       x  = rtmp+16*((unsigned int)pj[j]);
1585*e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
1586*e6580ceeSShri Abhyankar 
1587*e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
1588*e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1589*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1590*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1591*e6580ceeSShri Abhyankar 
1592*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1593*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1594*e6580ceeSShri Abhyankar 
1595*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1596*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1597*e6580ceeSShri Abhyankar 
1598*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1599*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1600*e6580ceeSShri Abhyankar 
1601*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1602*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1603*e6580ceeSShri Abhyankar 
1604*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1605*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1606*e6580ceeSShri Abhyankar 
1607*e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1608*e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1609*e6580ceeSShri Abhyankar 
1610*e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1611*e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1612*e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1613*e6580ceeSShri Abhyankar       pv += 16;
1614*e6580ceeSShri Abhyankar     }
1615*e6580ceeSShri Abhyankar     /* invert diagonal block */
1616*e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
1617*e6580ceeSShri Abhyankar     if (pivotinblocks) {
1618*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1619*e6580ceeSShri Abhyankar     } else {
1620*e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1621*e6580ceeSShri Abhyankar     }
1622*e6580ceeSShri Abhyankar /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1623*e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1624*e6580ceeSShri Abhyankar   }
1625*e6580ceeSShri Abhyankar 
1626*e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1627*e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1628*e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1629*e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
1630*e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1631*e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1632*e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1633*e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1634*e6580ceeSShri Abhyankar }
1635*e6580ceeSShri Abhyankar 
1636*e6580ceeSShri Abhyankar #endif
1637