xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 78bb40077513d5120f4c52e0fb25a84efb280004)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
383287d42SBarry Smith /*
483287d42SBarry Smith     Factorization code for BAIJ format.
583287d42SBarry Smith */
67c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
7c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
883287d42SBarry Smith 
983287d42SBarry Smith /* ------------------------------------------------------------*/
1083287d42SBarry Smith /*
1183287d42SBarry Smith       Version for when blocks are 4 by 4
1283287d42SBarry Smith */
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
150481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
1683287d42SBarry Smith {
1783287d42SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1883287d42SBarry Smith   IS             isrow = b->row,isicol = b->icol;
196849ba73SBarry Smith   PetscErrorCode ierr;
205d0c19d7SBarry Smith   const PetscInt *r,*ic;
215d0c19d7SBarry Smith   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22690b6cddSBarry Smith   PetscInt       *ajtmpold,*ajtmp,nz,row;
23690b6cddSBarry Smith   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
2483287d42SBarry Smith   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
2583287d42SBarry Smith   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
2683287d42SBarry Smith   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
2783287d42SBarry Smith   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
2883287d42SBarry Smith   MatScalar      m13,m14,m15,m16;
2983287d42SBarry Smith   MatScalar      *ba = b->a,*aa = a->a;
30bcd9e38bSBarry Smith   PetscTruth     pivotinblocks = b->pivotinblocks;
3162bba022SBarry Smith   PetscReal      shift = info->shiftinblocks;
3283287d42SBarry Smith 
3383287d42SBarry Smith   PetscFunctionBegin;
3483287d42SBarry Smith   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3583287d42SBarry Smith   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36b0a32e0cSBarry Smith   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3783287d42SBarry Smith 
3883287d42SBarry Smith   for (i=0; i<n; i++) {
3983287d42SBarry Smith     nz    = bi[i+1] - bi[i];
4083287d42SBarry Smith     ajtmp = bj + bi[i];
4183287d42SBarry Smith     for  (j=0; j<nz; j++) {
4283287d42SBarry Smith       x = rtmp+16*ajtmp[j];
4383287d42SBarry Smith       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
4483287d42SBarry Smith       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
4583287d42SBarry Smith     }
4683287d42SBarry Smith     /* load in initial (unfactored row) */
4783287d42SBarry Smith     idx      = r[i];
4883287d42SBarry Smith     nz       = ai[idx+1] - ai[idx];
4983287d42SBarry Smith     ajtmpold = aj + ai[idx];
5083287d42SBarry Smith     v        = aa + 16*ai[idx];
5183287d42SBarry Smith     for (j=0; j<nz; j++) {
5283287d42SBarry Smith       x    = rtmp+16*ic[ajtmpold[j]];
5383287d42SBarry Smith       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
5483287d42SBarry Smith       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
5583287d42SBarry Smith       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
5683287d42SBarry Smith       x[14] = v[14]; x[15] = v[15];
5783287d42SBarry Smith       v    += 16;
5883287d42SBarry Smith     }
5983287d42SBarry Smith     row = *ajtmp++;
6083287d42SBarry Smith     while (row < i) {
6183287d42SBarry Smith       pc  = rtmp + 16*row;
6283287d42SBarry Smith       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
6383287d42SBarry Smith       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
6483287d42SBarry Smith       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
6583287d42SBarry Smith       p15 = pc[14]; p16 = pc[15];
6683287d42SBarry Smith       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
6783287d42SBarry Smith           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
6883287d42SBarry Smith           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
6983287d42SBarry Smith           || p16 != 0.0) {
7083287d42SBarry Smith         pv = ba + 16*diag_offset[row];
7183287d42SBarry Smith         pj = bj + diag_offset[row] + 1;
7283287d42SBarry Smith         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
7383287d42SBarry Smith         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
7483287d42SBarry Smith         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
7583287d42SBarry Smith         x15 = pv[14]; x16 = pv[15];
7683287d42SBarry Smith         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
7783287d42SBarry Smith         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
7883287d42SBarry Smith         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
7983287d42SBarry Smith         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
8083287d42SBarry Smith 
8183287d42SBarry Smith         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
8283287d42SBarry Smith         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
8383287d42SBarry Smith         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
8483287d42SBarry Smith         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
8583287d42SBarry Smith 
8683287d42SBarry Smith         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
8783287d42SBarry Smith         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
8883287d42SBarry Smith         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
8983287d42SBarry Smith         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
9083287d42SBarry Smith 
9183287d42SBarry Smith         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
9283287d42SBarry Smith         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
9383287d42SBarry Smith         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
9483287d42SBarry Smith         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
9583287d42SBarry Smith 
9683287d42SBarry Smith         nz = bi[row+1] - diag_offset[row] - 1;
9783287d42SBarry Smith         pv += 16;
9883287d42SBarry Smith         for (j=0; j<nz; j++) {
9983287d42SBarry Smith           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
10083287d42SBarry Smith           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
10183287d42SBarry Smith           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
10283287d42SBarry Smith           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
10383287d42SBarry Smith           x    = rtmp + 16*pj[j];
10483287d42SBarry Smith           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
10583287d42SBarry Smith           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
10683287d42SBarry Smith           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
10783287d42SBarry Smith           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
10883287d42SBarry Smith 
10983287d42SBarry Smith           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
11083287d42SBarry Smith           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
11183287d42SBarry Smith           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
11283287d42SBarry Smith           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
11383287d42SBarry Smith 
11483287d42SBarry Smith           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
11583287d42SBarry Smith           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
11683287d42SBarry Smith           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
11783287d42SBarry Smith           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
11883287d42SBarry Smith 
11983287d42SBarry Smith           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
12083287d42SBarry Smith           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
12183287d42SBarry Smith           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
12283287d42SBarry Smith           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
12383287d42SBarry Smith 
12483287d42SBarry Smith           pv   += 16;
12583287d42SBarry Smith         }
126dc0b31edSSatish Balay         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
12783287d42SBarry Smith       }
12883287d42SBarry Smith       row = *ajtmp++;
12983287d42SBarry Smith     }
13083287d42SBarry Smith     /* finished row so stick it into b->a */
13183287d42SBarry Smith     pv = ba + 16*bi[i];
13283287d42SBarry Smith     pj = bj + bi[i];
13383287d42SBarry Smith     nz = bi[i+1] - bi[i];
13483287d42SBarry Smith     for (j=0; j<nz; j++) {
13583287d42SBarry Smith       x      = rtmp+16*pj[j];
13683287d42SBarry Smith       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
13783287d42SBarry Smith       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
13883287d42SBarry Smith       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
13983287d42SBarry Smith       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
14083287d42SBarry Smith       pv   += 16;
14183287d42SBarry Smith     }
14283287d42SBarry Smith     /* invert diagonal block */
14383287d42SBarry Smith     w    = ba + 16*diag_offset[i];
144bcd9e38bSBarry Smith     if (pivotinblocks) {
14562bba022SBarry Smith       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146bcd9e38bSBarry Smith     } else {
147bcd9e38bSBarry Smith       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148bcd9e38bSBarry Smith     }
14983287d42SBarry Smith   }
15083287d42SBarry Smith 
15183287d42SBarry Smith   ierr = PetscFree(rtmp);CHKERRQ(ierr);
15283287d42SBarry Smith   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
15383287d42SBarry Smith   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154db4efbfdSBarry Smith   C->ops->solve          = MatSolve_SeqBAIJ_4;
155db4efbfdSBarry Smith   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
15683287d42SBarry Smith   C->assembled = PETSC_TRUE;
157efee365bSSatish Balay   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
15883287d42SBarry Smith   PetscFunctionReturn(0);
15983287d42SBarry Smith }
160e6580ceeSShri Abhyankar 
161209027a4SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct -
162209027a4SShri Abhyankar      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
163209027a4SShri Abhyankar        Kernel_A_gets_A_times_B()
164209027a4SShri Abhyankar        Kernel_A_gets_A_minus_B_times_C()
165209027a4SShri Abhyankar        Kernel_A_gets_inverse_A()
166209027a4SShri Abhyankar */
167209027a4SShri Abhyankar #undef __FUNCT__
168209027a4SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct"
169209027a4SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
170209027a4SShri Abhyankar {
171209027a4SShri Abhyankar   Mat            C=B;
172209027a4SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173209027a4SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
174209027a4SShri Abhyankar   PetscErrorCode ierr;
175209027a4SShri Abhyankar   const PetscInt *r,*ic,*ics;
176209027a4SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
177209027a4SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
178209027a4SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
179209027a4SShri Abhyankar   PetscInt       bs2 = a->bs2,flg;
180209027a4SShri Abhyankar   PetscReal      shift = info->shiftinblocks;
181209027a4SShri Abhyankar 
182209027a4SShri Abhyankar   PetscFunctionBegin;
183209027a4SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
184209027a4SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
185209027a4SShri Abhyankar 
186209027a4SShri Abhyankar   /* generate work space needed by the factorization */
187209027a4SShri Abhyankar   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
188209027a4SShri Abhyankar   mwork = rtmp + bs2*n;
189209027a4SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
190209027a4SShri Abhyankar   ics  = ic;
191209027a4SShri Abhyankar 
192209027a4SShri Abhyankar   for (i=0; i<n; i++){
193209027a4SShri Abhyankar     /* zero rtmp */
194209027a4SShri Abhyankar     /* L part */
195209027a4SShri Abhyankar     nz    = bi[i+1] - bi[i];
196209027a4SShri Abhyankar     bjtmp = bj + bi[i];
197209027a4SShri Abhyankar     for  (j=0; j<nz; j++){
198209027a4SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
199209027a4SShri Abhyankar     }
200209027a4SShri Abhyankar 
201209027a4SShri Abhyankar     /* U part */
202209027a4SShri Abhyankar     nz = bi[2*n-i+1] - bi[2*n-i];
203209027a4SShri Abhyankar     bjtmp = bj + bi[2*n-i];
204209027a4SShri Abhyankar     for  (j=0; j<nz; j++){
205209027a4SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
206209027a4SShri Abhyankar     }
207209027a4SShri Abhyankar 
208209027a4SShri Abhyankar     /* load in initial (unfactored row) */
209209027a4SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
210209027a4SShri Abhyankar     ajtmp = aj + ai[r[i]];
211209027a4SShri Abhyankar     v     = aa + bs2*ai[r[i]];
212209027a4SShri Abhyankar     for (j=0; j<nz; j++) {
213209027a4SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
214209027a4SShri Abhyankar     }
215209027a4SShri Abhyankar 
216209027a4SShri Abhyankar     /* elimination */
217209027a4SShri Abhyankar     bjtmp = bj + bi[i];
218209027a4SShri Abhyankar     nzL   = bi[i+1] - bi[i];
219b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
220b1646270SShri Abhyankar       row = bjtmp[k];
221209027a4SShri Abhyankar       pc = rtmp + bs2*row;
222209027a4SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
223209027a4SShri Abhyankar       if (flg) {
224209027a4SShri Abhyankar         pv = b->a + bs2*bdiag[row];
225209027a4SShri Abhyankar         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
226209027a4SShri Abhyankar         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
227209027a4SShri Abhyankar 
228209027a4SShri Abhyankar         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
229209027a4SShri Abhyankar         pv = b->a + bs2*bi[2*n-row];
230209027a4SShri Abhyankar         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
231209027a4SShri Abhyankar         for (j=0; j<nz; j++) {
232209027a4SShri Abhyankar           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
233209027a4SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
234209027a4SShri Abhyankar           v    = rtmp + bs2*pj[j];
235209027a4SShri Abhyankar           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
236209027a4SShri Abhyankar           pv  += bs2;
237209027a4SShri Abhyankar         }
238209027a4SShri Abhyankar         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
239209027a4SShri Abhyankar       }
240209027a4SShri Abhyankar     }
241209027a4SShri Abhyankar 
242209027a4SShri Abhyankar     /* finished row so stick it into b->a */
243209027a4SShri Abhyankar     /* L part */
244209027a4SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
245209027a4SShri Abhyankar     pj   = b->j + bi[i] ;
246209027a4SShri Abhyankar     nz   = bi[i+1] - bi[i];
247209027a4SShri Abhyankar     for (j=0; j<nz; j++) {
248209027a4SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
249209027a4SShri Abhyankar     }
250209027a4SShri Abhyankar 
251209027a4SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
252209027a4SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
253209027a4SShri Abhyankar     pj   = b->j + bdiag[i];
254209027a4SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
255209027a4SShri Abhyankar     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
256209027a4SShri Abhyankar     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
257209027a4SShri Abhyankar 
258209027a4SShri Abhyankar     /* U part */
259209027a4SShri Abhyankar     pv = b->a + bs2*bi[2*n-i];
260209027a4SShri Abhyankar     pj = b->j + bi[2*n-i];
261209027a4SShri Abhyankar     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
262209027a4SShri Abhyankar     for (j=0; j<nz; j++){
263209027a4SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
264209027a4SShri Abhyankar     }
265209027a4SShri Abhyankar   }
266209027a4SShri Abhyankar 
267209027a4SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
268209027a4SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
269209027a4SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
270209027a4SShri Abhyankar 
271209027a4SShri Abhyankar   C->assembled = PETSC_TRUE;
272209027a4SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
273209027a4SShri Abhyankar   PetscFunctionReturn(0);
274209027a4SShri Abhyankar }
275209027a4SShri Abhyankar 
276e6580ceeSShri Abhyankar #undef __FUNCT__
277*78bb4007SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2"
278*78bb4007SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
279*78bb4007SShri Abhyankar {
280*78bb4007SShri Abhyankar   Mat            C=B;
281*78bb4007SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
282*78bb4007SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
283*78bb4007SShri Abhyankar   PetscErrorCode ierr;
284*78bb4007SShri Abhyankar   const PetscInt *r,*ic,*ics;
285*78bb4007SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
286*78bb4007SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
287*78bb4007SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
288*78bb4007SShri Abhyankar   PetscInt       bs2 = a->bs2,flg;
289*78bb4007SShri Abhyankar   PetscReal      shift = info->shiftinblocks;
290*78bb4007SShri Abhyankar 
291*78bb4007SShri Abhyankar   PetscFunctionBegin;
292*78bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
293*78bb4007SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
294*78bb4007SShri Abhyankar 
295*78bb4007SShri Abhyankar   /* generate work space needed by the factorization */
296*78bb4007SShri Abhyankar   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
297*78bb4007SShri Abhyankar   mwork = rtmp + bs2*n;
298*78bb4007SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
299*78bb4007SShri Abhyankar   ics  = ic;
300*78bb4007SShri Abhyankar 
301*78bb4007SShri Abhyankar   for (i=0; i<n; i++){
302*78bb4007SShri Abhyankar     /* zero rtmp */
303*78bb4007SShri Abhyankar     /* L part */
304*78bb4007SShri Abhyankar     nz    = bi[i+1] - bi[i];
305*78bb4007SShri Abhyankar     bjtmp = bj + bi[i];
306*78bb4007SShri Abhyankar     for  (j=0; j<nz; j++){
307*78bb4007SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
308*78bb4007SShri Abhyankar     }
309*78bb4007SShri Abhyankar 
310*78bb4007SShri Abhyankar     /* U part */
311*78bb4007SShri Abhyankar     nz = bdiag[i]-bdiag[i+1];
312*78bb4007SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
313*78bb4007SShri Abhyankar     for  (j=0; j<nz; j++){
314*78bb4007SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
315*78bb4007SShri Abhyankar     }
316*78bb4007SShri Abhyankar 
317*78bb4007SShri Abhyankar     /* load in initial (unfactored row) */
318*78bb4007SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
319*78bb4007SShri Abhyankar     ajtmp = aj + ai[r[i]];
320*78bb4007SShri Abhyankar     v     = aa + bs2*ai[r[i]];
321*78bb4007SShri Abhyankar     for (j=0; j<nz; j++) {
322*78bb4007SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
323*78bb4007SShri Abhyankar     }
324*78bb4007SShri Abhyankar 
325*78bb4007SShri Abhyankar     /* elimination */
326*78bb4007SShri Abhyankar     bjtmp = bj + bi[i];
327*78bb4007SShri Abhyankar     nzL   = bi[i+1] - bi[i];
328*78bb4007SShri Abhyankar     for(k=0;k < nzL;k++) {
329*78bb4007SShri Abhyankar       row = bjtmp[k];
330*78bb4007SShri Abhyankar       pc = rtmp + bs2*row;
331*78bb4007SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
332*78bb4007SShri Abhyankar       if (flg) {
333*78bb4007SShri Abhyankar         pv = b->a + bs2*bdiag[row];
334*78bb4007SShri Abhyankar         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
335*78bb4007SShri Abhyankar         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
336*78bb4007SShri Abhyankar 
337*78bb4007SShri Abhyankar         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
338*78bb4007SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
339*78bb4007SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
340*78bb4007SShri Abhyankar         for (j=0; j<nz; j++) {
341*78bb4007SShri Abhyankar           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
342*78bb4007SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
343*78bb4007SShri Abhyankar           v    = rtmp + bs2*pj[j];
344*78bb4007SShri Abhyankar           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
345*78bb4007SShri Abhyankar           pv  += bs2;
346*78bb4007SShri Abhyankar         }
347*78bb4007SShri Abhyankar         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
348*78bb4007SShri Abhyankar       }
349*78bb4007SShri Abhyankar     }
350*78bb4007SShri Abhyankar 
351*78bb4007SShri Abhyankar     /* finished row so stick it into b->a */
352*78bb4007SShri Abhyankar     /* L part */
353*78bb4007SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
354*78bb4007SShri Abhyankar     pj   = b->j + bi[i] ;
355*78bb4007SShri Abhyankar     nz   = bi[i+1] - bi[i];
356*78bb4007SShri Abhyankar     for (j=0; j<nz; j++) {
357*78bb4007SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
358*78bb4007SShri Abhyankar     }
359*78bb4007SShri Abhyankar 
360*78bb4007SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
361*78bb4007SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
362*78bb4007SShri Abhyankar     pj   = b->j + bdiag[i];
363*78bb4007SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
364*78bb4007SShri Abhyankar     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
365*78bb4007SShri Abhyankar     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
366*78bb4007SShri Abhyankar 
367*78bb4007SShri Abhyankar     /* U part */
368*78bb4007SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
369*78bb4007SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
370*78bb4007SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
371*78bb4007SShri Abhyankar     for (j=0; j<nz; j++){
372*78bb4007SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
373*78bb4007SShri Abhyankar     }
374*78bb4007SShri Abhyankar   }
375*78bb4007SShri Abhyankar 
376*78bb4007SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
377*78bb4007SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
378*78bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
379*78bb4007SShri Abhyankar 
380*78bb4007SShri Abhyankar   C->assembled = PETSC_TRUE;
381*78bb4007SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
382*78bb4007SShri Abhyankar   PetscFunctionReturn(0);
383*78bb4007SShri Abhyankar }
384*78bb4007SShri Abhyankar 
385*78bb4007SShri Abhyankar #undef __FUNCT__
386e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
387e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
388e6580ceeSShri Abhyankar {
389e6580ceeSShri Abhyankar /*
390e6580ceeSShri Abhyankar     Default Version for when blocks are 4 by 4 Using natural ordering
391e6580ceeSShri Abhyankar */
392e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
393e6580ceeSShri Abhyankar   PetscErrorCode ierr;
394e6580ceeSShri Abhyankar   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
395e6580ceeSShri Abhyankar   PetscInt       *ajtmpold,*ajtmp,nz,row;
396e6580ceeSShri Abhyankar   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
397e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
398e6580ceeSShri Abhyankar   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
399e6580ceeSShri Abhyankar   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
400e6580ceeSShri Abhyankar   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
401e6580ceeSShri Abhyankar   MatScalar      m13,m14,m15,m16;
402e6580ceeSShri Abhyankar   MatScalar      *ba = b->a,*aa = a->a;
403e6580ceeSShri Abhyankar   PetscTruth     pivotinblocks = b->pivotinblocks;
404e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
405e6580ceeSShri Abhyankar 
406e6580ceeSShri Abhyankar   PetscFunctionBegin;
407e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
408e6580ceeSShri Abhyankar 
409e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
410e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
411e6580ceeSShri Abhyankar     ajtmp = bj + bi[i];
412e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
413e6580ceeSShri Abhyankar       x = rtmp+16*ajtmp[j];
414e6580ceeSShri Abhyankar       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
415e6580ceeSShri Abhyankar       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
416e6580ceeSShri Abhyankar     }
417e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
418e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
419e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
420e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
421e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
422e6580ceeSShri Abhyankar       x    = rtmp+16*ajtmpold[j];
423e6580ceeSShri Abhyankar       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
424e6580ceeSShri Abhyankar       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
425e6580ceeSShri Abhyankar       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
426e6580ceeSShri Abhyankar       x[14] = v[14]; x[15] = v[15];
427e6580ceeSShri Abhyankar       v    += 16;
428e6580ceeSShri Abhyankar     }
429e6580ceeSShri Abhyankar     row = *ajtmp++;
430e6580ceeSShri Abhyankar     while (row < i) {
431e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
432e6580ceeSShri Abhyankar       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
433e6580ceeSShri Abhyankar       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
434e6580ceeSShri Abhyankar       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
435e6580ceeSShri Abhyankar       p15 = pc[14]; p16 = pc[15];
436e6580ceeSShri Abhyankar       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
437e6580ceeSShri Abhyankar           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
438e6580ceeSShri Abhyankar           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
439e6580ceeSShri Abhyankar           || p16 != 0.0) {
440e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
441e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
442e6580ceeSShri Abhyankar         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
443e6580ceeSShri Abhyankar         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
444e6580ceeSShri Abhyankar         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
445e6580ceeSShri Abhyankar         x15 = pv[14]; x16 = pv[15];
446e6580ceeSShri Abhyankar         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
447e6580ceeSShri Abhyankar         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
448e6580ceeSShri Abhyankar         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
449e6580ceeSShri Abhyankar         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
450e6580ceeSShri Abhyankar 
451e6580ceeSShri Abhyankar         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
452e6580ceeSShri Abhyankar         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
453e6580ceeSShri Abhyankar         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
454e6580ceeSShri Abhyankar         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
455e6580ceeSShri Abhyankar 
456e6580ceeSShri Abhyankar         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
457e6580ceeSShri Abhyankar         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
458e6580ceeSShri Abhyankar         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
459e6580ceeSShri Abhyankar         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
460e6580ceeSShri Abhyankar 
461e6580ceeSShri Abhyankar         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
462e6580ceeSShri Abhyankar         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
463e6580ceeSShri Abhyankar         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
464e6580ceeSShri Abhyankar         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
465e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
466e6580ceeSShri Abhyankar         pv += 16;
467e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
468e6580ceeSShri Abhyankar           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
469e6580ceeSShri Abhyankar           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
470e6580ceeSShri Abhyankar           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
471e6580ceeSShri Abhyankar           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
472e6580ceeSShri Abhyankar           x    = rtmp + 16*pj[j];
473e6580ceeSShri Abhyankar           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
474e6580ceeSShri Abhyankar           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
475e6580ceeSShri Abhyankar           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
476e6580ceeSShri Abhyankar           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
477e6580ceeSShri Abhyankar 
478e6580ceeSShri Abhyankar           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
479e6580ceeSShri Abhyankar           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
480e6580ceeSShri Abhyankar           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
481e6580ceeSShri Abhyankar           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
482e6580ceeSShri Abhyankar 
483e6580ceeSShri Abhyankar           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
484e6580ceeSShri Abhyankar           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
485e6580ceeSShri Abhyankar           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
486e6580ceeSShri Abhyankar           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
487e6580ceeSShri Abhyankar 
488e6580ceeSShri Abhyankar           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
489e6580ceeSShri Abhyankar           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
490e6580ceeSShri Abhyankar           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
491e6580ceeSShri Abhyankar           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
492e6580ceeSShri Abhyankar 
493e6580ceeSShri Abhyankar           pv   += 16;
494e6580ceeSShri Abhyankar         }
495e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
496e6580ceeSShri Abhyankar       }
497e6580ceeSShri Abhyankar       row = *ajtmp++;
498e6580ceeSShri Abhyankar     }
499e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
500e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
501e6580ceeSShri Abhyankar     pj = bj + bi[i];
502e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
503e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
504e6580ceeSShri Abhyankar       x      = rtmp+16*pj[j];
505e6580ceeSShri Abhyankar       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
506e6580ceeSShri Abhyankar       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
507e6580ceeSShri Abhyankar       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
508e6580ceeSShri Abhyankar       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
509e6580ceeSShri Abhyankar       pv   += 16;
510e6580ceeSShri Abhyankar     }
511e6580ceeSShri Abhyankar     /* invert diagonal block */
512e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
513e6580ceeSShri Abhyankar     if (pivotinblocks) {
514e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
515e6580ceeSShri Abhyankar     } else {
516e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
517e6580ceeSShri Abhyankar     }
518e6580ceeSShri Abhyankar   }
519e6580ceeSShri Abhyankar 
520e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
521e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
522e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
523e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
524e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
525e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
526e6580ceeSShri Abhyankar }
527209027a4SShri Abhyankar /*
528209027a4SShri Abhyankar   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct -
529209027a4SShri Abhyankar     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct()
530209027a4SShri Abhyankar */
531209027a4SShri Abhyankar #undef __FUNCT__
532209027a4SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct"
533209027a4SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
534209027a4SShri Abhyankar {
535209027a4SShri Abhyankar   Mat            C=B;
536209027a4SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
537209027a4SShri Abhyankar   PetscErrorCode ierr;
538209027a4SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
539209027a4SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
540209027a4SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
541209027a4SShri Abhyankar   PetscInt       bs2 = a->bs2,flg;
542209027a4SShri Abhyankar   PetscReal      shift = info->shiftinblocks;
543e6580ceeSShri Abhyankar 
544209027a4SShri Abhyankar   PetscFunctionBegin;
545209027a4SShri Abhyankar   /* generate work space needed by the factorization */
546209027a4SShri Abhyankar   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
547209027a4SShri Abhyankar   mwork = rtmp + bs2*n;
548209027a4SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
549209027a4SShri Abhyankar 
550209027a4SShri Abhyankar   for (i=0; i<n; i++){
551209027a4SShri Abhyankar     /* zero rtmp */
552209027a4SShri Abhyankar     /* L part */
553209027a4SShri Abhyankar     nz    = bi[i+1] - bi[i];
554209027a4SShri Abhyankar     bjtmp = bj + bi[i];
555209027a4SShri Abhyankar     for  (j=0; j<nz; j++){
556209027a4SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
557209027a4SShri Abhyankar     }
558209027a4SShri Abhyankar 
559209027a4SShri Abhyankar     /* U part */
560209027a4SShri Abhyankar     nz = bi[2*n-i+1] - bi[2*n-i];
561209027a4SShri Abhyankar     bjtmp = bj + bi[2*n-i];
562209027a4SShri Abhyankar     for  (j=0; j<nz; j++){
563209027a4SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
564209027a4SShri Abhyankar     }
565209027a4SShri Abhyankar 
566209027a4SShri Abhyankar     /* load in initial (unfactored row) */
567209027a4SShri Abhyankar     nz    = ai[i+1] - ai[i];
568209027a4SShri Abhyankar     ajtmp = aj + ai[i];
569209027a4SShri Abhyankar     v     = aa + bs2*ai[i];
570209027a4SShri Abhyankar     for (j=0; j<nz; j++) {
571209027a4SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
572209027a4SShri Abhyankar     }
573209027a4SShri Abhyankar 
574209027a4SShri Abhyankar     /* elimination */
575209027a4SShri Abhyankar     bjtmp = bj + bi[i];
576209027a4SShri Abhyankar     nzL   = bi[i+1] - bi[i];
577b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
578b1646270SShri Abhyankar       row = bjtmp[k];
579209027a4SShri Abhyankar       pc = rtmp + bs2*row;
580209027a4SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
581209027a4SShri Abhyankar       if (flg) {
582209027a4SShri Abhyankar         pv = b->a + bs2*bdiag[row];
583209027a4SShri Abhyankar         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
584209027a4SShri Abhyankar         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
585209027a4SShri Abhyankar 
586209027a4SShri Abhyankar         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
587209027a4SShri Abhyankar         pv = b->a + bs2*bi[2*n-row];
588209027a4SShri Abhyankar         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
589209027a4SShri Abhyankar         for (j=0; j<nz; j++) {
590209027a4SShri Abhyankar           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
591209027a4SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
592209027a4SShri Abhyankar           v    = rtmp + bs2*pj[j];
593209027a4SShri Abhyankar           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
594209027a4SShri Abhyankar           pv  += bs2;
595209027a4SShri Abhyankar         }
596209027a4SShri Abhyankar         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
597209027a4SShri Abhyankar       }
598209027a4SShri Abhyankar     }
599209027a4SShri Abhyankar 
600209027a4SShri Abhyankar     /* finished row so stick it into b->a */
601209027a4SShri Abhyankar     /* L part */
602209027a4SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
603209027a4SShri Abhyankar     pj   = b->j + bi[i] ;
604209027a4SShri Abhyankar     nz   = bi[i+1] - bi[i];
605209027a4SShri Abhyankar     for (j=0; j<nz; j++) {
606209027a4SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
607209027a4SShri Abhyankar     }
608209027a4SShri Abhyankar 
609209027a4SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
610209027a4SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
611209027a4SShri Abhyankar     pj   = b->j + bdiag[i];
612209027a4SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
613209027a4SShri Abhyankar     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
614209027a4SShri Abhyankar     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
615209027a4SShri Abhyankar 
616209027a4SShri Abhyankar     /* U part */
617209027a4SShri Abhyankar     pv = b->a + bs2*bi[2*n-i];
618209027a4SShri Abhyankar     pj = b->j + bi[2*n-i];
619209027a4SShri Abhyankar     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
620209027a4SShri Abhyankar     for (j=0; j<nz; j++){
621209027a4SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
622209027a4SShri Abhyankar     }
623209027a4SShri Abhyankar   }
624209027a4SShri Abhyankar 
625209027a4SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
626209027a4SShri Abhyankar   C->assembled = PETSC_TRUE;
627209027a4SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
628209027a4SShri Abhyankar   PetscFunctionReturn(0);
629209027a4SShri Abhyankar }
630e6580ceeSShri Abhyankar 
631b2b2dd24SShri Abhyankar #undef __FUNCT__
632b2b2dd24SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
633b2b2dd24SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
634b2b2dd24SShri Abhyankar {
635b2b2dd24SShri Abhyankar   Mat            C=B;
636b2b2dd24SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
637b2b2dd24SShri Abhyankar   PetscErrorCode ierr;
638b2b2dd24SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
639b2b2dd24SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
640b2b2dd24SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
641b2b2dd24SShri Abhyankar   PetscInt       bs2 = a->bs2,flg;
642b2b2dd24SShri Abhyankar   PetscReal      shift = info->shiftinblocks;
643b2b2dd24SShri Abhyankar 
644b2b2dd24SShri Abhyankar   PetscFunctionBegin;
645b2b2dd24SShri Abhyankar   /* generate work space needed by the factorization */
646b2b2dd24SShri Abhyankar   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
647b2b2dd24SShri Abhyankar   mwork = rtmp + bs2*n;
648b2b2dd24SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
649b2b2dd24SShri Abhyankar 
650b2b2dd24SShri Abhyankar   for (i=0; i<n; i++){
651b2b2dd24SShri Abhyankar     /* zero rtmp */
652b2b2dd24SShri Abhyankar     /* L part */
653b2b2dd24SShri Abhyankar     nz    = bi[i+1] - bi[i];
654b2b2dd24SShri Abhyankar     bjtmp = bj + bi[i];
655b2b2dd24SShri Abhyankar     for  (j=0; j<nz; j++){
656b2b2dd24SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
657b2b2dd24SShri Abhyankar     }
658b2b2dd24SShri Abhyankar 
659b2b2dd24SShri Abhyankar     /* U part */
660b2b2dd24SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
661b2b2dd24SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
662b2b2dd24SShri Abhyankar     for  (j=0; j<nz; j++){
663b2b2dd24SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
664b2b2dd24SShri Abhyankar     }
665b2b2dd24SShri Abhyankar 
666b2b2dd24SShri Abhyankar     /* load in initial (unfactored row) */
667b2b2dd24SShri Abhyankar     nz    = ai[i+1] - ai[i];
668b2b2dd24SShri Abhyankar     ajtmp = aj + ai[i];
669b2b2dd24SShri Abhyankar     v     = aa + bs2*ai[i];
670b2b2dd24SShri Abhyankar     for (j=0; j<nz; j++) {
671b2b2dd24SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
672b2b2dd24SShri Abhyankar     }
673b2b2dd24SShri Abhyankar 
674b2b2dd24SShri Abhyankar     /* elimination */
675b2b2dd24SShri Abhyankar     bjtmp = bj + bi[i];
676b2b2dd24SShri Abhyankar     nzL   = bi[i+1] - bi[i];
677b2b2dd24SShri Abhyankar     for(k=0;k < nzL;k++) {
678b2b2dd24SShri Abhyankar       row = bjtmp[k];
679b2b2dd24SShri Abhyankar       pc = rtmp + bs2*row;
680b2b2dd24SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
681b2b2dd24SShri Abhyankar       if (flg) {
682b2b2dd24SShri Abhyankar         pv = b->a + bs2*bdiag[row];
683b2b2dd24SShri Abhyankar         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
684b2b2dd24SShri Abhyankar         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
685b2b2dd24SShri Abhyankar 
686b2b2dd24SShri Abhyankar         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
687b2b2dd24SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
688b2b2dd24SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
689b2b2dd24SShri Abhyankar         for (j=0; j<nz; j++) {
690b2b2dd24SShri Abhyankar           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
691b2b2dd24SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
692b2b2dd24SShri Abhyankar           v    = rtmp + bs2*pj[j];
693b2b2dd24SShri Abhyankar           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
694b2b2dd24SShri Abhyankar           pv  += bs2;
695b2b2dd24SShri Abhyankar         }
696b2b2dd24SShri Abhyankar         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
697b2b2dd24SShri Abhyankar       }
698b2b2dd24SShri Abhyankar     }
699b2b2dd24SShri Abhyankar 
700b2b2dd24SShri Abhyankar     /* finished row so stick it into b->a */
701b2b2dd24SShri Abhyankar     /* L part */
702b2b2dd24SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
703b2b2dd24SShri Abhyankar     pj   = b->j + bi[i] ;
704b2b2dd24SShri Abhyankar     nz   = bi[i+1] - bi[i];
705b2b2dd24SShri Abhyankar     for (j=0; j<nz; j++) {
706b2b2dd24SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
707b2b2dd24SShri Abhyankar     }
708b2b2dd24SShri Abhyankar 
709b2b2dd24SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
710b2b2dd24SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
711b2b2dd24SShri Abhyankar     pj   = b->j + bdiag[i];
712b2b2dd24SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
713b2b2dd24SShri Abhyankar     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
714b2b2dd24SShri Abhyankar     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
715b2b2dd24SShri Abhyankar 
716b2b2dd24SShri Abhyankar     /* U part */
717b2b2dd24SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
718b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
719b2b2dd24SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
720b2b2dd24SShri Abhyankar     for (j=0; j<nz; j++){
721b2b2dd24SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
722b2b2dd24SShri Abhyankar     }
723b2b2dd24SShri Abhyankar   }
724b2b2dd24SShri Abhyankar 
725b2b2dd24SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
726b2b2dd24SShri Abhyankar   C->assembled = PETSC_TRUE;
727b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
728b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
729b2b2dd24SShri Abhyankar }
730b2b2dd24SShri Abhyankar 
731e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE)
732e6580ceeSShri Abhyankar 
733e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE
734e6580ceeSShri Abhyankar 
735e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */
736e6580ceeSShri Abhyankar #undef __FUNCT__
737e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
738e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
739e6580ceeSShri Abhyankar {
740e6580ceeSShri Abhyankar   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
741e6580ceeSShri Abhyankar   PetscErrorCode ierr;
742e6580ceeSShri Abhyankar   int i,j,n = a->mbs;
743e6580ceeSShri Abhyankar   int         *bj = b->j,*bjtmp,*pj;
744e6580ceeSShri Abhyankar   int         row;
745e6580ceeSShri Abhyankar   int         *ajtmpold,nz,*bi=b->i;
746e6580ceeSShri Abhyankar   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
747e6580ceeSShri Abhyankar   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
748e6580ceeSShri Abhyankar   MatScalar   *ba = b->a,*aa = a->a;
749e6580ceeSShri Abhyankar   int         nonzero=0;
750e6580ceeSShri Abhyankar /*    int            nonzero=0,colscale = 16; */
751e6580ceeSShri Abhyankar   PetscTruth  pivotinblocks = b->pivotinblocks;
752e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
753e6580ceeSShri Abhyankar 
754e6580ceeSShri Abhyankar   PetscFunctionBegin;
755e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
756e6580ceeSShri Abhyankar 
757e6580ceeSShri Abhyankar   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
758e6580ceeSShri Abhyankar   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
759e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
760e6580ceeSShri Abhyankar   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
761e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
762e6580ceeSShri Abhyankar /*      colscale = 4; */
763e6580ceeSShri Abhyankar /*    } */
764e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
765e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
766e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
767e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
768e6580ceeSShri Abhyankar     /* zero out one register */
769e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
770e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
771e6580ceeSShri Abhyankar       x = rtmp+16*bjtmp[j];
772e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
773e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
774e6580ceeSShri Abhyankar         /* Copy zero register to memory locations */
775e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
776e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
777e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
778e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
779e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
780e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
781e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
782e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
783e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
784e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
785e6580ceeSShri Abhyankar     }
786e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
787e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
788e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
789e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
790e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
791e6580ceeSShri Abhyankar       x = rtmp+16*ajtmpold[j];
792e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmpold[j]; */
793e6580ceeSShri Abhyankar       /* Copy v block into x block */
794e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
795e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
796e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
797e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
798e6580ceeSShri Abhyankar 
799e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
800e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
801e6580ceeSShri Abhyankar 
802e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
803e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
804e6580ceeSShri Abhyankar 
805e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
806e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
807e6580ceeSShri Abhyankar 
808e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
809e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
810e6580ceeSShri Abhyankar 
811e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
812e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
813e6580ceeSShri Abhyankar 
814e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
815e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
816e6580ceeSShri Abhyankar 
817e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
818e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
819e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
820e6580ceeSShri Abhyankar 
821e6580ceeSShri Abhyankar       v += 16;
822e6580ceeSShri Abhyankar     }
823e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
824e6580ceeSShri Abhyankar     row = *bjtmp++;
825e6580ceeSShri Abhyankar     while (row < i) {
826e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
827e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
828e6580ceeSShri Abhyankar         /* Load block from lower triangle */
829e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
830e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
831e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
832e6580ceeSShri Abhyankar 
833e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
834e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
835e6580ceeSShri Abhyankar 
836e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
837e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
838e6580ceeSShri Abhyankar 
839e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
840e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
841e6580ceeSShri Abhyankar 
842e6580ceeSShri Abhyankar         /* Compare block to zero block */
843e6580ceeSShri Abhyankar 
844e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM4,XMM7)
845e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM4,XMM0)
846e6580ceeSShri Abhyankar 
847e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM5,XMM7)
848e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM5,XMM1)
849e6580ceeSShri Abhyankar 
850e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM6,XMM7)
851e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM6,XMM2)
852e6580ceeSShri Abhyankar 
853e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM7,XMM3)
854e6580ceeSShri Abhyankar 
855e6580ceeSShri Abhyankar         /* Reduce the comparisons to one SSE register */
856e6580ceeSShri Abhyankar         SSE_OR_PS(XMM6,XMM7)
857e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM4)
858e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM6)
859e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
860e6580ceeSShri Abhyankar 
861e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
862e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
863e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
864e6580ceeSShri Abhyankar 
865e6580ceeSShri Abhyankar       /* If block is nonzero ... */
866e6580ceeSShri Abhyankar       if (nonzero) {
867e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
868e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
869e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
870e6580ceeSShri Abhyankar 
871e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
872e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
873e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
874e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
875e6580ceeSShri Abhyankar 
876e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
877e6580ceeSShri Abhyankar           /* Column 0, product is accumulated in XMM4 */
878e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
879e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
880e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM0)
881e6580ceeSShri Abhyankar 
882e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
883e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
884e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM1)
885e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM5)
886e6580ceeSShri Abhyankar 
887e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
888e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
889e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM2)
890e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM6)
891e6580ceeSShri Abhyankar 
892e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
893e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
894e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
895e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM7)
896e6580ceeSShri Abhyankar 
897e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
898e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
899e6580ceeSShri Abhyankar 
900e6580ceeSShri Abhyankar           /* Column 1, product is accumulated in XMM5 */
901e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
902e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
903e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
904e6580ceeSShri Abhyankar 
905e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
906e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
907e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
908e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
909e6580ceeSShri Abhyankar 
910e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
911e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
912e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
913e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM7)
914e6580ceeSShri Abhyankar 
915e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
916e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
917e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
918e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
919e6580ceeSShri Abhyankar 
920e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
921e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
922e6580ceeSShri Abhyankar 
923e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
924e6580ceeSShri Abhyankar 
925e6580ceeSShri Abhyankar           /* Column 2, product is accumulated in XMM6 */
926e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
927e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
928e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
929e6580ceeSShri Abhyankar 
930e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
931e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
932e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
933e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
934e6580ceeSShri Abhyankar 
935e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
936e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
937e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
938e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
939e6580ceeSShri Abhyankar 
940e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
941e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
942e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
943e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
944e6580ceeSShri Abhyankar 
945e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
946e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
947e6580ceeSShri Abhyankar 
948e6580ceeSShri Abhyankar           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
949e6580ceeSShri Abhyankar           /* Column 3, product is accumulated in XMM0 */
950e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
951e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
952e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM0,XMM7)
953e6580ceeSShri Abhyankar 
954e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
955e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
956e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM7)
957e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
958e6580ceeSShri Abhyankar 
959e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
960e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM1,XMM1,0x00)
961e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM2)
962e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
963e6580ceeSShri Abhyankar 
964e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
965e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
966e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM3,XMM7)
967e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM3)
968e6580ceeSShri Abhyankar 
969e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
970e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
971e6580ceeSShri Abhyankar 
972e6580ceeSShri Abhyankar           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
973e6580ceeSShri Abhyankar           /* This is code to be maintained and read by humans afterall. */
974e6580ceeSShri Abhyankar           /* Copy Multiplier Col 3 into XMM3 */
975e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM3,XMM0)
976e6580ceeSShri Abhyankar           /* Copy Multiplier Col 2 into XMM2 */
977e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM2,XMM6)
978e6580ceeSShri Abhyankar           /* Copy Multiplier Col 1 into XMM1 */
979e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM1,XMM5)
980e6580ceeSShri Abhyankar           /* Copy Multiplier Col 0 into XMM0 */
981e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM0,XMM4)
982e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
983e6580ceeSShri Abhyankar 
984e6580ceeSShri Abhyankar         /* Update the row: */
985e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
986e6580ceeSShri Abhyankar         pv += 16;
987e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
988e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
989e6580ceeSShri Abhyankar           x = rtmp + 16*pj[j];
990e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
991e6580ceeSShri Abhyankar 
992e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
993e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
994e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
995e6580ceeSShri Abhyankar             /* Load First Column of X*/
996e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
997e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
998e6580ceeSShri Abhyankar 
999e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1000e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1001e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1002e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1003e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1004e6580ceeSShri Abhyankar 
1005e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1006e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1007e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1008e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1009e6580ceeSShri Abhyankar 
1010e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1011e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1012e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1013e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1014e6580ceeSShri Abhyankar 
1015e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1016e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1017e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1018e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1019e6580ceeSShri Abhyankar 
1020e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1021e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1022e6580ceeSShri Abhyankar 
1023e6580ceeSShri Abhyankar             /* Second Column */
1024e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1025e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1026e6580ceeSShri Abhyankar 
1027e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1028e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1029e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1030e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM0)
1031e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1032e6580ceeSShri Abhyankar 
1033e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1034e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1035e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM1)
1036e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM7)
1037e6580ceeSShri Abhyankar 
1038e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1039e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1040e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM2)
1041e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM4)
1042e6580ceeSShri Abhyankar 
1043e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1044e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1045e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM3)
1046e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1047e6580ceeSShri Abhyankar 
1048e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1049e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1050e6580ceeSShri Abhyankar 
1051e6580ceeSShri Abhyankar             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1052e6580ceeSShri Abhyankar 
1053e6580ceeSShri Abhyankar             /* Third Column */
1054e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1055e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1056e6580ceeSShri Abhyankar 
1057e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1058e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1059e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1060e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM0)
1061e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1062e6580ceeSShri Abhyankar 
1063e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1064e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1065e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM1)
1066e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM4)
1067e6580ceeSShri Abhyankar 
1068e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1069e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1070e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM2)
1071e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM5)
1072e6580ceeSShri Abhyankar 
1073e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1074e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1075e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM3)
1076e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1077e6580ceeSShri Abhyankar 
1078e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1079e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1080e6580ceeSShri Abhyankar 
1081e6580ceeSShri Abhyankar             /* Fourth Column */
1082e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1083e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1084e6580ceeSShri Abhyankar 
1085e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1086e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1087e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1088e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1089e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1090e6580ceeSShri Abhyankar 
1091e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1092e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1093e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1094e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1095e6580ceeSShri Abhyankar 
1096e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1097e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1098e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1099e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1100e6580ceeSShri Abhyankar 
1101e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1102e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1103e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1104e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1105e6580ceeSShri Abhyankar 
1106e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1107e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1108e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1109e6580ceeSShri Abhyankar           pv   += 16;
1110e6580ceeSShri Abhyankar         }
1111e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1112e6580ceeSShri Abhyankar       }
1113e6580ceeSShri Abhyankar       row = *bjtmp++;
1114e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
1115e6580ceeSShri Abhyankar     }
1116e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1117e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
1118e6580ceeSShri Abhyankar     pj = bj + bi[i];
1119e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
1120e6580ceeSShri Abhyankar 
1121e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1122e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1123e6580ceeSShri Abhyankar       x  = rtmp+16*pj[j];
1124e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
1125e6580ceeSShri Abhyankar 
1126e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
1127e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1128e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1129e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1130e6580ceeSShri Abhyankar 
1131e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1132e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1133e6580ceeSShri Abhyankar 
1134e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1135e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1136e6580ceeSShri Abhyankar 
1137e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1138e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1139e6580ceeSShri Abhyankar 
1140e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1141e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1142e6580ceeSShri Abhyankar 
1143e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1144e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1145e6580ceeSShri Abhyankar 
1146e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1147e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1148e6580ceeSShri Abhyankar 
1149e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1150e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1151e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1152e6580ceeSShri Abhyankar       pv += 16;
1153e6580ceeSShri Abhyankar     }
1154e6580ceeSShri Abhyankar     /* invert diagonal block */
1155e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
1156e6580ceeSShri Abhyankar     if (pivotinblocks) {
1157e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1158e6580ceeSShri Abhyankar     } else {
1159e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1160e6580ceeSShri Abhyankar     }
1161e6580ceeSShri Abhyankar /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1162e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1163e6580ceeSShri Abhyankar   }
1164e6580ceeSShri Abhyankar 
1165e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1166e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1167e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1168e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
1169e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1170e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1171e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1172e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1173e6580ceeSShri Abhyankar }
1174e6580ceeSShri Abhyankar 
1175e6580ceeSShri Abhyankar #undef __FUNCT__
1176e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1177e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1178e6580ceeSShri Abhyankar {
1179e6580ceeSShri Abhyankar   Mat            A=C;
1180e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1181e6580ceeSShri Abhyankar   PetscErrorCode ierr;
1182e6580ceeSShri Abhyankar   int i,j,n = a->mbs;
1183e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1184e6580ceeSShri Abhyankar   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
1185e6580ceeSShri Abhyankar   unsigned int   row;
1186e6580ceeSShri Abhyankar   int            nz,*bi=b->i;
1187e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i;
1188e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1189e6580ceeSShri Abhyankar   MatScalar      *ba = b->a,*aa = a->a;
1190e6580ceeSShri Abhyankar   int            nonzero=0;
1191e6580ceeSShri Abhyankar /*    int            nonzero=0,colscale = 16; */
1192e6580ceeSShri Abhyankar   PetscTruth     pivotinblocks = b->pivotinblocks;
1193e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
1194e6580ceeSShri Abhyankar 
1195e6580ceeSShri Abhyankar   PetscFunctionBegin;
1196e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1197e6580ceeSShri Abhyankar 
1198e6580ceeSShri Abhyankar   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1199e6580ceeSShri Abhyankar   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1200e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1201e6580ceeSShri Abhyankar   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1202e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
1203e6580ceeSShri Abhyankar /*      colscale = 4; */
1204e6580ceeSShri Abhyankar /*    } */
1205e6580ceeSShri Abhyankar 
1206e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
1207e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
1208e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1209e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1210e6580ceeSShri Abhyankar     /* zero out one register */
1211e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
1212e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
1213e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)bjtmp[j]);
1214e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
1215e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1216e6580ceeSShri Abhyankar         /* Copy zero register to memory locations */
1217e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1218e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1219e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1220e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1221e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1222e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1223e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1224e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1225e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1226e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1227e6580ceeSShri Abhyankar     }
1228e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1229e6580ceeSShri Abhyankar     nz    = ai[i+1] - ai[i];
1230e6580ceeSShri Abhyankar     ajtmp = aj + ai[i];
1231e6580ceeSShri Abhyankar     v     = aa + 16*ai[i];
1232e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1233e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)ajtmp[j]);
1234e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmp[j]; */
1235e6580ceeSShri Abhyankar       /* Copy v block into x block */
1236e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
1237e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1238e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1239e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1240e6580ceeSShri Abhyankar 
1241e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1242e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1243e6580ceeSShri Abhyankar 
1244e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1245e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1246e6580ceeSShri Abhyankar 
1247e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1248e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1249e6580ceeSShri Abhyankar 
1250e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1251e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1252e6580ceeSShri Abhyankar 
1253e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1254e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1255e6580ceeSShri Abhyankar 
1256e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1257e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1258e6580ceeSShri Abhyankar 
1259e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1260e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1261e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1262e6580ceeSShri Abhyankar 
1263e6580ceeSShri Abhyankar       v += 16;
1264e6580ceeSShri Abhyankar     }
1265e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
1266e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1267e6580ceeSShri Abhyankar     while (row < i) {
1268e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
1269e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1270e6580ceeSShri Abhyankar         /* Load block from lower triangle */
1271e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1272e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1273e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1274e6580ceeSShri Abhyankar 
1275e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1276e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1277e6580ceeSShri Abhyankar 
1278e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1279e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1280e6580ceeSShri Abhyankar 
1281e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1282e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1283e6580ceeSShri Abhyankar 
1284e6580ceeSShri Abhyankar         /* Compare block to zero block */
1285e6580ceeSShri Abhyankar 
1286e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM4,XMM7)
1287e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM4,XMM0)
1288e6580ceeSShri Abhyankar 
1289e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM5,XMM7)
1290e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM5,XMM1)
1291e6580ceeSShri Abhyankar 
1292e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM6,XMM7)
1293e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM6,XMM2)
1294e6580ceeSShri Abhyankar 
1295e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM7,XMM3)
1296e6580ceeSShri Abhyankar 
1297e6580ceeSShri Abhyankar         /* Reduce the comparisons to one SSE register */
1298e6580ceeSShri Abhyankar         SSE_OR_PS(XMM6,XMM7)
1299e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM4)
1300e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM6)
1301e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1302e6580ceeSShri Abhyankar 
1303e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1304e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1305e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
1306e6580ceeSShri Abhyankar 
1307e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1308e6580ceeSShri Abhyankar       if (nonzero) {
1309e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
1310e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1311e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1312e6580ceeSShri Abhyankar 
1313e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1314e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1315e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1316e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1317e6580ceeSShri Abhyankar 
1318e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
1319e6580ceeSShri Abhyankar           /* Column 0, product is accumulated in XMM4 */
1320e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1321e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1322e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM0)
1323e6580ceeSShri Abhyankar 
1324e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1325e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1326e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM1)
1327e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM5)
1328e6580ceeSShri Abhyankar 
1329e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1330e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1331e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM2)
1332e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM6)
1333e6580ceeSShri Abhyankar 
1334e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1335e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1336e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1337e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM7)
1338e6580ceeSShri Abhyankar 
1339e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1340e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1341e6580ceeSShri Abhyankar 
1342e6580ceeSShri Abhyankar           /* Column 1, product is accumulated in XMM5 */
1343e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1344e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1345e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1346e6580ceeSShri Abhyankar 
1347e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1348e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1349e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1350e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
1351e6580ceeSShri Abhyankar 
1352e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1353e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1354e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1355e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM7)
1356e6580ceeSShri Abhyankar 
1357e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1358e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1359e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
1360e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
1361e6580ceeSShri Abhyankar 
1362e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1363e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1364e6580ceeSShri Abhyankar 
1365e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1366e6580ceeSShri Abhyankar 
1367e6580ceeSShri Abhyankar           /* Column 2, product is accumulated in XMM6 */
1368e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1369e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1370e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
1371e6580ceeSShri Abhyankar 
1372e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1373e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1374e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
1375e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1376e6580ceeSShri Abhyankar 
1377e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1378e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1379e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1380e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1381e6580ceeSShri Abhyankar 
1382e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1383e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1384e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1385e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1386e6580ceeSShri Abhyankar 
1387e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1388e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1389e6580ceeSShri Abhyankar 
1390e6580ceeSShri Abhyankar           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1391e6580ceeSShri Abhyankar           /* Column 3, product is accumulated in XMM0 */
1392e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1393e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1394e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM0,XMM7)
1395e6580ceeSShri Abhyankar 
1396e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1397e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1398e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM7)
1399e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
1400e6580ceeSShri Abhyankar 
1401e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1402e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM1,XMM1,0x00)
1403e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM2)
1404e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
1405e6580ceeSShri Abhyankar 
1406e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1407e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1408e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM3,XMM7)
1409e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM3)
1410e6580ceeSShri Abhyankar 
1411e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1412e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1413e6580ceeSShri Abhyankar 
1414e6580ceeSShri Abhyankar           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1415e6580ceeSShri Abhyankar           /* This is code to be maintained and read by humans afterall. */
1416e6580ceeSShri Abhyankar           /* Copy Multiplier Col 3 into XMM3 */
1417e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM3,XMM0)
1418e6580ceeSShri Abhyankar           /* Copy Multiplier Col 2 into XMM2 */
1419e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM2,XMM6)
1420e6580ceeSShri Abhyankar           /* Copy Multiplier Col 1 into XMM1 */
1421e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM1,XMM5)
1422e6580ceeSShri Abhyankar           /* Copy Multiplier Col 0 into XMM0 */
1423e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM0,XMM4)
1424e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1425e6580ceeSShri Abhyankar 
1426e6580ceeSShri Abhyankar         /* Update the row: */
1427e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
1428e6580ceeSShri Abhyankar         pv += 16;
1429e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
1430e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1431e6580ceeSShri Abhyankar           x = rtmp + 16*((unsigned int)pj[j]);
1432e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
1433e6580ceeSShri Abhyankar 
1434e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1435e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1436e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
1437e6580ceeSShri Abhyankar             /* Load First Column of X*/
1438e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1439e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1440e6580ceeSShri Abhyankar 
1441e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1442e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1443e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1444e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1445e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1446e6580ceeSShri Abhyankar 
1447e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1448e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1449e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1450e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1451e6580ceeSShri Abhyankar 
1452e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1453e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1454e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1455e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1456e6580ceeSShri Abhyankar 
1457e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1458e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1459e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1460e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1461e6580ceeSShri Abhyankar 
1462e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1463e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1464e6580ceeSShri Abhyankar 
1465e6580ceeSShri Abhyankar             /* Second Column */
1466e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1467e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1468e6580ceeSShri Abhyankar 
1469e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1470e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1471e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1472e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM0)
1473e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1474e6580ceeSShri Abhyankar 
1475e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1476e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1477e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM1)
1478e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM7)
1479e6580ceeSShri Abhyankar 
1480e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1481e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1482e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM2)
1483e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM4)
1484e6580ceeSShri Abhyankar 
1485e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1486e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1487e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM3)
1488e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1489e6580ceeSShri Abhyankar 
1490e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1491e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1492e6580ceeSShri Abhyankar 
1493e6580ceeSShri Abhyankar             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1494e6580ceeSShri Abhyankar 
1495e6580ceeSShri Abhyankar             /* Third Column */
1496e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1497e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1498e6580ceeSShri Abhyankar 
1499e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1500e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1501e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1502e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM0)
1503e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1504e6580ceeSShri Abhyankar 
1505e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1506e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1507e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM1)
1508e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM4)
1509e6580ceeSShri Abhyankar 
1510e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1511e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1512e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM2)
1513e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM5)
1514e6580ceeSShri Abhyankar 
1515e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1516e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1517e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM3)
1518e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1519e6580ceeSShri Abhyankar 
1520e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1521e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1522e6580ceeSShri Abhyankar 
1523e6580ceeSShri Abhyankar             /* Fourth Column */
1524e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1525e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1526e6580ceeSShri Abhyankar 
1527e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1528e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1529e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1530e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1531e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1532e6580ceeSShri Abhyankar 
1533e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1534e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1535e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1536e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1537e6580ceeSShri Abhyankar 
1538e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1539e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1540e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1541e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1542e6580ceeSShri Abhyankar 
1543e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1544e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1545e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1546e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1547e6580ceeSShri Abhyankar 
1548e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1549e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1550e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1551e6580ceeSShri Abhyankar           pv   += 16;
1552e6580ceeSShri Abhyankar         }
1553e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1554e6580ceeSShri Abhyankar       }
1555e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1556e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
1557e6580ceeSShri Abhyankar /*        bjtmp++; */
1558e6580ceeSShri Abhyankar     }
1559e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1560e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
1561e6580ceeSShri Abhyankar     pj = bj + bi[i];
1562e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
1563e6580ceeSShri Abhyankar 
1564e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1565e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1566e6580ceeSShri Abhyankar       x  = rtmp+16*((unsigned int)pj[j]);
1567e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
1568e6580ceeSShri Abhyankar 
1569e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
1570e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1571e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1572e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1573e6580ceeSShri Abhyankar 
1574e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1575e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1576e6580ceeSShri Abhyankar 
1577e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1578e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1579e6580ceeSShri Abhyankar 
1580e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1581e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1582e6580ceeSShri Abhyankar 
1583e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1584e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1585e6580ceeSShri Abhyankar 
1586e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1587e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1588e6580ceeSShri Abhyankar 
1589e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1590e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1591e6580ceeSShri Abhyankar 
1592e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1593e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1594e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1595e6580ceeSShri Abhyankar       pv += 16;
1596e6580ceeSShri Abhyankar     }
1597e6580ceeSShri Abhyankar     /* invert diagonal block */
1598e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
1599e6580ceeSShri Abhyankar     if (pivotinblocks) {
1600e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1601e6580ceeSShri Abhyankar     } else {
1602e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1603e6580ceeSShri Abhyankar     }
1604e6580ceeSShri Abhyankar /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1605e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1606e6580ceeSShri Abhyankar   }
1607e6580ceeSShri Abhyankar 
1608e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1609e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1610e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1611e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
1612e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1613e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1614e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1615e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1616e6580ceeSShri Abhyankar }
1617e6580ceeSShri Abhyankar 
1618e6580ceeSShri Abhyankar #undef __FUNCT__
1619e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1620e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1621e6580ceeSShri Abhyankar {
1622e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1623e6580ceeSShri Abhyankar   PetscErrorCode ierr;
1624e6580ceeSShri Abhyankar   int  i,j,n = a->mbs;
1625e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1626e6580ceeSShri Abhyankar   unsigned int   row;
1627e6580ceeSShri Abhyankar   int            *ajtmpold,nz,*bi=b->i;
1628e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1629e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1630e6580ceeSShri Abhyankar   MatScalar      *ba = b->a,*aa = a->a;
1631e6580ceeSShri Abhyankar   int            nonzero=0;
1632e6580ceeSShri Abhyankar /*    int            nonzero=0,colscale = 16; */
1633e6580ceeSShri Abhyankar   PetscTruth     pivotinblocks = b->pivotinblocks;
1634e6580ceeSShri Abhyankar   PetscReal      shift = info->shiftinblocks;
1635e6580ceeSShri Abhyankar 
1636e6580ceeSShri Abhyankar   PetscFunctionBegin;
1637e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1638e6580ceeSShri Abhyankar 
1639e6580ceeSShri Abhyankar   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1640e6580ceeSShri Abhyankar   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1641e6580ceeSShri Abhyankar   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1642e6580ceeSShri Abhyankar   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1643e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
1644e6580ceeSShri Abhyankar /*      colscale = 4; */
1645e6580ceeSShri Abhyankar /*    } */
1646e6580ceeSShri Abhyankar   if ((unsigned long)bj==(unsigned long)aj) {
1647e6580ceeSShri Abhyankar     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1648e6580ceeSShri Abhyankar   }
1649e6580ceeSShri Abhyankar 
1650e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
1651e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
1652e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1653e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1654e6580ceeSShri Abhyankar     /* zero out one register */
1655e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
1656e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
1657e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)bjtmp[j]);
1658e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
1659e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1660e6580ceeSShri Abhyankar         /* Copy zero register to memory locations */
1661e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1662e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1663e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1664e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1665e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1666e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1667e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1668e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1669e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1670e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1671e6580ceeSShri Abhyankar     }
1672e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1673e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
1674e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
1675e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
1676e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1677e6580ceeSShri Abhyankar       x = rtmp+16*ajtmpold[j];
1678e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmpold[j]; */
1679e6580ceeSShri Abhyankar       /* Copy v block into x block */
1680e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
1681e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1682e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1683e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1684e6580ceeSShri Abhyankar 
1685e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1686e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1687e6580ceeSShri Abhyankar 
1688e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1689e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1690e6580ceeSShri Abhyankar 
1691e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1692e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1693e6580ceeSShri Abhyankar 
1694e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1695e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1696e6580ceeSShri Abhyankar 
1697e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1698e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1699e6580ceeSShri Abhyankar 
1700e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1701e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1702e6580ceeSShri Abhyankar 
1703e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1704e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1705e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1706e6580ceeSShri Abhyankar 
1707e6580ceeSShri Abhyankar       v += 16;
1708e6580ceeSShri Abhyankar     }
1709e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
1710e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1711e6580ceeSShri Abhyankar     while (row < i) {
1712e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
1713e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1714e6580ceeSShri Abhyankar         /* Load block from lower triangle */
1715e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1716e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1717e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1718e6580ceeSShri Abhyankar 
1719e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1720e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1721e6580ceeSShri Abhyankar 
1722e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1723e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1724e6580ceeSShri Abhyankar 
1725e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1726e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1727e6580ceeSShri Abhyankar 
1728e6580ceeSShri Abhyankar         /* Compare block to zero block */
1729e6580ceeSShri Abhyankar 
1730e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM4,XMM7)
1731e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM4,XMM0)
1732e6580ceeSShri Abhyankar 
1733e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM5,XMM7)
1734e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM5,XMM1)
1735e6580ceeSShri Abhyankar 
1736e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM6,XMM7)
1737e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM6,XMM2)
1738e6580ceeSShri Abhyankar 
1739e6580ceeSShri Abhyankar         SSE_CMPNEQ_PS(XMM7,XMM3)
1740e6580ceeSShri Abhyankar 
1741e6580ceeSShri Abhyankar         /* Reduce the comparisons to one SSE register */
1742e6580ceeSShri Abhyankar         SSE_OR_PS(XMM6,XMM7)
1743e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM4)
1744e6580ceeSShri Abhyankar         SSE_OR_PS(XMM5,XMM6)
1745e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1746e6580ceeSShri Abhyankar 
1747e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1748e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1749e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
1750e6580ceeSShri Abhyankar 
1751e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1752e6580ceeSShri Abhyankar       if (nonzero) {
1753e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
1754e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1755e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1756e6580ceeSShri Abhyankar 
1757e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1758e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1759e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1760e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1761e6580ceeSShri Abhyankar 
1762e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
1763e6580ceeSShri Abhyankar           /* Column 0, product is accumulated in XMM4 */
1764e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1765e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1766e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM0)
1767e6580ceeSShri Abhyankar 
1768e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1769e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1770e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM1)
1771e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM5)
1772e6580ceeSShri Abhyankar 
1773e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1774e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1775e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM2)
1776e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM6)
1777e6580ceeSShri Abhyankar 
1778e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1779e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1780e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1781e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM4,XMM7)
1782e6580ceeSShri Abhyankar 
1783e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1784e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1785e6580ceeSShri Abhyankar 
1786e6580ceeSShri Abhyankar           /* Column 1, product is accumulated in XMM5 */
1787e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1788e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1789e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1790e6580ceeSShri Abhyankar 
1791e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1792e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1793e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1794e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
1795e6580ceeSShri Abhyankar 
1796e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1797e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1798e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1799e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM7)
1800e6580ceeSShri Abhyankar 
1801e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1802e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1803e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
1804e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM5,XMM6)
1805e6580ceeSShri Abhyankar 
1806e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1807e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1808e6580ceeSShri Abhyankar 
1809e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1810e6580ceeSShri Abhyankar 
1811e6580ceeSShri Abhyankar           /* Column 2, product is accumulated in XMM6 */
1812e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1813e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1814e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
1815e6580ceeSShri Abhyankar 
1816e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1817e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1818e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
1819e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1820e6580ceeSShri Abhyankar 
1821e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1822e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1823e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1824e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1825e6580ceeSShri Abhyankar 
1826e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1827e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1828e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1829e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM6,XMM7)
1830e6580ceeSShri Abhyankar 
1831e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1832e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1833e6580ceeSShri Abhyankar 
1834e6580ceeSShri Abhyankar           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1835e6580ceeSShri Abhyankar           /* Column 3, product is accumulated in XMM0 */
1836e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1837e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1838e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM0,XMM7)
1839e6580ceeSShri Abhyankar 
1840e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1841e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1842e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM7)
1843e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
1844e6580ceeSShri Abhyankar 
1845e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1846e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM1,XMM1,0x00)
1847e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM1,XMM2)
1848e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM1)
1849e6580ceeSShri Abhyankar 
1850e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1851e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1852e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM3,XMM7)
1853e6580ceeSShri Abhyankar           SSE_ADD_PS(XMM0,XMM3)
1854e6580ceeSShri Abhyankar 
1855e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1856e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1857e6580ceeSShri Abhyankar 
1858e6580ceeSShri Abhyankar           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1859e6580ceeSShri Abhyankar           /* This is code to be maintained and read by humans afterall. */
1860e6580ceeSShri Abhyankar           /* Copy Multiplier Col 3 into XMM3 */
1861e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM3,XMM0)
1862e6580ceeSShri Abhyankar           /* Copy Multiplier Col 2 into XMM2 */
1863e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM2,XMM6)
1864e6580ceeSShri Abhyankar           /* Copy Multiplier Col 1 into XMM1 */
1865e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM1,XMM5)
1866e6580ceeSShri Abhyankar           /* Copy Multiplier Col 0 into XMM0 */
1867e6580ceeSShri Abhyankar           SSE_COPY_PS(XMM0,XMM4)
1868e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1869e6580ceeSShri Abhyankar 
1870e6580ceeSShri Abhyankar         /* Update the row: */
1871e6580ceeSShri Abhyankar         nz = bi[row+1] - diag_offset[row] - 1;
1872e6580ceeSShri Abhyankar         pv += 16;
1873e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
1874e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1875e6580ceeSShri Abhyankar           x = rtmp + 16*((unsigned int)pj[j]);
1876e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
1877e6580ceeSShri Abhyankar 
1878e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1879e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1880e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
1881e6580ceeSShri Abhyankar             /* Load First Column of X*/
1882e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1883e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1884e6580ceeSShri Abhyankar 
1885e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1886e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1887e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1888e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1889e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1890e6580ceeSShri Abhyankar 
1891e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1892e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1893e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1894e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1895e6580ceeSShri Abhyankar 
1896e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1897e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1898e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1899e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1900e6580ceeSShri Abhyankar 
1901e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1902e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1903e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1904e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1905e6580ceeSShri Abhyankar 
1906e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1907e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1908e6580ceeSShri Abhyankar 
1909e6580ceeSShri Abhyankar             /* Second Column */
1910e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1911e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1912e6580ceeSShri Abhyankar 
1913e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1914e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1915e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1916e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM0)
1917e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1918e6580ceeSShri Abhyankar 
1919e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1920e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1921e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM1)
1922e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM7)
1923e6580ceeSShri Abhyankar 
1924e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1925e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1926e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM2)
1927e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM4)
1928e6580ceeSShri Abhyankar 
1929e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1930e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1931e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM3)
1932e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM5,XMM6)
1933e6580ceeSShri Abhyankar 
1934e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1935e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1936e6580ceeSShri Abhyankar 
1937e6580ceeSShri Abhyankar             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1938e6580ceeSShri Abhyankar 
1939e6580ceeSShri Abhyankar             /* Third Column */
1940e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1941e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1942e6580ceeSShri Abhyankar 
1943e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1944e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1945e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1946e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM0)
1947e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1948e6580ceeSShri Abhyankar 
1949e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1950e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM4,XMM4,0x00)
1951e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM4,XMM1)
1952e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM4)
1953e6580ceeSShri Abhyankar 
1954e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1955e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1956e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM2)
1957e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM5)
1958e6580ceeSShri Abhyankar 
1959e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1960e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1961e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM3)
1962e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM6,XMM7)
1963e6580ceeSShri Abhyankar 
1964e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1965e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1966e6580ceeSShri Abhyankar 
1967e6580ceeSShri Abhyankar             /* Fourth Column */
1968e6580ceeSShri Abhyankar             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1969e6580ceeSShri Abhyankar             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1970e6580ceeSShri Abhyankar 
1971e6580ceeSShri Abhyankar             /* Matrix-Vector Product: */
1972e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1973e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1974e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM0)
1975e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1976e6580ceeSShri Abhyankar 
1977e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1978e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM6,XMM6,0x00)
1979e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM6,XMM1)
1980e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM6)
1981e6580ceeSShri Abhyankar 
1982e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1983e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM7,XMM7,0x00)
1984e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM7,XMM2)
1985e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM7)
1986e6580ceeSShri Abhyankar 
1987e6580ceeSShri Abhyankar             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1988e6580ceeSShri Abhyankar             SSE_SHUFFLE(XMM5,XMM5,0x00)
1989e6580ceeSShri Abhyankar             SSE_MULT_PS(XMM5,XMM3)
1990e6580ceeSShri Abhyankar             SSE_SUB_PS(XMM4,XMM5)
1991e6580ceeSShri Abhyankar 
1992e6580ceeSShri Abhyankar             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1993e6580ceeSShri Abhyankar             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1994e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1995e6580ceeSShri Abhyankar           pv   += 16;
1996e6580ceeSShri Abhyankar         }
1997e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1998e6580ceeSShri Abhyankar       }
1999e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
2000e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
2001e6580ceeSShri Abhyankar /*        bjtmp++; */
2002e6580ceeSShri Abhyankar     }
2003e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
2004e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
2005e6580ceeSShri Abhyankar     pj = bj + bi[i];
2006e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
2007e6580ceeSShri Abhyankar 
2008e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
2009e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
2010e6580ceeSShri Abhyankar       x  = rtmp+16*((unsigned int)pj[j]);
2011e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
2012e6580ceeSShri Abhyankar 
2013e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
2014e6580ceeSShri Abhyankar         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
2015e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
2016e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
2017e6580ceeSShri Abhyankar 
2018e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
2019e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
2020e6580ceeSShri Abhyankar 
2021e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
2022e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
2023e6580ceeSShri Abhyankar 
2024e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
2025e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
2026e6580ceeSShri Abhyankar 
2027e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
2028e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
2029e6580ceeSShri Abhyankar 
2030e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
2031e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
2032e6580ceeSShri Abhyankar 
2033e6580ceeSShri Abhyankar         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
2034e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
2035e6580ceeSShri Abhyankar 
2036e6580ceeSShri Abhyankar         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
2037e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
2038e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
2039e6580ceeSShri Abhyankar       pv += 16;
2040e6580ceeSShri Abhyankar     }
2041e6580ceeSShri Abhyankar     /* invert diagonal block */
2042e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
2043e6580ceeSShri Abhyankar     if (pivotinblocks) {
2044e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
2045e6580ceeSShri Abhyankar     } else {
2046e6580ceeSShri Abhyankar       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
2047e6580ceeSShri Abhyankar     }
2048e6580ceeSShri Abhyankar /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
2049e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
2050e6580ceeSShri Abhyankar   }
2051e6580ceeSShri Abhyankar 
2052e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2053e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
2054e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
2055e6580ceeSShri Abhyankar   C->assembled = PETSC_TRUE;
2056e6580ceeSShri Abhyankar   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
2057e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
2058e6580ceeSShri Abhyankar   SSE_SCOPE_END;
2059e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
2060e6580ceeSShri Abhyankar }
2061e6580ceeSShri Abhyankar 
2062e6580ceeSShri Abhyankar #endif
2063