xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision a455e9261d77363e13c8b22a0e0ea831d71fb0ff)
1be1d678aSKris Buschelman 
283287d42SBarry Smith /*
383287d42SBarry Smith     Factorization code for BAIJ format.
483287d42SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
783287d42SBarry Smith 
883287d42SBarry Smith /* ------------------------------------------------------------*/
983287d42SBarry Smith /*
1083287d42SBarry Smith       Version for when blocks are 4 by 4
1183287d42SBarry Smith */
124a2ae208SSatish Balay #undef __FUNCT__
1306e38f1dSHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
1406e38f1dSHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
1583287d42SBarry Smith {
1683287d42SBarry Smith   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1783287d42SBarry Smith   IS             isrow = b->row,isicol = b->icol;
186849ba73SBarry Smith   PetscErrorCode ierr;
195d0c19d7SBarry Smith   const PetscInt *r,*ic;
205d0c19d7SBarry Smith   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21690b6cddSBarry Smith   PetscInt       *ajtmpold,*ajtmp,nz,row;
22690b6cddSBarry Smith   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
2383287d42SBarry Smith   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
2483287d42SBarry Smith   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
2583287d42SBarry Smith   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
2683287d42SBarry Smith   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
2783287d42SBarry Smith   MatScalar      m13,m14,m15,m16;
2883287d42SBarry Smith   MatScalar      *ba           = b->a,*aa = a->a;
29*a455e926SHong Zhang   PetscBool      pivotinblocks = b->pivotinblocks;
30182b8fbaSHong Zhang   PetscReal      shift         = info->shiftamount;
3183287d42SBarry Smith 
3283287d42SBarry Smith   PetscFunctionBegin;
3383287d42SBarry Smith   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3483287d42SBarry Smith   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
35785e854fSJed Brown   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
3683287d42SBarry Smith 
3783287d42SBarry Smith   for (i=0; i<n; i++) {
3883287d42SBarry Smith     nz    = bi[i+1] - bi[i];
3983287d42SBarry Smith     ajtmp = bj + bi[i];
4083287d42SBarry Smith     for  (j=0; j<nz; j++) {
4183287d42SBarry Smith       x     = rtmp+16*ajtmp[j];
4283287d42SBarry Smith       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
4383287d42SBarry Smith       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
4483287d42SBarry Smith     }
4583287d42SBarry Smith     /* load in initial (unfactored row) */
4683287d42SBarry Smith     idx      = r[i];
4783287d42SBarry Smith     nz       = ai[idx+1] - ai[idx];
4883287d42SBarry Smith     ajtmpold = aj + ai[idx];
4983287d42SBarry Smith     v        = aa + 16*ai[idx];
5083287d42SBarry Smith     for (j=0; j<nz; j++) {
5183287d42SBarry Smith       x     = rtmp+16*ic[ajtmpold[j]];
5283287d42SBarry Smith       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
5383287d42SBarry Smith       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
5483287d42SBarry Smith       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
5583287d42SBarry Smith       x[14] = v[14]; x[15] = v[15];
5683287d42SBarry Smith       v    += 16;
5783287d42SBarry Smith     }
5883287d42SBarry Smith     row = *ajtmp++;
5983287d42SBarry Smith     while (row < i) {
6083287d42SBarry Smith       pc  = rtmp + 16*row;
6183287d42SBarry Smith       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
6283287d42SBarry Smith       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
6383287d42SBarry Smith       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
6483287d42SBarry Smith       p15 = pc[14]; p16 = pc[15];
6583287d42SBarry Smith       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
6683287d42SBarry Smith           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
6783287d42SBarry Smith           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
6883287d42SBarry Smith           || p16 != 0.0) {
6983287d42SBarry Smith         pv    = ba + 16*diag_offset[row];
7083287d42SBarry Smith         pj    = bj + diag_offset[row] + 1;
7183287d42SBarry Smith         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
7283287d42SBarry Smith         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
7383287d42SBarry Smith         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
7483287d42SBarry Smith         x15   = pv[14]; x16 = pv[15];
7583287d42SBarry Smith         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
7683287d42SBarry Smith         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
7783287d42SBarry Smith         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
7883287d42SBarry Smith         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
7983287d42SBarry Smith 
8083287d42SBarry Smith         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
8183287d42SBarry Smith         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
8283287d42SBarry Smith         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
8383287d42SBarry Smith         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
8483287d42SBarry Smith 
8583287d42SBarry Smith         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
8683287d42SBarry Smith         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
8783287d42SBarry Smith         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
8883287d42SBarry Smith         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
8983287d42SBarry Smith 
9083287d42SBarry Smith         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
9183287d42SBarry Smith         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
9283287d42SBarry Smith         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
9383287d42SBarry Smith         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
9483287d42SBarry Smith 
9583287d42SBarry Smith         nz  = bi[row+1] - diag_offset[row] - 1;
9683287d42SBarry Smith         pv += 16;
9783287d42SBarry Smith         for (j=0; j<nz; j++) {
9883287d42SBarry Smith           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
9983287d42SBarry Smith           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
10083287d42SBarry Smith           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
10183287d42SBarry Smith           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
10283287d42SBarry Smith           x     = rtmp + 16*pj[j];
10383287d42SBarry Smith           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
10483287d42SBarry Smith           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
10583287d42SBarry Smith           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
10683287d42SBarry Smith           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
10783287d42SBarry Smith 
10883287d42SBarry Smith           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
10983287d42SBarry Smith           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
11083287d42SBarry Smith           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
11183287d42SBarry Smith           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
11283287d42SBarry Smith 
11383287d42SBarry Smith           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
11483287d42SBarry Smith           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
11583287d42SBarry Smith           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
11683287d42SBarry Smith           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
11783287d42SBarry Smith 
11883287d42SBarry Smith           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
11983287d42SBarry Smith           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
12083287d42SBarry Smith           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
12183287d42SBarry Smith           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
12283287d42SBarry Smith 
12383287d42SBarry Smith           pv += 16;
12483287d42SBarry Smith         }
125dc0b31edSSatish Balay         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
12683287d42SBarry Smith       }
12783287d42SBarry Smith       row = *ajtmp++;
12883287d42SBarry Smith     }
12983287d42SBarry Smith     /* finished row so stick it into b->a */
13083287d42SBarry Smith     pv = ba + 16*bi[i];
13183287d42SBarry Smith     pj = bj + bi[i];
13283287d42SBarry Smith     nz = bi[i+1] - bi[i];
13383287d42SBarry Smith     for (j=0; j<nz; j++) {
13483287d42SBarry Smith       x      = rtmp+16*pj[j];
13583287d42SBarry Smith       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
13683287d42SBarry Smith       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
13783287d42SBarry Smith       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
13883287d42SBarry Smith       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
13983287d42SBarry Smith       pv    += 16;
14083287d42SBarry Smith     }
14183287d42SBarry Smith     /* invert diagonal block */
14283287d42SBarry Smith     w = ba + 16*diag_offset[i];
143bcd9e38bSBarry Smith     if (pivotinblocks) {
144*a455e926SHong Zhang       PetscBool allowzeropivot,zeropivotdetected;
145*a455e926SHong Zhang       allowzeropivot = PetscNot(A->erroriffailure);
146*a455e926SHong Zhang       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1472e92ee13SHong Zhang       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
148bcd9e38bSBarry Smith     } else {
14996b95a6bSBarry Smith       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
150bcd9e38bSBarry Smith     }
15183287d42SBarry Smith   }
15283287d42SBarry Smith 
15383287d42SBarry Smith   ierr = PetscFree(rtmp);CHKERRQ(ierr);
15483287d42SBarry Smith   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
15583287d42SBarry Smith   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
15626fbe8dcSKarl Rupp 
15706e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
15806e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
15983287d42SBarry Smith   C->assembled           = PETSC_TRUE;
16026fbe8dcSKarl Rupp 
161766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
16283287d42SBarry Smith   PetscFunctionReturn(0);
16383287d42SBarry Smith }
164e6580ceeSShri Abhyankar 
1654dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_4 -
1664dd39f65SShri Abhyankar      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
16796b95a6bSBarry Smith        PetscKernel_A_gets_A_times_B()
16896b95a6bSBarry Smith        PetscKernel_A_gets_A_minus_B_times_C()
16996b95a6bSBarry Smith        PetscKernel_A_gets_inverse_A()
170209027a4SShri Abhyankar */
171c0c7eb62SShri Abhyankar 
172209027a4SShri Abhyankar #undef __FUNCT__
1734dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
1744dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
175209027a4SShri Abhyankar {
176209027a4SShri Abhyankar   Mat            C     = B;
177209027a4SShri Abhyankar   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
178209027a4SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
179209027a4SShri Abhyankar   PetscErrorCode ierr;
1805a586d82SBarry Smith   const PetscInt *r,*ic;
181bbd65245SShri Abhyankar   PetscInt       i,j,k,nz,nzL,row;
182bbd65245SShri Abhyankar   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
183bbd65245SShri Abhyankar   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
184209027a4SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
185bbd65245SShri Abhyankar   PetscInt       flg;
1866ba06ab7SHong Zhang   PetscReal      shift;
187*a455e926SHong Zhang   PetscBool      allowzeropivot,zeropivotdetected;
188209027a4SShri Abhyankar 
189209027a4SShri Abhyankar   PetscFunctionBegin;
190209027a4SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
191209027a4SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
192209027a4SShri Abhyankar 
1936ba06ab7SHong Zhang   if (info->shifttype == MAT_SHIFT_NONE) {
1946ba06ab7SHong Zhang     shift = 0;
1956ba06ab7SHong Zhang   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
1966ba06ab7SHong Zhang     shift = info->shiftamount;
1976ba06ab7SHong Zhang   }
1986ba06ab7SHong Zhang 
199209027a4SShri Abhyankar   /* generate work space needed by the factorization */
200dcca6d9dSJed Brown   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
201209027a4SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
202209027a4SShri Abhyankar 
203209027a4SShri Abhyankar   for (i=0; i<n; i++) {
204209027a4SShri Abhyankar     /* zero rtmp */
205209027a4SShri Abhyankar     /* L part */
206209027a4SShri Abhyankar     nz    = bi[i+1] - bi[i];
207209027a4SShri Abhyankar     bjtmp = bj + bi[i];
208209027a4SShri Abhyankar     for  (j=0; j<nz; j++) {
209209027a4SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
210209027a4SShri Abhyankar     }
211209027a4SShri Abhyankar 
212209027a4SShri Abhyankar     /* U part */
21378bb4007SShri Abhyankar     nz    = bdiag[i]-bdiag[i+1];
21478bb4007SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
21578bb4007SShri Abhyankar     for  (j=0; j<nz; j++) {
21678bb4007SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
21778bb4007SShri Abhyankar     }
21878bb4007SShri Abhyankar 
21978bb4007SShri Abhyankar     /* load in initial (unfactored row) */
22078bb4007SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
22178bb4007SShri Abhyankar     ajtmp = aj + ai[r[i]];
22278bb4007SShri Abhyankar     v     = aa + bs2*ai[r[i]];
22378bb4007SShri Abhyankar     for (j=0; j<nz; j++) {
22478bb4007SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
22578bb4007SShri Abhyankar     }
22678bb4007SShri Abhyankar 
22778bb4007SShri Abhyankar     /* elimination */
22878bb4007SShri Abhyankar     bjtmp = bj + bi[i];
22978bb4007SShri Abhyankar     nzL   = bi[i+1] - bi[i];
23078bb4007SShri Abhyankar     for (k=0; k < nzL; k++) {
23178bb4007SShri Abhyankar       row = bjtmp[k];
23278bb4007SShri Abhyankar       pc  = rtmp + bs2*row;
233c35f09e5SBarry Smith       for (flg=0,j=0; j<bs2; j++) {
234c35f09e5SBarry Smith         if (pc[j]!=0.0) {
235c35f09e5SBarry Smith           flg = 1;
236c35f09e5SBarry Smith           break;
237c35f09e5SBarry Smith         }
238c35f09e5SBarry Smith       }
23978bb4007SShri Abhyankar       if (flg) {
24078bb4007SShri Abhyankar         pv = b->a + bs2*bdiag[row];
24196b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
24296b95a6bSBarry Smith         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
24378bb4007SShri Abhyankar 
24478bb4007SShri Abhyankar         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
24578bb4007SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
24678bb4007SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
24778bb4007SShri Abhyankar         for (j=0; j<nz; j++) {
24896b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
24978bb4007SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
25078bb4007SShri Abhyankar           v    = rtmp + bs2*pj[j];
25196b95a6bSBarry Smith           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
25278bb4007SShri Abhyankar           pv  += bs2;
25378bb4007SShri Abhyankar         }
25478bb4007SShri Abhyankar         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
25578bb4007SShri Abhyankar       }
25678bb4007SShri Abhyankar     }
25778bb4007SShri Abhyankar 
25878bb4007SShri Abhyankar     /* finished row so stick it into b->a */
25978bb4007SShri Abhyankar     /* L part */
26078bb4007SShri Abhyankar     pv = b->a + bs2*bi[i];
26178bb4007SShri Abhyankar     pj = b->j + bi[i];
26278bb4007SShri Abhyankar     nz = bi[i+1] - bi[i];
26378bb4007SShri Abhyankar     for (j=0; j<nz; j++) {
26478bb4007SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
26578bb4007SShri Abhyankar     }
26678bb4007SShri Abhyankar 
26778bb4007SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
26878bb4007SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
26978bb4007SShri Abhyankar     pj   = b->j + bdiag[i];
27078bb4007SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
271*a455e926SHong Zhang     allowzeropivot = PetscNot(A->erroriffailure);
272*a455e926SHong Zhang     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2732e92ee13SHong Zhang     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
27478bb4007SShri Abhyankar 
27578bb4007SShri Abhyankar     /* U part */
27678bb4007SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
27778bb4007SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
27878bb4007SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
27978bb4007SShri Abhyankar     for (j=0; j<nz; j++) {
28078bb4007SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
28178bb4007SShri Abhyankar     }
28278bb4007SShri Abhyankar   }
28378bb4007SShri Abhyankar 
284fca92195SBarry Smith   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
28578bb4007SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
28678bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
28726fbe8dcSKarl Rupp 
2884dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4;
2894dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
29078bb4007SShri Abhyankar   C->assembled           = PETSC_TRUE;
29126fbe8dcSKarl Rupp 
292766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
29378bb4007SShri Abhyankar   PetscFunctionReturn(0);
29478bb4007SShri Abhyankar }
29578bb4007SShri Abhyankar 
29678bb4007SShri Abhyankar #undef __FUNCT__
29706e38f1dSHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
29806e38f1dSHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
299e6580ceeSShri Abhyankar {
300e6580ceeSShri Abhyankar /*
301e6580ceeSShri Abhyankar     Default Version for when blocks are 4 by 4 Using natural ordering
302e6580ceeSShri Abhyankar */
303e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
304e6580ceeSShri Abhyankar   PetscErrorCode ierr;
305e6580ceeSShri Abhyankar   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
306e6580ceeSShri Abhyankar   PetscInt       *ajtmpold,*ajtmp,nz,row;
307e6580ceeSShri Abhyankar   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
308e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
309e6580ceeSShri Abhyankar   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
310e6580ceeSShri Abhyankar   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
311e6580ceeSShri Abhyankar   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
312e6580ceeSShri Abhyankar   MatScalar      m13,m14,m15,m16;
313e6580ceeSShri Abhyankar   MatScalar      *ba           = b->a,*aa = a->a;
314*a455e926SHong Zhang   PetscBool      pivotinblocks = b->pivotinblocks;
315182b8fbaSHong Zhang   PetscReal      shift         = info->shiftamount;
316e6580ceeSShri Abhyankar 
317e6580ceeSShri Abhyankar   PetscFunctionBegin;
318785e854fSJed Brown   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
319e6580ceeSShri Abhyankar 
320e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
321e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
322e6580ceeSShri Abhyankar     ajtmp = bj + bi[i];
323e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
324e6580ceeSShri Abhyankar       x     = rtmp+16*ajtmp[j];
325e6580ceeSShri Abhyankar       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
326e6580ceeSShri Abhyankar       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
327e6580ceeSShri Abhyankar     }
328e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
329e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
330e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
331e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
332e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
333e6580ceeSShri Abhyankar       x     = rtmp+16*ajtmpold[j];
334e6580ceeSShri Abhyankar       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
335e6580ceeSShri Abhyankar       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
336e6580ceeSShri Abhyankar       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
337e6580ceeSShri Abhyankar       x[14] = v[14]; x[15] = v[15];
338e6580ceeSShri Abhyankar       v    += 16;
339e6580ceeSShri Abhyankar     }
340e6580ceeSShri Abhyankar     row = *ajtmp++;
341e6580ceeSShri Abhyankar     while (row < i) {
342e6580ceeSShri Abhyankar       pc  = rtmp + 16*row;
343e6580ceeSShri Abhyankar       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
344e6580ceeSShri Abhyankar       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
345e6580ceeSShri Abhyankar       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
346e6580ceeSShri Abhyankar       p15 = pc[14]; p16 = pc[15];
347e6580ceeSShri Abhyankar       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
348e6580ceeSShri Abhyankar           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
349e6580ceeSShri Abhyankar           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
350e6580ceeSShri Abhyankar           || p16 != 0.0) {
351e6580ceeSShri Abhyankar         pv    = ba + 16*diag_offset[row];
352e6580ceeSShri Abhyankar         pj    = bj + diag_offset[row] + 1;
353e6580ceeSShri Abhyankar         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
354e6580ceeSShri Abhyankar         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
355e6580ceeSShri Abhyankar         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
356e6580ceeSShri Abhyankar         x15   = pv[14]; x16 = pv[15];
357e6580ceeSShri Abhyankar         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
358e6580ceeSShri Abhyankar         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
359e6580ceeSShri Abhyankar         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
360e6580ceeSShri Abhyankar         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
361e6580ceeSShri Abhyankar 
362e6580ceeSShri Abhyankar         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
363e6580ceeSShri Abhyankar         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
364e6580ceeSShri Abhyankar         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
365e6580ceeSShri Abhyankar         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
366e6580ceeSShri Abhyankar 
367e6580ceeSShri Abhyankar         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
368e6580ceeSShri Abhyankar         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
369e6580ceeSShri Abhyankar         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
370e6580ceeSShri Abhyankar         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
371e6580ceeSShri Abhyankar 
372e6580ceeSShri Abhyankar         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
373e6580ceeSShri Abhyankar         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
374e6580ceeSShri Abhyankar         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
375e6580ceeSShri Abhyankar         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
376e6580ceeSShri Abhyankar         nz     = bi[row+1] - diag_offset[row] - 1;
377e6580ceeSShri Abhyankar         pv    += 16;
378e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
379e6580ceeSShri Abhyankar           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
380e6580ceeSShri Abhyankar           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
381e6580ceeSShri Abhyankar           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
382e6580ceeSShri Abhyankar           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
383e6580ceeSShri Abhyankar           x     = rtmp + 16*pj[j];
384e6580ceeSShri Abhyankar           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
385e6580ceeSShri Abhyankar           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
386e6580ceeSShri Abhyankar           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
387e6580ceeSShri Abhyankar           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
388e6580ceeSShri Abhyankar 
389e6580ceeSShri Abhyankar           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
390e6580ceeSShri Abhyankar           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
391e6580ceeSShri Abhyankar           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
392e6580ceeSShri Abhyankar           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
393e6580ceeSShri Abhyankar 
394e6580ceeSShri Abhyankar           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
395e6580ceeSShri Abhyankar           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
396e6580ceeSShri Abhyankar           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
397e6580ceeSShri Abhyankar           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
398e6580ceeSShri Abhyankar 
399e6580ceeSShri Abhyankar           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
400e6580ceeSShri Abhyankar           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
401e6580ceeSShri Abhyankar           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
402e6580ceeSShri Abhyankar           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
403e6580ceeSShri Abhyankar 
404e6580ceeSShri Abhyankar           pv += 16;
405e6580ceeSShri Abhyankar         }
406e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
407e6580ceeSShri Abhyankar       }
408e6580ceeSShri Abhyankar       row = *ajtmp++;
409e6580ceeSShri Abhyankar     }
410e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
411e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
412e6580ceeSShri Abhyankar     pj = bj + bi[i];
413e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
414e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
415e6580ceeSShri Abhyankar       x      = rtmp+16*pj[j];
416e6580ceeSShri Abhyankar       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
417e6580ceeSShri Abhyankar       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
418e6580ceeSShri Abhyankar       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
419e6580ceeSShri Abhyankar       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
420e6580ceeSShri Abhyankar       pv    += 16;
421e6580ceeSShri Abhyankar     }
422e6580ceeSShri Abhyankar     /* invert diagonal block */
423e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
424e6580ceeSShri Abhyankar     if (pivotinblocks) {
425*a455e926SHong Zhang       PetscBool allowzeropivot,zeropivotdetected;
426*a455e926SHong Zhang       allowzeropivot = PetscNot(A->erroriffailure);
427*a455e926SHong Zhang       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
4282e92ee13SHong Zhang       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
429e6580ceeSShri Abhyankar     } else {
43096b95a6bSBarry Smith       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
431e6580ceeSShri Abhyankar     }
432e6580ceeSShri Abhyankar   }
433e6580ceeSShri Abhyankar 
434e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
43526fbe8dcSKarl Rupp 
43606e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
43706e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
438e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
43926fbe8dcSKarl Rupp 
440766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
441e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
442e6580ceeSShri Abhyankar }
443c0c7eb62SShri Abhyankar 
444209027a4SShri Abhyankar /*
4454dd39f65SShri Abhyankar   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
4464dd39f65SShri Abhyankar     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
447209027a4SShri Abhyankar */
448209027a4SShri Abhyankar #undef __FUNCT__
4494dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
4504dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
451209027a4SShri Abhyankar {
452209027a4SShri Abhyankar   Mat            C =B;
453209027a4SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
454209027a4SShri Abhyankar   PetscErrorCode ierr;
455bbd65245SShri Abhyankar   PetscInt       i,j,k,nz,nzL,row;
456bbd65245SShri Abhyankar   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
457bbd65245SShri Abhyankar   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
458209027a4SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
459bbd65245SShri Abhyankar   PetscInt       flg;
4606ba06ab7SHong Zhang   PetscReal      shift;
461*a455e926SHong Zhang   PetscBool      allowzeropivot,zeropivotdetected;
462e6580ceeSShri Abhyankar 
463209027a4SShri Abhyankar   PetscFunctionBegin;
464209027a4SShri Abhyankar   /* generate work space needed by the factorization */
465dcca6d9dSJed Brown   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
466209027a4SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
467209027a4SShri Abhyankar 
4686ba06ab7SHong Zhang   if (info->shifttype == MAT_SHIFT_NONE) {
4696ba06ab7SHong Zhang     shift = 0;
4706ba06ab7SHong Zhang   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
4716ba06ab7SHong Zhang     shift = info->shiftamount;
4726ba06ab7SHong Zhang   }
4736ba06ab7SHong Zhang 
474209027a4SShri Abhyankar   for (i=0; i<n; i++) {
475209027a4SShri Abhyankar     /* zero rtmp */
476209027a4SShri Abhyankar     /* L part */
477209027a4SShri Abhyankar     nz    = bi[i+1] - bi[i];
478209027a4SShri Abhyankar     bjtmp = bj + bi[i];
479209027a4SShri Abhyankar     for  (j=0; j<nz; j++) {
480209027a4SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
481209027a4SShri Abhyankar     }
482209027a4SShri Abhyankar 
483209027a4SShri Abhyankar     /* U part */
484b2b2dd24SShri Abhyankar     nz    = bdiag[i] - bdiag[i+1];
485b2b2dd24SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
486b2b2dd24SShri Abhyankar     for  (j=0; j<nz; j++) {
487b2b2dd24SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
488b2b2dd24SShri Abhyankar     }
489b2b2dd24SShri Abhyankar 
490b2b2dd24SShri Abhyankar     /* load in initial (unfactored row) */
491b2b2dd24SShri Abhyankar     nz    = ai[i+1] - ai[i];
492b2b2dd24SShri Abhyankar     ajtmp = aj + ai[i];
493b2b2dd24SShri Abhyankar     v     = aa + bs2*ai[i];
494b2b2dd24SShri Abhyankar     for (j=0; j<nz; j++) {
495b2b2dd24SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
496b2b2dd24SShri Abhyankar     }
497b2b2dd24SShri Abhyankar 
498b2b2dd24SShri Abhyankar     /* elimination */
499b2b2dd24SShri Abhyankar     bjtmp = bj + bi[i];
500b2b2dd24SShri Abhyankar     nzL   = bi[i+1] - bi[i];
501b2b2dd24SShri Abhyankar     for (k=0; k < nzL; k++) {
502b2b2dd24SShri Abhyankar       row = bjtmp[k];
503b2b2dd24SShri Abhyankar       pc  = rtmp + bs2*row;
504c35f09e5SBarry Smith       for (flg=0,j=0; j<bs2; j++) {
505c35f09e5SBarry Smith         if (pc[j]!=0.0) {
506c35f09e5SBarry Smith           flg = 1;
507c35f09e5SBarry Smith           break;
508c35f09e5SBarry Smith         }
509c35f09e5SBarry Smith       }
510b2b2dd24SShri Abhyankar       if (flg) {
511b2b2dd24SShri Abhyankar         pv = b->a + bs2*bdiag[row];
51296b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
51396b95a6bSBarry Smith         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
514b2b2dd24SShri Abhyankar 
515b2b2dd24SShri Abhyankar         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
516b2b2dd24SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
517b2b2dd24SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
518b2b2dd24SShri Abhyankar         for (j=0; j<nz; j++) {
51996b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
520b2b2dd24SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
521b2b2dd24SShri Abhyankar           v    = rtmp + bs2*pj[j];
52296b95a6bSBarry Smith           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
523b2b2dd24SShri Abhyankar           pv  += bs2;
524b2b2dd24SShri Abhyankar         }
525b2b2dd24SShri Abhyankar         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
526b2b2dd24SShri Abhyankar       }
527b2b2dd24SShri Abhyankar     }
528b2b2dd24SShri Abhyankar 
529b2b2dd24SShri Abhyankar     /* finished row so stick it into b->a */
530b2b2dd24SShri Abhyankar     /* L part */
531b2b2dd24SShri Abhyankar     pv = b->a + bs2*bi[i];
532b2b2dd24SShri Abhyankar     pj = b->j + bi[i];
533b2b2dd24SShri Abhyankar     nz = bi[i+1] - bi[i];
534b2b2dd24SShri Abhyankar     for (j=0; j<nz; j++) {
535b2b2dd24SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
536b2b2dd24SShri Abhyankar     }
537b2b2dd24SShri Abhyankar 
538b2b2dd24SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
539b2b2dd24SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
540b2b2dd24SShri Abhyankar     pj   = b->j + bdiag[i];
541b2b2dd24SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
542*a455e926SHong Zhang     allowzeropivot = PetscNot(A->erroriffailure);
543*a455e926SHong Zhang     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
5442e92ee13SHong Zhang     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
545b2b2dd24SShri Abhyankar 
546b2b2dd24SShri Abhyankar     /* U part */
547b2b2dd24SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
548b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
549b2b2dd24SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
550b2b2dd24SShri Abhyankar     for (j=0; j<nz; j++) {
551b2b2dd24SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
552b2b2dd24SShri Abhyankar     }
553b2b2dd24SShri Abhyankar   }
554fca92195SBarry Smith   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
55526fbe8dcSKarl Rupp 
5564dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
5574dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
558b2b2dd24SShri Abhyankar   C->assembled           = PETSC_TRUE;
55926fbe8dcSKarl Rupp 
560766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
561b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
562b2b2dd24SShri Abhyankar }
563b2b2dd24SShri Abhyankar 
564e6580ceeSShri Abhyankar #if defined(PETSC_HAVE_SSE)
565e6580ceeSShri Abhyankar 
566e6580ceeSShri Abhyankar #include PETSC_HAVE_SSE
567e6580ceeSShri Abhyankar 
568e6580ceeSShri Abhyankar /* SSE Version for when blocks are 4 by 4 Using natural ordering */
569e6580ceeSShri Abhyankar #undef __FUNCT__
570e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
571e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
572e6580ceeSShri Abhyankar {
573e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
574e6580ceeSShri Abhyankar   PetscErrorCode ierr;
575e6580ceeSShri Abhyankar   int            i,j,n = a->mbs;
576e6580ceeSShri Abhyankar   int            *bj = b->j,*bjtmp,*pj;
577e6580ceeSShri Abhyankar   int            row;
578e6580ceeSShri Abhyankar   int            *ajtmpold,nz,*bi=b->i;
579e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
580e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
581e6580ceeSShri Abhyankar   MatScalar      *ba    = b->a,*aa = a->a;
582e6580ceeSShri Abhyankar   int            nonzero=0;
583*a455e926SHong Zhang   PetscBool pivotinblocks = b->pivotinblocks;
584182b8fbaSHong Zhang   PetscReal shift         = info->shiftamount;
585e6580ceeSShri Abhyankar 
586e6580ceeSShri Abhyankar   PetscFunctionBegin;
587e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
588e6580ceeSShri Abhyankar 
589e32f2f54SBarry Smith   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
590e32f2f54SBarry Smith   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
591785e854fSJed Brown   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
592e32f2f54SBarry Smith   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
593e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
594e6580ceeSShri Abhyankar /*      colscale = 4; */
595e6580ceeSShri Abhyankar /*    } */
596e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
597e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
598e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
599e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
600e6580ceeSShri Abhyankar     /* zero out one register */
601e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
602e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
603e6580ceeSShri Abhyankar       x = rtmp+16*bjtmp[j];
604e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
605e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
606e6580ceeSShri Abhyankar       /* Copy zero register to memory locations */
607e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
608e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
609e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
610e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
611e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
612e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
613e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
614e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
615e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
616e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
617e6580ceeSShri Abhyankar     }
618e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
619e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
620e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
621e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
622e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
623e6580ceeSShri Abhyankar       x = rtmp+16*ajtmpold[j];
624e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmpold[j]; */
625e6580ceeSShri Abhyankar       /* Copy v block into x block */
626e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
627e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
628e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
629e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
630e6580ceeSShri Abhyankar 
631e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
632e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
633e6580ceeSShri Abhyankar 
634e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
635e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
636e6580ceeSShri Abhyankar 
637e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
638e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
639e6580ceeSShri Abhyankar 
640e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
641e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
642e6580ceeSShri Abhyankar 
643e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
644e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
645e6580ceeSShri Abhyankar 
646e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
647e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
648e6580ceeSShri Abhyankar 
649e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
650e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
651e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
652e6580ceeSShri Abhyankar 
653e6580ceeSShri Abhyankar       v += 16;
654e6580ceeSShri Abhyankar     }
655e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
656e6580ceeSShri Abhyankar     row = *bjtmp++;
657e6580ceeSShri Abhyankar     while (row < i) {
658e6580ceeSShri Abhyankar       pc = rtmp + 16*row;
659e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
660e6580ceeSShri Abhyankar       /* Load block from lower triangle */
661e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
662e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
663e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
664e6580ceeSShri Abhyankar 
665e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
666e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
667e6580ceeSShri Abhyankar 
668e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
669e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
670e6580ceeSShri Abhyankar 
671e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
672e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
673e6580ceeSShri Abhyankar 
674e6580ceeSShri Abhyankar       /* Compare block to zero block */
675e6580ceeSShri Abhyankar 
676e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM4,XMM7)
677e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM4,XMM0)
678e6580ceeSShri Abhyankar 
679e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM5,XMM7)
680e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM5,XMM1)
681e6580ceeSShri Abhyankar 
682e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM6,XMM7)
683e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM6,XMM2)
684e6580ceeSShri Abhyankar 
685e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM7,XMM3)
686e6580ceeSShri Abhyankar 
687e6580ceeSShri Abhyankar       /* Reduce the comparisons to one SSE register */
688e6580ceeSShri Abhyankar       SSE_OR_PS(XMM6,XMM7)
689e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5,XMM4)
690e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5,XMM6)
691e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
692e6580ceeSShri Abhyankar 
693e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
694e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
695e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
696e6580ceeSShri Abhyankar 
697e6580ceeSShri Abhyankar       /* If block is nonzero ... */
698e6580ceeSShri Abhyankar       if (nonzero) {
699e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
700e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
701e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
702e6580ceeSShri Abhyankar 
703e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
704e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
705e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
706e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
707e6580ceeSShri Abhyankar 
708e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
709e6580ceeSShri Abhyankar         /* Column 0, product is accumulated in XMM4 */
710e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
711e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM4,XMM4,0x00)
712e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM4,XMM0)
713e6580ceeSShri Abhyankar 
714e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
715e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5,XMM5,0x00)
716e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5,XMM1)
717e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM5)
718e6580ceeSShri Abhyankar 
719e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
720e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
721e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM2)
722e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM6)
723e6580ceeSShri Abhyankar 
724e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
725e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
726e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM3)
727e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM7)
728e6580ceeSShri Abhyankar 
729e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
730e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
731e6580ceeSShri Abhyankar 
732e6580ceeSShri Abhyankar         /* Column 1, product is accumulated in XMM5 */
733e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
734e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5,XMM5,0x00)
735e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5,XMM0)
736e6580ceeSShri Abhyankar 
737e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
738e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
739e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM1)
740e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM6)
741e6580ceeSShri Abhyankar 
742e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
743e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
744e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM2)
745e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM7)
746e6580ceeSShri Abhyankar 
747e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
748e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
749e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM3)
750e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM6)
751e6580ceeSShri Abhyankar 
752e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
753e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
754e6580ceeSShri Abhyankar 
755e6580ceeSShri Abhyankar         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
756e6580ceeSShri Abhyankar 
757e6580ceeSShri Abhyankar         /* Column 2, product is accumulated in XMM6 */
758e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
759e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
760e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM0)
761e6580ceeSShri Abhyankar 
762e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
763e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
764e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM1)
765e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
766e6580ceeSShri Abhyankar 
767e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
768e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
769e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM2)
770e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
771e6580ceeSShri Abhyankar 
772e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
773e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
774e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM3)
775e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
776e6580ceeSShri Abhyankar 
777e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
778e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
779e6580ceeSShri Abhyankar 
780e6580ceeSShri Abhyankar         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
781e6580ceeSShri Abhyankar         /* Column 3, product is accumulated in XMM0 */
782e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
783e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
784e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM0,XMM7)
785e6580ceeSShri Abhyankar 
786e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
787e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
788e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1,XMM7)
789e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM1)
790e6580ceeSShri Abhyankar 
791e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
792e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM1,XMM1,0x00)
793e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1,XMM2)
794e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM1)
795e6580ceeSShri Abhyankar 
796e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
797e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
798e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM3,XMM7)
799e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM3)
800e6580ceeSShri Abhyankar 
801e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
802e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
803e6580ceeSShri Abhyankar 
804e6580ceeSShri Abhyankar         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
805e6580ceeSShri Abhyankar         /* This is code to be maintained and read by humans afterall. */
806e6580ceeSShri Abhyankar         /* Copy Multiplier Col 3 into XMM3 */
807e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM3,XMM0)
808e6580ceeSShri Abhyankar         /* Copy Multiplier Col 2 into XMM2 */
809e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM2,XMM6)
810e6580ceeSShri Abhyankar         /* Copy Multiplier Col 1 into XMM1 */
811e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM1,XMM5)
812e6580ceeSShri Abhyankar         /* Copy Multiplier Col 0 into XMM0 */
813e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM0,XMM4)
814e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
815e6580ceeSShri Abhyankar 
816e6580ceeSShri Abhyankar         /* Update the row: */
817e6580ceeSShri Abhyankar         nz  = bi[row+1] - diag_offset[row] - 1;
818e6580ceeSShri Abhyankar         pv += 16;
819e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
820e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
821e6580ceeSShri Abhyankar           x = rtmp + 16*pj[j];
822e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
823e6580ceeSShri Abhyankar 
824e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
825e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
826e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
827e6580ceeSShri Abhyankar           /* Load First Column of X*/
828e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
829e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
830e6580ceeSShri Abhyankar 
831e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
832e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
833e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
834e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
835e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
836e6580ceeSShri Abhyankar 
837e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
838e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
839e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
840e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM6)
841e6580ceeSShri Abhyankar 
842e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
843e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
844e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
845e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM7)
846e6580ceeSShri Abhyankar 
847e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
848e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
849e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM3)
850e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
851e6580ceeSShri Abhyankar 
852e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
853e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
854e6580ceeSShri Abhyankar 
855e6580ceeSShri Abhyankar           /* Second Column */
856e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
857e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
858e6580ceeSShri Abhyankar 
859e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
860e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
861e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
862e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
863e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM6)
864e6580ceeSShri Abhyankar 
865e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
866e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
867e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
868e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM7)
869e6580ceeSShri Abhyankar 
870e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
871e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
872e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM2)
873e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM4)
874e6580ceeSShri Abhyankar 
875e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
876e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
877e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
878e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM6)
879e6580ceeSShri Abhyankar 
880e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
881e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
882e6580ceeSShri Abhyankar 
883e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
884e6580ceeSShri Abhyankar 
885e6580ceeSShri Abhyankar           /* Third Column */
886e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
887e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
888e6580ceeSShri Abhyankar 
889e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
890e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
891e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
892e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM0)
893e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM7)
894e6580ceeSShri Abhyankar 
895e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
896e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
897e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM1)
898e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM4)
899e6580ceeSShri Abhyankar 
900e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
901e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
902e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM2)
903e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM5)
904e6580ceeSShri Abhyankar 
905e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
906e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
907e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
908e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM7)
909e6580ceeSShri Abhyankar 
910e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
911e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
912e6580ceeSShri Abhyankar 
913e6580ceeSShri Abhyankar           /* Fourth Column */
914e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
915e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
916e6580ceeSShri Abhyankar 
917e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
918e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
919e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
920e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
921e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
922e6580ceeSShri Abhyankar 
923e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
924e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
925e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
926e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM6)
927e6580ceeSShri Abhyankar 
928e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
929e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
930e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
931e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM7)
932e6580ceeSShri Abhyankar 
933e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
934e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
935e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM3)
936e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
937e6580ceeSShri Abhyankar 
938e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
939e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
940e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
941e6580ceeSShri Abhyankar           pv += 16;
942e6580ceeSShri Abhyankar         }
943e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
944e6580ceeSShri Abhyankar       }
945e6580ceeSShri Abhyankar       row = *bjtmp++;
946e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
947e6580ceeSShri Abhyankar     }
948e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
949e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
950e6580ceeSShri Abhyankar     pj = bj + bi[i];
951e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
952e6580ceeSShri Abhyankar 
953e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
954e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
955e6580ceeSShri Abhyankar       x = rtmp+16*pj[j];
956e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
957e6580ceeSShri Abhyankar 
958e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
959e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
960e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
961e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
962e6580ceeSShri Abhyankar 
963e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
964e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
965e6580ceeSShri Abhyankar 
966e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
967e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
968e6580ceeSShri Abhyankar 
969e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
970e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
971e6580ceeSShri Abhyankar 
972e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
973e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
974e6580ceeSShri Abhyankar 
975e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
976e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
977e6580ceeSShri Abhyankar 
978e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
979e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
980e6580ceeSShri Abhyankar 
981e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
982e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
983e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
984e6580ceeSShri Abhyankar       pv += 16;
985e6580ceeSShri Abhyankar     }
986e6580ceeSShri Abhyankar     /* invert diagonal block */
987e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
988e6580ceeSShri Abhyankar     if (pivotinblocks) {
989*a455e926SHong Zhang       PetscBool allowzeropivot,zeropivotdetected;
990*a455e926SHong Zhang       allowzeropivot = PetscNot(A->erroriffailure);
991*a455e926SHong Zhang       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
9922e92ee13SHong Zhang       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
993e6580ceeSShri Abhyankar     } else {
99496b95a6bSBarry Smith       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
995e6580ceeSShri Abhyankar     }
99696b95a6bSBarry Smith /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
997e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
998e6580ceeSShri Abhyankar   }
999e6580ceeSShri Abhyankar 
1000e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
100126fbe8dcSKarl Rupp 
1002e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1003e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1004e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
100526fbe8dcSKarl Rupp 
1006766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1007e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1008e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1009e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1010e6580ceeSShri Abhyankar }
1011e6580ceeSShri Abhyankar 
1012e6580ceeSShri Abhyankar #undef __FUNCT__
1013e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1014e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1015e6580ceeSShri Abhyankar {
1016e6580ceeSShri Abhyankar   Mat            A  =C;
1017e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1018e6580ceeSShri Abhyankar   PetscErrorCode ierr;
1019e6580ceeSShri Abhyankar   int            i,j,n = a->mbs;
1020e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1021e6580ceeSShri Abhyankar   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1022e6580ceeSShri Abhyankar   unsigned int   row;
1023e6580ceeSShri Abhyankar   int            nz,*bi=b->i;
1024e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i;
1025e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1026e6580ceeSShri Abhyankar   MatScalar      *ba    = b->a,*aa = a->a;
1027e6580ceeSShri Abhyankar   int            nonzero=0;
1028*a455e926SHong Zhang   PetscBool      pivotinblocks = b->pivotinblocks;
1029182b8fbaSHong Zhang   PetscReal      shift = info->shiftamount;
1030e6580ceeSShri Abhyankar 
1031e6580ceeSShri Abhyankar   PetscFunctionBegin;
1032e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1033e6580ceeSShri Abhyankar 
1034e32f2f54SBarry Smith   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1035e32f2f54SBarry Smith   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1036785e854fSJed Brown   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1037e32f2f54SBarry Smith   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1038e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
1039e6580ceeSShri Abhyankar /*      colscale = 4; */
1040e6580ceeSShri Abhyankar /*    } */
1041e6580ceeSShri Abhyankar 
1042e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
1043e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
1044e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1045e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1046e6580ceeSShri Abhyankar     /* zero out one register */
1047e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
1048e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
1049e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)bjtmp[j]);
1050e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
1051e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1052e6580ceeSShri Abhyankar       /* Copy zero register to memory locations */
1053e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1054e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1055e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1056e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1057e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1058e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1059e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1060e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1061e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1062e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1063e6580ceeSShri Abhyankar     }
1064e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1065e6580ceeSShri Abhyankar     nz    = ai[i+1] - ai[i];
1066e6580ceeSShri Abhyankar     ajtmp = aj + ai[i];
1067e6580ceeSShri Abhyankar     v     = aa + 16*ai[i];
1068e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1069e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)ajtmp[j]);
1070e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmp[j]; */
1071e6580ceeSShri Abhyankar       /* Copy v block into x block */
1072e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
1073e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1074e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1075e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1076e6580ceeSShri Abhyankar 
1077e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1078e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1079e6580ceeSShri Abhyankar 
1080e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1081e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1082e6580ceeSShri Abhyankar 
1083e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1084e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1085e6580ceeSShri Abhyankar 
1086e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1087e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1088e6580ceeSShri Abhyankar 
1089e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1090e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1091e6580ceeSShri Abhyankar 
1092e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1093e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1094e6580ceeSShri Abhyankar 
1095e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1096e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1097e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1098e6580ceeSShri Abhyankar 
1099e6580ceeSShri Abhyankar       v += 16;
1100e6580ceeSShri Abhyankar     }
1101e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
1102e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1103e6580ceeSShri Abhyankar     while (row < i) {
1104e6580ceeSShri Abhyankar       pc = rtmp + 16*row;
1105e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1106e6580ceeSShri Abhyankar       /* Load block from lower triangle */
1107e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1108e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1109e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1110e6580ceeSShri Abhyankar 
1111e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1112e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1113e6580ceeSShri Abhyankar 
1114e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1115e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1116e6580ceeSShri Abhyankar 
1117e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1118e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1119e6580ceeSShri Abhyankar 
1120e6580ceeSShri Abhyankar       /* Compare block to zero block */
1121e6580ceeSShri Abhyankar 
1122e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM4,XMM7)
1123e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM4,XMM0)
1124e6580ceeSShri Abhyankar 
1125e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM5,XMM7)
1126e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM5,XMM1)
1127e6580ceeSShri Abhyankar 
1128e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM6,XMM7)
1129e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM6,XMM2)
1130e6580ceeSShri Abhyankar 
1131e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM7,XMM3)
1132e6580ceeSShri Abhyankar 
1133e6580ceeSShri Abhyankar       /* Reduce the comparisons to one SSE register */
1134e6580ceeSShri Abhyankar       SSE_OR_PS(XMM6,XMM7)
1135e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5,XMM4)
1136e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5,XMM6)
1137e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1138e6580ceeSShri Abhyankar 
1139e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1140e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1141e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
1142e6580ceeSShri Abhyankar 
1143e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1144e6580ceeSShri Abhyankar       if (nonzero) {
1145e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
1146e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1147e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1148e6580ceeSShri Abhyankar 
1149e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1150e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1151e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1152e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1153e6580ceeSShri Abhyankar 
1154e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
1155e6580ceeSShri Abhyankar         /* Column 0, product is accumulated in XMM4 */
1156e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1157e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM4,XMM4,0x00)
1158e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM4,XMM0)
1159e6580ceeSShri Abhyankar 
1160e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1161e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5,XMM5,0x00)
1162e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5,XMM1)
1163e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM5)
1164e6580ceeSShri Abhyankar 
1165e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1166e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1167e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM2)
1168e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM6)
1169e6580ceeSShri Abhyankar 
1170e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1171e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1172e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM3)
1173e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM7)
1174e6580ceeSShri Abhyankar 
1175e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1176e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1177e6580ceeSShri Abhyankar 
1178e6580ceeSShri Abhyankar         /* Column 1, product is accumulated in XMM5 */
1179e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1180e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5,XMM5,0x00)
1181e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5,XMM0)
1182e6580ceeSShri Abhyankar 
1183e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1184e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1185e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM1)
1186e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM6)
1187e6580ceeSShri Abhyankar 
1188e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1189e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1190e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM2)
1191e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM7)
1192e6580ceeSShri Abhyankar 
1193e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1194e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1195e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM3)
1196e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM6)
1197e6580ceeSShri Abhyankar 
1198e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1199e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1200e6580ceeSShri Abhyankar 
1201e6580ceeSShri Abhyankar         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1202e6580ceeSShri Abhyankar 
1203e6580ceeSShri Abhyankar         /* Column 2, product is accumulated in XMM6 */
1204e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1205e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1206e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM0)
1207e6580ceeSShri Abhyankar 
1208e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1209e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1210e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM1)
1211e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
1212e6580ceeSShri Abhyankar 
1213e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1214e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1215e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM2)
1216e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
1217e6580ceeSShri Abhyankar 
1218e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1219e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1220e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM3)
1221e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
1222e6580ceeSShri Abhyankar 
1223e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1224e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1225e6580ceeSShri Abhyankar 
1226e6580ceeSShri Abhyankar         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1227e6580ceeSShri Abhyankar         /* Column 3, product is accumulated in XMM0 */
1228e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1229e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1230e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM0,XMM7)
1231e6580ceeSShri Abhyankar 
1232e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1233e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1234e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1,XMM7)
1235e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM1)
1236e6580ceeSShri Abhyankar 
1237e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1238e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM1,XMM1,0x00)
1239e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1,XMM2)
1240e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM1)
1241e6580ceeSShri Abhyankar 
1242e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1243e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1244e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM3,XMM7)
1245e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM3)
1246e6580ceeSShri Abhyankar 
1247e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1248e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1249e6580ceeSShri Abhyankar 
1250e6580ceeSShri Abhyankar         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1251e6580ceeSShri Abhyankar         /* This is code to be maintained and read by humans afterall. */
1252e6580ceeSShri Abhyankar         /* Copy Multiplier Col 3 into XMM3 */
1253e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM3,XMM0)
1254e6580ceeSShri Abhyankar         /* Copy Multiplier Col 2 into XMM2 */
1255e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM2,XMM6)
1256e6580ceeSShri Abhyankar         /* Copy Multiplier Col 1 into XMM1 */
1257e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM1,XMM5)
1258e6580ceeSShri Abhyankar         /* Copy Multiplier Col 0 into XMM0 */
1259e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM0,XMM4)
1260e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1261e6580ceeSShri Abhyankar 
1262e6580ceeSShri Abhyankar         /* Update the row: */
1263e6580ceeSShri Abhyankar         nz  = bi[row+1] - diag_offset[row] - 1;
1264e6580ceeSShri Abhyankar         pv += 16;
1265e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
1266e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1267e6580ceeSShri Abhyankar           x = rtmp + 16*((unsigned int)pj[j]);
1268e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
1269e6580ceeSShri Abhyankar 
1270e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1271e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1272e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
1273e6580ceeSShri Abhyankar           /* Load First Column of X*/
1274e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1275e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1276e6580ceeSShri Abhyankar 
1277e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1278e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1279e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1280e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1281e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1282e6580ceeSShri Abhyankar 
1283e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1284e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1285e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1286e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM6)
1287e6580ceeSShri Abhyankar 
1288e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1289e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1290e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1291e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM7)
1292e6580ceeSShri Abhyankar 
1293e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1294e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1295e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM3)
1296e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1297e6580ceeSShri Abhyankar 
1298e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1299e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1300e6580ceeSShri Abhyankar 
1301e6580ceeSShri Abhyankar           /* Second Column */
1302e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1303e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1304e6580ceeSShri Abhyankar 
1305e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1306e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1307e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1308e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
1309e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM6)
1310e6580ceeSShri Abhyankar 
1311e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1312e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1313e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
1314e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM7)
1315e6580ceeSShri Abhyankar 
1316e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1317e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1318e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM2)
1319e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM4)
1320e6580ceeSShri Abhyankar 
1321e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1322e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1323e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
1324e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM6)
1325e6580ceeSShri Abhyankar 
1326e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1327e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1328e6580ceeSShri Abhyankar 
1329e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1330e6580ceeSShri Abhyankar 
1331e6580ceeSShri Abhyankar           /* Third Column */
1332e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1333e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1334e6580ceeSShri Abhyankar 
1335e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1336e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1337e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1338e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM0)
1339e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM7)
1340e6580ceeSShri Abhyankar 
1341e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1342e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1343e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM1)
1344e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM4)
1345e6580ceeSShri Abhyankar 
1346e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1347e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1348e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM2)
1349e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM5)
1350e6580ceeSShri Abhyankar 
1351e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1352e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1353e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1354e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM7)
1355e6580ceeSShri Abhyankar 
1356e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1357e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1358e6580ceeSShri Abhyankar 
1359e6580ceeSShri Abhyankar           /* Fourth Column */
1360e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1361e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1362e6580ceeSShri Abhyankar 
1363e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1364e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1365e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1366e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1367e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1368e6580ceeSShri Abhyankar 
1369e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1370e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1371e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1372e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM6)
1373e6580ceeSShri Abhyankar 
1374e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1375e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1376e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1377e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM7)
1378e6580ceeSShri Abhyankar 
1379e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1380e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1381e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM3)
1382e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1383e6580ceeSShri Abhyankar 
1384e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1385e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1386e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1387e6580ceeSShri Abhyankar           pv += 16;
1388e6580ceeSShri Abhyankar         }
1389e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1390e6580ceeSShri Abhyankar       }
1391e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1392e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
1393e6580ceeSShri Abhyankar /*        bjtmp++; */
1394e6580ceeSShri Abhyankar     }
1395e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1396e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
1397e6580ceeSShri Abhyankar     pj = bj + bi[i];
1398e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
1399e6580ceeSShri Abhyankar 
1400e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1401e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1402e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)pj[j]);
1403e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
1404e6580ceeSShri Abhyankar 
1405e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
1406e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1407e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1408e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1409e6580ceeSShri Abhyankar 
1410e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1411e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1412e6580ceeSShri Abhyankar 
1413e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1414e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1415e6580ceeSShri Abhyankar 
1416e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1417e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1418e6580ceeSShri Abhyankar 
1419e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1420e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1421e6580ceeSShri Abhyankar 
1422e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1423e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1424e6580ceeSShri Abhyankar 
1425e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1426e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1427e6580ceeSShri Abhyankar 
1428e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1429e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1430e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1431e6580ceeSShri Abhyankar       pv += 16;
1432e6580ceeSShri Abhyankar     }
1433e6580ceeSShri Abhyankar     /* invert diagonal block */
1434e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
1435e6580ceeSShri Abhyankar     if (pivotinblocks) {
1436*a455e926SHong Zhang       PetscBool allowzeropivot,zeropivotdetected;
1437*a455e926SHong Zhang       allowzeropivot = PetscNot(A->erroriffailure);
1438*a455e926SHong Zhang       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
14392e92ee13SHong Zhang       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1440e6580ceeSShri Abhyankar     } else {
144196b95a6bSBarry Smith       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1442e6580ceeSShri Abhyankar     }
144396b95a6bSBarry Smith /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1444e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1445e6580ceeSShri Abhyankar   }
1446e6580ceeSShri Abhyankar 
1447e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
144826fbe8dcSKarl Rupp 
1449e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1450e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1451e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
145226fbe8dcSKarl Rupp 
1453766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1454e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1455e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1456e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1457e6580ceeSShri Abhyankar }
1458e6580ceeSShri Abhyankar 
1459e6580ceeSShri Abhyankar #undef __FUNCT__
1460e6580ceeSShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1461e6580ceeSShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1462e6580ceeSShri Abhyankar {
1463e6580ceeSShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1464e6580ceeSShri Abhyankar   PetscErrorCode ierr;
1465e6580ceeSShri Abhyankar   int            i,j,n = a->mbs;
1466e6580ceeSShri Abhyankar   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1467e6580ceeSShri Abhyankar   unsigned int   row;
1468e6580ceeSShri Abhyankar   int            *ajtmpold,nz,*bi=b->i;
1469e6580ceeSShri Abhyankar   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1470e6580ceeSShri Abhyankar   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1471e6580ceeSShri Abhyankar   MatScalar      *ba    = b->a,*aa = a->a;
1472e6580ceeSShri Abhyankar   int            nonzero=0;
1473*a455e926SHong Zhang   PetscBool      pivotinblocks = b->pivotinblocks;
1474182b8fbaSHong Zhang   PetscReal      shift = info->shiftamount;
1475e6580ceeSShri Abhyankar 
1476e6580ceeSShri Abhyankar   PetscFunctionBegin;
1477e6580ceeSShri Abhyankar   SSE_SCOPE_BEGIN;
1478e6580ceeSShri Abhyankar 
1479e32f2f54SBarry Smith   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1480e32f2f54SBarry Smith   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1481785e854fSJed Brown   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1482e32f2f54SBarry Smith   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1483e6580ceeSShri Abhyankar /*    if ((unsigned long)bj==(unsigned long)aj) { */
1484e6580ceeSShri Abhyankar /*      colscale = 4; */
1485e6580ceeSShri Abhyankar /*    } */
1486e6580ceeSShri Abhyankar   if ((unsigned long)bj==(unsigned long)aj) {
1487e6580ceeSShri Abhyankar     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1488e6580ceeSShri Abhyankar   }
1489e6580ceeSShri Abhyankar 
1490e6580ceeSShri Abhyankar   for (i=0; i<n; i++) {
1491e6580ceeSShri Abhyankar     nz    = bi[i+1] - bi[i];
1492e6580ceeSShri Abhyankar     bjtmp = bj + bi[i];
1493e6580ceeSShri Abhyankar     /* zero out the 4x4 block accumulators */
1494e6580ceeSShri Abhyankar     /* zero out one register */
1495e6580ceeSShri Abhyankar     XOR_PS(XMM7,XMM7);
1496e6580ceeSShri Abhyankar     for  (j=0; j<nz; j++) {
1497e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)bjtmp[j]);
1498e6580ceeSShri Abhyankar /*        x = rtmp+4*bjtmp[j]; */
1499e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(x)
1500e6580ceeSShri Abhyankar       /* Copy zero register to memory locations */
1501e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1502e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1503e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1504e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1505e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1506e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1507e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1508e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1509e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1510e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1511e6580ceeSShri Abhyankar     }
1512e6580ceeSShri Abhyankar     /* load in initial (unfactored row) */
1513e6580ceeSShri Abhyankar     nz       = ai[i+1] - ai[i];
1514e6580ceeSShri Abhyankar     ajtmpold = aj + ai[i];
1515e6580ceeSShri Abhyankar     v        = aa + 16*ai[i];
1516e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1517e6580ceeSShri Abhyankar       x = rtmp+16*ajtmpold[j];
1518e6580ceeSShri Abhyankar /*        x = rtmp+colscale*ajtmpold[j]; */
1519e6580ceeSShri Abhyankar       /* Copy v block into x block */
1520e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(v,x)
1521e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1522e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1523e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1524e6580ceeSShri Abhyankar 
1525e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1526e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1527e6580ceeSShri Abhyankar 
1528e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1529e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1530e6580ceeSShri Abhyankar 
1531e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1532e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1533e6580ceeSShri Abhyankar 
1534e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1535e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1536e6580ceeSShri Abhyankar 
1537e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1538e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1539e6580ceeSShri Abhyankar 
1540e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1541e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1542e6580ceeSShri Abhyankar 
1543e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1544e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1545e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1546e6580ceeSShri Abhyankar 
1547e6580ceeSShri Abhyankar       v += 16;
1548e6580ceeSShri Abhyankar     }
1549e6580ceeSShri Abhyankar /*      row = (*bjtmp++)/4; */
1550e6580ceeSShri Abhyankar     row = (unsigned int)(*bjtmp++);
1551e6580ceeSShri Abhyankar     while (row < i) {
1552e6580ceeSShri Abhyankar       pc = rtmp + 16*row;
1553e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_1(pc)
1554e6580ceeSShri Abhyankar       /* Load block from lower triangle */
1555e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1556e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1557e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1558e6580ceeSShri Abhyankar 
1559e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1560e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1561e6580ceeSShri Abhyankar 
1562e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1563e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1564e6580ceeSShri Abhyankar 
1565e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1566e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1567e6580ceeSShri Abhyankar 
1568e6580ceeSShri Abhyankar       /* Compare block to zero block */
1569e6580ceeSShri Abhyankar 
1570e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM4,XMM7)
1571e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM4,XMM0)
1572e6580ceeSShri Abhyankar 
1573e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM5,XMM7)
1574e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM5,XMM1)
1575e6580ceeSShri Abhyankar 
1576e6580ceeSShri Abhyankar       SSE_COPY_PS(XMM6,XMM7)
1577e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM6,XMM2)
1578e6580ceeSShri Abhyankar 
1579e6580ceeSShri Abhyankar       SSE_CMPNEQ_PS(XMM7,XMM3)
1580e6580ceeSShri Abhyankar 
1581e6580ceeSShri Abhyankar       /* Reduce the comparisons to one SSE register */
1582e6580ceeSShri Abhyankar       SSE_OR_PS(XMM6,XMM7)
1583e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5,XMM4)
1584e6580ceeSShri Abhyankar       SSE_OR_PS(XMM5,XMM6)
1585e6580ceeSShri Abhyankar       SSE_INLINE_END_1;
1586e6580ceeSShri Abhyankar 
1587e6580ceeSShri Abhyankar       /* Reduce the one SSE register to an integer register for branching */
1588e6580ceeSShri Abhyankar       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1589e6580ceeSShri Abhyankar       MOVEMASK(nonzero,XMM5);
1590e6580ceeSShri Abhyankar 
1591e6580ceeSShri Abhyankar       /* If block is nonzero ... */
1592e6580ceeSShri Abhyankar       if (nonzero) {
1593e6580ceeSShri Abhyankar         pv = ba + 16*diag_offset[row];
1594e6580ceeSShri Abhyankar         PREFETCH_L1(&pv[16]);
1595e6580ceeSShri Abhyankar         pj = bj + diag_offset[row] + 1;
1596e6580ceeSShri Abhyankar 
1597e6580ceeSShri Abhyankar         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1598e6580ceeSShri Abhyankar         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1599e6580ceeSShri Abhyankar         /* but the diagonal was inverted already */
1600e6580ceeSShri Abhyankar         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1601e6580ceeSShri Abhyankar 
1602e6580ceeSShri Abhyankar         SSE_INLINE_BEGIN_2(pv,pc)
1603e6580ceeSShri Abhyankar         /* Column 0, product is accumulated in XMM4 */
1604e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1605e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM4,XMM4,0x00)
1606e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM4,XMM0)
1607e6580ceeSShri Abhyankar 
1608e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1609e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5,XMM5,0x00)
1610e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5,XMM1)
1611e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM5)
1612e6580ceeSShri Abhyankar 
1613e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1614e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1615e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM2)
1616e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM6)
1617e6580ceeSShri Abhyankar 
1618e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1619e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1620e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM3)
1621e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM4,XMM7)
1622e6580ceeSShri Abhyankar 
1623e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1624e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1625e6580ceeSShri Abhyankar 
1626e6580ceeSShri Abhyankar         /* Column 1, product is accumulated in XMM5 */
1627e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1628e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM5,XMM5,0x00)
1629e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM5,XMM0)
1630e6580ceeSShri Abhyankar 
1631e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1632e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1633e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM1)
1634e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM6)
1635e6580ceeSShri Abhyankar 
1636e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1637e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1638e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM2)
1639e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM7)
1640e6580ceeSShri Abhyankar 
1641e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1642e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1643e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM3)
1644e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM5,XMM6)
1645e6580ceeSShri Abhyankar 
1646e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1647e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1648e6580ceeSShri Abhyankar 
1649e6580ceeSShri Abhyankar         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1650e6580ceeSShri Abhyankar 
1651e6580ceeSShri Abhyankar         /* Column 2, product is accumulated in XMM6 */
1652e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1653e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM6,XMM6,0x00)
1654e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM6,XMM0)
1655e6580ceeSShri Abhyankar 
1656e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1657e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1658e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM1)
1659e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
1660e6580ceeSShri Abhyankar 
1661e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1662e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1663e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM2)
1664e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
1665e6580ceeSShri Abhyankar 
1666e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1667e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1668e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM7,XMM3)
1669e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM6,XMM7)
1670e6580ceeSShri Abhyankar 
1671e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1672e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1673e6580ceeSShri Abhyankar 
1674e6580ceeSShri Abhyankar         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1675e6580ceeSShri Abhyankar         /* Column 3, product is accumulated in XMM0 */
1676e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1677e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1678e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM0,XMM7)
1679e6580ceeSShri Abhyankar 
1680e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1681e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1682e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1,XMM7)
1683e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM1)
1684e6580ceeSShri Abhyankar 
1685e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1686e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM1,XMM1,0x00)
1687e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM1,XMM2)
1688e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM1)
1689e6580ceeSShri Abhyankar 
1690e6580ceeSShri Abhyankar         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1691e6580ceeSShri Abhyankar         SSE_SHUFFLE(XMM7,XMM7,0x00)
1692e6580ceeSShri Abhyankar         SSE_MULT_PS(XMM3,XMM7)
1693e6580ceeSShri Abhyankar         SSE_ADD_PS(XMM0,XMM3)
1694e6580ceeSShri Abhyankar 
1695e6580ceeSShri Abhyankar         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1696e6580ceeSShri Abhyankar         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1697e6580ceeSShri Abhyankar 
1698e6580ceeSShri Abhyankar         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1699e6580ceeSShri Abhyankar         /* This is code to be maintained and read by humans afterall. */
1700e6580ceeSShri Abhyankar         /* Copy Multiplier Col 3 into XMM3 */
1701e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM3,XMM0)
1702e6580ceeSShri Abhyankar         /* Copy Multiplier Col 2 into XMM2 */
1703e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM2,XMM6)
1704e6580ceeSShri Abhyankar         /* Copy Multiplier Col 1 into XMM1 */
1705e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM1,XMM5)
1706e6580ceeSShri Abhyankar         /* Copy Multiplier Col 0 into XMM0 */
1707e6580ceeSShri Abhyankar         SSE_COPY_PS(XMM0,XMM4)
1708e6580ceeSShri Abhyankar         SSE_INLINE_END_2;
1709e6580ceeSShri Abhyankar 
1710e6580ceeSShri Abhyankar         /* Update the row: */
1711e6580ceeSShri Abhyankar         nz  = bi[row+1] - diag_offset[row] - 1;
1712e6580ceeSShri Abhyankar         pv += 16;
1713e6580ceeSShri Abhyankar         for (j=0; j<nz; j++) {
1714e6580ceeSShri Abhyankar           PREFETCH_L1(&pv[16]);
1715e6580ceeSShri Abhyankar           x = rtmp + 16*((unsigned int)pj[j]);
1716e6580ceeSShri Abhyankar /*            x = rtmp + 4*pj[j]; */
1717e6580ceeSShri Abhyankar 
1718e6580ceeSShri Abhyankar           /* X:=X-M*PV, One column at a time */
1719e6580ceeSShri Abhyankar           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1720e6580ceeSShri Abhyankar           SSE_INLINE_BEGIN_2(x,pv)
1721e6580ceeSShri Abhyankar           /* Load First Column of X*/
1722e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1723e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1724e6580ceeSShri Abhyankar 
1725e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1726e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1727e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1728e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1729e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1730e6580ceeSShri Abhyankar 
1731e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1732e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1733e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1734e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM6)
1735e6580ceeSShri Abhyankar 
1736e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1737e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1738e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1739e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM7)
1740e6580ceeSShri Abhyankar 
1741e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1742e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1743e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM3)
1744e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1745e6580ceeSShri Abhyankar 
1746e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1747e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1748e6580ceeSShri Abhyankar 
1749e6580ceeSShri Abhyankar           /* Second Column */
1750e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1751e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1752e6580ceeSShri Abhyankar 
1753e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1754e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1755e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1756e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM0)
1757e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM6)
1758e6580ceeSShri Abhyankar 
1759e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1760e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1761e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM1)
1762e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM7)
1763e6580ceeSShri Abhyankar 
1764e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1765e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1766e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM2)
1767e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM4)
1768e6580ceeSShri Abhyankar 
1769e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1770e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1771e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM3)
1772e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM5,XMM6)
1773e6580ceeSShri Abhyankar 
1774e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1775e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1776e6580ceeSShri Abhyankar 
1777e6580ceeSShri Abhyankar           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1778e6580ceeSShri Abhyankar 
1779e6580ceeSShri Abhyankar           /* Third Column */
1780e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1781e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1782e6580ceeSShri Abhyankar 
1783e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1784e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1785e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1786e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM0)
1787e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM7)
1788e6580ceeSShri Abhyankar 
1789e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1790e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM4,XMM4,0x00)
1791e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM4,XMM1)
1792e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM4)
1793e6580ceeSShri Abhyankar 
1794e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1795e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1796e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM2)
1797e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM5)
1798e6580ceeSShri Abhyankar 
1799e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1800e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1801e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM3)
1802e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM6,XMM7)
1803e6580ceeSShri Abhyankar 
1804e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1805e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1806e6580ceeSShri Abhyankar 
1807e6580ceeSShri Abhyankar           /* Fourth Column */
1808e6580ceeSShri Abhyankar           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1809e6580ceeSShri Abhyankar           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1810e6580ceeSShri Abhyankar 
1811e6580ceeSShri Abhyankar           /* Matrix-Vector Product: */
1812e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1813e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1814e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM0)
1815e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1816e6580ceeSShri Abhyankar 
1817e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1818e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM6,XMM6,0x00)
1819e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM6,XMM1)
1820e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM6)
1821e6580ceeSShri Abhyankar 
1822e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1823e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM7,XMM7,0x00)
1824e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM7,XMM2)
1825e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM7)
1826e6580ceeSShri Abhyankar 
1827e6580ceeSShri Abhyankar           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1828e6580ceeSShri Abhyankar           SSE_SHUFFLE(XMM5,XMM5,0x00)
1829e6580ceeSShri Abhyankar           SSE_MULT_PS(XMM5,XMM3)
1830e6580ceeSShri Abhyankar           SSE_SUB_PS(XMM4,XMM5)
1831e6580ceeSShri Abhyankar 
1832e6580ceeSShri Abhyankar           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1833e6580ceeSShri Abhyankar           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1834e6580ceeSShri Abhyankar           SSE_INLINE_END_2;
1835e6580ceeSShri Abhyankar           pv += 16;
1836e6580ceeSShri Abhyankar         }
1837e6580ceeSShri Abhyankar         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1838e6580ceeSShri Abhyankar       }
1839e6580ceeSShri Abhyankar       row = (unsigned int)(*bjtmp++);
1840e6580ceeSShri Abhyankar /*        row = (*bjtmp++)/4; */
1841e6580ceeSShri Abhyankar /*        bjtmp++; */
1842e6580ceeSShri Abhyankar     }
1843e6580ceeSShri Abhyankar     /* finished row so stick it into b->a */
1844e6580ceeSShri Abhyankar     pv = ba + 16*bi[i];
1845e6580ceeSShri Abhyankar     pj = bj + bi[i];
1846e6580ceeSShri Abhyankar     nz = bi[i+1] - bi[i];
1847e6580ceeSShri Abhyankar 
1848e6580ceeSShri Abhyankar     /* Copy x block back into pv block */
1849e6580ceeSShri Abhyankar     for (j=0; j<nz; j++) {
1850e6580ceeSShri Abhyankar       x = rtmp+16*((unsigned int)pj[j]);
1851e6580ceeSShri Abhyankar /*        x  = rtmp+4*pj[j]; */
1852e6580ceeSShri Abhyankar 
1853e6580ceeSShri Abhyankar       SSE_INLINE_BEGIN_2(x,pv)
1854e6580ceeSShri Abhyankar       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1855e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1856e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1857e6580ceeSShri Abhyankar 
1858e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1859e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1860e6580ceeSShri Abhyankar 
1861e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1862e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1863e6580ceeSShri Abhyankar 
1864e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1865e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1866e6580ceeSShri Abhyankar 
1867e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1868e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1869e6580ceeSShri Abhyankar 
1870e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1871e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1872e6580ceeSShri Abhyankar 
1873e6580ceeSShri Abhyankar       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1874e6580ceeSShri Abhyankar       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1875e6580ceeSShri Abhyankar 
1876e6580ceeSShri Abhyankar       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1877e6580ceeSShri Abhyankar       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1878e6580ceeSShri Abhyankar       SSE_INLINE_END_2;
1879e6580ceeSShri Abhyankar       pv += 16;
1880e6580ceeSShri Abhyankar     }
1881e6580ceeSShri Abhyankar     /* invert diagonal block */
1882e6580ceeSShri Abhyankar     w = ba + 16*diag_offset[i];
1883e6580ceeSShri Abhyankar     if (pivotinblocks) {
1884*a455e926SHong Zhang       PetscBool allowzeropivot,zeropivotdetected;
1885*a455e926SHong Zhang       allowzeropivot = PetscNot(A->erroriffailure);
1886*a455e926SHong Zhang       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr);
18872e92ee13SHong Zhang       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1888e6580ceeSShri Abhyankar     } else {
188996b95a6bSBarry Smith       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1890e6580ceeSShri Abhyankar     }
189196b95a6bSBarry Smith /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1892e6580ceeSShri Abhyankar     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1893e6580ceeSShri Abhyankar   }
1894e6580ceeSShri Abhyankar 
1895e6580ceeSShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
189626fbe8dcSKarl Rupp 
1897e6580ceeSShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1898e6580ceeSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1899e6580ceeSShri Abhyankar   C->assembled           = PETSC_TRUE;
190026fbe8dcSKarl Rupp 
1901766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1902e6580ceeSShri Abhyankar   /* Flop Count from inverting diagonal blocks */
1903e6580ceeSShri Abhyankar   SSE_SCOPE_END;
1904e6580ceeSShri Abhyankar   PetscFunctionReturn(0);
1905e6580ceeSShri Abhyankar }
1906e6580ceeSShri Abhyankar 
1907e6580ceeSShri Abhyankar #endif
1908