xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 2e92ee13a8395f820cc1e3fd74a7607ed52efa2a)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 #undef __FUNCT__
13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15 {
16   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17   IS             isrow = b->row,isicol = b->icol;
18   PetscErrorCode ierr;
19   const PetscInt *r,*ic;
20   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21   PetscInt       *ajtmpold,*ajtmp,nz,row;
22   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
24   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27   MatScalar      m13,m14,m15,m16;
28   MatScalar      *ba           = b->a,*aa = a->a;
29   PetscBool      pivotinblocks = b->pivotinblocks,zeropivotdetected;
30   PetscReal      shift         = info->shiftamount;
31 
32   PetscFunctionBegin;
33   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
35   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
36 
37   for (i=0; i<n; i++) {
38     nz    = bi[i+1] - bi[i];
39     ajtmp = bj + bi[i];
40     for  (j=0; j<nz; j++) {
41       x     = rtmp+16*ajtmp[j];
42       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
43       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44     }
45     /* load in initial (unfactored row) */
46     idx      = r[i];
47     nz       = ai[idx+1] - ai[idx];
48     ajtmpold = aj + ai[idx];
49     v        = aa + 16*ai[idx];
50     for (j=0; j<nz; j++) {
51       x     = rtmp+16*ic[ajtmpold[j]];
52       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
53       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
54       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55       x[14] = v[14]; x[15] = v[15];
56       v    += 16;
57     }
58     row = *ajtmp++;
59     while (row < i) {
60       pc  = rtmp + 16*row;
61       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
62       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
63       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64       p15 = pc[14]; p16 = pc[15];
65       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68           || p16 != 0.0) {
69         pv    = ba + 16*diag_offset[row];
70         pj    = bj + diag_offset[row] + 1;
71         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
72         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
73         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74         x15   = pv[14]; x16 = pv[15];
75         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
76         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
77         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
78         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
79 
80         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
81         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
82         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
83         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
84 
85         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
86         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
87         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
88         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
89 
90         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
91         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
92         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
93         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
94 
95         nz  = bi[row+1] - diag_offset[row] - 1;
96         pv += 16;
97         for (j=0; j<nz; j++) {
98           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
99           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
102           x     = rtmp + 16*pj[j];
103           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
107 
108           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
112 
113           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117 
118           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
122 
123           pv += 16;
124         }
125         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
126       }
127       row = *ajtmp++;
128     }
129     /* finished row so stick it into b->a */
130     pv = ba + 16*bi[i];
131     pj = bj + bi[i];
132     nz = bi[i+1] - bi[i];
133     for (j=0; j<nz; j++) {
134       x      = rtmp+16*pj[j];
135       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139       pv    += 16;
140     }
141     /* invert diagonal block */
142     w = ba + 16*diag_offset[i];
143     if (pivotinblocks) {
144       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
145       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146     } else {
147       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154 
155   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
156   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
157   C->assembled           = PETSC_TRUE;
158 
159   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
160   PetscFunctionReturn(0);
161 }
162 
163 /* MatLUFactorNumeric_SeqBAIJ_4 -
164      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
165        PetscKernel_A_gets_A_times_B()
166        PetscKernel_A_gets_A_minus_B_times_C()
167        PetscKernel_A_gets_inverse_A()
168 */
169 
170 #undef __FUNCT__
171 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
172 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
173 {
174   Mat            C     = B;
175   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
176   IS             isrow = b->row,isicol = b->icol;
177   PetscErrorCode ierr;
178   const PetscInt *r,*ic;
179   PetscInt       i,j,k,nz,nzL,row;
180   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
181   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
182   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
183   PetscInt       flg;
184   PetscReal      shift;
185   PetscBool      zeropivotdetected;
186 
187   PetscFunctionBegin;
188   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
189   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
190 
191   if (info->shifttype == MAT_SHIFT_NONE) {
192     shift = 0;
193   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
194     shift = info->shiftamount;
195   }
196 
197   /* generate work space needed by the factorization */
198   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
199   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
200 
201   for (i=0; i<n; i++) {
202     /* zero rtmp */
203     /* L part */
204     nz    = bi[i+1] - bi[i];
205     bjtmp = bj + bi[i];
206     for  (j=0; j<nz; j++) {
207       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
208     }
209 
210     /* U part */
211     nz    = bdiag[i]-bdiag[i+1];
212     bjtmp = bj + bdiag[i+1]+1;
213     for  (j=0; j<nz; j++) {
214       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
215     }
216 
217     /* load in initial (unfactored row) */
218     nz    = ai[r[i]+1] - ai[r[i]];
219     ajtmp = aj + ai[r[i]];
220     v     = aa + bs2*ai[r[i]];
221     for (j=0; j<nz; j++) {
222       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
223     }
224 
225     /* elimination */
226     bjtmp = bj + bi[i];
227     nzL   = bi[i+1] - bi[i];
228     for (k=0; k < nzL; k++) {
229       row = bjtmp[k];
230       pc  = rtmp + bs2*row;
231       for (flg=0,j=0; j<bs2; j++) {
232         if (pc[j]!=0.0) {
233           flg = 1;
234           break;
235         }
236       }
237       if (flg) {
238         pv = b->a + bs2*bdiag[row];
239         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
240         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
241 
242         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
243         pv = b->a + bs2*(bdiag[row+1]+1);
244         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
245         for (j=0; j<nz; j++) {
246           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
247           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
248           v    = rtmp + bs2*pj[j];
249           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
250           pv  += bs2;
251         }
252         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
253       }
254     }
255 
256     /* finished row so stick it into b->a */
257     /* L part */
258     pv = b->a + bs2*bi[i];
259     pj = b->j + bi[i];
260     nz = bi[i+1] - bi[i];
261     for (j=0; j<nz; j++) {
262       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
263     }
264 
265     /* Mark diagonal and invert diagonal for simplier triangular solves */
266     pv   = b->a + bs2*bdiag[i];
267     pj   = b->j + bdiag[i];
268     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
269     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
270     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
271 
272     /* U part */
273     pv = b->a + bs2*(bdiag[i+1]+1);
274     pj = b->j + bdiag[i+1]+1;
275     nz = bdiag[i] - bdiag[i+1] - 1;
276     for (j=0; j<nz; j++) {
277       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
278     }
279   }
280 
281   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
282   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
283   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
284 
285   C->ops->solve          = MatSolve_SeqBAIJ_4;
286   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
287   C->assembled           = PETSC_TRUE;
288 
289   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
290   PetscFunctionReturn(0);
291 }
292 
293 #undef __FUNCT__
294 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
295 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
296 {
297 /*
298     Default Version for when blocks are 4 by 4 Using natural ordering
299 */
300   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
301   PetscErrorCode ierr;
302   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
303   PetscInt       *ajtmpold,*ajtmp,nz,row;
304   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
305   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
306   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
307   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
308   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
309   MatScalar      m13,m14,m15,m16;
310   MatScalar      *ba           = b->a,*aa = a->a;
311   PetscBool      pivotinblocks = b->pivotinblocks,zeropivotdetected;
312   PetscReal      shift         = info->shiftamount;
313 
314   PetscFunctionBegin;
315   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
316 
317   for (i=0; i<n; i++) {
318     nz    = bi[i+1] - bi[i];
319     ajtmp = bj + bi[i];
320     for  (j=0; j<nz; j++) {
321       x     = rtmp+16*ajtmp[j];
322       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
323       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
324     }
325     /* load in initial (unfactored row) */
326     nz       = ai[i+1] - ai[i];
327     ajtmpold = aj + ai[i];
328     v        = aa + 16*ai[i];
329     for (j=0; j<nz; j++) {
330       x     = rtmp+16*ajtmpold[j];
331       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
332       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
333       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
334       x[14] = v[14]; x[15] = v[15];
335       v    += 16;
336     }
337     row = *ajtmp++;
338     while (row < i) {
339       pc  = rtmp + 16*row;
340       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
341       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
342       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
343       p15 = pc[14]; p16 = pc[15];
344       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
345           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
346           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
347           || p16 != 0.0) {
348         pv    = ba + 16*diag_offset[row];
349         pj    = bj + diag_offset[row] + 1;
350         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
351         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
352         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
353         x15   = pv[14]; x16 = pv[15];
354         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
355         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
356         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
357         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
358 
359         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
360         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
361         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
362         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
363 
364         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
365         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
366         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
367         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
368 
369         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
370         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
371         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
372         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
373         nz     = bi[row+1] - diag_offset[row] - 1;
374         pv    += 16;
375         for (j=0; j<nz; j++) {
376           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
377           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
378           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
379           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
380           x     = rtmp + 16*pj[j];
381           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
382           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
383           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
384           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
385 
386           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
387           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
388           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
389           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
390 
391           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
392           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
393           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
394           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
395 
396           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
397           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
398           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
399           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
400 
401           pv += 16;
402         }
403         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
404       }
405       row = *ajtmp++;
406     }
407     /* finished row so stick it into b->a */
408     pv = ba + 16*bi[i];
409     pj = bj + bi[i];
410     nz = bi[i+1] - bi[i];
411     for (j=0; j<nz; j++) {
412       x      = rtmp+16*pj[j];
413       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
414       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
415       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
416       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
417       pv    += 16;
418     }
419     /* invert diagonal block */
420     w = ba + 16*diag_offset[i];
421     if (pivotinblocks) {
422       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
423       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
424     } else {
425       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
426     }
427   }
428 
429   ierr = PetscFree(rtmp);CHKERRQ(ierr);
430 
431   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
432   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
433   C->assembled           = PETSC_TRUE;
434 
435   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
436   PetscFunctionReturn(0);
437 }
438 
439 /*
440   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
441     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
442 */
443 #undef __FUNCT__
444 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
445 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
446 {
447   Mat            C =B;
448   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
449   PetscErrorCode ierr;
450   PetscInt       i,j,k,nz,nzL,row;
451   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
452   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
453   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
454   PetscInt       flg;
455   PetscReal      shift;
456   PetscBool      zeropivotdetected;
457 
458   PetscFunctionBegin;
459   /* generate work space needed by the factorization */
460   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
461   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
462 
463   if (info->shifttype == MAT_SHIFT_NONE) {
464     shift = 0;
465   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
466     shift = info->shiftamount;
467   }
468 
469   for (i=0; i<n; i++) {
470     /* zero rtmp */
471     /* L part */
472     nz    = bi[i+1] - bi[i];
473     bjtmp = bj + bi[i];
474     for  (j=0; j<nz; j++) {
475       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
476     }
477 
478     /* U part */
479     nz    = bdiag[i] - bdiag[i+1];
480     bjtmp = bj + bdiag[i+1]+1;
481     for  (j=0; j<nz; j++) {
482       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
483     }
484 
485     /* load in initial (unfactored row) */
486     nz    = ai[i+1] - ai[i];
487     ajtmp = aj + ai[i];
488     v     = aa + bs2*ai[i];
489     for (j=0; j<nz; j++) {
490       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
491     }
492 
493     /* elimination */
494     bjtmp = bj + bi[i];
495     nzL   = bi[i+1] - bi[i];
496     for (k=0; k < nzL; k++) {
497       row = bjtmp[k];
498       pc  = rtmp + bs2*row;
499       for (flg=0,j=0; j<bs2; j++) {
500         if (pc[j]!=0.0) {
501           flg = 1;
502           break;
503         }
504       }
505       if (flg) {
506         pv = b->a + bs2*bdiag[row];
507         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
508         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
509 
510         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
511         pv = b->a + bs2*(bdiag[row+1]+1);
512         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
513         for (j=0; j<nz; j++) {
514           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
515           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
516           v    = rtmp + bs2*pj[j];
517           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
518           pv  += bs2;
519         }
520         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
521       }
522     }
523 
524     /* finished row so stick it into b->a */
525     /* L part */
526     pv = b->a + bs2*bi[i];
527     pj = b->j + bi[i];
528     nz = bi[i+1] - bi[i];
529     for (j=0; j<nz; j++) {
530       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
531     }
532 
533     /* Mark diagonal and invert diagonal for simplier triangular solves */
534     pv   = b->a + bs2*bdiag[i];
535     pj   = b->j + bdiag[i];
536     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
537     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
538     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
539 
540     /* U part */
541     pv = b->a + bs2*(bdiag[i+1]+1);
542     pj = b->j + bdiag[i+1]+1;
543     nz = bdiag[i] - bdiag[i+1] - 1;
544     for (j=0; j<nz; j++) {
545       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
546     }
547   }
548   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
549 
550   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
551   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
552   C->assembled           = PETSC_TRUE;
553 
554   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
555   PetscFunctionReturn(0);
556 }
557 
558 #if defined(PETSC_HAVE_SSE)
559 
560 #include PETSC_HAVE_SSE
561 
562 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
563 #undef __FUNCT__
564 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
565 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
566 {
567   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
568   PetscErrorCode ierr;
569   int            i,j,n = a->mbs;
570   int            *bj = b->j,*bjtmp,*pj;
571   int            row;
572   int            *ajtmpold,nz,*bi=b->i;
573   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
574   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
575   MatScalar      *ba    = b->a,*aa = a->a;
576   int            nonzero=0;
577 /*    int            nonzero=0,colscale = 16; */
578   PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected;
579   PetscReal shift         = info->shiftamount;
580 
581   PetscFunctionBegin;
582   SSE_SCOPE_BEGIN;
583 
584   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
585   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
586   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
587   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
588 /*    if ((unsigned long)bj==(unsigned long)aj) { */
589 /*      colscale = 4; */
590 /*    } */
591   for (i=0; i<n; i++) {
592     nz    = bi[i+1] - bi[i];
593     bjtmp = bj + bi[i];
594     /* zero out the 4x4 block accumulators */
595     /* zero out one register */
596     XOR_PS(XMM7,XMM7);
597     for  (j=0; j<nz; j++) {
598       x = rtmp+16*bjtmp[j];
599 /*        x = rtmp+4*bjtmp[j]; */
600       SSE_INLINE_BEGIN_1(x)
601       /* Copy zero register to memory locations */
602       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
603       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
604       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
605       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
606       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
607       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
608       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
609       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
610       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
611       SSE_INLINE_END_1;
612     }
613     /* load in initial (unfactored row) */
614     nz       = ai[i+1] - ai[i];
615     ajtmpold = aj + ai[i];
616     v        = aa + 16*ai[i];
617     for (j=0; j<nz; j++) {
618       x = rtmp+16*ajtmpold[j];
619 /*        x = rtmp+colscale*ajtmpold[j]; */
620       /* Copy v block into x block */
621       SSE_INLINE_BEGIN_2(v,x)
622       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
623       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
624       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
625 
626       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
627       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
628 
629       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
630       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
631 
632       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
633       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
634 
635       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
636       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
637 
638       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
639       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
640 
641       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
642       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
643 
644       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
645       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
646       SSE_INLINE_END_2;
647 
648       v += 16;
649     }
650 /*      row = (*bjtmp++)/4; */
651     row = *bjtmp++;
652     while (row < i) {
653       pc = rtmp + 16*row;
654       SSE_INLINE_BEGIN_1(pc)
655       /* Load block from lower triangle */
656       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
657       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
658       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
659 
660       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
661       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
662 
663       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
664       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
665 
666       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
667       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
668 
669       /* Compare block to zero block */
670 
671       SSE_COPY_PS(XMM4,XMM7)
672       SSE_CMPNEQ_PS(XMM4,XMM0)
673 
674       SSE_COPY_PS(XMM5,XMM7)
675       SSE_CMPNEQ_PS(XMM5,XMM1)
676 
677       SSE_COPY_PS(XMM6,XMM7)
678       SSE_CMPNEQ_PS(XMM6,XMM2)
679 
680       SSE_CMPNEQ_PS(XMM7,XMM3)
681 
682       /* Reduce the comparisons to one SSE register */
683       SSE_OR_PS(XMM6,XMM7)
684       SSE_OR_PS(XMM5,XMM4)
685       SSE_OR_PS(XMM5,XMM6)
686       SSE_INLINE_END_1;
687 
688       /* Reduce the one SSE register to an integer register for branching */
689       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
690       MOVEMASK(nonzero,XMM5);
691 
692       /* If block is nonzero ... */
693       if (nonzero) {
694         pv = ba + 16*diag_offset[row];
695         PREFETCH_L1(&pv[16]);
696         pj = bj + diag_offset[row] + 1;
697 
698         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
699         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
700         /* but the diagonal was inverted already */
701         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
702 
703         SSE_INLINE_BEGIN_2(pv,pc)
704         /* Column 0, product is accumulated in XMM4 */
705         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
706         SSE_SHUFFLE(XMM4,XMM4,0x00)
707         SSE_MULT_PS(XMM4,XMM0)
708 
709         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
710         SSE_SHUFFLE(XMM5,XMM5,0x00)
711         SSE_MULT_PS(XMM5,XMM1)
712         SSE_ADD_PS(XMM4,XMM5)
713 
714         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
715         SSE_SHUFFLE(XMM6,XMM6,0x00)
716         SSE_MULT_PS(XMM6,XMM2)
717         SSE_ADD_PS(XMM4,XMM6)
718 
719         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
720         SSE_SHUFFLE(XMM7,XMM7,0x00)
721         SSE_MULT_PS(XMM7,XMM3)
722         SSE_ADD_PS(XMM4,XMM7)
723 
724         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
725         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
726 
727         /* Column 1, product is accumulated in XMM5 */
728         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
729         SSE_SHUFFLE(XMM5,XMM5,0x00)
730         SSE_MULT_PS(XMM5,XMM0)
731 
732         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
733         SSE_SHUFFLE(XMM6,XMM6,0x00)
734         SSE_MULT_PS(XMM6,XMM1)
735         SSE_ADD_PS(XMM5,XMM6)
736 
737         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
738         SSE_SHUFFLE(XMM7,XMM7,0x00)
739         SSE_MULT_PS(XMM7,XMM2)
740         SSE_ADD_PS(XMM5,XMM7)
741 
742         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
743         SSE_SHUFFLE(XMM6,XMM6,0x00)
744         SSE_MULT_PS(XMM6,XMM3)
745         SSE_ADD_PS(XMM5,XMM6)
746 
747         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
748         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
749 
750         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
751 
752         /* Column 2, product is accumulated in XMM6 */
753         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
754         SSE_SHUFFLE(XMM6,XMM6,0x00)
755         SSE_MULT_PS(XMM6,XMM0)
756 
757         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
758         SSE_SHUFFLE(XMM7,XMM7,0x00)
759         SSE_MULT_PS(XMM7,XMM1)
760         SSE_ADD_PS(XMM6,XMM7)
761 
762         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
763         SSE_SHUFFLE(XMM7,XMM7,0x00)
764         SSE_MULT_PS(XMM7,XMM2)
765         SSE_ADD_PS(XMM6,XMM7)
766 
767         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
768         SSE_SHUFFLE(XMM7,XMM7,0x00)
769         SSE_MULT_PS(XMM7,XMM3)
770         SSE_ADD_PS(XMM6,XMM7)
771 
772         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
773         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
774 
775         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
776         /* Column 3, product is accumulated in XMM0 */
777         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
778         SSE_SHUFFLE(XMM7,XMM7,0x00)
779         SSE_MULT_PS(XMM0,XMM7)
780 
781         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
782         SSE_SHUFFLE(XMM7,XMM7,0x00)
783         SSE_MULT_PS(XMM1,XMM7)
784         SSE_ADD_PS(XMM0,XMM1)
785 
786         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
787         SSE_SHUFFLE(XMM1,XMM1,0x00)
788         SSE_MULT_PS(XMM1,XMM2)
789         SSE_ADD_PS(XMM0,XMM1)
790 
791         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
792         SSE_SHUFFLE(XMM7,XMM7,0x00)
793         SSE_MULT_PS(XMM3,XMM7)
794         SSE_ADD_PS(XMM0,XMM3)
795 
796         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
797         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
798 
799         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
800         /* This is code to be maintained and read by humans afterall. */
801         /* Copy Multiplier Col 3 into XMM3 */
802         SSE_COPY_PS(XMM3,XMM0)
803         /* Copy Multiplier Col 2 into XMM2 */
804         SSE_COPY_PS(XMM2,XMM6)
805         /* Copy Multiplier Col 1 into XMM1 */
806         SSE_COPY_PS(XMM1,XMM5)
807         /* Copy Multiplier Col 0 into XMM0 */
808         SSE_COPY_PS(XMM0,XMM4)
809         SSE_INLINE_END_2;
810 
811         /* Update the row: */
812         nz  = bi[row+1] - diag_offset[row] - 1;
813         pv += 16;
814         for (j=0; j<nz; j++) {
815           PREFETCH_L1(&pv[16]);
816           x = rtmp + 16*pj[j];
817 /*            x = rtmp + 4*pj[j]; */
818 
819           /* X:=X-M*PV, One column at a time */
820           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
821           SSE_INLINE_BEGIN_2(x,pv)
822           /* Load First Column of X*/
823           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
824           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
825 
826           /* Matrix-Vector Product: */
827           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
828           SSE_SHUFFLE(XMM5,XMM5,0x00)
829           SSE_MULT_PS(XMM5,XMM0)
830           SSE_SUB_PS(XMM4,XMM5)
831 
832           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
833           SSE_SHUFFLE(XMM6,XMM6,0x00)
834           SSE_MULT_PS(XMM6,XMM1)
835           SSE_SUB_PS(XMM4,XMM6)
836 
837           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
838           SSE_SHUFFLE(XMM7,XMM7,0x00)
839           SSE_MULT_PS(XMM7,XMM2)
840           SSE_SUB_PS(XMM4,XMM7)
841 
842           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
843           SSE_SHUFFLE(XMM5,XMM5,0x00)
844           SSE_MULT_PS(XMM5,XMM3)
845           SSE_SUB_PS(XMM4,XMM5)
846 
847           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
848           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
849 
850           /* Second Column */
851           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
852           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
853 
854           /* Matrix-Vector Product: */
855           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
856           SSE_SHUFFLE(XMM6,XMM6,0x00)
857           SSE_MULT_PS(XMM6,XMM0)
858           SSE_SUB_PS(XMM5,XMM6)
859 
860           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
861           SSE_SHUFFLE(XMM7,XMM7,0x00)
862           SSE_MULT_PS(XMM7,XMM1)
863           SSE_SUB_PS(XMM5,XMM7)
864 
865           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
866           SSE_SHUFFLE(XMM4,XMM4,0x00)
867           SSE_MULT_PS(XMM4,XMM2)
868           SSE_SUB_PS(XMM5,XMM4)
869 
870           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
871           SSE_SHUFFLE(XMM6,XMM6,0x00)
872           SSE_MULT_PS(XMM6,XMM3)
873           SSE_SUB_PS(XMM5,XMM6)
874 
875           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
876           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
877 
878           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
879 
880           /* Third Column */
881           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
882           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
883 
884           /* Matrix-Vector Product: */
885           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
886           SSE_SHUFFLE(XMM7,XMM7,0x00)
887           SSE_MULT_PS(XMM7,XMM0)
888           SSE_SUB_PS(XMM6,XMM7)
889 
890           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
891           SSE_SHUFFLE(XMM4,XMM4,0x00)
892           SSE_MULT_PS(XMM4,XMM1)
893           SSE_SUB_PS(XMM6,XMM4)
894 
895           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
896           SSE_SHUFFLE(XMM5,XMM5,0x00)
897           SSE_MULT_PS(XMM5,XMM2)
898           SSE_SUB_PS(XMM6,XMM5)
899 
900           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
901           SSE_SHUFFLE(XMM7,XMM7,0x00)
902           SSE_MULT_PS(XMM7,XMM3)
903           SSE_SUB_PS(XMM6,XMM7)
904 
905           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
906           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
907 
908           /* Fourth Column */
909           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
910           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
911 
912           /* Matrix-Vector Product: */
913           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
914           SSE_SHUFFLE(XMM5,XMM5,0x00)
915           SSE_MULT_PS(XMM5,XMM0)
916           SSE_SUB_PS(XMM4,XMM5)
917 
918           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
919           SSE_SHUFFLE(XMM6,XMM6,0x00)
920           SSE_MULT_PS(XMM6,XMM1)
921           SSE_SUB_PS(XMM4,XMM6)
922 
923           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
924           SSE_SHUFFLE(XMM7,XMM7,0x00)
925           SSE_MULT_PS(XMM7,XMM2)
926           SSE_SUB_PS(XMM4,XMM7)
927 
928           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
929           SSE_SHUFFLE(XMM5,XMM5,0x00)
930           SSE_MULT_PS(XMM5,XMM3)
931           SSE_SUB_PS(XMM4,XMM5)
932 
933           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
934           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
935           SSE_INLINE_END_2;
936           pv += 16;
937         }
938         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
939       }
940       row = *bjtmp++;
941 /*        row = (*bjtmp++)/4; */
942     }
943     /* finished row so stick it into b->a */
944     pv = ba + 16*bi[i];
945     pj = bj + bi[i];
946     nz = bi[i+1] - bi[i];
947 
948     /* Copy x block back into pv block */
949     for (j=0; j<nz; j++) {
950       x = rtmp+16*pj[j];
951 /*        x  = rtmp+4*pj[j]; */
952 
953       SSE_INLINE_BEGIN_2(x,pv)
954       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
955       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
956       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
957 
958       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
959       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
960 
961       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
962       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
963 
964       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
965       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
966 
967       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
968       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
969 
970       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
971       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
972 
973       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
974       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
975 
976       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
977       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
978       SSE_INLINE_END_2;
979       pv += 16;
980     }
981     /* invert diagonal block */
982     w = ba + 16*diag_offset[i];
983     if (pivotinblocks) {
984       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
985       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
986     } else {
987       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
988     }
989 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
990     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
991   }
992 
993   ierr = PetscFree(rtmp);CHKERRQ(ierr);
994 
995   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
996   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
997   C->assembled           = PETSC_TRUE;
998 
999   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1000   /* Flop Count from inverting diagonal blocks */
1001   SSE_SCOPE_END;
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 #undef __FUNCT__
1006 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1007 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1008 {
1009   Mat            A  =C;
1010   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1011   PetscErrorCode ierr;
1012   int            i,j,n = a->mbs;
1013   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1014   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1015   unsigned int   row;
1016   int            nz,*bi=b->i;
1017   int            *diag_offset = b->diag,*ai=a->i;
1018   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1019   MatScalar      *ba    = b->a,*aa = a->a;
1020   int            nonzero=0;
1021 /*    int            nonzero=0,colscale = 16; */
1022   PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected;
1023   PetscReal shift         = info->shiftamount;
1024 
1025   PetscFunctionBegin;
1026   SSE_SCOPE_BEGIN;
1027 
1028   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1029   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1030   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1031   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1032 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1033 /*      colscale = 4; */
1034 /*    } */
1035 
1036   for (i=0; i<n; i++) {
1037     nz    = bi[i+1] - bi[i];
1038     bjtmp = bj + bi[i];
1039     /* zero out the 4x4 block accumulators */
1040     /* zero out one register */
1041     XOR_PS(XMM7,XMM7);
1042     for  (j=0; j<nz; j++) {
1043       x = rtmp+16*((unsigned int)bjtmp[j]);
1044 /*        x = rtmp+4*bjtmp[j]; */
1045       SSE_INLINE_BEGIN_1(x)
1046       /* Copy zero register to memory locations */
1047       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1048       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1049       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1050       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1051       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1052       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1053       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1054       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1055       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1056       SSE_INLINE_END_1;
1057     }
1058     /* load in initial (unfactored row) */
1059     nz    = ai[i+1] - ai[i];
1060     ajtmp = aj + ai[i];
1061     v     = aa + 16*ai[i];
1062     for (j=0; j<nz; j++) {
1063       x = rtmp+16*((unsigned int)ajtmp[j]);
1064 /*        x = rtmp+colscale*ajtmp[j]; */
1065       /* Copy v block into x block */
1066       SSE_INLINE_BEGIN_2(v,x)
1067       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1068       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1069       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1070 
1071       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1072       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1073 
1074       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1075       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1076 
1077       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1078       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1079 
1080       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1081       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1082 
1083       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1084       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1085 
1086       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1087       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1088 
1089       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1090       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1091       SSE_INLINE_END_2;
1092 
1093       v += 16;
1094     }
1095 /*      row = (*bjtmp++)/4; */
1096     row = (unsigned int)(*bjtmp++);
1097     while (row < i) {
1098       pc = rtmp + 16*row;
1099       SSE_INLINE_BEGIN_1(pc)
1100       /* Load block from lower triangle */
1101       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1102       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1103       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1104 
1105       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1106       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1107 
1108       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1109       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1110 
1111       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1112       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1113 
1114       /* Compare block to zero block */
1115 
1116       SSE_COPY_PS(XMM4,XMM7)
1117       SSE_CMPNEQ_PS(XMM4,XMM0)
1118 
1119       SSE_COPY_PS(XMM5,XMM7)
1120       SSE_CMPNEQ_PS(XMM5,XMM1)
1121 
1122       SSE_COPY_PS(XMM6,XMM7)
1123       SSE_CMPNEQ_PS(XMM6,XMM2)
1124 
1125       SSE_CMPNEQ_PS(XMM7,XMM3)
1126 
1127       /* Reduce the comparisons to one SSE register */
1128       SSE_OR_PS(XMM6,XMM7)
1129       SSE_OR_PS(XMM5,XMM4)
1130       SSE_OR_PS(XMM5,XMM6)
1131       SSE_INLINE_END_1;
1132 
1133       /* Reduce the one SSE register to an integer register for branching */
1134       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1135       MOVEMASK(nonzero,XMM5);
1136 
1137       /* If block is nonzero ... */
1138       if (nonzero) {
1139         pv = ba + 16*diag_offset[row];
1140         PREFETCH_L1(&pv[16]);
1141         pj = bj + diag_offset[row] + 1;
1142 
1143         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1144         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1145         /* but the diagonal was inverted already */
1146         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1147 
1148         SSE_INLINE_BEGIN_2(pv,pc)
1149         /* Column 0, product is accumulated in XMM4 */
1150         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1151         SSE_SHUFFLE(XMM4,XMM4,0x00)
1152         SSE_MULT_PS(XMM4,XMM0)
1153 
1154         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1155         SSE_SHUFFLE(XMM5,XMM5,0x00)
1156         SSE_MULT_PS(XMM5,XMM1)
1157         SSE_ADD_PS(XMM4,XMM5)
1158 
1159         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1160         SSE_SHUFFLE(XMM6,XMM6,0x00)
1161         SSE_MULT_PS(XMM6,XMM2)
1162         SSE_ADD_PS(XMM4,XMM6)
1163 
1164         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1165         SSE_SHUFFLE(XMM7,XMM7,0x00)
1166         SSE_MULT_PS(XMM7,XMM3)
1167         SSE_ADD_PS(XMM4,XMM7)
1168 
1169         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1170         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1171 
1172         /* Column 1, product is accumulated in XMM5 */
1173         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1174         SSE_SHUFFLE(XMM5,XMM5,0x00)
1175         SSE_MULT_PS(XMM5,XMM0)
1176 
1177         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1178         SSE_SHUFFLE(XMM6,XMM6,0x00)
1179         SSE_MULT_PS(XMM6,XMM1)
1180         SSE_ADD_PS(XMM5,XMM6)
1181 
1182         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1183         SSE_SHUFFLE(XMM7,XMM7,0x00)
1184         SSE_MULT_PS(XMM7,XMM2)
1185         SSE_ADD_PS(XMM5,XMM7)
1186 
1187         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1188         SSE_SHUFFLE(XMM6,XMM6,0x00)
1189         SSE_MULT_PS(XMM6,XMM3)
1190         SSE_ADD_PS(XMM5,XMM6)
1191 
1192         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1193         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1194 
1195         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1196 
1197         /* Column 2, product is accumulated in XMM6 */
1198         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1199         SSE_SHUFFLE(XMM6,XMM6,0x00)
1200         SSE_MULT_PS(XMM6,XMM0)
1201 
1202         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1203         SSE_SHUFFLE(XMM7,XMM7,0x00)
1204         SSE_MULT_PS(XMM7,XMM1)
1205         SSE_ADD_PS(XMM6,XMM7)
1206 
1207         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1208         SSE_SHUFFLE(XMM7,XMM7,0x00)
1209         SSE_MULT_PS(XMM7,XMM2)
1210         SSE_ADD_PS(XMM6,XMM7)
1211 
1212         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1213         SSE_SHUFFLE(XMM7,XMM7,0x00)
1214         SSE_MULT_PS(XMM7,XMM3)
1215         SSE_ADD_PS(XMM6,XMM7)
1216 
1217         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1218         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1219 
1220         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1221         /* Column 3, product is accumulated in XMM0 */
1222         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1223         SSE_SHUFFLE(XMM7,XMM7,0x00)
1224         SSE_MULT_PS(XMM0,XMM7)
1225 
1226         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1227         SSE_SHUFFLE(XMM7,XMM7,0x00)
1228         SSE_MULT_PS(XMM1,XMM7)
1229         SSE_ADD_PS(XMM0,XMM1)
1230 
1231         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1232         SSE_SHUFFLE(XMM1,XMM1,0x00)
1233         SSE_MULT_PS(XMM1,XMM2)
1234         SSE_ADD_PS(XMM0,XMM1)
1235 
1236         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1237         SSE_SHUFFLE(XMM7,XMM7,0x00)
1238         SSE_MULT_PS(XMM3,XMM7)
1239         SSE_ADD_PS(XMM0,XMM3)
1240 
1241         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1242         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1243 
1244         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1245         /* This is code to be maintained and read by humans afterall. */
1246         /* Copy Multiplier Col 3 into XMM3 */
1247         SSE_COPY_PS(XMM3,XMM0)
1248         /* Copy Multiplier Col 2 into XMM2 */
1249         SSE_COPY_PS(XMM2,XMM6)
1250         /* Copy Multiplier Col 1 into XMM1 */
1251         SSE_COPY_PS(XMM1,XMM5)
1252         /* Copy Multiplier Col 0 into XMM0 */
1253         SSE_COPY_PS(XMM0,XMM4)
1254         SSE_INLINE_END_2;
1255 
1256         /* Update the row: */
1257         nz  = bi[row+1] - diag_offset[row] - 1;
1258         pv += 16;
1259         for (j=0; j<nz; j++) {
1260           PREFETCH_L1(&pv[16]);
1261           x = rtmp + 16*((unsigned int)pj[j]);
1262 /*            x = rtmp + 4*pj[j]; */
1263 
1264           /* X:=X-M*PV, One column at a time */
1265           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1266           SSE_INLINE_BEGIN_2(x,pv)
1267           /* Load First Column of X*/
1268           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1269           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1270 
1271           /* Matrix-Vector Product: */
1272           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1273           SSE_SHUFFLE(XMM5,XMM5,0x00)
1274           SSE_MULT_PS(XMM5,XMM0)
1275           SSE_SUB_PS(XMM4,XMM5)
1276 
1277           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1278           SSE_SHUFFLE(XMM6,XMM6,0x00)
1279           SSE_MULT_PS(XMM6,XMM1)
1280           SSE_SUB_PS(XMM4,XMM6)
1281 
1282           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1283           SSE_SHUFFLE(XMM7,XMM7,0x00)
1284           SSE_MULT_PS(XMM7,XMM2)
1285           SSE_SUB_PS(XMM4,XMM7)
1286 
1287           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1288           SSE_SHUFFLE(XMM5,XMM5,0x00)
1289           SSE_MULT_PS(XMM5,XMM3)
1290           SSE_SUB_PS(XMM4,XMM5)
1291 
1292           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1293           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1294 
1295           /* Second Column */
1296           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1297           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1298 
1299           /* Matrix-Vector Product: */
1300           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1301           SSE_SHUFFLE(XMM6,XMM6,0x00)
1302           SSE_MULT_PS(XMM6,XMM0)
1303           SSE_SUB_PS(XMM5,XMM6)
1304 
1305           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1306           SSE_SHUFFLE(XMM7,XMM7,0x00)
1307           SSE_MULT_PS(XMM7,XMM1)
1308           SSE_SUB_PS(XMM5,XMM7)
1309 
1310           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1311           SSE_SHUFFLE(XMM4,XMM4,0x00)
1312           SSE_MULT_PS(XMM4,XMM2)
1313           SSE_SUB_PS(XMM5,XMM4)
1314 
1315           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1316           SSE_SHUFFLE(XMM6,XMM6,0x00)
1317           SSE_MULT_PS(XMM6,XMM3)
1318           SSE_SUB_PS(XMM5,XMM6)
1319 
1320           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1321           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1322 
1323           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1324 
1325           /* Third Column */
1326           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1327           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1328 
1329           /* Matrix-Vector Product: */
1330           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1331           SSE_SHUFFLE(XMM7,XMM7,0x00)
1332           SSE_MULT_PS(XMM7,XMM0)
1333           SSE_SUB_PS(XMM6,XMM7)
1334 
1335           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1336           SSE_SHUFFLE(XMM4,XMM4,0x00)
1337           SSE_MULT_PS(XMM4,XMM1)
1338           SSE_SUB_PS(XMM6,XMM4)
1339 
1340           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1341           SSE_SHUFFLE(XMM5,XMM5,0x00)
1342           SSE_MULT_PS(XMM5,XMM2)
1343           SSE_SUB_PS(XMM6,XMM5)
1344 
1345           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1346           SSE_SHUFFLE(XMM7,XMM7,0x00)
1347           SSE_MULT_PS(XMM7,XMM3)
1348           SSE_SUB_PS(XMM6,XMM7)
1349 
1350           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1351           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1352 
1353           /* Fourth Column */
1354           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1355           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1356 
1357           /* Matrix-Vector Product: */
1358           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1359           SSE_SHUFFLE(XMM5,XMM5,0x00)
1360           SSE_MULT_PS(XMM5,XMM0)
1361           SSE_SUB_PS(XMM4,XMM5)
1362 
1363           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1364           SSE_SHUFFLE(XMM6,XMM6,0x00)
1365           SSE_MULT_PS(XMM6,XMM1)
1366           SSE_SUB_PS(XMM4,XMM6)
1367 
1368           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1369           SSE_SHUFFLE(XMM7,XMM7,0x00)
1370           SSE_MULT_PS(XMM7,XMM2)
1371           SSE_SUB_PS(XMM4,XMM7)
1372 
1373           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1374           SSE_SHUFFLE(XMM5,XMM5,0x00)
1375           SSE_MULT_PS(XMM5,XMM3)
1376           SSE_SUB_PS(XMM4,XMM5)
1377 
1378           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1379           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1380           SSE_INLINE_END_2;
1381           pv += 16;
1382         }
1383         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1384       }
1385       row = (unsigned int)(*bjtmp++);
1386 /*        row = (*bjtmp++)/4; */
1387 /*        bjtmp++; */
1388     }
1389     /* finished row so stick it into b->a */
1390     pv = ba + 16*bi[i];
1391     pj = bj + bi[i];
1392     nz = bi[i+1] - bi[i];
1393 
1394     /* Copy x block back into pv block */
1395     for (j=0; j<nz; j++) {
1396       x = rtmp+16*((unsigned int)pj[j]);
1397 /*        x  = rtmp+4*pj[j]; */
1398 
1399       SSE_INLINE_BEGIN_2(x,pv)
1400       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1401       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1402       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1403 
1404       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1405       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1406 
1407       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1408       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1409 
1410       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1411       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1412 
1413       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1414       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1415 
1416       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1417       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1418 
1419       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1420       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1421 
1422       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1423       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1424       SSE_INLINE_END_2;
1425       pv += 16;
1426     }
1427     /* invert diagonal block */
1428     w = ba + 16*diag_offset[i];
1429     if (pivotinblocks) {
1430       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
1431       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1432     } else {
1433       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1434     }
1435 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1436     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1437   }
1438 
1439   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1440 
1441   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1442   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1443   C->assembled           = PETSC_TRUE;
1444 
1445   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1446   /* Flop Count from inverting diagonal blocks */
1447   SSE_SCOPE_END;
1448   PetscFunctionReturn(0);
1449 }
1450 
1451 #undef __FUNCT__
1452 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1453 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1454 {
1455   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1456   PetscErrorCode ierr;
1457   int            i,j,n = a->mbs;
1458   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1459   unsigned int   row;
1460   int            *ajtmpold,nz,*bi=b->i;
1461   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1462   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1463   MatScalar      *ba    = b->a,*aa = a->a;
1464   int            nonzero=0;
1465 /*    int            nonzero=0,colscale = 16; */
1466   PetscBool pivotinblocks = b->pivotinblocks,zeropivotdetected;
1467   PetscReal shift         = info->shiftamount;
1468 
1469   PetscFunctionBegin;
1470   SSE_SCOPE_BEGIN;
1471 
1472   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1473   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1474   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1475   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1476 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1477 /*      colscale = 4; */
1478 /*    } */
1479   if ((unsigned long)bj==(unsigned long)aj) {
1480     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1481   }
1482 
1483   for (i=0; i<n; i++) {
1484     nz    = bi[i+1] - bi[i];
1485     bjtmp = bj + bi[i];
1486     /* zero out the 4x4 block accumulators */
1487     /* zero out one register */
1488     XOR_PS(XMM7,XMM7);
1489     for  (j=0; j<nz; j++) {
1490       x = rtmp+16*((unsigned int)bjtmp[j]);
1491 /*        x = rtmp+4*bjtmp[j]; */
1492       SSE_INLINE_BEGIN_1(x)
1493       /* Copy zero register to memory locations */
1494       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1495       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1496       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1497       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1498       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1499       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1500       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1501       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1502       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1503       SSE_INLINE_END_1;
1504     }
1505     /* load in initial (unfactored row) */
1506     nz       = ai[i+1] - ai[i];
1507     ajtmpold = aj + ai[i];
1508     v        = aa + 16*ai[i];
1509     for (j=0; j<nz; j++) {
1510       x = rtmp+16*ajtmpold[j];
1511 /*        x = rtmp+colscale*ajtmpold[j]; */
1512       /* Copy v block into x block */
1513       SSE_INLINE_BEGIN_2(v,x)
1514       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1515       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1516       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1517 
1518       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1519       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1520 
1521       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1522       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1523 
1524       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1525       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1526 
1527       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1528       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1529 
1530       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1531       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1532 
1533       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1534       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1535 
1536       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1537       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1538       SSE_INLINE_END_2;
1539 
1540       v += 16;
1541     }
1542 /*      row = (*bjtmp++)/4; */
1543     row = (unsigned int)(*bjtmp++);
1544     while (row < i) {
1545       pc = rtmp + 16*row;
1546       SSE_INLINE_BEGIN_1(pc)
1547       /* Load block from lower triangle */
1548       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1549       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1550       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1551 
1552       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1553       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1554 
1555       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1556       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1557 
1558       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1559       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1560 
1561       /* Compare block to zero block */
1562 
1563       SSE_COPY_PS(XMM4,XMM7)
1564       SSE_CMPNEQ_PS(XMM4,XMM0)
1565 
1566       SSE_COPY_PS(XMM5,XMM7)
1567       SSE_CMPNEQ_PS(XMM5,XMM1)
1568 
1569       SSE_COPY_PS(XMM6,XMM7)
1570       SSE_CMPNEQ_PS(XMM6,XMM2)
1571 
1572       SSE_CMPNEQ_PS(XMM7,XMM3)
1573 
1574       /* Reduce the comparisons to one SSE register */
1575       SSE_OR_PS(XMM6,XMM7)
1576       SSE_OR_PS(XMM5,XMM4)
1577       SSE_OR_PS(XMM5,XMM6)
1578       SSE_INLINE_END_1;
1579 
1580       /* Reduce the one SSE register to an integer register for branching */
1581       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1582       MOVEMASK(nonzero,XMM5);
1583 
1584       /* If block is nonzero ... */
1585       if (nonzero) {
1586         pv = ba + 16*diag_offset[row];
1587         PREFETCH_L1(&pv[16]);
1588         pj = bj + diag_offset[row] + 1;
1589 
1590         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1591         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1592         /* but the diagonal was inverted already */
1593         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1594 
1595         SSE_INLINE_BEGIN_2(pv,pc)
1596         /* Column 0, product is accumulated in XMM4 */
1597         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1598         SSE_SHUFFLE(XMM4,XMM4,0x00)
1599         SSE_MULT_PS(XMM4,XMM0)
1600 
1601         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1602         SSE_SHUFFLE(XMM5,XMM5,0x00)
1603         SSE_MULT_PS(XMM5,XMM1)
1604         SSE_ADD_PS(XMM4,XMM5)
1605 
1606         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1607         SSE_SHUFFLE(XMM6,XMM6,0x00)
1608         SSE_MULT_PS(XMM6,XMM2)
1609         SSE_ADD_PS(XMM4,XMM6)
1610 
1611         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1612         SSE_SHUFFLE(XMM7,XMM7,0x00)
1613         SSE_MULT_PS(XMM7,XMM3)
1614         SSE_ADD_PS(XMM4,XMM7)
1615 
1616         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1617         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1618 
1619         /* Column 1, product is accumulated in XMM5 */
1620         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1621         SSE_SHUFFLE(XMM5,XMM5,0x00)
1622         SSE_MULT_PS(XMM5,XMM0)
1623 
1624         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1625         SSE_SHUFFLE(XMM6,XMM6,0x00)
1626         SSE_MULT_PS(XMM6,XMM1)
1627         SSE_ADD_PS(XMM5,XMM6)
1628 
1629         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1630         SSE_SHUFFLE(XMM7,XMM7,0x00)
1631         SSE_MULT_PS(XMM7,XMM2)
1632         SSE_ADD_PS(XMM5,XMM7)
1633 
1634         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1635         SSE_SHUFFLE(XMM6,XMM6,0x00)
1636         SSE_MULT_PS(XMM6,XMM3)
1637         SSE_ADD_PS(XMM5,XMM6)
1638 
1639         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1640         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1641 
1642         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1643 
1644         /* Column 2, product is accumulated in XMM6 */
1645         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1646         SSE_SHUFFLE(XMM6,XMM6,0x00)
1647         SSE_MULT_PS(XMM6,XMM0)
1648 
1649         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1650         SSE_SHUFFLE(XMM7,XMM7,0x00)
1651         SSE_MULT_PS(XMM7,XMM1)
1652         SSE_ADD_PS(XMM6,XMM7)
1653 
1654         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1655         SSE_SHUFFLE(XMM7,XMM7,0x00)
1656         SSE_MULT_PS(XMM7,XMM2)
1657         SSE_ADD_PS(XMM6,XMM7)
1658 
1659         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1660         SSE_SHUFFLE(XMM7,XMM7,0x00)
1661         SSE_MULT_PS(XMM7,XMM3)
1662         SSE_ADD_PS(XMM6,XMM7)
1663 
1664         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1665         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1666 
1667         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1668         /* Column 3, product is accumulated in XMM0 */
1669         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1670         SSE_SHUFFLE(XMM7,XMM7,0x00)
1671         SSE_MULT_PS(XMM0,XMM7)
1672 
1673         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1674         SSE_SHUFFLE(XMM7,XMM7,0x00)
1675         SSE_MULT_PS(XMM1,XMM7)
1676         SSE_ADD_PS(XMM0,XMM1)
1677 
1678         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1679         SSE_SHUFFLE(XMM1,XMM1,0x00)
1680         SSE_MULT_PS(XMM1,XMM2)
1681         SSE_ADD_PS(XMM0,XMM1)
1682 
1683         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1684         SSE_SHUFFLE(XMM7,XMM7,0x00)
1685         SSE_MULT_PS(XMM3,XMM7)
1686         SSE_ADD_PS(XMM0,XMM3)
1687 
1688         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1689         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1690 
1691         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1692         /* This is code to be maintained and read by humans afterall. */
1693         /* Copy Multiplier Col 3 into XMM3 */
1694         SSE_COPY_PS(XMM3,XMM0)
1695         /* Copy Multiplier Col 2 into XMM2 */
1696         SSE_COPY_PS(XMM2,XMM6)
1697         /* Copy Multiplier Col 1 into XMM1 */
1698         SSE_COPY_PS(XMM1,XMM5)
1699         /* Copy Multiplier Col 0 into XMM0 */
1700         SSE_COPY_PS(XMM0,XMM4)
1701         SSE_INLINE_END_2;
1702 
1703         /* Update the row: */
1704         nz  = bi[row+1] - diag_offset[row] - 1;
1705         pv += 16;
1706         for (j=0; j<nz; j++) {
1707           PREFETCH_L1(&pv[16]);
1708           x = rtmp + 16*((unsigned int)pj[j]);
1709 /*            x = rtmp + 4*pj[j]; */
1710 
1711           /* X:=X-M*PV, One column at a time */
1712           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1713           SSE_INLINE_BEGIN_2(x,pv)
1714           /* Load First Column of X*/
1715           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1716           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1717 
1718           /* Matrix-Vector Product: */
1719           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1720           SSE_SHUFFLE(XMM5,XMM5,0x00)
1721           SSE_MULT_PS(XMM5,XMM0)
1722           SSE_SUB_PS(XMM4,XMM5)
1723 
1724           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1725           SSE_SHUFFLE(XMM6,XMM6,0x00)
1726           SSE_MULT_PS(XMM6,XMM1)
1727           SSE_SUB_PS(XMM4,XMM6)
1728 
1729           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1730           SSE_SHUFFLE(XMM7,XMM7,0x00)
1731           SSE_MULT_PS(XMM7,XMM2)
1732           SSE_SUB_PS(XMM4,XMM7)
1733 
1734           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1735           SSE_SHUFFLE(XMM5,XMM5,0x00)
1736           SSE_MULT_PS(XMM5,XMM3)
1737           SSE_SUB_PS(XMM4,XMM5)
1738 
1739           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1740           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1741 
1742           /* Second Column */
1743           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1744           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1745 
1746           /* Matrix-Vector Product: */
1747           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1748           SSE_SHUFFLE(XMM6,XMM6,0x00)
1749           SSE_MULT_PS(XMM6,XMM0)
1750           SSE_SUB_PS(XMM5,XMM6)
1751 
1752           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1753           SSE_SHUFFLE(XMM7,XMM7,0x00)
1754           SSE_MULT_PS(XMM7,XMM1)
1755           SSE_SUB_PS(XMM5,XMM7)
1756 
1757           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1758           SSE_SHUFFLE(XMM4,XMM4,0x00)
1759           SSE_MULT_PS(XMM4,XMM2)
1760           SSE_SUB_PS(XMM5,XMM4)
1761 
1762           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1763           SSE_SHUFFLE(XMM6,XMM6,0x00)
1764           SSE_MULT_PS(XMM6,XMM3)
1765           SSE_SUB_PS(XMM5,XMM6)
1766 
1767           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1768           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1769 
1770           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1771 
1772           /* Third Column */
1773           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1774           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1775 
1776           /* Matrix-Vector Product: */
1777           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1778           SSE_SHUFFLE(XMM7,XMM7,0x00)
1779           SSE_MULT_PS(XMM7,XMM0)
1780           SSE_SUB_PS(XMM6,XMM7)
1781 
1782           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1783           SSE_SHUFFLE(XMM4,XMM4,0x00)
1784           SSE_MULT_PS(XMM4,XMM1)
1785           SSE_SUB_PS(XMM6,XMM4)
1786 
1787           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1788           SSE_SHUFFLE(XMM5,XMM5,0x00)
1789           SSE_MULT_PS(XMM5,XMM2)
1790           SSE_SUB_PS(XMM6,XMM5)
1791 
1792           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1793           SSE_SHUFFLE(XMM7,XMM7,0x00)
1794           SSE_MULT_PS(XMM7,XMM3)
1795           SSE_SUB_PS(XMM6,XMM7)
1796 
1797           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1798           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1799 
1800           /* Fourth Column */
1801           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1802           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1803 
1804           /* Matrix-Vector Product: */
1805           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1806           SSE_SHUFFLE(XMM5,XMM5,0x00)
1807           SSE_MULT_PS(XMM5,XMM0)
1808           SSE_SUB_PS(XMM4,XMM5)
1809 
1810           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1811           SSE_SHUFFLE(XMM6,XMM6,0x00)
1812           SSE_MULT_PS(XMM6,XMM1)
1813           SSE_SUB_PS(XMM4,XMM6)
1814 
1815           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1816           SSE_SHUFFLE(XMM7,XMM7,0x00)
1817           SSE_MULT_PS(XMM7,XMM2)
1818           SSE_SUB_PS(XMM4,XMM7)
1819 
1820           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1821           SSE_SHUFFLE(XMM5,XMM5,0x00)
1822           SSE_MULT_PS(XMM5,XMM3)
1823           SSE_SUB_PS(XMM4,XMM5)
1824 
1825           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1826           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1827           SSE_INLINE_END_2;
1828           pv += 16;
1829         }
1830         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1831       }
1832       row = (unsigned int)(*bjtmp++);
1833 /*        row = (*bjtmp++)/4; */
1834 /*        bjtmp++; */
1835     }
1836     /* finished row so stick it into b->a */
1837     pv = ba + 16*bi[i];
1838     pj = bj + bi[i];
1839     nz = bi[i+1] - bi[i];
1840 
1841     /* Copy x block back into pv block */
1842     for (j=0; j<nz; j++) {
1843       x = rtmp+16*((unsigned int)pj[j]);
1844 /*        x  = rtmp+4*pj[j]; */
1845 
1846       SSE_INLINE_BEGIN_2(x,pv)
1847       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1848       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1849       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1850 
1851       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1852       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1853 
1854       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1855       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1856 
1857       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1858       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1859 
1860       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1861       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1862 
1863       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1864       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1865 
1866       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1867       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1868 
1869       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1870       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1871       SSE_INLINE_END_2;
1872       pv += 16;
1873     }
1874     /* invert diagonal block */
1875     w = ba + 16*diag_offset[i];
1876     if (pivotinblocks) {
1877       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
1878       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1879     } else {
1880       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1881     }
1882 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1883     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1884   }
1885 
1886   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1887 
1888   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1889   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1890   C->assembled           = PETSC_TRUE;
1891 
1892   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1893   /* Flop Count from inverting diagonal blocks */
1894   SSE_SCOPE_END;
1895   PetscFunctionReturn(0);
1896 }
1897 
1898 #endif
1899