xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision a455e9261d77363e13c8b22a0e0ea831d71fb0ff)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 #undef __FUNCT__
13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15 {
16   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17   IS             isrow = b->row,isicol = b->icol;
18   PetscErrorCode ierr;
19   const PetscInt *r,*ic;
20   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21   PetscInt       *ajtmpold,*ajtmp,nz,row;
22   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
24   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27   MatScalar      m13,m14,m15,m16;
28   MatScalar      *ba           = b->a,*aa = a->a;
29   PetscBool      pivotinblocks = b->pivotinblocks;
30   PetscReal      shift         = info->shiftamount;
31 
32   PetscFunctionBegin;
33   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
35   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
36 
37   for (i=0; i<n; i++) {
38     nz    = bi[i+1] - bi[i];
39     ajtmp = bj + bi[i];
40     for  (j=0; j<nz; j++) {
41       x     = rtmp+16*ajtmp[j];
42       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
43       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44     }
45     /* load in initial (unfactored row) */
46     idx      = r[i];
47     nz       = ai[idx+1] - ai[idx];
48     ajtmpold = aj + ai[idx];
49     v        = aa + 16*ai[idx];
50     for (j=0; j<nz; j++) {
51       x     = rtmp+16*ic[ajtmpold[j]];
52       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
53       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
54       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55       x[14] = v[14]; x[15] = v[15];
56       v    += 16;
57     }
58     row = *ajtmp++;
59     while (row < i) {
60       pc  = rtmp + 16*row;
61       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
62       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
63       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64       p15 = pc[14]; p16 = pc[15];
65       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68           || p16 != 0.0) {
69         pv    = ba + 16*diag_offset[row];
70         pj    = bj + diag_offset[row] + 1;
71         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
72         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
73         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74         x15   = pv[14]; x16 = pv[15];
75         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
76         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
77         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
78         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
79 
80         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
81         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
82         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
83         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
84 
85         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
86         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
87         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
88         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
89 
90         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
91         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
92         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
93         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
94 
95         nz  = bi[row+1] - diag_offset[row] - 1;
96         pv += 16;
97         for (j=0; j<nz; j++) {
98           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
99           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
102           x     = rtmp + 16*pj[j];
103           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
107 
108           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
112 
113           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117 
118           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
122 
123           pv += 16;
124         }
125         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
126       }
127       row = *ajtmp++;
128     }
129     /* finished row so stick it into b->a */
130     pv = ba + 16*bi[i];
131     pj = bj + bi[i];
132     nz = bi[i+1] - bi[i];
133     for (j=0; j<nz; j++) {
134       x      = rtmp+16*pj[j];
135       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139       pv    += 16;
140     }
141     /* invert diagonal block */
142     w = ba + 16*diag_offset[i];
143     if (pivotinblocks) {
144       PetscBool allowzeropivot,zeropivotdetected;
145       allowzeropivot = PetscNot(A->erroriffailure);
146       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
147       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
148     } else {
149       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
150     }
151   }
152 
153   ierr = PetscFree(rtmp);CHKERRQ(ierr);
154   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
155   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
156 
157   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
158   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
159   C->assembled           = PETSC_TRUE;
160 
161   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatLUFactorNumeric_SeqBAIJ_4 -
166      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
167        PetscKernel_A_gets_A_times_B()
168        PetscKernel_A_gets_A_minus_B_times_C()
169        PetscKernel_A_gets_inverse_A()
170 */
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
174 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
175 {
176   Mat            C     = B;
177   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
178   IS             isrow = b->row,isicol = b->icol;
179   PetscErrorCode ierr;
180   const PetscInt *r,*ic;
181   PetscInt       i,j,k,nz,nzL,row;
182   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
183   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
184   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
185   PetscInt       flg;
186   PetscReal      shift;
187   PetscBool      allowzeropivot,zeropivotdetected;
188 
189   PetscFunctionBegin;
190   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
191   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
192 
193   if (info->shifttype == MAT_SHIFT_NONE) {
194     shift = 0;
195   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
196     shift = info->shiftamount;
197   }
198 
199   /* generate work space needed by the factorization */
200   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
201   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
202 
203   for (i=0; i<n; i++) {
204     /* zero rtmp */
205     /* L part */
206     nz    = bi[i+1] - bi[i];
207     bjtmp = bj + bi[i];
208     for  (j=0; j<nz; j++) {
209       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
210     }
211 
212     /* U part */
213     nz    = bdiag[i]-bdiag[i+1];
214     bjtmp = bj + bdiag[i+1]+1;
215     for  (j=0; j<nz; j++) {
216       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
217     }
218 
219     /* load in initial (unfactored row) */
220     nz    = ai[r[i]+1] - ai[r[i]];
221     ajtmp = aj + ai[r[i]];
222     v     = aa + bs2*ai[r[i]];
223     for (j=0; j<nz; j++) {
224       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
225     }
226 
227     /* elimination */
228     bjtmp = bj + bi[i];
229     nzL   = bi[i+1] - bi[i];
230     for (k=0; k < nzL; k++) {
231       row = bjtmp[k];
232       pc  = rtmp + bs2*row;
233       for (flg=0,j=0; j<bs2; j++) {
234         if (pc[j]!=0.0) {
235           flg = 1;
236           break;
237         }
238       }
239       if (flg) {
240         pv = b->a + bs2*bdiag[row];
241         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
242         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
243 
244         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
245         pv = b->a + bs2*(bdiag[row+1]+1);
246         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
247         for (j=0; j<nz; j++) {
248           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
249           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
250           v    = rtmp + bs2*pj[j];
251           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
252           pv  += bs2;
253         }
254         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
255       }
256     }
257 
258     /* finished row so stick it into b->a */
259     /* L part */
260     pv = b->a + bs2*bi[i];
261     pj = b->j + bi[i];
262     nz = bi[i+1] - bi[i];
263     for (j=0; j<nz; j++) {
264       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
265     }
266 
267     /* Mark diagonal and invert diagonal for simplier triangular solves */
268     pv   = b->a + bs2*bdiag[i];
269     pj   = b->j + bdiag[i];
270     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
271     allowzeropivot = PetscNot(A->erroriffailure);
272     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
273     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
274 
275     /* U part */
276     pv = b->a + bs2*(bdiag[i+1]+1);
277     pj = b->j + bdiag[i+1]+1;
278     nz = bdiag[i] - bdiag[i+1] - 1;
279     for (j=0; j<nz; j++) {
280       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
281     }
282   }
283 
284   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
285   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
286   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
287 
288   C->ops->solve          = MatSolve_SeqBAIJ_4;
289   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
290   C->assembled           = PETSC_TRUE;
291 
292   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
293   PetscFunctionReturn(0);
294 }
295 
296 #undef __FUNCT__
297 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
298 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
299 {
300 /*
301     Default Version for when blocks are 4 by 4 Using natural ordering
302 */
303   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
304   PetscErrorCode ierr;
305   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
306   PetscInt       *ajtmpold,*ajtmp,nz,row;
307   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
308   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
309   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
310   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
311   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
312   MatScalar      m13,m14,m15,m16;
313   MatScalar      *ba           = b->a,*aa = a->a;
314   PetscBool      pivotinblocks = b->pivotinblocks;
315   PetscReal      shift         = info->shiftamount;
316 
317   PetscFunctionBegin;
318   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
319 
320   for (i=0; i<n; i++) {
321     nz    = bi[i+1] - bi[i];
322     ajtmp = bj + bi[i];
323     for  (j=0; j<nz; j++) {
324       x     = rtmp+16*ajtmp[j];
325       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
326       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
327     }
328     /* load in initial (unfactored row) */
329     nz       = ai[i+1] - ai[i];
330     ajtmpold = aj + ai[i];
331     v        = aa + 16*ai[i];
332     for (j=0; j<nz; j++) {
333       x     = rtmp+16*ajtmpold[j];
334       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
335       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
336       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
337       x[14] = v[14]; x[15] = v[15];
338       v    += 16;
339     }
340     row = *ajtmp++;
341     while (row < i) {
342       pc  = rtmp + 16*row;
343       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
344       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
345       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
346       p15 = pc[14]; p16 = pc[15];
347       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
348           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
349           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
350           || p16 != 0.0) {
351         pv    = ba + 16*diag_offset[row];
352         pj    = bj + diag_offset[row] + 1;
353         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
354         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
355         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
356         x15   = pv[14]; x16 = pv[15];
357         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
358         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
359         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
360         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
361 
362         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
363         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
364         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
365         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
366 
367         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
368         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
369         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
370         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
371 
372         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
373         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
374         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
375         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
376         nz     = bi[row+1] - diag_offset[row] - 1;
377         pv    += 16;
378         for (j=0; j<nz; j++) {
379           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
380           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
381           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
382           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
383           x     = rtmp + 16*pj[j];
384           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
385           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
386           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
387           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
388 
389           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
390           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
391           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
392           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
393 
394           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
395           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
396           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
397           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
398 
399           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
400           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
401           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
402           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
403 
404           pv += 16;
405         }
406         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
407       }
408       row = *ajtmp++;
409     }
410     /* finished row so stick it into b->a */
411     pv = ba + 16*bi[i];
412     pj = bj + bi[i];
413     nz = bi[i+1] - bi[i];
414     for (j=0; j<nz; j++) {
415       x      = rtmp+16*pj[j];
416       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
417       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
418       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
419       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
420       pv    += 16;
421     }
422     /* invert diagonal block */
423     w = ba + 16*diag_offset[i];
424     if (pivotinblocks) {
425       PetscBool allowzeropivot,zeropivotdetected;
426       allowzeropivot = PetscNot(A->erroriffailure);
427       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
428       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
429     } else {
430       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
431     }
432   }
433 
434   ierr = PetscFree(rtmp);CHKERRQ(ierr);
435 
436   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
437   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
438   C->assembled           = PETSC_TRUE;
439 
440   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
441   PetscFunctionReturn(0);
442 }
443 
444 /*
445   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
446     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
447 */
448 #undef __FUNCT__
449 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
450 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
451 {
452   Mat            C =B;
453   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
454   PetscErrorCode ierr;
455   PetscInt       i,j,k,nz,nzL,row;
456   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
457   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
458   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
459   PetscInt       flg;
460   PetscReal      shift;
461   PetscBool      allowzeropivot,zeropivotdetected;
462 
463   PetscFunctionBegin;
464   /* generate work space needed by the factorization */
465   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
466   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
467 
468   if (info->shifttype == MAT_SHIFT_NONE) {
469     shift = 0;
470   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
471     shift = info->shiftamount;
472   }
473 
474   for (i=0; i<n; i++) {
475     /* zero rtmp */
476     /* L part */
477     nz    = bi[i+1] - bi[i];
478     bjtmp = bj + bi[i];
479     for  (j=0; j<nz; j++) {
480       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
481     }
482 
483     /* U part */
484     nz    = bdiag[i] - bdiag[i+1];
485     bjtmp = bj + bdiag[i+1]+1;
486     for  (j=0; j<nz; j++) {
487       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
488     }
489 
490     /* load in initial (unfactored row) */
491     nz    = ai[i+1] - ai[i];
492     ajtmp = aj + ai[i];
493     v     = aa + bs2*ai[i];
494     for (j=0; j<nz; j++) {
495       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
496     }
497 
498     /* elimination */
499     bjtmp = bj + bi[i];
500     nzL   = bi[i+1] - bi[i];
501     for (k=0; k < nzL; k++) {
502       row = bjtmp[k];
503       pc  = rtmp + bs2*row;
504       for (flg=0,j=0; j<bs2; j++) {
505         if (pc[j]!=0.0) {
506           flg = 1;
507           break;
508         }
509       }
510       if (flg) {
511         pv = b->a + bs2*bdiag[row];
512         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
513         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
514 
515         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
516         pv = b->a + bs2*(bdiag[row+1]+1);
517         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
518         for (j=0; j<nz; j++) {
519           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
520           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
521           v    = rtmp + bs2*pj[j];
522           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
523           pv  += bs2;
524         }
525         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
526       }
527     }
528 
529     /* finished row so stick it into b->a */
530     /* L part */
531     pv = b->a + bs2*bi[i];
532     pj = b->j + bi[i];
533     nz = bi[i+1] - bi[i];
534     for (j=0; j<nz; j++) {
535       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
536     }
537 
538     /* Mark diagonal and invert diagonal for simplier triangular solves */
539     pv   = b->a + bs2*bdiag[i];
540     pj   = b->j + bdiag[i];
541     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
542     allowzeropivot = PetscNot(A->erroriffailure);
543     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
544     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
545 
546     /* U part */
547     pv = b->a + bs2*(bdiag[i+1]+1);
548     pj = b->j + bdiag[i+1]+1;
549     nz = bdiag[i] - bdiag[i+1] - 1;
550     for (j=0; j<nz; j++) {
551       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
552     }
553   }
554   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
555 
556   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
557   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
558   C->assembled           = PETSC_TRUE;
559 
560   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
561   PetscFunctionReturn(0);
562 }
563 
564 #if defined(PETSC_HAVE_SSE)
565 
566 #include PETSC_HAVE_SSE
567 
568 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
569 #undef __FUNCT__
570 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
571 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
572 {
573   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
574   PetscErrorCode ierr;
575   int            i,j,n = a->mbs;
576   int            *bj = b->j,*bjtmp,*pj;
577   int            row;
578   int            *ajtmpold,nz,*bi=b->i;
579   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
580   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
581   MatScalar      *ba    = b->a,*aa = a->a;
582   int            nonzero=0;
583   PetscBool pivotinblocks = b->pivotinblocks;
584   PetscReal shift         = info->shiftamount;
585 
586   PetscFunctionBegin;
587   SSE_SCOPE_BEGIN;
588 
589   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
590   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
591   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
592   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
593 /*    if ((unsigned long)bj==(unsigned long)aj) { */
594 /*      colscale = 4; */
595 /*    } */
596   for (i=0; i<n; i++) {
597     nz    = bi[i+1] - bi[i];
598     bjtmp = bj + bi[i];
599     /* zero out the 4x4 block accumulators */
600     /* zero out one register */
601     XOR_PS(XMM7,XMM7);
602     for  (j=0; j<nz; j++) {
603       x = rtmp+16*bjtmp[j];
604 /*        x = rtmp+4*bjtmp[j]; */
605       SSE_INLINE_BEGIN_1(x)
606       /* Copy zero register to memory locations */
607       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
608       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
609       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
610       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
611       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
612       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
613       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
614       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
615       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
616       SSE_INLINE_END_1;
617     }
618     /* load in initial (unfactored row) */
619     nz       = ai[i+1] - ai[i];
620     ajtmpold = aj + ai[i];
621     v        = aa + 16*ai[i];
622     for (j=0; j<nz; j++) {
623       x = rtmp+16*ajtmpold[j];
624 /*        x = rtmp+colscale*ajtmpold[j]; */
625       /* Copy v block into x block */
626       SSE_INLINE_BEGIN_2(v,x)
627       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
628       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
629       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
630 
631       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
632       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
633 
634       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
635       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
636 
637       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
638       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
639 
640       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
641       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
642 
643       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
644       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
645 
646       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
647       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
648 
649       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
650       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
651       SSE_INLINE_END_2;
652 
653       v += 16;
654     }
655 /*      row = (*bjtmp++)/4; */
656     row = *bjtmp++;
657     while (row < i) {
658       pc = rtmp + 16*row;
659       SSE_INLINE_BEGIN_1(pc)
660       /* Load block from lower triangle */
661       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
662       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
663       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
664 
665       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
666       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
667 
668       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
669       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
670 
671       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
672       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
673 
674       /* Compare block to zero block */
675 
676       SSE_COPY_PS(XMM4,XMM7)
677       SSE_CMPNEQ_PS(XMM4,XMM0)
678 
679       SSE_COPY_PS(XMM5,XMM7)
680       SSE_CMPNEQ_PS(XMM5,XMM1)
681 
682       SSE_COPY_PS(XMM6,XMM7)
683       SSE_CMPNEQ_PS(XMM6,XMM2)
684 
685       SSE_CMPNEQ_PS(XMM7,XMM3)
686 
687       /* Reduce the comparisons to one SSE register */
688       SSE_OR_PS(XMM6,XMM7)
689       SSE_OR_PS(XMM5,XMM4)
690       SSE_OR_PS(XMM5,XMM6)
691       SSE_INLINE_END_1;
692 
693       /* Reduce the one SSE register to an integer register for branching */
694       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
695       MOVEMASK(nonzero,XMM5);
696 
697       /* If block is nonzero ... */
698       if (nonzero) {
699         pv = ba + 16*diag_offset[row];
700         PREFETCH_L1(&pv[16]);
701         pj = bj + diag_offset[row] + 1;
702 
703         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
704         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
705         /* but the diagonal was inverted already */
706         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
707 
708         SSE_INLINE_BEGIN_2(pv,pc)
709         /* Column 0, product is accumulated in XMM4 */
710         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
711         SSE_SHUFFLE(XMM4,XMM4,0x00)
712         SSE_MULT_PS(XMM4,XMM0)
713 
714         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
715         SSE_SHUFFLE(XMM5,XMM5,0x00)
716         SSE_MULT_PS(XMM5,XMM1)
717         SSE_ADD_PS(XMM4,XMM5)
718 
719         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
720         SSE_SHUFFLE(XMM6,XMM6,0x00)
721         SSE_MULT_PS(XMM6,XMM2)
722         SSE_ADD_PS(XMM4,XMM6)
723 
724         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
725         SSE_SHUFFLE(XMM7,XMM7,0x00)
726         SSE_MULT_PS(XMM7,XMM3)
727         SSE_ADD_PS(XMM4,XMM7)
728 
729         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
730         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
731 
732         /* Column 1, product is accumulated in XMM5 */
733         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
734         SSE_SHUFFLE(XMM5,XMM5,0x00)
735         SSE_MULT_PS(XMM5,XMM0)
736 
737         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
738         SSE_SHUFFLE(XMM6,XMM6,0x00)
739         SSE_MULT_PS(XMM6,XMM1)
740         SSE_ADD_PS(XMM5,XMM6)
741 
742         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
743         SSE_SHUFFLE(XMM7,XMM7,0x00)
744         SSE_MULT_PS(XMM7,XMM2)
745         SSE_ADD_PS(XMM5,XMM7)
746 
747         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
748         SSE_SHUFFLE(XMM6,XMM6,0x00)
749         SSE_MULT_PS(XMM6,XMM3)
750         SSE_ADD_PS(XMM5,XMM6)
751 
752         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
753         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
754 
755         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
756 
757         /* Column 2, product is accumulated in XMM6 */
758         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
759         SSE_SHUFFLE(XMM6,XMM6,0x00)
760         SSE_MULT_PS(XMM6,XMM0)
761 
762         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
763         SSE_SHUFFLE(XMM7,XMM7,0x00)
764         SSE_MULT_PS(XMM7,XMM1)
765         SSE_ADD_PS(XMM6,XMM7)
766 
767         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
768         SSE_SHUFFLE(XMM7,XMM7,0x00)
769         SSE_MULT_PS(XMM7,XMM2)
770         SSE_ADD_PS(XMM6,XMM7)
771 
772         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
773         SSE_SHUFFLE(XMM7,XMM7,0x00)
774         SSE_MULT_PS(XMM7,XMM3)
775         SSE_ADD_PS(XMM6,XMM7)
776 
777         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
778         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
779 
780         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
781         /* Column 3, product is accumulated in XMM0 */
782         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
783         SSE_SHUFFLE(XMM7,XMM7,0x00)
784         SSE_MULT_PS(XMM0,XMM7)
785 
786         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
787         SSE_SHUFFLE(XMM7,XMM7,0x00)
788         SSE_MULT_PS(XMM1,XMM7)
789         SSE_ADD_PS(XMM0,XMM1)
790 
791         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
792         SSE_SHUFFLE(XMM1,XMM1,0x00)
793         SSE_MULT_PS(XMM1,XMM2)
794         SSE_ADD_PS(XMM0,XMM1)
795 
796         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
797         SSE_SHUFFLE(XMM7,XMM7,0x00)
798         SSE_MULT_PS(XMM3,XMM7)
799         SSE_ADD_PS(XMM0,XMM3)
800 
801         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
802         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
803 
804         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
805         /* This is code to be maintained and read by humans afterall. */
806         /* Copy Multiplier Col 3 into XMM3 */
807         SSE_COPY_PS(XMM3,XMM0)
808         /* Copy Multiplier Col 2 into XMM2 */
809         SSE_COPY_PS(XMM2,XMM6)
810         /* Copy Multiplier Col 1 into XMM1 */
811         SSE_COPY_PS(XMM1,XMM5)
812         /* Copy Multiplier Col 0 into XMM0 */
813         SSE_COPY_PS(XMM0,XMM4)
814         SSE_INLINE_END_2;
815 
816         /* Update the row: */
817         nz  = bi[row+1] - diag_offset[row] - 1;
818         pv += 16;
819         for (j=0; j<nz; j++) {
820           PREFETCH_L1(&pv[16]);
821           x = rtmp + 16*pj[j];
822 /*            x = rtmp + 4*pj[j]; */
823 
824           /* X:=X-M*PV, One column at a time */
825           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
826           SSE_INLINE_BEGIN_2(x,pv)
827           /* Load First Column of X*/
828           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
829           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
830 
831           /* Matrix-Vector Product: */
832           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
833           SSE_SHUFFLE(XMM5,XMM5,0x00)
834           SSE_MULT_PS(XMM5,XMM0)
835           SSE_SUB_PS(XMM4,XMM5)
836 
837           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
838           SSE_SHUFFLE(XMM6,XMM6,0x00)
839           SSE_MULT_PS(XMM6,XMM1)
840           SSE_SUB_PS(XMM4,XMM6)
841 
842           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
843           SSE_SHUFFLE(XMM7,XMM7,0x00)
844           SSE_MULT_PS(XMM7,XMM2)
845           SSE_SUB_PS(XMM4,XMM7)
846 
847           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
848           SSE_SHUFFLE(XMM5,XMM5,0x00)
849           SSE_MULT_PS(XMM5,XMM3)
850           SSE_SUB_PS(XMM4,XMM5)
851 
852           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
853           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
854 
855           /* Second Column */
856           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
857           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
858 
859           /* Matrix-Vector Product: */
860           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
861           SSE_SHUFFLE(XMM6,XMM6,0x00)
862           SSE_MULT_PS(XMM6,XMM0)
863           SSE_SUB_PS(XMM5,XMM6)
864 
865           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
866           SSE_SHUFFLE(XMM7,XMM7,0x00)
867           SSE_MULT_PS(XMM7,XMM1)
868           SSE_SUB_PS(XMM5,XMM7)
869 
870           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
871           SSE_SHUFFLE(XMM4,XMM4,0x00)
872           SSE_MULT_PS(XMM4,XMM2)
873           SSE_SUB_PS(XMM5,XMM4)
874 
875           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
876           SSE_SHUFFLE(XMM6,XMM6,0x00)
877           SSE_MULT_PS(XMM6,XMM3)
878           SSE_SUB_PS(XMM5,XMM6)
879 
880           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
881           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
882 
883           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
884 
885           /* Third Column */
886           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
887           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
888 
889           /* Matrix-Vector Product: */
890           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
891           SSE_SHUFFLE(XMM7,XMM7,0x00)
892           SSE_MULT_PS(XMM7,XMM0)
893           SSE_SUB_PS(XMM6,XMM7)
894 
895           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
896           SSE_SHUFFLE(XMM4,XMM4,0x00)
897           SSE_MULT_PS(XMM4,XMM1)
898           SSE_SUB_PS(XMM6,XMM4)
899 
900           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
901           SSE_SHUFFLE(XMM5,XMM5,0x00)
902           SSE_MULT_PS(XMM5,XMM2)
903           SSE_SUB_PS(XMM6,XMM5)
904 
905           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
906           SSE_SHUFFLE(XMM7,XMM7,0x00)
907           SSE_MULT_PS(XMM7,XMM3)
908           SSE_SUB_PS(XMM6,XMM7)
909 
910           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
911           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
912 
913           /* Fourth Column */
914           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
915           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
916 
917           /* Matrix-Vector Product: */
918           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
919           SSE_SHUFFLE(XMM5,XMM5,0x00)
920           SSE_MULT_PS(XMM5,XMM0)
921           SSE_SUB_PS(XMM4,XMM5)
922 
923           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
924           SSE_SHUFFLE(XMM6,XMM6,0x00)
925           SSE_MULT_PS(XMM6,XMM1)
926           SSE_SUB_PS(XMM4,XMM6)
927 
928           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
929           SSE_SHUFFLE(XMM7,XMM7,0x00)
930           SSE_MULT_PS(XMM7,XMM2)
931           SSE_SUB_PS(XMM4,XMM7)
932 
933           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
934           SSE_SHUFFLE(XMM5,XMM5,0x00)
935           SSE_MULT_PS(XMM5,XMM3)
936           SSE_SUB_PS(XMM4,XMM5)
937 
938           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
939           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
940           SSE_INLINE_END_2;
941           pv += 16;
942         }
943         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
944       }
945       row = *bjtmp++;
946 /*        row = (*bjtmp++)/4; */
947     }
948     /* finished row so stick it into b->a */
949     pv = ba + 16*bi[i];
950     pj = bj + bi[i];
951     nz = bi[i+1] - bi[i];
952 
953     /* Copy x block back into pv block */
954     for (j=0; j<nz; j++) {
955       x = rtmp+16*pj[j];
956 /*        x  = rtmp+4*pj[j]; */
957 
958       SSE_INLINE_BEGIN_2(x,pv)
959       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
960       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
961       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
962 
963       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
964       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
965 
966       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
967       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
968 
969       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
970       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
971 
972       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
973       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
974 
975       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
976       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
977 
978       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
979       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
980 
981       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
982       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
983       SSE_INLINE_END_2;
984       pv += 16;
985     }
986     /* invert diagonal block */
987     w = ba + 16*diag_offset[i];
988     if (pivotinblocks) {
989       PetscBool allowzeropivot,zeropivotdetected;
990       allowzeropivot = PetscNot(A->erroriffailure);
991       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
992       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
993     } else {
994       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
995     }
996 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
997     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
998   }
999 
1000   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1001 
1002   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1003   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1004   C->assembled           = PETSC_TRUE;
1005 
1006   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1007   /* Flop Count from inverting diagonal blocks */
1008   SSE_SCOPE_END;
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 #undef __FUNCT__
1013 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1014 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1015 {
1016   Mat            A  =C;
1017   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1018   PetscErrorCode ierr;
1019   int            i,j,n = a->mbs;
1020   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1021   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1022   unsigned int   row;
1023   int            nz,*bi=b->i;
1024   int            *diag_offset = b->diag,*ai=a->i;
1025   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1026   MatScalar      *ba    = b->a,*aa = a->a;
1027   int            nonzero=0;
1028   PetscBool      pivotinblocks = b->pivotinblocks;
1029   PetscReal      shift = info->shiftamount;
1030 
1031   PetscFunctionBegin;
1032   SSE_SCOPE_BEGIN;
1033 
1034   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1035   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1036   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1037   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1038 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1039 /*      colscale = 4; */
1040 /*    } */
1041 
1042   for (i=0; i<n; i++) {
1043     nz    = bi[i+1] - bi[i];
1044     bjtmp = bj + bi[i];
1045     /* zero out the 4x4 block accumulators */
1046     /* zero out one register */
1047     XOR_PS(XMM7,XMM7);
1048     for  (j=0; j<nz; j++) {
1049       x = rtmp+16*((unsigned int)bjtmp[j]);
1050 /*        x = rtmp+4*bjtmp[j]; */
1051       SSE_INLINE_BEGIN_1(x)
1052       /* Copy zero register to memory locations */
1053       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1054       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1055       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1056       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1057       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1058       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1059       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1060       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1061       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1062       SSE_INLINE_END_1;
1063     }
1064     /* load in initial (unfactored row) */
1065     nz    = ai[i+1] - ai[i];
1066     ajtmp = aj + ai[i];
1067     v     = aa + 16*ai[i];
1068     for (j=0; j<nz; j++) {
1069       x = rtmp+16*((unsigned int)ajtmp[j]);
1070 /*        x = rtmp+colscale*ajtmp[j]; */
1071       /* Copy v block into x block */
1072       SSE_INLINE_BEGIN_2(v,x)
1073       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1074       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1075       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1076 
1077       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1078       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1079 
1080       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1081       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1082 
1083       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1084       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1085 
1086       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1087       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1088 
1089       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1090       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1091 
1092       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1093       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1094 
1095       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1096       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1097       SSE_INLINE_END_2;
1098 
1099       v += 16;
1100     }
1101 /*      row = (*bjtmp++)/4; */
1102     row = (unsigned int)(*bjtmp++);
1103     while (row < i) {
1104       pc = rtmp + 16*row;
1105       SSE_INLINE_BEGIN_1(pc)
1106       /* Load block from lower triangle */
1107       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1108       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1109       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1110 
1111       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1112       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1113 
1114       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1115       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1116 
1117       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1118       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1119 
1120       /* Compare block to zero block */
1121 
1122       SSE_COPY_PS(XMM4,XMM7)
1123       SSE_CMPNEQ_PS(XMM4,XMM0)
1124 
1125       SSE_COPY_PS(XMM5,XMM7)
1126       SSE_CMPNEQ_PS(XMM5,XMM1)
1127 
1128       SSE_COPY_PS(XMM6,XMM7)
1129       SSE_CMPNEQ_PS(XMM6,XMM2)
1130 
1131       SSE_CMPNEQ_PS(XMM7,XMM3)
1132 
1133       /* Reduce the comparisons to one SSE register */
1134       SSE_OR_PS(XMM6,XMM7)
1135       SSE_OR_PS(XMM5,XMM4)
1136       SSE_OR_PS(XMM5,XMM6)
1137       SSE_INLINE_END_1;
1138 
1139       /* Reduce the one SSE register to an integer register for branching */
1140       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1141       MOVEMASK(nonzero,XMM5);
1142 
1143       /* If block is nonzero ... */
1144       if (nonzero) {
1145         pv = ba + 16*diag_offset[row];
1146         PREFETCH_L1(&pv[16]);
1147         pj = bj + diag_offset[row] + 1;
1148 
1149         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1150         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1151         /* but the diagonal was inverted already */
1152         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1153 
1154         SSE_INLINE_BEGIN_2(pv,pc)
1155         /* Column 0, product is accumulated in XMM4 */
1156         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1157         SSE_SHUFFLE(XMM4,XMM4,0x00)
1158         SSE_MULT_PS(XMM4,XMM0)
1159 
1160         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1161         SSE_SHUFFLE(XMM5,XMM5,0x00)
1162         SSE_MULT_PS(XMM5,XMM1)
1163         SSE_ADD_PS(XMM4,XMM5)
1164 
1165         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1166         SSE_SHUFFLE(XMM6,XMM6,0x00)
1167         SSE_MULT_PS(XMM6,XMM2)
1168         SSE_ADD_PS(XMM4,XMM6)
1169 
1170         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1171         SSE_SHUFFLE(XMM7,XMM7,0x00)
1172         SSE_MULT_PS(XMM7,XMM3)
1173         SSE_ADD_PS(XMM4,XMM7)
1174 
1175         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1176         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1177 
1178         /* Column 1, product is accumulated in XMM5 */
1179         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1180         SSE_SHUFFLE(XMM5,XMM5,0x00)
1181         SSE_MULT_PS(XMM5,XMM0)
1182 
1183         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1184         SSE_SHUFFLE(XMM6,XMM6,0x00)
1185         SSE_MULT_PS(XMM6,XMM1)
1186         SSE_ADD_PS(XMM5,XMM6)
1187 
1188         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1189         SSE_SHUFFLE(XMM7,XMM7,0x00)
1190         SSE_MULT_PS(XMM7,XMM2)
1191         SSE_ADD_PS(XMM5,XMM7)
1192 
1193         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1194         SSE_SHUFFLE(XMM6,XMM6,0x00)
1195         SSE_MULT_PS(XMM6,XMM3)
1196         SSE_ADD_PS(XMM5,XMM6)
1197 
1198         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1199         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1200 
1201         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1202 
1203         /* Column 2, product is accumulated in XMM6 */
1204         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1205         SSE_SHUFFLE(XMM6,XMM6,0x00)
1206         SSE_MULT_PS(XMM6,XMM0)
1207 
1208         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1209         SSE_SHUFFLE(XMM7,XMM7,0x00)
1210         SSE_MULT_PS(XMM7,XMM1)
1211         SSE_ADD_PS(XMM6,XMM7)
1212 
1213         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1214         SSE_SHUFFLE(XMM7,XMM7,0x00)
1215         SSE_MULT_PS(XMM7,XMM2)
1216         SSE_ADD_PS(XMM6,XMM7)
1217 
1218         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1219         SSE_SHUFFLE(XMM7,XMM7,0x00)
1220         SSE_MULT_PS(XMM7,XMM3)
1221         SSE_ADD_PS(XMM6,XMM7)
1222 
1223         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1224         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1225 
1226         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1227         /* Column 3, product is accumulated in XMM0 */
1228         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1229         SSE_SHUFFLE(XMM7,XMM7,0x00)
1230         SSE_MULT_PS(XMM0,XMM7)
1231 
1232         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1233         SSE_SHUFFLE(XMM7,XMM7,0x00)
1234         SSE_MULT_PS(XMM1,XMM7)
1235         SSE_ADD_PS(XMM0,XMM1)
1236 
1237         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1238         SSE_SHUFFLE(XMM1,XMM1,0x00)
1239         SSE_MULT_PS(XMM1,XMM2)
1240         SSE_ADD_PS(XMM0,XMM1)
1241 
1242         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1243         SSE_SHUFFLE(XMM7,XMM7,0x00)
1244         SSE_MULT_PS(XMM3,XMM7)
1245         SSE_ADD_PS(XMM0,XMM3)
1246 
1247         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1248         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1249 
1250         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1251         /* This is code to be maintained and read by humans afterall. */
1252         /* Copy Multiplier Col 3 into XMM3 */
1253         SSE_COPY_PS(XMM3,XMM0)
1254         /* Copy Multiplier Col 2 into XMM2 */
1255         SSE_COPY_PS(XMM2,XMM6)
1256         /* Copy Multiplier Col 1 into XMM1 */
1257         SSE_COPY_PS(XMM1,XMM5)
1258         /* Copy Multiplier Col 0 into XMM0 */
1259         SSE_COPY_PS(XMM0,XMM4)
1260         SSE_INLINE_END_2;
1261 
1262         /* Update the row: */
1263         nz  = bi[row+1] - diag_offset[row] - 1;
1264         pv += 16;
1265         for (j=0; j<nz; j++) {
1266           PREFETCH_L1(&pv[16]);
1267           x = rtmp + 16*((unsigned int)pj[j]);
1268 /*            x = rtmp + 4*pj[j]; */
1269 
1270           /* X:=X-M*PV, One column at a time */
1271           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1272           SSE_INLINE_BEGIN_2(x,pv)
1273           /* Load First Column of X*/
1274           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1275           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1276 
1277           /* Matrix-Vector Product: */
1278           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1279           SSE_SHUFFLE(XMM5,XMM5,0x00)
1280           SSE_MULT_PS(XMM5,XMM0)
1281           SSE_SUB_PS(XMM4,XMM5)
1282 
1283           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1284           SSE_SHUFFLE(XMM6,XMM6,0x00)
1285           SSE_MULT_PS(XMM6,XMM1)
1286           SSE_SUB_PS(XMM4,XMM6)
1287 
1288           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1289           SSE_SHUFFLE(XMM7,XMM7,0x00)
1290           SSE_MULT_PS(XMM7,XMM2)
1291           SSE_SUB_PS(XMM4,XMM7)
1292 
1293           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1294           SSE_SHUFFLE(XMM5,XMM5,0x00)
1295           SSE_MULT_PS(XMM5,XMM3)
1296           SSE_SUB_PS(XMM4,XMM5)
1297 
1298           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1299           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1300 
1301           /* Second Column */
1302           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1303           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1304 
1305           /* Matrix-Vector Product: */
1306           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1307           SSE_SHUFFLE(XMM6,XMM6,0x00)
1308           SSE_MULT_PS(XMM6,XMM0)
1309           SSE_SUB_PS(XMM5,XMM6)
1310 
1311           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1312           SSE_SHUFFLE(XMM7,XMM7,0x00)
1313           SSE_MULT_PS(XMM7,XMM1)
1314           SSE_SUB_PS(XMM5,XMM7)
1315 
1316           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1317           SSE_SHUFFLE(XMM4,XMM4,0x00)
1318           SSE_MULT_PS(XMM4,XMM2)
1319           SSE_SUB_PS(XMM5,XMM4)
1320 
1321           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1322           SSE_SHUFFLE(XMM6,XMM6,0x00)
1323           SSE_MULT_PS(XMM6,XMM3)
1324           SSE_SUB_PS(XMM5,XMM6)
1325 
1326           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1327           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1328 
1329           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1330 
1331           /* Third Column */
1332           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1333           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1334 
1335           /* Matrix-Vector Product: */
1336           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1337           SSE_SHUFFLE(XMM7,XMM7,0x00)
1338           SSE_MULT_PS(XMM7,XMM0)
1339           SSE_SUB_PS(XMM6,XMM7)
1340 
1341           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1342           SSE_SHUFFLE(XMM4,XMM4,0x00)
1343           SSE_MULT_PS(XMM4,XMM1)
1344           SSE_SUB_PS(XMM6,XMM4)
1345 
1346           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1347           SSE_SHUFFLE(XMM5,XMM5,0x00)
1348           SSE_MULT_PS(XMM5,XMM2)
1349           SSE_SUB_PS(XMM6,XMM5)
1350 
1351           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1352           SSE_SHUFFLE(XMM7,XMM7,0x00)
1353           SSE_MULT_PS(XMM7,XMM3)
1354           SSE_SUB_PS(XMM6,XMM7)
1355 
1356           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1357           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1358 
1359           /* Fourth Column */
1360           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1361           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1362 
1363           /* Matrix-Vector Product: */
1364           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1365           SSE_SHUFFLE(XMM5,XMM5,0x00)
1366           SSE_MULT_PS(XMM5,XMM0)
1367           SSE_SUB_PS(XMM4,XMM5)
1368 
1369           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1370           SSE_SHUFFLE(XMM6,XMM6,0x00)
1371           SSE_MULT_PS(XMM6,XMM1)
1372           SSE_SUB_PS(XMM4,XMM6)
1373 
1374           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1375           SSE_SHUFFLE(XMM7,XMM7,0x00)
1376           SSE_MULT_PS(XMM7,XMM2)
1377           SSE_SUB_PS(XMM4,XMM7)
1378 
1379           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1380           SSE_SHUFFLE(XMM5,XMM5,0x00)
1381           SSE_MULT_PS(XMM5,XMM3)
1382           SSE_SUB_PS(XMM4,XMM5)
1383 
1384           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1385           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1386           SSE_INLINE_END_2;
1387           pv += 16;
1388         }
1389         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1390       }
1391       row = (unsigned int)(*bjtmp++);
1392 /*        row = (*bjtmp++)/4; */
1393 /*        bjtmp++; */
1394     }
1395     /* finished row so stick it into b->a */
1396     pv = ba + 16*bi[i];
1397     pj = bj + bi[i];
1398     nz = bi[i+1] - bi[i];
1399 
1400     /* Copy x block back into pv block */
1401     for (j=0; j<nz; j++) {
1402       x = rtmp+16*((unsigned int)pj[j]);
1403 /*        x  = rtmp+4*pj[j]; */
1404 
1405       SSE_INLINE_BEGIN_2(x,pv)
1406       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1407       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1408       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1409 
1410       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1411       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1412 
1413       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1414       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1415 
1416       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1417       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1418 
1419       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1420       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1421 
1422       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1423       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1424 
1425       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1426       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1427 
1428       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1429       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1430       SSE_INLINE_END_2;
1431       pv += 16;
1432     }
1433     /* invert diagonal block */
1434     w = ba + 16*diag_offset[i];
1435     if (pivotinblocks) {
1436       PetscBool allowzeropivot,zeropivotdetected;
1437       allowzeropivot = PetscNot(A->erroriffailure);
1438       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1439       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1440     } else {
1441       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1442     }
1443 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1444     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1445   }
1446 
1447   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1448 
1449   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1450   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1451   C->assembled           = PETSC_TRUE;
1452 
1453   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1454   /* Flop Count from inverting diagonal blocks */
1455   SSE_SCOPE_END;
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 #undef __FUNCT__
1460 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1461 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1462 {
1463   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1464   PetscErrorCode ierr;
1465   int            i,j,n = a->mbs;
1466   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1467   unsigned int   row;
1468   int            *ajtmpold,nz,*bi=b->i;
1469   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1470   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1471   MatScalar      *ba    = b->a,*aa = a->a;
1472   int            nonzero=0;
1473   PetscBool      pivotinblocks = b->pivotinblocks;
1474   PetscReal      shift = info->shiftamount;
1475 
1476   PetscFunctionBegin;
1477   SSE_SCOPE_BEGIN;
1478 
1479   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1480   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1481   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1482   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1483 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1484 /*      colscale = 4; */
1485 /*    } */
1486   if ((unsigned long)bj==(unsigned long)aj) {
1487     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1488   }
1489 
1490   for (i=0; i<n; i++) {
1491     nz    = bi[i+1] - bi[i];
1492     bjtmp = bj + bi[i];
1493     /* zero out the 4x4 block accumulators */
1494     /* zero out one register */
1495     XOR_PS(XMM7,XMM7);
1496     for  (j=0; j<nz; j++) {
1497       x = rtmp+16*((unsigned int)bjtmp[j]);
1498 /*        x = rtmp+4*bjtmp[j]; */
1499       SSE_INLINE_BEGIN_1(x)
1500       /* Copy zero register to memory locations */
1501       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1502       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1503       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1504       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1505       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1506       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1507       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1508       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1509       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1510       SSE_INLINE_END_1;
1511     }
1512     /* load in initial (unfactored row) */
1513     nz       = ai[i+1] - ai[i];
1514     ajtmpold = aj + ai[i];
1515     v        = aa + 16*ai[i];
1516     for (j=0; j<nz; j++) {
1517       x = rtmp+16*ajtmpold[j];
1518 /*        x = rtmp+colscale*ajtmpold[j]; */
1519       /* Copy v block into x block */
1520       SSE_INLINE_BEGIN_2(v,x)
1521       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1522       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1523       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1524 
1525       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1526       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1527 
1528       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1529       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1530 
1531       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1532       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1533 
1534       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1535       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1536 
1537       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1538       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1539 
1540       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1541       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1542 
1543       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1544       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1545       SSE_INLINE_END_2;
1546 
1547       v += 16;
1548     }
1549 /*      row = (*bjtmp++)/4; */
1550     row = (unsigned int)(*bjtmp++);
1551     while (row < i) {
1552       pc = rtmp + 16*row;
1553       SSE_INLINE_BEGIN_1(pc)
1554       /* Load block from lower triangle */
1555       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1556       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1557       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1558 
1559       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1560       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1561 
1562       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1563       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1564 
1565       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1566       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1567 
1568       /* Compare block to zero block */
1569 
1570       SSE_COPY_PS(XMM4,XMM7)
1571       SSE_CMPNEQ_PS(XMM4,XMM0)
1572 
1573       SSE_COPY_PS(XMM5,XMM7)
1574       SSE_CMPNEQ_PS(XMM5,XMM1)
1575 
1576       SSE_COPY_PS(XMM6,XMM7)
1577       SSE_CMPNEQ_PS(XMM6,XMM2)
1578 
1579       SSE_CMPNEQ_PS(XMM7,XMM3)
1580 
1581       /* Reduce the comparisons to one SSE register */
1582       SSE_OR_PS(XMM6,XMM7)
1583       SSE_OR_PS(XMM5,XMM4)
1584       SSE_OR_PS(XMM5,XMM6)
1585       SSE_INLINE_END_1;
1586 
1587       /* Reduce the one SSE register to an integer register for branching */
1588       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1589       MOVEMASK(nonzero,XMM5);
1590 
1591       /* If block is nonzero ... */
1592       if (nonzero) {
1593         pv = ba + 16*diag_offset[row];
1594         PREFETCH_L1(&pv[16]);
1595         pj = bj + diag_offset[row] + 1;
1596 
1597         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1598         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1599         /* but the diagonal was inverted already */
1600         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1601 
1602         SSE_INLINE_BEGIN_2(pv,pc)
1603         /* Column 0, product is accumulated in XMM4 */
1604         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1605         SSE_SHUFFLE(XMM4,XMM4,0x00)
1606         SSE_MULT_PS(XMM4,XMM0)
1607 
1608         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1609         SSE_SHUFFLE(XMM5,XMM5,0x00)
1610         SSE_MULT_PS(XMM5,XMM1)
1611         SSE_ADD_PS(XMM4,XMM5)
1612 
1613         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1614         SSE_SHUFFLE(XMM6,XMM6,0x00)
1615         SSE_MULT_PS(XMM6,XMM2)
1616         SSE_ADD_PS(XMM4,XMM6)
1617 
1618         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1619         SSE_SHUFFLE(XMM7,XMM7,0x00)
1620         SSE_MULT_PS(XMM7,XMM3)
1621         SSE_ADD_PS(XMM4,XMM7)
1622 
1623         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1624         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1625 
1626         /* Column 1, product is accumulated in XMM5 */
1627         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1628         SSE_SHUFFLE(XMM5,XMM5,0x00)
1629         SSE_MULT_PS(XMM5,XMM0)
1630 
1631         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1632         SSE_SHUFFLE(XMM6,XMM6,0x00)
1633         SSE_MULT_PS(XMM6,XMM1)
1634         SSE_ADD_PS(XMM5,XMM6)
1635 
1636         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1637         SSE_SHUFFLE(XMM7,XMM7,0x00)
1638         SSE_MULT_PS(XMM7,XMM2)
1639         SSE_ADD_PS(XMM5,XMM7)
1640 
1641         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1642         SSE_SHUFFLE(XMM6,XMM6,0x00)
1643         SSE_MULT_PS(XMM6,XMM3)
1644         SSE_ADD_PS(XMM5,XMM6)
1645 
1646         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1647         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1648 
1649         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1650 
1651         /* Column 2, product is accumulated in XMM6 */
1652         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1653         SSE_SHUFFLE(XMM6,XMM6,0x00)
1654         SSE_MULT_PS(XMM6,XMM0)
1655 
1656         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1657         SSE_SHUFFLE(XMM7,XMM7,0x00)
1658         SSE_MULT_PS(XMM7,XMM1)
1659         SSE_ADD_PS(XMM6,XMM7)
1660 
1661         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1662         SSE_SHUFFLE(XMM7,XMM7,0x00)
1663         SSE_MULT_PS(XMM7,XMM2)
1664         SSE_ADD_PS(XMM6,XMM7)
1665 
1666         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1667         SSE_SHUFFLE(XMM7,XMM7,0x00)
1668         SSE_MULT_PS(XMM7,XMM3)
1669         SSE_ADD_PS(XMM6,XMM7)
1670 
1671         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1672         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1673 
1674         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1675         /* Column 3, product is accumulated in XMM0 */
1676         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1677         SSE_SHUFFLE(XMM7,XMM7,0x00)
1678         SSE_MULT_PS(XMM0,XMM7)
1679 
1680         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1681         SSE_SHUFFLE(XMM7,XMM7,0x00)
1682         SSE_MULT_PS(XMM1,XMM7)
1683         SSE_ADD_PS(XMM0,XMM1)
1684 
1685         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1686         SSE_SHUFFLE(XMM1,XMM1,0x00)
1687         SSE_MULT_PS(XMM1,XMM2)
1688         SSE_ADD_PS(XMM0,XMM1)
1689 
1690         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1691         SSE_SHUFFLE(XMM7,XMM7,0x00)
1692         SSE_MULT_PS(XMM3,XMM7)
1693         SSE_ADD_PS(XMM0,XMM3)
1694 
1695         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1696         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1697 
1698         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1699         /* This is code to be maintained and read by humans afterall. */
1700         /* Copy Multiplier Col 3 into XMM3 */
1701         SSE_COPY_PS(XMM3,XMM0)
1702         /* Copy Multiplier Col 2 into XMM2 */
1703         SSE_COPY_PS(XMM2,XMM6)
1704         /* Copy Multiplier Col 1 into XMM1 */
1705         SSE_COPY_PS(XMM1,XMM5)
1706         /* Copy Multiplier Col 0 into XMM0 */
1707         SSE_COPY_PS(XMM0,XMM4)
1708         SSE_INLINE_END_2;
1709 
1710         /* Update the row: */
1711         nz  = bi[row+1] - diag_offset[row] - 1;
1712         pv += 16;
1713         for (j=0; j<nz; j++) {
1714           PREFETCH_L1(&pv[16]);
1715           x = rtmp + 16*((unsigned int)pj[j]);
1716 /*            x = rtmp + 4*pj[j]; */
1717 
1718           /* X:=X-M*PV, One column at a time */
1719           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1720           SSE_INLINE_BEGIN_2(x,pv)
1721           /* Load First Column of X*/
1722           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1723           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1724 
1725           /* Matrix-Vector Product: */
1726           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1727           SSE_SHUFFLE(XMM5,XMM5,0x00)
1728           SSE_MULT_PS(XMM5,XMM0)
1729           SSE_SUB_PS(XMM4,XMM5)
1730 
1731           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1732           SSE_SHUFFLE(XMM6,XMM6,0x00)
1733           SSE_MULT_PS(XMM6,XMM1)
1734           SSE_SUB_PS(XMM4,XMM6)
1735 
1736           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1737           SSE_SHUFFLE(XMM7,XMM7,0x00)
1738           SSE_MULT_PS(XMM7,XMM2)
1739           SSE_SUB_PS(XMM4,XMM7)
1740 
1741           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1742           SSE_SHUFFLE(XMM5,XMM5,0x00)
1743           SSE_MULT_PS(XMM5,XMM3)
1744           SSE_SUB_PS(XMM4,XMM5)
1745 
1746           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1747           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1748 
1749           /* Second Column */
1750           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1751           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1752 
1753           /* Matrix-Vector Product: */
1754           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1755           SSE_SHUFFLE(XMM6,XMM6,0x00)
1756           SSE_MULT_PS(XMM6,XMM0)
1757           SSE_SUB_PS(XMM5,XMM6)
1758 
1759           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1760           SSE_SHUFFLE(XMM7,XMM7,0x00)
1761           SSE_MULT_PS(XMM7,XMM1)
1762           SSE_SUB_PS(XMM5,XMM7)
1763 
1764           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1765           SSE_SHUFFLE(XMM4,XMM4,0x00)
1766           SSE_MULT_PS(XMM4,XMM2)
1767           SSE_SUB_PS(XMM5,XMM4)
1768 
1769           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1770           SSE_SHUFFLE(XMM6,XMM6,0x00)
1771           SSE_MULT_PS(XMM6,XMM3)
1772           SSE_SUB_PS(XMM5,XMM6)
1773 
1774           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1775           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1776 
1777           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1778 
1779           /* Third Column */
1780           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1781           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1782 
1783           /* Matrix-Vector Product: */
1784           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1785           SSE_SHUFFLE(XMM7,XMM7,0x00)
1786           SSE_MULT_PS(XMM7,XMM0)
1787           SSE_SUB_PS(XMM6,XMM7)
1788 
1789           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1790           SSE_SHUFFLE(XMM4,XMM4,0x00)
1791           SSE_MULT_PS(XMM4,XMM1)
1792           SSE_SUB_PS(XMM6,XMM4)
1793 
1794           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1795           SSE_SHUFFLE(XMM5,XMM5,0x00)
1796           SSE_MULT_PS(XMM5,XMM2)
1797           SSE_SUB_PS(XMM6,XMM5)
1798 
1799           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1800           SSE_SHUFFLE(XMM7,XMM7,0x00)
1801           SSE_MULT_PS(XMM7,XMM3)
1802           SSE_SUB_PS(XMM6,XMM7)
1803 
1804           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1805           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1806 
1807           /* Fourth Column */
1808           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1809           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1810 
1811           /* Matrix-Vector Product: */
1812           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1813           SSE_SHUFFLE(XMM5,XMM5,0x00)
1814           SSE_MULT_PS(XMM5,XMM0)
1815           SSE_SUB_PS(XMM4,XMM5)
1816 
1817           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1818           SSE_SHUFFLE(XMM6,XMM6,0x00)
1819           SSE_MULT_PS(XMM6,XMM1)
1820           SSE_SUB_PS(XMM4,XMM6)
1821 
1822           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1823           SSE_SHUFFLE(XMM7,XMM7,0x00)
1824           SSE_MULT_PS(XMM7,XMM2)
1825           SSE_SUB_PS(XMM4,XMM7)
1826 
1827           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1828           SSE_SHUFFLE(XMM5,XMM5,0x00)
1829           SSE_MULT_PS(XMM5,XMM3)
1830           SSE_SUB_PS(XMM4,XMM5)
1831 
1832           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1833           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1834           SSE_INLINE_END_2;
1835           pv += 16;
1836         }
1837         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1838       }
1839       row = (unsigned int)(*bjtmp++);
1840 /*        row = (*bjtmp++)/4; */
1841 /*        bjtmp++; */
1842     }
1843     /* finished row so stick it into b->a */
1844     pv = ba + 16*bi[i];
1845     pj = bj + bi[i];
1846     nz = bi[i+1] - bi[i];
1847 
1848     /* Copy x block back into pv block */
1849     for (j=0; j<nz; j++) {
1850       x = rtmp+16*((unsigned int)pj[j]);
1851 /*        x  = rtmp+4*pj[j]; */
1852 
1853       SSE_INLINE_BEGIN_2(x,pv)
1854       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1855       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1856       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1857 
1858       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1859       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1860 
1861       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1862       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1863 
1864       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1865       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1866 
1867       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1868       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1869 
1870       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1871       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1872 
1873       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1874       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1875 
1876       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1877       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1878       SSE_INLINE_END_2;
1879       pv += 16;
1880     }
1881     /* invert diagonal block */
1882     w = ba + 16*diag_offset[i];
1883     if (pivotinblocks) {
1884       PetscBool allowzeropivot,zeropivotdetected;
1885       allowzeropivot = PetscNot(A->erroriffailure);
1886       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr);
1887       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1888     } else {
1889       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1890     }
1891 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1892     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1893   }
1894 
1895   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1896 
1897   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1898   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1899   C->assembled           = PETSC_TRUE;
1900 
1901   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1902   /* Flop Count from inverting diagonal blocks */
1903   SSE_SCOPE_END;
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 #endif
1908