xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 6abefa4f446b2ea91915da4e25ea0defa83db48e)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftamount;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4 -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171 {
172   Mat            C=B;
173   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174   IS             isrow = b->row,isicol = b->icol;
175   PetscErrorCode ierr;
176   const PetscInt *r,*ic,*ics;
177   PetscInt       i,j,k,nz,nzL,row;
178   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181   PetscInt       flg;
182   PetscReal      shift = info->shiftamount;
183 
184   PetscFunctionBegin;
185   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
186   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
187 
188   /* generate work space needed by the factorization */
189   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
190   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
191   ics  = ic;
192 
193   for (i=0; i<n; i++){
194     /* zero rtmp */
195     /* L part */
196     nz    = bi[i+1] - bi[i];
197     bjtmp = bj + bi[i];
198     for  (j=0; j<nz; j++){
199       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
200     }
201 
202     /* U part */
203     nz = bdiag[i]-bdiag[i+1];
204     bjtmp = bj + bdiag[i+1]+1;
205     for  (j=0; j<nz; j++){
206       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
207     }
208 
209     /* load in initial (unfactored row) */
210     nz    = ai[r[i]+1] - ai[r[i]];
211     ajtmp = aj + ai[r[i]];
212     v     = aa + bs2*ai[r[i]];
213     for (j=0; j<nz; j++) {
214       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
215     }
216 
217     /* elimination */
218     bjtmp = bj + bi[i];
219     nzL   = bi[i+1] - bi[i];
220     for(k=0;k < nzL;k++) {
221       row = bjtmp[k];
222       pc = rtmp + bs2*row;
223       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
224       if (flg) {
225         pv = b->a + bs2*bdiag[row];
226         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
227         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
228 
229         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
230         pv = b->a + bs2*(bdiag[row+1]+1);
231         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
232         for (j=0; j<nz; j++) {
233           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
234           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
235           v    = rtmp + bs2*pj[j];
236           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
237           pv  += bs2;
238         }
239         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
240       }
241     }
242 
243     /* finished row so stick it into b->a */
244     /* L part */
245     pv   = b->a + bs2*bi[i] ;
246     pj   = b->j + bi[i] ;
247     nz   = bi[i+1] - bi[i];
248     for (j=0; j<nz; j++) {
249       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
250     }
251 
252     /* Mark diagonal and invert diagonal for simplier triangular solves */
253     pv   = b->a + bs2*bdiag[i];
254     pj   = b->j + bdiag[i];
255     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
256     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
257     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
258 
259     /* U part */
260     pv = b->a + bs2*(bdiag[i+1]+1);
261     pj = b->j + bdiag[i+1]+1;
262     nz = bdiag[i] - bdiag[i+1] - 1;
263     for (j=0; j<nz; j++){
264       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
265     }
266   }
267 
268   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
269   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
270   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
271   C->ops->solve          = MatSolve_SeqBAIJ_4;
272   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
273   C->assembled = PETSC_TRUE;
274   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
275   PetscFunctionReturn(0);
276 }
277 
278 #undef __FUNCT__
279 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
280 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
281 {
282 /*
283     Default Version for when blocks are 4 by 4 Using natural ordering
284 */
285   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
286   PetscErrorCode ierr;
287   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
288   PetscInt       *ajtmpold,*ajtmp,nz,row;
289   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
290   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
291   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
292   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
293   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
294   MatScalar      m13,m14,m15,m16;
295   MatScalar      *ba = b->a,*aa = a->a;
296   PetscTruth     pivotinblocks = b->pivotinblocks;
297   PetscReal      shift = info->shiftamount;
298 
299   PetscFunctionBegin;
300   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
301 
302   for (i=0; i<n; i++) {
303     nz    = bi[i+1] - bi[i];
304     ajtmp = bj + bi[i];
305     for  (j=0; j<nz; j++) {
306       x = rtmp+16*ajtmp[j];
307       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
308       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
309     }
310     /* load in initial (unfactored row) */
311     nz       = ai[i+1] - ai[i];
312     ajtmpold = aj + ai[i];
313     v        = aa + 16*ai[i];
314     for (j=0; j<nz; j++) {
315       x    = rtmp+16*ajtmpold[j];
316       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
317       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
318       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
319       x[14] = v[14]; x[15] = v[15];
320       v    += 16;
321     }
322     row = *ajtmp++;
323     while (row < i) {
324       pc  = rtmp + 16*row;
325       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
326       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
327       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
328       p15 = pc[14]; p16 = pc[15];
329       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
330           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
331           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
332           || p16 != 0.0) {
333         pv = ba + 16*diag_offset[row];
334         pj = bj + diag_offset[row] + 1;
335         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
336         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
337         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
338         x15 = pv[14]; x16 = pv[15];
339         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
340         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
341         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
342         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
343 
344         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
345         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
346         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
347         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
348 
349         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
350         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
351         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
352         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
353 
354         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
355         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
356         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
357         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
358         nz = bi[row+1] - diag_offset[row] - 1;
359         pv += 16;
360         for (j=0; j<nz; j++) {
361           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
362           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
363           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
364           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
365           x    = rtmp + 16*pj[j];
366           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
367           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
368           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
369           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
370 
371           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
372           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
373           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
374           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
375 
376           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
377           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
378           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
379           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
380 
381           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
382           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
383           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
384           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
385 
386           pv   += 16;
387         }
388         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
389       }
390       row = *ajtmp++;
391     }
392     /* finished row so stick it into b->a */
393     pv = ba + 16*bi[i];
394     pj = bj + bi[i];
395     nz = bi[i+1] - bi[i];
396     for (j=0; j<nz; j++) {
397       x      = rtmp+16*pj[j];
398       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
399       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
400       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
401       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
402       pv   += 16;
403     }
404     /* invert diagonal block */
405     w = ba + 16*diag_offset[i];
406     if (pivotinblocks) {
407       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
408     } else {
409       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
410     }
411   }
412 
413   ierr = PetscFree(rtmp);CHKERRQ(ierr);
414   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
415   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
416   C->assembled = PETSC_TRUE;
417   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
418   PetscFunctionReturn(0);
419 }
420 
421 /*
422   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
423     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
424 */
425 #undef __FUNCT__
426 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
427 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
428 {
429   Mat            C=B;
430   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
431   PetscErrorCode ierr;
432   PetscInt       i,j,k,nz,nzL,row;
433   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
434   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
435   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
436   PetscInt       flg;
437   PetscReal      shift = info->shiftamount;
438 
439   PetscFunctionBegin;
440   /* generate work space needed by the factorization */
441   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
442   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
443 
444   for (i=0; i<n; i++){
445     /* zero rtmp */
446     /* L part */
447     nz    = bi[i+1] - bi[i];
448     bjtmp = bj + bi[i];
449     for  (j=0; j<nz; j++){
450       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
451     }
452 
453     /* U part */
454     nz = bdiag[i] - bdiag[i+1];
455     bjtmp = bj + bdiag[i+1]+1;
456     for  (j=0; j<nz; j++){
457       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
458     }
459 
460     /* load in initial (unfactored row) */
461     nz    = ai[i+1] - ai[i];
462     ajtmp = aj + ai[i];
463     v     = aa + bs2*ai[i];
464     for (j=0; j<nz; j++) {
465       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
466     }
467 
468     /* elimination */
469     bjtmp = bj + bi[i];
470     nzL   = bi[i+1] - bi[i];
471     for(k=0;k < nzL;k++) {
472       row = bjtmp[k];
473       pc = rtmp + bs2*row;
474       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
475       if (flg) {
476         pv = b->a + bs2*bdiag[row];
477         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
478         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
479 
480         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
481         pv = b->a + bs2*(bdiag[row+1]+1);
482         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
483         for (j=0; j<nz; j++) {
484           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
485           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
486           v    = rtmp + bs2*pj[j];
487           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
488           pv  += bs2;
489         }
490         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
491       }
492     }
493 
494     /* finished row so stick it into b->a */
495     /* L part */
496     pv   = b->a + bs2*bi[i] ;
497     pj   = b->j + bi[i] ;
498     nz   = bi[i+1] - bi[i];
499     for (j=0; j<nz; j++) {
500       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
501     }
502 
503     /* Mark diagonal and invert diagonal for simplier triangular solves */
504     pv   = b->a + bs2*bdiag[i];
505     pj   = b->j + bdiag[i];
506     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
507     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
508     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
509 
510     /* U part */
511     pv = b->a + bs2*(bdiag[i+1]+1);
512     pj = b->j + bdiag[i+1]+1;
513     nz = bdiag[i] - bdiag[i+1] - 1;
514     for (j=0; j<nz; j++){
515       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
516     }
517   }
518   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
519   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
520   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
521   C->assembled = PETSC_TRUE;
522   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
523   PetscFunctionReturn(0);
524 }
525 
526 #if defined(PETSC_HAVE_SSE)
527 
528 #include PETSC_HAVE_SSE
529 
530 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
531 #undef __FUNCT__
532 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
533 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
534 {
535   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
536   PetscErrorCode ierr;
537   int i,j,n = a->mbs;
538   int         *bj = b->j,*bjtmp,*pj;
539   int         row;
540   int         *ajtmpold,nz,*bi=b->i;
541   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
542   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
543   MatScalar   *ba = b->a,*aa = a->a;
544   int         nonzero=0;
545 /*    int            nonzero=0,colscale = 16; */
546   PetscTruth  pivotinblocks = b->pivotinblocks;
547   PetscReal      shift = info->shiftamount;
548 
549   PetscFunctionBegin;
550   SSE_SCOPE_BEGIN;
551 
552   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
553   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
554   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
555   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
556 /*    if ((unsigned long)bj==(unsigned long)aj) { */
557 /*      colscale = 4; */
558 /*    } */
559   for (i=0; i<n; i++) {
560     nz    = bi[i+1] - bi[i];
561     bjtmp = bj + bi[i];
562     /* zero out the 4x4 block accumulators */
563     /* zero out one register */
564     XOR_PS(XMM7,XMM7);
565     for  (j=0; j<nz; j++) {
566       x = rtmp+16*bjtmp[j];
567 /*        x = rtmp+4*bjtmp[j]; */
568       SSE_INLINE_BEGIN_1(x)
569         /* Copy zero register to memory locations */
570         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
571         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
572         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
573         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
574         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
575         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
576         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
577         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
578         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
579       SSE_INLINE_END_1;
580     }
581     /* load in initial (unfactored row) */
582     nz       = ai[i+1] - ai[i];
583     ajtmpold = aj + ai[i];
584     v        = aa + 16*ai[i];
585     for (j=0; j<nz; j++) {
586       x = rtmp+16*ajtmpold[j];
587 /*        x = rtmp+colscale*ajtmpold[j]; */
588       /* Copy v block into x block */
589       SSE_INLINE_BEGIN_2(v,x)
590         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
591         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
592         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
593 
594         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
595         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
596 
597         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
598         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
599 
600         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
601         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
602 
603         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
604         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
605 
606         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
607         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
608 
609         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
610         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
611 
612         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
613         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
614       SSE_INLINE_END_2;
615 
616       v += 16;
617     }
618 /*      row = (*bjtmp++)/4; */
619     row = *bjtmp++;
620     while (row < i) {
621       pc  = rtmp + 16*row;
622       SSE_INLINE_BEGIN_1(pc)
623         /* Load block from lower triangle */
624         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
625         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
626         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
627 
628         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
629         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
630 
631         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
632         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
633 
634         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
635         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
636 
637         /* Compare block to zero block */
638 
639         SSE_COPY_PS(XMM4,XMM7)
640         SSE_CMPNEQ_PS(XMM4,XMM0)
641 
642         SSE_COPY_PS(XMM5,XMM7)
643         SSE_CMPNEQ_PS(XMM5,XMM1)
644 
645         SSE_COPY_PS(XMM6,XMM7)
646         SSE_CMPNEQ_PS(XMM6,XMM2)
647 
648         SSE_CMPNEQ_PS(XMM7,XMM3)
649 
650         /* Reduce the comparisons to one SSE register */
651         SSE_OR_PS(XMM6,XMM7)
652         SSE_OR_PS(XMM5,XMM4)
653         SSE_OR_PS(XMM5,XMM6)
654       SSE_INLINE_END_1;
655 
656       /* Reduce the one SSE register to an integer register for branching */
657       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
658       MOVEMASK(nonzero,XMM5);
659 
660       /* If block is nonzero ... */
661       if (nonzero) {
662         pv = ba + 16*diag_offset[row];
663         PREFETCH_L1(&pv[16]);
664         pj = bj + diag_offset[row] + 1;
665 
666         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
667         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
668         /* but the diagonal was inverted already */
669         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
670 
671         SSE_INLINE_BEGIN_2(pv,pc)
672           /* Column 0, product is accumulated in XMM4 */
673           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
674           SSE_SHUFFLE(XMM4,XMM4,0x00)
675           SSE_MULT_PS(XMM4,XMM0)
676 
677           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
678           SSE_SHUFFLE(XMM5,XMM5,0x00)
679           SSE_MULT_PS(XMM5,XMM1)
680           SSE_ADD_PS(XMM4,XMM5)
681 
682           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
683           SSE_SHUFFLE(XMM6,XMM6,0x00)
684           SSE_MULT_PS(XMM6,XMM2)
685           SSE_ADD_PS(XMM4,XMM6)
686 
687           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
688           SSE_SHUFFLE(XMM7,XMM7,0x00)
689           SSE_MULT_PS(XMM7,XMM3)
690           SSE_ADD_PS(XMM4,XMM7)
691 
692           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
693           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
694 
695           /* Column 1, product is accumulated in XMM5 */
696           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
697           SSE_SHUFFLE(XMM5,XMM5,0x00)
698           SSE_MULT_PS(XMM5,XMM0)
699 
700           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
701           SSE_SHUFFLE(XMM6,XMM6,0x00)
702           SSE_MULT_PS(XMM6,XMM1)
703           SSE_ADD_PS(XMM5,XMM6)
704 
705           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
706           SSE_SHUFFLE(XMM7,XMM7,0x00)
707           SSE_MULT_PS(XMM7,XMM2)
708           SSE_ADD_PS(XMM5,XMM7)
709 
710           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
711           SSE_SHUFFLE(XMM6,XMM6,0x00)
712           SSE_MULT_PS(XMM6,XMM3)
713           SSE_ADD_PS(XMM5,XMM6)
714 
715           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
716           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
717 
718           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
719 
720           /* Column 2, product is accumulated in XMM6 */
721           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
722           SSE_SHUFFLE(XMM6,XMM6,0x00)
723           SSE_MULT_PS(XMM6,XMM0)
724 
725           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
726           SSE_SHUFFLE(XMM7,XMM7,0x00)
727           SSE_MULT_PS(XMM7,XMM1)
728           SSE_ADD_PS(XMM6,XMM7)
729 
730           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
731           SSE_SHUFFLE(XMM7,XMM7,0x00)
732           SSE_MULT_PS(XMM7,XMM2)
733           SSE_ADD_PS(XMM6,XMM7)
734 
735           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
736           SSE_SHUFFLE(XMM7,XMM7,0x00)
737           SSE_MULT_PS(XMM7,XMM3)
738           SSE_ADD_PS(XMM6,XMM7)
739 
740           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
741           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
742 
743           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
744           /* Column 3, product is accumulated in XMM0 */
745           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
746           SSE_SHUFFLE(XMM7,XMM7,0x00)
747           SSE_MULT_PS(XMM0,XMM7)
748 
749           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
750           SSE_SHUFFLE(XMM7,XMM7,0x00)
751           SSE_MULT_PS(XMM1,XMM7)
752           SSE_ADD_PS(XMM0,XMM1)
753 
754           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
755           SSE_SHUFFLE(XMM1,XMM1,0x00)
756           SSE_MULT_PS(XMM1,XMM2)
757           SSE_ADD_PS(XMM0,XMM1)
758 
759           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
760           SSE_SHUFFLE(XMM7,XMM7,0x00)
761           SSE_MULT_PS(XMM3,XMM7)
762           SSE_ADD_PS(XMM0,XMM3)
763 
764           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
765           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
766 
767           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
768           /* This is code to be maintained and read by humans afterall. */
769           /* Copy Multiplier Col 3 into XMM3 */
770           SSE_COPY_PS(XMM3,XMM0)
771           /* Copy Multiplier Col 2 into XMM2 */
772           SSE_COPY_PS(XMM2,XMM6)
773           /* Copy Multiplier Col 1 into XMM1 */
774           SSE_COPY_PS(XMM1,XMM5)
775           /* Copy Multiplier Col 0 into XMM0 */
776           SSE_COPY_PS(XMM0,XMM4)
777         SSE_INLINE_END_2;
778 
779         /* Update the row: */
780         nz = bi[row+1] - diag_offset[row] - 1;
781         pv += 16;
782         for (j=0; j<nz; j++) {
783           PREFETCH_L1(&pv[16]);
784           x = rtmp + 16*pj[j];
785 /*            x = rtmp + 4*pj[j]; */
786 
787           /* X:=X-M*PV, One column at a time */
788           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
789           SSE_INLINE_BEGIN_2(x,pv)
790             /* Load First Column of X*/
791             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
792             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
793 
794             /* Matrix-Vector Product: */
795             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
796             SSE_SHUFFLE(XMM5,XMM5,0x00)
797             SSE_MULT_PS(XMM5,XMM0)
798             SSE_SUB_PS(XMM4,XMM5)
799 
800             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
801             SSE_SHUFFLE(XMM6,XMM6,0x00)
802             SSE_MULT_PS(XMM6,XMM1)
803             SSE_SUB_PS(XMM4,XMM6)
804 
805             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
806             SSE_SHUFFLE(XMM7,XMM7,0x00)
807             SSE_MULT_PS(XMM7,XMM2)
808             SSE_SUB_PS(XMM4,XMM7)
809 
810             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
811             SSE_SHUFFLE(XMM5,XMM5,0x00)
812             SSE_MULT_PS(XMM5,XMM3)
813             SSE_SUB_PS(XMM4,XMM5)
814 
815             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
816             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
817 
818             /* Second Column */
819             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
820             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
821 
822             /* Matrix-Vector Product: */
823             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
824             SSE_SHUFFLE(XMM6,XMM6,0x00)
825             SSE_MULT_PS(XMM6,XMM0)
826             SSE_SUB_PS(XMM5,XMM6)
827 
828             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
829             SSE_SHUFFLE(XMM7,XMM7,0x00)
830             SSE_MULT_PS(XMM7,XMM1)
831             SSE_SUB_PS(XMM5,XMM7)
832 
833             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
834             SSE_SHUFFLE(XMM4,XMM4,0x00)
835             SSE_MULT_PS(XMM4,XMM2)
836             SSE_SUB_PS(XMM5,XMM4)
837 
838             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
839             SSE_SHUFFLE(XMM6,XMM6,0x00)
840             SSE_MULT_PS(XMM6,XMM3)
841             SSE_SUB_PS(XMM5,XMM6)
842 
843             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
844             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
845 
846             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
847 
848             /* Third Column */
849             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
850             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
851 
852             /* Matrix-Vector Product: */
853             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
854             SSE_SHUFFLE(XMM7,XMM7,0x00)
855             SSE_MULT_PS(XMM7,XMM0)
856             SSE_SUB_PS(XMM6,XMM7)
857 
858             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
859             SSE_SHUFFLE(XMM4,XMM4,0x00)
860             SSE_MULT_PS(XMM4,XMM1)
861             SSE_SUB_PS(XMM6,XMM4)
862 
863             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
864             SSE_SHUFFLE(XMM5,XMM5,0x00)
865             SSE_MULT_PS(XMM5,XMM2)
866             SSE_SUB_PS(XMM6,XMM5)
867 
868             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
869             SSE_SHUFFLE(XMM7,XMM7,0x00)
870             SSE_MULT_PS(XMM7,XMM3)
871             SSE_SUB_PS(XMM6,XMM7)
872 
873             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
874             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
875 
876             /* Fourth Column */
877             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
878             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
879 
880             /* Matrix-Vector Product: */
881             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
882             SSE_SHUFFLE(XMM5,XMM5,0x00)
883             SSE_MULT_PS(XMM5,XMM0)
884             SSE_SUB_PS(XMM4,XMM5)
885 
886             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
887             SSE_SHUFFLE(XMM6,XMM6,0x00)
888             SSE_MULT_PS(XMM6,XMM1)
889             SSE_SUB_PS(XMM4,XMM6)
890 
891             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
892             SSE_SHUFFLE(XMM7,XMM7,0x00)
893             SSE_MULT_PS(XMM7,XMM2)
894             SSE_SUB_PS(XMM4,XMM7)
895 
896             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
897             SSE_SHUFFLE(XMM5,XMM5,0x00)
898             SSE_MULT_PS(XMM5,XMM3)
899             SSE_SUB_PS(XMM4,XMM5)
900 
901             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
902             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
903           SSE_INLINE_END_2;
904           pv   += 16;
905         }
906         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
907       }
908       row = *bjtmp++;
909 /*        row = (*bjtmp++)/4; */
910     }
911     /* finished row so stick it into b->a */
912     pv = ba + 16*bi[i];
913     pj = bj + bi[i];
914     nz = bi[i+1] - bi[i];
915 
916     /* Copy x block back into pv block */
917     for (j=0; j<nz; j++) {
918       x  = rtmp+16*pj[j];
919 /*        x  = rtmp+4*pj[j]; */
920 
921       SSE_INLINE_BEGIN_2(x,pv)
922         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
923         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
924         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
925 
926         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
927         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
928 
929         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
930         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
931 
932         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
933         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
934 
935         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
936         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
937 
938         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
939         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
940 
941         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
942         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
943 
944         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
945         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
946       SSE_INLINE_END_2;
947       pv += 16;
948     }
949     /* invert diagonal block */
950     w = ba + 16*diag_offset[i];
951     if (pivotinblocks) {
952       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
953     } else {
954       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
955     }
956 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
957     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
958   }
959 
960   ierr = PetscFree(rtmp);CHKERRQ(ierr);
961   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
962   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
963   C->assembled = PETSC_TRUE;
964   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
965   /* Flop Count from inverting diagonal blocks */
966   SSE_SCOPE_END;
967   PetscFunctionReturn(0);
968 }
969 
970 #undef __FUNCT__
971 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
972 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
973 {
974   Mat            A=C;
975   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
976   PetscErrorCode ierr;
977   int i,j,n = a->mbs;
978   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
979   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
980   unsigned int   row;
981   int            nz,*bi=b->i;
982   int            *diag_offset = b->diag,*ai=a->i;
983   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
984   MatScalar      *ba = b->a,*aa = a->a;
985   int            nonzero=0;
986 /*    int            nonzero=0,colscale = 16; */
987   PetscTruth     pivotinblocks = b->pivotinblocks;
988   PetscReal      shift = info->shiftamount;
989 
990   PetscFunctionBegin;
991   SSE_SCOPE_BEGIN;
992 
993   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
994   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
995   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
996   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
997 /*    if ((unsigned long)bj==(unsigned long)aj) { */
998 /*      colscale = 4; */
999 /*    } */
1000 
1001   for (i=0; i<n; i++) {
1002     nz    = bi[i+1] - bi[i];
1003     bjtmp = bj + bi[i];
1004     /* zero out the 4x4 block accumulators */
1005     /* zero out one register */
1006     XOR_PS(XMM7,XMM7);
1007     for  (j=0; j<nz; j++) {
1008       x = rtmp+16*((unsigned int)bjtmp[j]);
1009 /*        x = rtmp+4*bjtmp[j]; */
1010       SSE_INLINE_BEGIN_1(x)
1011         /* Copy zero register to memory locations */
1012         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1013         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1014         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1015         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1016         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1017         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1018         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1019         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1020         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1021       SSE_INLINE_END_1;
1022     }
1023     /* load in initial (unfactored row) */
1024     nz    = ai[i+1] - ai[i];
1025     ajtmp = aj + ai[i];
1026     v     = aa + 16*ai[i];
1027     for (j=0; j<nz; j++) {
1028       x = rtmp+16*((unsigned int)ajtmp[j]);
1029 /*        x = rtmp+colscale*ajtmp[j]; */
1030       /* Copy v block into x block */
1031       SSE_INLINE_BEGIN_2(v,x)
1032         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1033         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1034         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1035 
1036         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1037         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1038 
1039         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1040         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1041 
1042         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1043         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1044 
1045         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1046         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1047 
1048         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1049         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1050 
1051         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1052         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1053 
1054         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1055         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1056       SSE_INLINE_END_2;
1057 
1058       v += 16;
1059     }
1060 /*      row = (*bjtmp++)/4; */
1061     row = (unsigned int)(*bjtmp++);
1062     while (row < i) {
1063       pc  = rtmp + 16*row;
1064       SSE_INLINE_BEGIN_1(pc)
1065         /* Load block from lower triangle */
1066         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1067         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1068         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1069 
1070         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1071         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1072 
1073         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1074         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1075 
1076         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1077         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1078 
1079         /* Compare block to zero block */
1080 
1081         SSE_COPY_PS(XMM4,XMM7)
1082         SSE_CMPNEQ_PS(XMM4,XMM0)
1083 
1084         SSE_COPY_PS(XMM5,XMM7)
1085         SSE_CMPNEQ_PS(XMM5,XMM1)
1086 
1087         SSE_COPY_PS(XMM6,XMM7)
1088         SSE_CMPNEQ_PS(XMM6,XMM2)
1089 
1090         SSE_CMPNEQ_PS(XMM7,XMM3)
1091 
1092         /* Reduce the comparisons to one SSE register */
1093         SSE_OR_PS(XMM6,XMM7)
1094         SSE_OR_PS(XMM5,XMM4)
1095         SSE_OR_PS(XMM5,XMM6)
1096       SSE_INLINE_END_1;
1097 
1098       /* Reduce the one SSE register to an integer register for branching */
1099       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1100       MOVEMASK(nonzero,XMM5);
1101 
1102       /* If block is nonzero ... */
1103       if (nonzero) {
1104         pv = ba + 16*diag_offset[row];
1105         PREFETCH_L1(&pv[16]);
1106         pj = bj + diag_offset[row] + 1;
1107 
1108         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1109         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1110         /* but the diagonal was inverted already */
1111         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1112 
1113         SSE_INLINE_BEGIN_2(pv,pc)
1114           /* Column 0, product is accumulated in XMM4 */
1115           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1116           SSE_SHUFFLE(XMM4,XMM4,0x00)
1117           SSE_MULT_PS(XMM4,XMM0)
1118 
1119           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1120           SSE_SHUFFLE(XMM5,XMM5,0x00)
1121           SSE_MULT_PS(XMM5,XMM1)
1122           SSE_ADD_PS(XMM4,XMM5)
1123 
1124           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1125           SSE_SHUFFLE(XMM6,XMM6,0x00)
1126           SSE_MULT_PS(XMM6,XMM2)
1127           SSE_ADD_PS(XMM4,XMM6)
1128 
1129           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1130           SSE_SHUFFLE(XMM7,XMM7,0x00)
1131           SSE_MULT_PS(XMM7,XMM3)
1132           SSE_ADD_PS(XMM4,XMM7)
1133 
1134           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1135           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1136 
1137           /* Column 1, product is accumulated in XMM5 */
1138           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1139           SSE_SHUFFLE(XMM5,XMM5,0x00)
1140           SSE_MULT_PS(XMM5,XMM0)
1141 
1142           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1143           SSE_SHUFFLE(XMM6,XMM6,0x00)
1144           SSE_MULT_PS(XMM6,XMM1)
1145           SSE_ADD_PS(XMM5,XMM6)
1146 
1147           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1148           SSE_SHUFFLE(XMM7,XMM7,0x00)
1149           SSE_MULT_PS(XMM7,XMM2)
1150           SSE_ADD_PS(XMM5,XMM7)
1151 
1152           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1153           SSE_SHUFFLE(XMM6,XMM6,0x00)
1154           SSE_MULT_PS(XMM6,XMM3)
1155           SSE_ADD_PS(XMM5,XMM6)
1156 
1157           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1158           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1159 
1160           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1161 
1162           /* Column 2, product is accumulated in XMM6 */
1163           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1164           SSE_SHUFFLE(XMM6,XMM6,0x00)
1165           SSE_MULT_PS(XMM6,XMM0)
1166 
1167           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1168           SSE_SHUFFLE(XMM7,XMM7,0x00)
1169           SSE_MULT_PS(XMM7,XMM1)
1170           SSE_ADD_PS(XMM6,XMM7)
1171 
1172           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1173           SSE_SHUFFLE(XMM7,XMM7,0x00)
1174           SSE_MULT_PS(XMM7,XMM2)
1175           SSE_ADD_PS(XMM6,XMM7)
1176 
1177           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1178           SSE_SHUFFLE(XMM7,XMM7,0x00)
1179           SSE_MULT_PS(XMM7,XMM3)
1180           SSE_ADD_PS(XMM6,XMM7)
1181 
1182           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1183           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1184 
1185           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1186           /* Column 3, product is accumulated in XMM0 */
1187           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1188           SSE_SHUFFLE(XMM7,XMM7,0x00)
1189           SSE_MULT_PS(XMM0,XMM7)
1190 
1191           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1192           SSE_SHUFFLE(XMM7,XMM7,0x00)
1193           SSE_MULT_PS(XMM1,XMM7)
1194           SSE_ADD_PS(XMM0,XMM1)
1195 
1196           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1197           SSE_SHUFFLE(XMM1,XMM1,0x00)
1198           SSE_MULT_PS(XMM1,XMM2)
1199           SSE_ADD_PS(XMM0,XMM1)
1200 
1201           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1202           SSE_SHUFFLE(XMM7,XMM7,0x00)
1203           SSE_MULT_PS(XMM3,XMM7)
1204           SSE_ADD_PS(XMM0,XMM3)
1205 
1206           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1207           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1208 
1209           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1210           /* This is code to be maintained and read by humans afterall. */
1211           /* Copy Multiplier Col 3 into XMM3 */
1212           SSE_COPY_PS(XMM3,XMM0)
1213           /* Copy Multiplier Col 2 into XMM2 */
1214           SSE_COPY_PS(XMM2,XMM6)
1215           /* Copy Multiplier Col 1 into XMM1 */
1216           SSE_COPY_PS(XMM1,XMM5)
1217           /* Copy Multiplier Col 0 into XMM0 */
1218           SSE_COPY_PS(XMM0,XMM4)
1219         SSE_INLINE_END_2;
1220 
1221         /* Update the row: */
1222         nz = bi[row+1] - diag_offset[row] - 1;
1223         pv += 16;
1224         for (j=0; j<nz; j++) {
1225           PREFETCH_L1(&pv[16]);
1226           x = rtmp + 16*((unsigned int)pj[j]);
1227 /*            x = rtmp + 4*pj[j]; */
1228 
1229           /* X:=X-M*PV, One column at a time */
1230           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1231           SSE_INLINE_BEGIN_2(x,pv)
1232             /* Load First Column of X*/
1233             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1234             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1235 
1236             /* Matrix-Vector Product: */
1237             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1238             SSE_SHUFFLE(XMM5,XMM5,0x00)
1239             SSE_MULT_PS(XMM5,XMM0)
1240             SSE_SUB_PS(XMM4,XMM5)
1241 
1242             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1243             SSE_SHUFFLE(XMM6,XMM6,0x00)
1244             SSE_MULT_PS(XMM6,XMM1)
1245             SSE_SUB_PS(XMM4,XMM6)
1246 
1247             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1248             SSE_SHUFFLE(XMM7,XMM7,0x00)
1249             SSE_MULT_PS(XMM7,XMM2)
1250             SSE_SUB_PS(XMM4,XMM7)
1251 
1252             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1253             SSE_SHUFFLE(XMM5,XMM5,0x00)
1254             SSE_MULT_PS(XMM5,XMM3)
1255             SSE_SUB_PS(XMM4,XMM5)
1256 
1257             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1258             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1259 
1260             /* Second Column */
1261             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1262             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1263 
1264             /* Matrix-Vector Product: */
1265             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1266             SSE_SHUFFLE(XMM6,XMM6,0x00)
1267             SSE_MULT_PS(XMM6,XMM0)
1268             SSE_SUB_PS(XMM5,XMM6)
1269 
1270             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1271             SSE_SHUFFLE(XMM7,XMM7,0x00)
1272             SSE_MULT_PS(XMM7,XMM1)
1273             SSE_SUB_PS(XMM5,XMM7)
1274 
1275             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1276             SSE_SHUFFLE(XMM4,XMM4,0x00)
1277             SSE_MULT_PS(XMM4,XMM2)
1278             SSE_SUB_PS(XMM5,XMM4)
1279 
1280             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1281             SSE_SHUFFLE(XMM6,XMM6,0x00)
1282             SSE_MULT_PS(XMM6,XMM3)
1283             SSE_SUB_PS(XMM5,XMM6)
1284 
1285             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1286             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1287 
1288             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1289 
1290             /* Third Column */
1291             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1292             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1293 
1294             /* Matrix-Vector Product: */
1295             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1296             SSE_SHUFFLE(XMM7,XMM7,0x00)
1297             SSE_MULT_PS(XMM7,XMM0)
1298             SSE_SUB_PS(XMM6,XMM7)
1299 
1300             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1301             SSE_SHUFFLE(XMM4,XMM4,0x00)
1302             SSE_MULT_PS(XMM4,XMM1)
1303             SSE_SUB_PS(XMM6,XMM4)
1304 
1305             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1306             SSE_SHUFFLE(XMM5,XMM5,0x00)
1307             SSE_MULT_PS(XMM5,XMM2)
1308             SSE_SUB_PS(XMM6,XMM5)
1309 
1310             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1311             SSE_SHUFFLE(XMM7,XMM7,0x00)
1312             SSE_MULT_PS(XMM7,XMM3)
1313             SSE_SUB_PS(XMM6,XMM7)
1314 
1315             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1316             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1317 
1318             /* Fourth Column */
1319             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1320             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1321 
1322             /* Matrix-Vector Product: */
1323             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1324             SSE_SHUFFLE(XMM5,XMM5,0x00)
1325             SSE_MULT_PS(XMM5,XMM0)
1326             SSE_SUB_PS(XMM4,XMM5)
1327 
1328             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1329             SSE_SHUFFLE(XMM6,XMM6,0x00)
1330             SSE_MULT_PS(XMM6,XMM1)
1331             SSE_SUB_PS(XMM4,XMM6)
1332 
1333             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1334             SSE_SHUFFLE(XMM7,XMM7,0x00)
1335             SSE_MULT_PS(XMM7,XMM2)
1336             SSE_SUB_PS(XMM4,XMM7)
1337 
1338             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1339             SSE_SHUFFLE(XMM5,XMM5,0x00)
1340             SSE_MULT_PS(XMM5,XMM3)
1341             SSE_SUB_PS(XMM4,XMM5)
1342 
1343             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1344             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1345           SSE_INLINE_END_2;
1346           pv   += 16;
1347         }
1348         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1349       }
1350       row = (unsigned int)(*bjtmp++);
1351 /*        row = (*bjtmp++)/4; */
1352 /*        bjtmp++; */
1353     }
1354     /* finished row so stick it into b->a */
1355     pv = ba + 16*bi[i];
1356     pj = bj + bi[i];
1357     nz = bi[i+1] - bi[i];
1358 
1359     /* Copy x block back into pv block */
1360     for (j=0; j<nz; j++) {
1361       x  = rtmp+16*((unsigned int)pj[j]);
1362 /*        x  = rtmp+4*pj[j]; */
1363 
1364       SSE_INLINE_BEGIN_2(x,pv)
1365         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1366         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1367         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1368 
1369         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1370         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1371 
1372         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1373         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1374 
1375         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1376         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1377 
1378         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1379         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1380 
1381         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1382         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1383 
1384         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1385         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1386 
1387         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1388         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1389       SSE_INLINE_END_2;
1390       pv += 16;
1391     }
1392     /* invert diagonal block */
1393     w = ba + 16*diag_offset[i];
1394     if (pivotinblocks) {
1395       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1396     } else {
1397       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1398     }
1399 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1400     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1401   }
1402 
1403   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1404   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1405   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1406   C->assembled = PETSC_TRUE;
1407   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1408   /* Flop Count from inverting diagonal blocks */
1409   SSE_SCOPE_END;
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 #undef __FUNCT__
1414 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1415 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1416 {
1417   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1418   PetscErrorCode ierr;
1419   int  i,j,n = a->mbs;
1420   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1421   unsigned int   row;
1422   int            *ajtmpold,nz,*bi=b->i;
1423   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1424   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1425   MatScalar      *ba = b->a,*aa = a->a;
1426   int            nonzero=0;
1427 /*    int            nonzero=0,colscale = 16; */
1428   PetscTruth     pivotinblocks = b->pivotinblocks;
1429   PetscReal      shift = info->shiftamount;
1430 
1431   PetscFunctionBegin;
1432   SSE_SCOPE_BEGIN;
1433 
1434   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1435   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1436   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1437   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1438 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1439 /*      colscale = 4; */
1440 /*    } */
1441   if ((unsigned long)bj==(unsigned long)aj) {
1442     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1443   }
1444 
1445   for (i=0; i<n; i++) {
1446     nz    = bi[i+1] - bi[i];
1447     bjtmp = bj + bi[i];
1448     /* zero out the 4x4 block accumulators */
1449     /* zero out one register */
1450     XOR_PS(XMM7,XMM7);
1451     for  (j=0; j<nz; j++) {
1452       x = rtmp+16*((unsigned int)bjtmp[j]);
1453 /*        x = rtmp+4*bjtmp[j]; */
1454       SSE_INLINE_BEGIN_1(x)
1455         /* Copy zero register to memory locations */
1456         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1457         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1458         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1459         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1460         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1461         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1462         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1463         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1464         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1465       SSE_INLINE_END_1;
1466     }
1467     /* load in initial (unfactored row) */
1468     nz       = ai[i+1] - ai[i];
1469     ajtmpold = aj + ai[i];
1470     v        = aa + 16*ai[i];
1471     for (j=0; j<nz; j++) {
1472       x = rtmp+16*ajtmpold[j];
1473 /*        x = rtmp+colscale*ajtmpold[j]; */
1474       /* Copy v block into x block */
1475       SSE_INLINE_BEGIN_2(v,x)
1476         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1477         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1478         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1479 
1480         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1481         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1482 
1483         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1484         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1485 
1486         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1487         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1488 
1489         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1490         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1491 
1492         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1493         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1494 
1495         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1496         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1497 
1498         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1499         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1500       SSE_INLINE_END_2;
1501 
1502       v += 16;
1503     }
1504 /*      row = (*bjtmp++)/4; */
1505     row = (unsigned int)(*bjtmp++);
1506     while (row < i) {
1507       pc  = rtmp + 16*row;
1508       SSE_INLINE_BEGIN_1(pc)
1509         /* Load block from lower triangle */
1510         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1511         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1512         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1513 
1514         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1515         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1516 
1517         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1518         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1519 
1520         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1521         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1522 
1523         /* Compare block to zero block */
1524 
1525         SSE_COPY_PS(XMM4,XMM7)
1526         SSE_CMPNEQ_PS(XMM4,XMM0)
1527 
1528         SSE_COPY_PS(XMM5,XMM7)
1529         SSE_CMPNEQ_PS(XMM5,XMM1)
1530 
1531         SSE_COPY_PS(XMM6,XMM7)
1532         SSE_CMPNEQ_PS(XMM6,XMM2)
1533 
1534         SSE_CMPNEQ_PS(XMM7,XMM3)
1535 
1536         /* Reduce the comparisons to one SSE register */
1537         SSE_OR_PS(XMM6,XMM7)
1538         SSE_OR_PS(XMM5,XMM4)
1539         SSE_OR_PS(XMM5,XMM6)
1540       SSE_INLINE_END_1;
1541 
1542       /* Reduce the one SSE register to an integer register for branching */
1543       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1544       MOVEMASK(nonzero,XMM5);
1545 
1546       /* If block is nonzero ... */
1547       if (nonzero) {
1548         pv = ba + 16*diag_offset[row];
1549         PREFETCH_L1(&pv[16]);
1550         pj = bj + diag_offset[row] + 1;
1551 
1552         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1553         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1554         /* but the diagonal was inverted already */
1555         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1556 
1557         SSE_INLINE_BEGIN_2(pv,pc)
1558           /* Column 0, product is accumulated in XMM4 */
1559           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1560           SSE_SHUFFLE(XMM4,XMM4,0x00)
1561           SSE_MULT_PS(XMM4,XMM0)
1562 
1563           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1564           SSE_SHUFFLE(XMM5,XMM5,0x00)
1565           SSE_MULT_PS(XMM5,XMM1)
1566           SSE_ADD_PS(XMM4,XMM5)
1567 
1568           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1569           SSE_SHUFFLE(XMM6,XMM6,0x00)
1570           SSE_MULT_PS(XMM6,XMM2)
1571           SSE_ADD_PS(XMM4,XMM6)
1572 
1573           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1574           SSE_SHUFFLE(XMM7,XMM7,0x00)
1575           SSE_MULT_PS(XMM7,XMM3)
1576           SSE_ADD_PS(XMM4,XMM7)
1577 
1578           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1579           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1580 
1581           /* Column 1, product is accumulated in XMM5 */
1582           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1583           SSE_SHUFFLE(XMM5,XMM5,0x00)
1584           SSE_MULT_PS(XMM5,XMM0)
1585 
1586           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1587           SSE_SHUFFLE(XMM6,XMM6,0x00)
1588           SSE_MULT_PS(XMM6,XMM1)
1589           SSE_ADD_PS(XMM5,XMM6)
1590 
1591           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1592           SSE_SHUFFLE(XMM7,XMM7,0x00)
1593           SSE_MULT_PS(XMM7,XMM2)
1594           SSE_ADD_PS(XMM5,XMM7)
1595 
1596           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1597           SSE_SHUFFLE(XMM6,XMM6,0x00)
1598           SSE_MULT_PS(XMM6,XMM3)
1599           SSE_ADD_PS(XMM5,XMM6)
1600 
1601           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1602           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1603 
1604           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1605 
1606           /* Column 2, product is accumulated in XMM6 */
1607           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1608           SSE_SHUFFLE(XMM6,XMM6,0x00)
1609           SSE_MULT_PS(XMM6,XMM0)
1610 
1611           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1612           SSE_SHUFFLE(XMM7,XMM7,0x00)
1613           SSE_MULT_PS(XMM7,XMM1)
1614           SSE_ADD_PS(XMM6,XMM7)
1615 
1616           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1617           SSE_SHUFFLE(XMM7,XMM7,0x00)
1618           SSE_MULT_PS(XMM7,XMM2)
1619           SSE_ADD_PS(XMM6,XMM7)
1620 
1621           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1622           SSE_SHUFFLE(XMM7,XMM7,0x00)
1623           SSE_MULT_PS(XMM7,XMM3)
1624           SSE_ADD_PS(XMM6,XMM7)
1625 
1626           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1627           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1628 
1629           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1630           /* Column 3, product is accumulated in XMM0 */
1631           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1632           SSE_SHUFFLE(XMM7,XMM7,0x00)
1633           SSE_MULT_PS(XMM0,XMM7)
1634 
1635           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1636           SSE_SHUFFLE(XMM7,XMM7,0x00)
1637           SSE_MULT_PS(XMM1,XMM7)
1638           SSE_ADD_PS(XMM0,XMM1)
1639 
1640           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1641           SSE_SHUFFLE(XMM1,XMM1,0x00)
1642           SSE_MULT_PS(XMM1,XMM2)
1643           SSE_ADD_PS(XMM0,XMM1)
1644 
1645           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1646           SSE_SHUFFLE(XMM7,XMM7,0x00)
1647           SSE_MULT_PS(XMM3,XMM7)
1648           SSE_ADD_PS(XMM0,XMM3)
1649 
1650           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1651           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1652 
1653           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1654           /* This is code to be maintained and read by humans afterall. */
1655           /* Copy Multiplier Col 3 into XMM3 */
1656           SSE_COPY_PS(XMM3,XMM0)
1657           /* Copy Multiplier Col 2 into XMM2 */
1658           SSE_COPY_PS(XMM2,XMM6)
1659           /* Copy Multiplier Col 1 into XMM1 */
1660           SSE_COPY_PS(XMM1,XMM5)
1661           /* Copy Multiplier Col 0 into XMM0 */
1662           SSE_COPY_PS(XMM0,XMM4)
1663         SSE_INLINE_END_2;
1664 
1665         /* Update the row: */
1666         nz = bi[row+1] - diag_offset[row] - 1;
1667         pv += 16;
1668         for (j=0; j<nz; j++) {
1669           PREFETCH_L1(&pv[16]);
1670           x = rtmp + 16*((unsigned int)pj[j]);
1671 /*            x = rtmp + 4*pj[j]; */
1672 
1673           /* X:=X-M*PV, One column at a time */
1674           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1675           SSE_INLINE_BEGIN_2(x,pv)
1676             /* Load First Column of X*/
1677             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1678             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1679 
1680             /* Matrix-Vector Product: */
1681             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1682             SSE_SHUFFLE(XMM5,XMM5,0x00)
1683             SSE_MULT_PS(XMM5,XMM0)
1684             SSE_SUB_PS(XMM4,XMM5)
1685 
1686             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1687             SSE_SHUFFLE(XMM6,XMM6,0x00)
1688             SSE_MULT_PS(XMM6,XMM1)
1689             SSE_SUB_PS(XMM4,XMM6)
1690 
1691             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1692             SSE_SHUFFLE(XMM7,XMM7,0x00)
1693             SSE_MULT_PS(XMM7,XMM2)
1694             SSE_SUB_PS(XMM4,XMM7)
1695 
1696             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1697             SSE_SHUFFLE(XMM5,XMM5,0x00)
1698             SSE_MULT_PS(XMM5,XMM3)
1699             SSE_SUB_PS(XMM4,XMM5)
1700 
1701             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1702             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1703 
1704             /* Second Column */
1705             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1706             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1707 
1708             /* Matrix-Vector Product: */
1709             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1710             SSE_SHUFFLE(XMM6,XMM6,0x00)
1711             SSE_MULT_PS(XMM6,XMM0)
1712             SSE_SUB_PS(XMM5,XMM6)
1713 
1714             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1715             SSE_SHUFFLE(XMM7,XMM7,0x00)
1716             SSE_MULT_PS(XMM7,XMM1)
1717             SSE_SUB_PS(XMM5,XMM7)
1718 
1719             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1720             SSE_SHUFFLE(XMM4,XMM4,0x00)
1721             SSE_MULT_PS(XMM4,XMM2)
1722             SSE_SUB_PS(XMM5,XMM4)
1723 
1724             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1725             SSE_SHUFFLE(XMM6,XMM6,0x00)
1726             SSE_MULT_PS(XMM6,XMM3)
1727             SSE_SUB_PS(XMM5,XMM6)
1728 
1729             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1730             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1731 
1732             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1733 
1734             /* Third Column */
1735             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1736             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1737 
1738             /* Matrix-Vector Product: */
1739             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1740             SSE_SHUFFLE(XMM7,XMM7,0x00)
1741             SSE_MULT_PS(XMM7,XMM0)
1742             SSE_SUB_PS(XMM6,XMM7)
1743 
1744             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1745             SSE_SHUFFLE(XMM4,XMM4,0x00)
1746             SSE_MULT_PS(XMM4,XMM1)
1747             SSE_SUB_PS(XMM6,XMM4)
1748 
1749             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1750             SSE_SHUFFLE(XMM5,XMM5,0x00)
1751             SSE_MULT_PS(XMM5,XMM2)
1752             SSE_SUB_PS(XMM6,XMM5)
1753 
1754             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1755             SSE_SHUFFLE(XMM7,XMM7,0x00)
1756             SSE_MULT_PS(XMM7,XMM3)
1757             SSE_SUB_PS(XMM6,XMM7)
1758 
1759             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1760             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1761 
1762             /* Fourth Column */
1763             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1764             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1765 
1766             /* Matrix-Vector Product: */
1767             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1768             SSE_SHUFFLE(XMM5,XMM5,0x00)
1769             SSE_MULT_PS(XMM5,XMM0)
1770             SSE_SUB_PS(XMM4,XMM5)
1771 
1772             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1773             SSE_SHUFFLE(XMM6,XMM6,0x00)
1774             SSE_MULT_PS(XMM6,XMM1)
1775             SSE_SUB_PS(XMM4,XMM6)
1776 
1777             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1778             SSE_SHUFFLE(XMM7,XMM7,0x00)
1779             SSE_MULT_PS(XMM7,XMM2)
1780             SSE_SUB_PS(XMM4,XMM7)
1781 
1782             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1783             SSE_SHUFFLE(XMM5,XMM5,0x00)
1784             SSE_MULT_PS(XMM5,XMM3)
1785             SSE_SUB_PS(XMM4,XMM5)
1786 
1787             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1788             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1789           SSE_INLINE_END_2;
1790           pv   += 16;
1791         }
1792         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1793       }
1794       row = (unsigned int)(*bjtmp++);
1795 /*        row = (*bjtmp++)/4; */
1796 /*        bjtmp++; */
1797     }
1798     /* finished row so stick it into b->a */
1799     pv = ba + 16*bi[i];
1800     pj = bj + bi[i];
1801     nz = bi[i+1] - bi[i];
1802 
1803     /* Copy x block back into pv block */
1804     for (j=0; j<nz; j++) {
1805       x  = rtmp+16*((unsigned int)pj[j]);
1806 /*        x  = rtmp+4*pj[j]; */
1807 
1808       SSE_INLINE_BEGIN_2(x,pv)
1809         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1810         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1811         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1812 
1813         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1814         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1815 
1816         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1817         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1818 
1819         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1820         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1821 
1822         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1823         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1824 
1825         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1826         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1827 
1828         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1829         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1830 
1831         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1832         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1833       SSE_INLINE_END_2;
1834       pv += 16;
1835     }
1836     /* invert diagonal block */
1837     w = ba + 16*diag_offset[i];
1838     if (pivotinblocks) {
1839       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1840     } else {
1841       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1842     }
1843 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1844     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1845   }
1846 
1847   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1848   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1849   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1850   C->assembled = PETSC_TRUE;
1851   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1852   /* Flop Count from inverting diagonal blocks */
1853   SSE_SCOPE_END;
1854   PetscFunctionReturn(0);
1855 }
1856 
1857 #endif
1858