xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 84df9cb40eca90ea9b18a456fab7a4ecc7f6c1a4)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <../src/mat/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 #undef __FUNCT__
13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15 {
16   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
17   IS             isrow = b->row,isicol = b->icol;
18   PetscErrorCode ierr;
19   const PetscInt *r,*ic;
20   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21   PetscInt       *ajtmpold,*ajtmp,nz,row;
22   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
24   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27   MatScalar      m13,m14,m15,m16;
28   MatScalar      *ba = b->a,*aa = a->a;
29   PetscBool      pivotinblocks = b->pivotinblocks;
30   PetscReal      shift = info->shiftamount;
31 
32   PetscFunctionBegin;
33   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
35   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
36 
37   for (i=0; i<n; i++) {
38     nz    = bi[i+1] - bi[i];
39     ajtmp = bj + bi[i];
40     for  (j=0; j<nz; j++) {
41       x = rtmp+16*ajtmp[j];
42       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
43       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44     }
45     /* load in initial (unfactored row) */
46     idx      = r[i];
47     nz       = ai[idx+1] - ai[idx];
48     ajtmpold = aj + ai[idx];
49     v        = aa + 16*ai[idx];
50     for (j=0; j<nz; j++) {
51       x    = rtmp+16*ic[ajtmpold[j]];
52       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
53       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
54       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55       x[14] = v[14]; x[15] = v[15];
56       v    += 16;
57     }
58     row = *ajtmp++;
59     while (row < i) {
60       pc  = rtmp + 16*row;
61       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
62       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
63       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64       p15 = pc[14]; p16 = pc[15];
65       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68           || p16 != 0.0) {
69         pv = ba + 16*diag_offset[row];
70         pj = bj + diag_offset[row] + 1;
71         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
72         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
73         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74         x15 = pv[14]; x16 = pv[15];
75         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
76         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
77         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
78         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
79 
80         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
81         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
82         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
83         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
84 
85         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
86         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
87         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
88         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
89 
90         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
91         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
92         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
93         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
94 
95         nz = bi[row+1] - diag_offset[row] - 1;
96         pv += 16;
97         for (j=0; j<nz; j++) {
98           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
99           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
102           x    = rtmp + 16*pj[j];
103           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
107 
108           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
112 
113           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117 
118           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
122 
123           pv   += 16;
124         }
125         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
126       }
127       row = *ajtmp++;
128     }
129     /* finished row so stick it into b->a */
130     pv = ba + 16*bi[i];
131     pj = bj + bi[i];
132     nz = bi[i+1] - bi[i];
133     for (j=0; j<nz; j++) {
134       x      = rtmp+16*pj[j];
135       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139       pv   += 16;
140     }
141     /* invert diagonal block */
142     w    = ba + 16*diag_offset[i];
143     if (pivotinblocks) {
144       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
145     } else {
146       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
147     }
148   }
149 
150   ierr = PetscFree(rtmp);CHKERRQ(ierr);
151   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
153   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
154   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
155   C->assembled = PETSC_TRUE;
156   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
157   PetscFunctionReturn(0);
158 }
159 
160 /* MatLUFactorNumeric_SeqBAIJ_4 -
161      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
162        Kernel_A_gets_A_times_B()
163        Kernel_A_gets_A_minus_B_times_C()
164        Kernel_A_gets_inverse_A()
165 */
166 
167 #undef __FUNCT__
168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
170 {
171   Mat            C=B;
172   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173   IS             isrow = b->row,isicol = b->icol;
174   PetscErrorCode ierr;
175   const PetscInt *r,*ic;
176   PetscInt       i,j,k,nz,nzL,row;
177   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
178   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
179   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
180   PetscInt       flg;
181   PetscReal      shift;
182 
183   PetscFunctionBegin;
184   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
185   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
186 
187   if (info->shifttype == MAT_SHIFT_NONE){
188     shift = 0;
189   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
190     shift = info->shiftamount;
191   }
192 
193   /* generate work space needed by the factorization */
194   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
195   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
196 
197   for (i=0; i<n; i++){
198     /* zero rtmp */
199     /* L part */
200     nz    = bi[i+1] - bi[i];
201     bjtmp = bj + bi[i];
202     for  (j=0; j<nz; j++){
203       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
204     }
205 
206     /* U part */
207     nz = bdiag[i]-bdiag[i+1];
208     bjtmp = bj + bdiag[i+1]+1;
209     for  (j=0; j<nz; j++){
210       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
211     }
212 
213     /* load in initial (unfactored row) */
214     nz    = ai[r[i]+1] - ai[r[i]];
215     ajtmp = aj + ai[r[i]];
216     v     = aa + bs2*ai[r[i]];
217     for (j=0; j<nz; j++) {
218       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
219     }
220 
221     /* elimination */
222     bjtmp = bj + bi[i];
223     nzL   = bi[i+1] - bi[i];
224     for(k=0;k < nzL;k++) {
225       row = bjtmp[k];
226       pc = rtmp + bs2*row;
227       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
228       if (flg) {
229         pv = b->a + bs2*bdiag[row];
230         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
231         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
232 
233         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
234         pv = b->a + bs2*(bdiag[row+1]+1);
235         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
236         for (j=0; j<nz; j++) {
237           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
238           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
239           v    = rtmp + bs2*pj[j];
240           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
241           pv  += bs2;
242         }
243         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
244       }
245     }
246 
247     /* finished row so stick it into b->a */
248     /* L part */
249     pv   = b->a + bs2*bi[i] ;
250     pj   = b->j + bi[i] ;
251     nz   = bi[i+1] - bi[i];
252     for (j=0; j<nz; j++) {
253       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
254     }
255 
256     /* Mark diagonal and invert diagonal for simplier triangular solves */
257     pv   = b->a + bs2*bdiag[i];
258     pj   = b->j + bdiag[i];
259     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
260     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
261 
262     /* U part */
263     pv = b->a + bs2*(bdiag[i+1]+1);
264     pj = b->j + bdiag[i+1]+1;
265     nz = bdiag[i] - bdiag[i+1] - 1;
266     for (j=0; j<nz; j++){
267       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
268     }
269   }
270 
271   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
272   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
273   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
274   C->ops->solve          = MatSolve_SeqBAIJ_4;
275   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
276   C->assembled = PETSC_TRUE;
277   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
278   PetscFunctionReturn(0);
279 }
280 
281 #undef __FUNCT__
282 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
283 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
284 {
285 /*
286     Default Version for when blocks are 4 by 4 Using natural ordering
287 */
288   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
289   PetscErrorCode ierr;
290   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
291   PetscInt       *ajtmpold,*ajtmp,nz,row;
292   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
293   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
294   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
295   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
296   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
297   MatScalar      m13,m14,m15,m16;
298   MatScalar      *ba = b->a,*aa = a->a;
299   PetscBool      pivotinblocks = b->pivotinblocks;
300   PetscReal      shift = info->shiftamount;
301 
302   PetscFunctionBegin;
303   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
304 
305   for (i=0; i<n; i++) {
306     nz    = bi[i+1] - bi[i];
307     ajtmp = bj + bi[i];
308     for  (j=0; j<nz; j++) {
309       x = rtmp+16*ajtmp[j];
310       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
311       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
312     }
313     /* load in initial (unfactored row) */
314     nz       = ai[i+1] - ai[i];
315     ajtmpold = aj + ai[i];
316     v        = aa + 16*ai[i];
317     for (j=0; j<nz; j++) {
318       x    = rtmp+16*ajtmpold[j];
319       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
320       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
321       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
322       x[14] = v[14]; x[15] = v[15];
323       v    += 16;
324     }
325     row = *ajtmp++;
326     while (row < i) {
327       pc  = rtmp + 16*row;
328       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
329       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
330       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
331       p15 = pc[14]; p16 = pc[15];
332       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
333           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
334           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
335           || p16 != 0.0) {
336         pv = ba + 16*diag_offset[row];
337         pj = bj + diag_offset[row] + 1;
338         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
339         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
340         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
341         x15 = pv[14]; x16 = pv[15];
342         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
343         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
344         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
345         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
346 
347         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
348         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
349         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
350         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
351 
352         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
353         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
354         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
355         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
356 
357         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
358         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
359         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
360         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
361         nz = bi[row+1] - diag_offset[row] - 1;
362         pv += 16;
363         for (j=0; j<nz; j++) {
364           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
365           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
366           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
367           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
368           x    = rtmp + 16*pj[j];
369           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
370           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
371           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
372           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
373 
374           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
375           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
376           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
377           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
378 
379           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
380           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
381           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
382           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
383 
384           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
385           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
386           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
387           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
388 
389           pv   += 16;
390         }
391         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
392       }
393       row = *ajtmp++;
394     }
395     /* finished row so stick it into b->a */
396     pv = ba + 16*bi[i];
397     pj = bj + bi[i];
398     nz = bi[i+1] - bi[i];
399     for (j=0; j<nz; j++) {
400       x      = rtmp+16*pj[j];
401       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
402       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
403       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
404       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
405       pv   += 16;
406     }
407     /* invert diagonal block */
408     w = ba + 16*diag_offset[i];
409     if (pivotinblocks) {
410       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
411     } else {
412       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
413     }
414   }
415 
416   ierr = PetscFree(rtmp);CHKERRQ(ierr);
417   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
418   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
419   C->assembled = PETSC_TRUE;
420   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
421   PetscFunctionReturn(0);
422 }
423 
424 /*
425   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
426     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
427 */
428 #undef __FUNCT__
429 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
430 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
431 {
432   Mat            C=B;
433   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
434   PetscErrorCode ierr;
435   PetscInt       i,j,k,nz,nzL,row;
436   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
437   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
438   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
439   PetscInt       flg;
440   PetscReal      shift;
441 
442   PetscFunctionBegin;
443   /* generate work space needed by the factorization */
444   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
445   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
446 
447   if (info->shifttype == MAT_SHIFT_NONE){
448     shift = 0;
449   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
450     shift = info->shiftamount;
451   }
452 
453   for (i=0; i<n; i++){
454     /* zero rtmp */
455     /* L part */
456     nz    = bi[i+1] - bi[i];
457     bjtmp = bj + bi[i];
458     for  (j=0; j<nz; j++){
459       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
460     }
461 
462     /* U part */
463     nz = bdiag[i] - bdiag[i+1];
464     bjtmp = bj + bdiag[i+1]+1;
465     for  (j=0; j<nz; j++){
466       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
467     }
468 
469     /* load in initial (unfactored row) */
470     nz    = ai[i+1] - ai[i];
471     ajtmp = aj + ai[i];
472     v     = aa + bs2*ai[i];
473     for (j=0; j<nz; j++) {
474       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
475     }
476 
477     /* elimination */
478     bjtmp = bj + bi[i];
479     nzL   = bi[i+1] - bi[i];
480     for(k=0;k < nzL;k++) {
481       row = bjtmp[k];
482       pc = rtmp + bs2*row;
483       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
484       if (flg) {
485         pv = b->a + bs2*bdiag[row];
486         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
487         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
488 
489         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
490         pv = b->a + bs2*(bdiag[row+1]+1);
491         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
492         for (j=0; j<nz; j++) {
493           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
494           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
495           v    = rtmp + bs2*pj[j];
496           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
497           pv  += bs2;
498         }
499         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
500       }
501     }
502 
503     /* finished row so stick it into b->a */
504     /* L part */
505     pv   = b->a + bs2*bi[i] ;
506     pj   = b->j + bi[i] ;
507     nz   = bi[i+1] - bi[i];
508     for (j=0; j<nz; j++) {
509       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
510     }
511 
512     /* Mark diagonal and invert diagonal for simplier triangular solves */
513     pv   = b->a + bs2*bdiag[i];
514     pj   = b->j + bdiag[i];
515     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
516     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
517 
518     /* U part */
519     pv = b->a + bs2*(bdiag[i+1]+1);
520     pj = b->j + bdiag[i+1]+1;
521     nz = bdiag[i] - bdiag[i+1] - 1;
522     for (j=0; j<nz; j++){
523       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
524     }
525   }
526   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
527   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
528   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
529   C->assembled = PETSC_TRUE;
530   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
531   PetscFunctionReturn(0);
532 }
533 
534 #if defined(PETSC_HAVE_SSE)
535 
536 #include PETSC_HAVE_SSE
537 
538 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
539 #undef __FUNCT__
540 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
542 {
543   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
544   PetscErrorCode ierr;
545   int i,j,n = a->mbs;
546   int         *bj = b->j,*bjtmp,*pj;
547   int         row;
548   int         *ajtmpold,nz,*bi=b->i;
549   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
550   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
551   MatScalar   *ba = b->a,*aa = a->a;
552   int         nonzero=0;
553 /*    int            nonzero=0,colscale = 16; */
554   PetscBool   pivotinblocks = b->pivotinblocks;
555   PetscReal      shift = info->shiftamount;
556 
557   PetscFunctionBegin;
558   SSE_SCOPE_BEGIN;
559 
560   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
561   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
562   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
563   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
564 /*    if ((unsigned long)bj==(unsigned long)aj) { */
565 /*      colscale = 4; */
566 /*    } */
567   for (i=0; i<n; i++) {
568     nz    = bi[i+1] - bi[i];
569     bjtmp = bj + bi[i];
570     /* zero out the 4x4 block accumulators */
571     /* zero out one register */
572     XOR_PS(XMM7,XMM7);
573     for  (j=0; j<nz; j++) {
574       x = rtmp+16*bjtmp[j];
575 /*        x = rtmp+4*bjtmp[j]; */
576       SSE_INLINE_BEGIN_1(x)
577         /* Copy zero register to memory locations */
578         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
579         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
580         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
581         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
582         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
583         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
584         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
585         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
586         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
587       SSE_INLINE_END_1;
588     }
589     /* load in initial (unfactored row) */
590     nz       = ai[i+1] - ai[i];
591     ajtmpold = aj + ai[i];
592     v        = aa + 16*ai[i];
593     for (j=0; j<nz; j++) {
594       x = rtmp+16*ajtmpold[j];
595 /*        x = rtmp+colscale*ajtmpold[j]; */
596       /* Copy v block into x block */
597       SSE_INLINE_BEGIN_2(v,x)
598         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
599         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
600         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
601 
602         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
603         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
604 
605         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
606         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
607 
608         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
609         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
610 
611         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
612         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
613 
614         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
615         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
616 
617         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
618         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
619 
620         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
621         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
622       SSE_INLINE_END_2;
623 
624       v += 16;
625     }
626 /*      row = (*bjtmp++)/4; */
627     row = *bjtmp++;
628     while (row < i) {
629       pc  = rtmp + 16*row;
630       SSE_INLINE_BEGIN_1(pc)
631         /* Load block from lower triangle */
632         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
633         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
634         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
635 
636         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
637         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
638 
639         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
640         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
641 
642         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
643         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
644 
645         /* Compare block to zero block */
646 
647         SSE_COPY_PS(XMM4,XMM7)
648         SSE_CMPNEQ_PS(XMM4,XMM0)
649 
650         SSE_COPY_PS(XMM5,XMM7)
651         SSE_CMPNEQ_PS(XMM5,XMM1)
652 
653         SSE_COPY_PS(XMM6,XMM7)
654         SSE_CMPNEQ_PS(XMM6,XMM2)
655 
656         SSE_CMPNEQ_PS(XMM7,XMM3)
657 
658         /* Reduce the comparisons to one SSE register */
659         SSE_OR_PS(XMM6,XMM7)
660         SSE_OR_PS(XMM5,XMM4)
661         SSE_OR_PS(XMM5,XMM6)
662       SSE_INLINE_END_1;
663 
664       /* Reduce the one SSE register to an integer register for branching */
665       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
666       MOVEMASK(nonzero,XMM5);
667 
668       /* If block is nonzero ... */
669       if (nonzero) {
670         pv = ba + 16*diag_offset[row];
671         PREFETCH_L1(&pv[16]);
672         pj = bj + diag_offset[row] + 1;
673 
674         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
675         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
676         /* but the diagonal was inverted already */
677         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
678 
679         SSE_INLINE_BEGIN_2(pv,pc)
680           /* Column 0, product is accumulated in XMM4 */
681           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
682           SSE_SHUFFLE(XMM4,XMM4,0x00)
683           SSE_MULT_PS(XMM4,XMM0)
684 
685           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
686           SSE_SHUFFLE(XMM5,XMM5,0x00)
687           SSE_MULT_PS(XMM5,XMM1)
688           SSE_ADD_PS(XMM4,XMM5)
689 
690           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
691           SSE_SHUFFLE(XMM6,XMM6,0x00)
692           SSE_MULT_PS(XMM6,XMM2)
693           SSE_ADD_PS(XMM4,XMM6)
694 
695           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
696           SSE_SHUFFLE(XMM7,XMM7,0x00)
697           SSE_MULT_PS(XMM7,XMM3)
698           SSE_ADD_PS(XMM4,XMM7)
699 
700           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
701           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
702 
703           /* Column 1, product is accumulated in XMM5 */
704           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
705           SSE_SHUFFLE(XMM5,XMM5,0x00)
706           SSE_MULT_PS(XMM5,XMM0)
707 
708           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
709           SSE_SHUFFLE(XMM6,XMM6,0x00)
710           SSE_MULT_PS(XMM6,XMM1)
711           SSE_ADD_PS(XMM5,XMM6)
712 
713           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
714           SSE_SHUFFLE(XMM7,XMM7,0x00)
715           SSE_MULT_PS(XMM7,XMM2)
716           SSE_ADD_PS(XMM5,XMM7)
717 
718           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
719           SSE_SHUFFLE(XMM6,XMM6,0x00)
720           SSE_MULT_PS(XMM6,XMM3)
721           SSE_ADD_PS(XMM5,XMM6)
722 
723           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
724           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
725 
726           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
727 
728           /* Column 2, product is accumulated in XMM6 */
729           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
730           SSE_SHUFFLE(XMM6,XMM6,0x00)
731           SSE_MULT_PS(XMM6,XMM0)
732 
733           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
734           SSE_SHUFFLE(XMM7,XMM7,0x00)
735           SSE_MULT_PS(XMM7,XMM1)
736           SSE_ADD_PS(XMM6,XMM7)
737 
738           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
739           SSE_SHUFFLE(XMM7,XMM7,0x00)
740           SSE_MULT_PS(XMM7,XMM2)
741           SSE_ADD_PS(XMM6,XMM7)
742 
743           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
744           SSE_SHUFFLE(XMM7,XMM7,0x00)
745           SSE_MULT_PS(XMM7,XMM3)
746           SSE_ADD_PS(XMM6,XMM7)
747 
748           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
749           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
750 
751           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
752           /* Column 3, product is accumulated in XMM0 */
753           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
754           SSE_SHUFFLE(XMM7,XMM7,0x00)
755           SSE_MULT_PS(XMM0,XMM7)
756 
757           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
758           SSE_SHUFFLE(XMM7,XMM7,0x00)
759           SSE_MULT_PS(XMM1,XMM7)
760           SSE_ADD_PS(XMM0,XMM1)
761 
762           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
763           SSE_SHUFFLE(XMM1,XMM1,0x00)
764           SSE_MULT_PS(XMM1,XMM2)
765           SSE_ADD_PS(XMM0,XMM1)
766 
767           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
768           SSE_SHUFFLE(XMM7,XMM7,0x00)
769           SSE_MULT_PS(XMM3,XMM7)
770           SSE_ADD_PS(XMM0,XMM3)
771 
772           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
773           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
774 
775           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
776           /* This is code to be maintained and read by humans afterall. */
777           /* Copy Multiplier Col 3 into XMM3 */
778           SSE_COPY_PS(XMM3,XMM0)
779           /* Copy Multiplier Col 2 into XMM2 */
780           SSE_COPY_PS(XMM2,XMM6)
781           /* Copy Multiplier Col 1 into XMM1 */
782           SSE_COPY_PS(XMM1,XMM5)
783           /* Copy Multiplier Col 0 into XMM0 */
784           SSE_COPY_PS(XMM0,XMM4)
785         SSE_INLINE_END_2;
786 
787         /* Update the row: */
788         nz = bi[row+1] - diag_offset[row] - 1;
789         pv += 16;
790         for (j=0; j<nz; j++) {
791           PREFETCH_L1(&pv[16]);
792           x = rtmp + 16*pj[j];
793 /*            x = rtmp + 4*pj[j]; */
794 
795           /* X:=X-M*PV, One column at a time */
796           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
797           SSE_INLINE_BEGIN_2(x,pv)
798             /* Load First Column of X*/
799             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
800             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
801 
802             /* Matrix-Vector Product: */
803             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
804             SSE_SHUFFLE(XMM5,XMM5,0x00)
805             SSE_MULT_PS(XMM5,XMM0)
806             SSE_SUB_PS(XMM4,XMM5)
807 
808             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
809             SSE_SHUFFLE(XMM6,XMM6,0x00)
810             SSE_MULT_PS(XMM6,XMM1)
811             SSE_SUB_PS(XMM4,XMM6)
812 
813             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
814             SSE_SHUFFLE(XMM7,XMM7,0x00)
815             SSE_MULT_PS(XMM7,XMM2)
816             SSE_SUB_PS(XMM4,XMM7)
817 
818             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
819             SSE_SHUFFLE(XMM5,XMM5,0x00)
820             SSE_MULT_PS(XMM5,XMM3)
821             SSE_SUB_PS(XMM4,XMM5)
822 
823             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
824             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
825 
826             /* Second Column */
827             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
828             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
829 
830             /* Matrix-Vector Product: */
831             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
832             SSE_SHUFFLE(XMM6,XMM6,0x00)
833             SSE_MULT_PS(XMM6,XMM0)
834             SSE_SUB_PS(XMM5,XMM6)
835 
836             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
837             SSE_SHUFFLE(XMM7,XMM7,0x00)
838             SSE_MULT_PS(XMM7,XMM1)
839             SSE_SUB_PS(XMM5,XMM7)
840 
841             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
842             SSE_SHUFFLE(XMM4,XMM4,0x00)
843             SSE_MULT_PS(XMM4,XMM2)
844             SSE_SUB_PS(XMM5,XMM4)
845 
846             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
847             SSE_SHUFFLE(XMM6,XMM6,0x00)
848             SSE_MULT_PS(XMM6,XMM3)
849             SSE_SUB_PS(XMM5,XMM6)
850 
851             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
852             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
853 
854             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
855 
856             /* Third Column */
857             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
858             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
859 
860             /* Matrix-Vector Product: */
861             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
862             SSE_SHUFFLE(XMM7,XMM7,0x00)
863             SSE_MULT_PS(XMM7,XMM0)
864             SSE_SUB_PS(XMM6,XMM7)
865 
866             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
867             SSE_SHUFFLE(XMM4,XMM4,0x00)
868             SSE_MULT_PS(XMM4,XMM1)
869             SSE_SUB_PS(XMM6,XMM4)
870 
871             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
872             SSE_SHUFFLE(XMM5,XMM5,0x00)
873             SSE_MULT_PS(XMM5,XMM2)
874             SSE_SUB_PS(XMM6,XMM5)
875 
876             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
877             SSE_SHUFFLE(XMM7,XMM7,0x00)
878             SSE_MULT_PS(XMM7,XMM3)
879             SSE_SUB_PS(XMM6,XMM7)
880 
881             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
882             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
883 
884             /* Fourth Column */
885             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
886             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
887 
888             /* Matrix-Vector Product: */
889             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
890             SSE_SHUFFLE(XMM5,XMM5,0x00)
891             SSE_MULT_PS(XMM5,XMM0)
892             SSE_SUB_PS(XMM4,XMM5)
893 
894             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
895             SSE_SHUFFLE(XMM6,XMM6,0x00)
896             SSE_MULT_PS(XMM6,XMM1)
897             SSE_SUB_PS(XMM4,XMM6)
898 
899             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
900             SSE_SHUFFLE(XMM7,XMM7,0x00)
901             SSE_MULT_PS(XMM7,XMM2)
902             SSE_SUB_PS(XMM4,XMM7)
903 
904             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
905             SSE_SHUFFLE(XMM5,XMM5,0x00)
906             SSE_MULT_PS(XMM5,XMM3)
907             SSE_SUB_PS(XMM4,XMM5)
908 
909             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
910             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
911           SSE_INLINE_END_2;
912           pv   += 16;
913         }
914         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
915       }
916       row = *bjtmp++;
917 /*        row = (*bjtmp++)/4; */
918     }
919     /* finished row so stick it into b->a */
920     pv = ba + 16*bi[i];
921     pj = bj + bi[i];
922     nz = bi[i+1] - bi[i];
923 
924     /* Copy x block back into pv block */
925     for (j=0; j<nz; j++) {
926       x  = rtmp+16*pj[j];
927 /*        x  = rtmp+4*pj[j]; */
928 
929       SSE_INLINE_BEGIN_2(x,pv)
930         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
931         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
932         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
933 
934         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
935         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
936 
937         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
938         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
939 
940         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
941         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
942 
943         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
944         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
945 
946         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
947         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
948 
949         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
950         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
951 
952         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
953         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
954       SSE_INLINE_END_2;
955       pv += 16;
956     }
957     /* invert diagonal block */
958     w = ba + 16*diag_offset[i];
959     if (pivotinblocks) {
960       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
961     } else {
962       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
963     }
964 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
965     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
966   }
967 
968   ierr = PetscFree(rtmp);CHKERRQ(ierr);
969   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
970   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
971   C->assembled = PETSC_TRUE;
972   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
973   /* Flop Count from inverting diagonal blocks */
974   SSE_SCOPE_END;
975   PetscFunctionReturn(0);
976 }
977 
978 #undef __FUNCT__
979 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
980 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
981 {
982   Mat            A=C;
983   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
984   PetscErrorCode ierr;
985   int i,j,n = a->mbs;
986   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
987   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
988   unsigned int   row;
989   int            nz,*bi=b->i;
990   int            *diag_offset = b->diag,*ai=a->i;
991   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
992   MatScalar      *ba = b->a,*aa = a->a;
993   int            nonzero=0;
994 /*    int            nonzero=0,colscale = 16; */
995   PetscBool      pivotinblocks = b->pivotinblocks;
996   PetscReal      shift = info->shiftamount;
997 
998   PetscFunctionBegin;
999   SSE_SCOPE_BEGIN;
1000 
1001   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1002   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1003   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1004   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1005 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1006 /*      colscale = 4; */
1007 /*    } */
1008 
1009   for (i=0; i<n; i++) {
1010     nz    = bi[i+1] - bi[i];
1011     bjtmp = bj + bi[i];
1012     /* zero out the 4x4 block accumulators */
1013     /* zero out one register */
1014     XOR_PS(XMM7,XMM7);
1015     for  (j=0; j<nz; j++) {
1016       x = rtmp+16*((unsigned int)bjtmp[j]);
1017 /*        x = rtmp+4*bjtmp[j]; */
1018       SSE_INLINE_BEGIN_1(x)
1019         /* Copy zero register to memory locations */
1020         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1021         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1022         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1023         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1024         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1025         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1026         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1027         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1028         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1029       SSE_INLINE_END_1;
1030     }
1031     /* load in initial (unfactored row) */
1032     nz    = ai[i+1] - ai[i];
1033     ajtmp = aj + ai[i];
1034     v     = aa + 16*ai[i];
1035     for (j=0; j<nz; j++) {
1036       x = rtmp+16*((unsigned int)ajtmp[j]);
1037 /*        x = rtmp+colscale*ajtmp[j]; */
1038       /* Copy v block into x block */
1039       SSE_INLINE_BEGIN_2(v,x)
1040         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1042         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1043 
1044         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1045         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1046 
1047         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1048         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1049 
1050         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1051         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1052 
1053         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1054         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1055 
1056         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1057         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1058 
1059         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1060         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1061 
1062         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1063         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1064       SSE_INLINE_END_2;
1065 
1066       v += 16;
1067     }
1068 /*      row = (*bjtmp++)/4; */
1069     row = (unsigned int)(*bjtmp++);
1070     while (row < i) {
1071       pc  = rtmp + 16*row;
1072       SSE_INLINE_BEGIN_1(pc)
1073         /* Load block from lower triangle */
1074         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1075         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1076         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1077 
1078         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1079         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1080 
1081         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1082         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1083 
1084         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1085         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1086 
1087         /* Compare block to zero block */
1088 
1089         SSE_COPY_PS(XMM4,XMM7)
1090         SSE_CMPNEQ_PS(XMM4,XMM0)
1091 
1092         SSE_COPY_PS(XMM5,XMM7)
1093         SSE_CMPNEQ_PS(XMM5,XMM1)
1094 
1095         SSE_COPY_PS(XMM6,XMM7)
1096         SSE_CMPNEQ_PS(XMM6,XMM2)
1097 
1098         SSE_CMPNEQ_PS(XMM7,XMM3)
1099 
1100         /* Reduce the comparisons to one SSE register */
1101         SSE_OR_PS(XMM6,XMM7)
1102         SSE_OR_PS(XMM5,XMM4)
1103         SSE_OR_PS(XMM5,XMM6)
1104       SSE_INLINE_END_1;
1105 
1106       /* Reduce the one SSE register to an integer register for branching */
1107       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1108       MOVEMASK(nonzero,XMM5);
1109 
1110       /* If block is nonzero ... */
1111       if (nonzero) {
1112         pv = ba + 16*diag_offset[row];
1113         PREFETCH_L1(&pv[16]);
1114         pj = bj + diag_offset[row] + 1;
1115 
1116         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1117         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1118         /* but the diagonal was inverted already */
1119         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1120 
1121         SSE_INLINE_BEGIN_2(pv,pc)
1122           /* Column 0, product is accumulated in XMM4 */
1123           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1124           SSE_SHUFFLE(XMM4,XMM4,0x00)
1125           SSE_MULT_PS(XMM4,XMM0)
1126 
1127           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1128           SSE_SHUFFLE(XMM5,XMM5,0x00)
1129           SSE_MULT_PS(XMM5,XMM1)
1130           SSE_ADD_PS(XMM4,XMM5)
1131 
1132           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1133           SSE_SHUFFLE(XMM6,XMM6,0x00)
1134           SSE_MULT_PS(XMM6,XMM2)
1135           SSE_ADD_PS(XMM4,XMM6)
1136 
1137           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1138           SSE_SHUFFLE(XMM7,XMM7,0x00)
1139           SSE_MULT_PS(XMM7,XMM3)
1140           SSE_ADD_PS(XMM4,XMM7)
1141 
1142           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1143           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1144 
1145           /* Column 1, product is accumulated in XMM5 */
1146           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1147           SSE_SHUFFLE(XMM5,XMM5,0x00)
1148           SSE_MULT_PS(XMM5,XMM0)
1149 
1150           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1151           SSE_SHUFFLE(XMM6,XMM6,0x00)
1152           SSE_MULT_PS(XMM6,XMM1)
1153           SSE_ADD_PS(XMM5,XMM6)
1154 
1155           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1156           SSE_SHUFFLE(XMM7,XMM7,0x00)
1157           SSE_MULT_PS(XMM7,XMM2)
1158           SSE_ADD_PS(XMM5,XMM7)
1159 
1160           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1161           SSE_SHUFFLE(XMM6,XMM6,0x00)
1162           SSE_MULT_PS(XMM6,XMM3)
1163           SSE_ADD_PS(XMM5,XMM6)
1164 
1165           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1166           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1167 
1168           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1169 
1170           /* Column 2, product is accumulated in XMM6 */
1171           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1172           SSE_SHUFFLE(XMM6,XMM6,0x00)
1173           SSE_MULT_PS(XMM6,XMM0)
1174 
1175           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1176           SSE_SHUFFLE(XMM7,XMM7,0x00)
1177           SSE_MULT_PS(XMM7,XMM1)
1178           SSE_ADD_PS(XMM6,XMM7)
1179 
1180           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1181           SSE_SHUFFLE(XMM7,XMM7,0x00)
1182           SSE_MULT_PS(XMM7,XMM2)
1183           SSE_ADD_PS(XMM6,XMM7)
1184 
1185           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1186           SSE_SHUFFLE(XMM7,XMM7,0x00)
1187           SSE_MULT_PS(XMM7,XMM3)
1188           SSE_ADD_PS(XMM6,XMM7)
1189 
1190           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1191           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1192 
1193           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1194           /* Column 3, product is accumulated in XMM0 */
1195           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1196           SSE_SHUFFLE(XMM7,XMM7,0x00)
1197           SSE_MULT_PS(XMM0,XMM7)
1198 
1199           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1200           SSE_SHUFFLE(XMM7,XMM7,0x00)
1201           SSE_MULT_PS(XMM1,XMM7)
1202           SSE_ADD_PS(XMM0,XMM1)
1203 
1204           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1205           SSE_SHUFFLE(XMM1,XMM1,0x00)
1206           SSE_MULT_PS(XMM1,XMM2)
1207           SSE_ADD_PS(XMM0,XMM1)
1208 
1209           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1210           SSE_SHUFFLE(XMM7,XMM7,0x00)
1211           SSE_MULT_PS(XMM3,XMM7)
1212           SSE_ADD_PS(XMM0,XMM3)
1213 
1214           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1215           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1216 
1217           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1218           /* This is code to be maintained and read by humans afterall. */
1219           /* Copy Multiplier Col 3 into XMM3 */
1220           SSE_COPY_PS(XMM3,XMM0)
1221           /* Copy Multiplier Col 2 into XMM2 */
1222           SSE_COPY_PS(XMM2,XMM6)
1223           /* Copy Multiplier Col 1 into XMM1 */
1224           SSE_COPY_PS(XMM1,XMM5)
1225           /* Copy Multiplier Col 0 into XMM0 */
1226           SSE_COPY_PS(XMM0,XMM4)
1227         SSE_INLINE_END_2;
1228 
1229         /* Update the row: */
1230         nz = bi[row+1] - diag_offset[row] - 1;
1231         pv += 16;
1232         for (j=0; j<nz; j++) {
1233           PREFETCH_L1(&pv[16]);
1234           x = rtmp + 16*((unsigned int)pj[j]);
1235 /*            x = rtmp + 4*pj[j]; */
1236 
1237           /* X:=X-M*PV, One column at a time */
1238           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1239           SSE_INLINE_BEGIN_2(x,pv)
1240             /* Load First Column of X*/
1241             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1242             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1243 
1244             /* Matrix-Vector Product: */
1245             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1246             SSE_SHUFFLE(XMM5,XMM5,0x00)
1247             SSE_MULT_PS(XMM5,XMM0)
1248             SSE_SUB_PS(XMM4,XMM5)
1249 
1250             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1251             SSE_SHUFFLE(XMM6,XMM6,0x00)
1252             SSE_MULT_PS(XMM6,XMM1)
1253             SSE_SUB_PS(XMM4,XMM6)
1254 
1255             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1256             SSE_SHUFFLE(XMM7,XMM7,0x00)
1257             SSE_MULT_PS(XMM7,XMM2)
1258             SSE_SUB_PS(XMM4,XMM7)
1259 
1260             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1261             SSE_SHUFFLE(XMM5,XMM5,0x00)
1262             SSE_MULT_PS(XMM5,XMM3)
1263             SSE_SUB_PS(XMM4,XMM5)
1264 
1265             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1267 
1268             /* Second Column */
1269             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1270             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1271 
1272             /* Matrix-Vector Product: */
1273             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1274             SSE_SHUFFLE(XMM6,XMM6,0x00)
1275             SSE_MULT_PS(XMM6,XMM0)
1276             SSE_SUB_PS(XMM5,XMM6)
1277 
1278             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1279             SSE_SHUFFLE(XMM7,XMM7,0x00)
1280             SSE_MULT_PS(XMM7,XMM1)
1281             SSE_SUB_PS(XMM5,XMM7)
1282 
1283             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1284             SSE_SHUFFLE(XMM4,XMM4,0x00)
1285             SSE_MULT_PS(XMM4,XMM2)
1286             SSE_SUB_PS(XMM5,XMM4)
1287 
1288             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1289             SSE_SHUFFLE(XMM6,XMM6,0x00)
1290             SSE_MULT_PS(XMM6,XMM3)
1291             SSE_SUB_PS(XMM5,XMM6)
1292 
1293             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1295 
1296             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1297 
1298             /* Third Column */
1299             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1300             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1301 
1302             /* Matrix-Vector Product: */
1303             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1304             SSE_SHUFFLE(XMM7,XMM7,0x00)
1305             SSE_MULT_PS(XMM7,XMM0)
1306             SSE_SUB_PS(XMM6,XMM7)
1307 
1308             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1309             SSE_SHUFFLE(XMM4,XMM4,0x00)
1310             SSE_MULT_PS(XMM4,XMM1)
1311             SSE_SUB_PS(XMM6,XMM4)
1312 
1313             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1314             SSE_SHUFFLE(XMM5,XMM5,0x00)
1315             SSE_MULT_PS(XMM5,XMM2)
1316             SSE_SUB_PS(XMM6,XMM5)
1317 
1318             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1319             SSE_SHUFFLE(XMM7,XMM7,0x00)
1320             SSE_MULT_PS(XMM7,XMM3)
1321             SSE_SUB_PS(XMM6,XMM7)
1322 
1323             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1325 
1326             /* Fourth Column */
1327             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1328             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1329 
1330             /* Matrix-Vector Product: */
1331             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1332             SSE_SHUFFLE(XMM5,XMM5,0x00)
1333             SSE_MULT_PS(XMM5,XMM0)
1334             SSE_SUB_PS(XMM4,XMM5)
1335 
1336             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1337             SSE_SHUFFLE(XMM6,XMM6,0x00)
1338             SSE_MULT_PS(XMM6,XMM1)
1339             SSE_SUB_PS(XMM4,XMM6)
1340 
1341             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1342             SSE_SHUFFLE(XMM7,XMM7,0x00)
1343             SSE_MULT_PS(XMM7,XMM2)
1344             SSE_SUB_PS(XMM4,XMM7)
1345 
1346             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1347             SSE_SHUFFLE(XMM5,XMM5,0x00)
1348             SSE_MULT_PS(XMM5,XMM3)
1349             SSE_SUB_PS(XMM4,XMM5)
1350 
1351             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1353           SSE_INLINE_END_2;
1354           pv   += 16;
1355         }
1356         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1357       }
1358       row = (unsigned int)(*bjtmp++);
1359 /*        row = (*bjtmp++)/4; */
1360 /*        bjtmp++; */
1361     }
1362     /* finished row so stick it into b->a */
1363     pv = ba + 16*bi[i];
1364     pj = bj + bi[i];
1365     nz = bi[i+1] - bi[i];
1366 
1367     /* Copy x block back into pv block */
1368     for (j=0; j<nz; j++) {
1369       x  = rtmp+16*((unsigned int)pj[j]);
1370 /*        x  = rtmp+4*pj[j]; */
1371 
1372       SSE_INLINE_BEGIN_2(x,pv)
1373         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1374         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1375         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1376 
1377         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1378         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1379 
1380         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1381         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1382 
1383         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1384         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1385 
1386         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1387         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1388 
1389         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1390         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1391 
1392         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1393         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1394 
1395         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1396         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1397       SSE_INLINE_END_2;
1398       pv += 16;
1399     }
1400     /* invert diagonal block */
1401     w = ba + 16*diag_offset[i];
1402     if (pivotinblocks) {
1403       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1404     } else {
1405       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1406     }
1407 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1408     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1409   }
1410 
1411   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1412   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1413   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1414   C->assembled = PETSC_TRUE;
1415   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1416   /* Flop Count from inverting diagonal blocks */
1417   SSE_SCOPE_END;
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 #undef __FUNCT__
1422 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1423 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1424 {
1425   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1426   PetscErrorCode ierr;
1427   int  i,j,n = a->mbs;
1428   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1429   unsigned int   row;
1430   int            *ajtmpold,nz,*bi=b->i;
1431   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1432   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1433   MatScalar      *ba = b->a,*aa = a->a;
1434   int            nonzero=0;
1435 /*    int            nonzero=0,colscale = 16; */
1436   PetscBool      pivotinblocks = b->pivotinblocks;
1437   PetscReal      shift = info->shiftamount;
1438 
1439   PetscFunctionBegin;
1440   SSE_SCOPE_BEGIN;
1441 
1442   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1443   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1444   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1445   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1446 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1447 /*      colscale = 4; */
1448 /*    } */
1449   if ((unsigned long)bj==(unsigned long)aj) {
1450     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1451   }
1452 
1453   for (i=0; i<n; i++) {
1454     nz    = bi[i+1] - bi[i];
1455     bjtmp = bj + bi[i];
1456     /* zero out the 4x4 block accumulators */
1457     /* zero out one register */
1458     XOR_PS(XMM7,XMM7);
1459     for  (j=0; j<nz; j++) {
1460       x = rtmp+16*((unsigned int)bjtmp[j]);
1461 /*        x = rtmp+4*bjtmp[j]; */
1462       SSE_INLINE_BEGIN_1(x)
1463         /* Copy zero register to memory locations */
1464         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1465         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1466         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1467         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1468         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1469         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1470         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1471         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1472         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1473       SSE_INLINE_END_1;
1474     }
1475     /* load in initial (unfactored row) */
1476     nz       = ai[i+1] - ai[i];
1477     ajtmpold = aj + ai[i];
1478     v        = aa + 16*ai[i];
1479     for (j=0; j<nz; j++) {
1480       x = rtmp+16*ajtmpold[j];
1481 /*        x = rtmp+colscale*ajtmpold[j]; */
1482       /* Copy v block into x block */
1483       SSE_INLINE_BEGIN_2(v,x)
1484         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1486         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1487 
1488         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1489         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1490 
1491         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1492         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1493 
1494         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1495         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1496 
1497         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1498         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1499 
1500         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1501         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1502 
1503         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1504         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1505 
1506         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1507         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1508       SSE_INLINE_END_2;
1509 
1510       v += 16;
1511     }
1512 /*      row = (*bjtmp++)/4; */
1513     row = (unsigned int)(*bjtmp++);
1514     while (row < i) {
1515       pc  = rtmp + 16*row;
1516       SSE_INLINE_BEGIN_1(pc)
1517         /* Load block from lower triangle */
1518         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1519         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1520         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1521 
1522         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1523         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1524 
1525         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1526         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1527 
1528         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1529         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1530 
1531         /* Compare block to zero block */
1532 
1533         SSE_COPY_PS(XMM4,XMM7)
1534         SSE_CMPNEQ_PS(XMM4,XMM0)
1535 
1536         SSE_COPY_PS(XMM5,XMM7)
1537         SSE_CMPNEQ_PS(XMM5,XMM1)
1538 
1539         SSE_COPY_PS(XMM6,XMM7)
1540         SSE_CMPNEQ_PS(XMM6,XMM2)
1541 
1542         SSE_CMPNEQ_PS(XMM7,XMM3)
1543 
1544         /* Reduce the comparisons to one SSE register */
1545         SSE_OR_PS(XMM6,XMM7)
1546         SSE_OR_PS(XMM5,XMM4)
1547         SSE_OR_PS(XMM5,XMM6)
1548       SSE_INLINE_END_1;
1549 
1550       /* Reduce the one SSE register to an integer register for branching */
1551       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1552       MOVEMASK(nonzero,XMM5);
1553 
1554       /* If block is nonzero ... */
1555       if (nonzero) {
1556         pv = ba + 16*diag_offset[row];
1557         PREFETCH_L1(&pv[16]);
1558         pj = bj + diag_offset[row] + 1;
1559 
1560         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1561         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1562         /* but the diagonal was inverted already */
1563         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1564 
1565         SSE_INLINE_BEGIN_2(pv,pc)
1566           /* Column 0, product is accumulated in XMM4 */
1567           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1568           SSE_SHUFFLE(XMM4,XMM4,0x00)
1569           SSE_MULT_PS(XMM4,XMM0)
1570 
1571           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1572           SSE_SHUFFLE(XMM5,XMM5,0x00)
1573           SSE_MULT_PS(XMM5,XMM1)
1574           SSE_ADD_PS(XMM4,XMM5)
1575 
1576           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1577           SSE_SHUFFLE(XMM6,XMM6,0x00)
1578           SSE_MULT_PS(XMM6,XMM2)
1579           SSE_ADD_PS(XMM4,XMM6)
1580 
1581           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1582           SSE_SHUFFLE(XMM7,XMM7,0x00)
1583           SSE_MULT_PS(XMM7,XMM3)
1584           SSE_ADD_PS(XMM4,XMM7)
1585 
1586           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1587           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1588 
1589           /* Column 1, product is accumulated in XMM5 */
1590           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1591           SSE_SHUFFLE(XMM5,XMM5,0x00)
1592           SSE_MULT_PS(XMM5,XMM0)
1593 
1594           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1595           SSE_SHUFFLE(XMM6,XMM6,0x00)
1596           SSE_MULT_PS(XMM6,XMM1)
1597           SSE_ADD_PS(XMM5,XMM6)
1598 
1599           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1600           SSE_SHUFFLE(XMM7,XMM7,0x00)
1601           SSE_MULT_PS(XMM7,XMM2)
1602           SSE_ADD_PS(XMM5,XMM7)
1603 
1604           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1605           SSE_SHUFFLE(XMM6,XMM6,0x00)
1606           SSE_MULT_PS(XMM6,XMM3)
1607           SSE_ADD_PS(XMM5,XMM6)
1608 
1609           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1610           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1611 
1612           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1613 
1614           /* Column 2, product is accumulated in XMM6 */
1615           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1616           SSE_SHUFFLE(XMM6,XMM6,0x00)
1617           SSE_MULT_PS(XMM6,XMM0)
1618 
1619           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1620           SSE_SHUFFLE(XMM7,XMM7,0x00)
1621           SSE_MULT_PS(XMM7,XMM1)
1622           SSE_ADD_PS(XMM6,XMM7)
1623 
1624           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1625           SSE_SHUFFLE(XMM7,XMM7,0x00)
1626           SSE_MULT_PS(XMM7,XMM2)
1627           SSE_ADD_PS(XMM6,XMM7)
1628 
1629           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1630           SSE_SHUFFLE(XMM7,XMM7,0x00)
1631           SSE_MULT_PS(XMM7,XMM3)
1632           SSE_ADD_PS(XMM6,XMM7)
1633 
1634           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1635           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1636 
1637           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1638           /* Column 3, product is accumulated in XMM0 */
1639           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1640           SSE_SHUFFLE(XMM7,XMM7,0x00)
1641           SSE_MULT_PS(XMM0,XMM7)
1642 
1643           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1644           SSE_SHUFFLE(XMM7,XMM7,0x00)
1645           SSE_MULT_PS(XMM1,XMM7)
1646           SSE_ADD_PS(XMM0,XMM1)
1647 
1648           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1649           SSE_SHUFFLE(XMM1,XMM1,0x00)
1650           SSE_MULT_PS(XMM1,XMM2)
1651           SSE_ADD_PS(XMM0,XMM1)
1652 
1653           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1654           SSE_SHUFFLE(XMM7,XMM7,0x00)
1655           SSE_MULT_PS(XMM3,XMM7)
1656           SSE_ADD_PS(XMM0,XMM3)
1657 
1658           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1659           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1660 
1661           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1662           /* This is code to be maintained and read by humans afterall. */
1663           /* Copy Multiplier Col 3 into XMM3 */
1664           SSE_COPY_PS(XMM3,XMM0)
1665           /* Copy Multiplier Col 2 into XMM2 */
1666           SSE_COPY_PS(XMM2,XMM6)
1667           /* Copy Multiplier Col 1 into XMM1 */
1668           SSE_COPY_PS(XMM1,XMM5)
1669           /* Copy Multiplier Col 0 into XMM0 */
1670           SSE_COPY_PS(XMM0,XMM4)
1671         SSE_INLINE_END_2;
1672 
1673         /* Update the row: */
1674         nz = bi[row+1] - diag_offset[row] - 1;
1675         pv += 16;
1676         for (j=0; j<nz; j++) {
1677           PREFETCH_L1(&pv[16]);
1678           x = rtmp + 16*((unsigned int)pj[j]);
1679 /*            x = rtmp + 4*pj[j]; */
1680 
1681           /* X:=X-M*PV, One column at a time */
1682           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1683           SSE_INLINE_BEGIN_2(x,pv)
1684             /* Load First Column of X*/
1685             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1686             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1687 
1688             /* Matrix-Vector Product: */
1689             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1690             SSE_SHUFFLE(XMM5,XMM5,0x00)
1691             SSE_MULT_PS(XMM5,XMM0)
1692             SSE_SUB_PS(XMM4,XMM5)
1693 
1694             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1695             SSE_SHUFFLE(XMM6,XMM6,0x00)
1696             SSE_MULT_PS(XMM6,XMM1)
1697             SSE_SUB_PS(XMM4,XMM6)
1698 
1699             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1700             SSE_SHUFFLE(XMM7,XMM7,0x00)
1701             SSE_MULT_PS(XMM7,XMM2)
1702             SSE_SUB_PS(XMM4,XMM7)
1703 
1704             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1705             SSE_SHUFFLE(XMM5,XMM5,0x00)
1706             SSE_MULT_PS(XMM5,XMM3)
1707             SSE_SUB_PS(XMM4,XMM5)
1708 
1709             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1710             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1711 
1712             /* Second Column */
1713             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1714             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1715 
1716             /* Matrix-Vector Product: */
1717             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1718             SSE_SHUFFLE(XMM6,XMM6,0x00)
1719             SSE_MULT_PS(XMM6,XMM0)
1720             SSE_SUB_PS(XMM5,XMM6)
1721 
1722             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1723             SSE_SHUFFLE(XMM7,XMM7,0x00)
1724             SSE_MULT_PS(XMM7,XMM1)
1725             SSE_SUB_PS(XMM5,XMM7)
1726 
1727             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1728             SSE_SHUFFLE(XMM4,XMM4,0x00)
1729             SSE_MULT_PS(XMM4,XMM2)
1730             SSE_SUB_PS(XMM5,XMM4)
1731 
1732             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1733             SSE_SHUFFLE(XMM6,XMM6,0x00)
1734             SSE_MULT_PS(XMM6,XMM3)
1735             SSE_SUB_PS(XMM5,XMM6)
1736 
1737             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1738             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1739 
1740             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1741 
1742             /* Third Column */
1743             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1744             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1745 
1746             /* Matrix-Vector Product: */
1747             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1748             SSE_SHUFFLE(XMM7,XMM7,0x00)
1749             SSE_MULT_PS(XMM7,XMM0)
1750             SSE_SUB_PS(XMM6,XMM7)
1751 
1752             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1753             SSE_SHUFFLE(XMM4,XMM4,0x00)
1754             SSE_MULT_PS(XMM4,XMM1)
1755             SSE_SUB_PS(XMM6,XMM4)
1756 
1757             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1758             SSE_SHUFFLE(XMM5,XMM5,0x00)
1759             SSE_MULT_PS(XMM5,XMM2)
1760             SSE_SUB_PS(XMM6,XMM5)
1761 
1762             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1763             SSE_SHUFFLE(XMM7,XMM7,0x00)
1764             SSE_MULT_PS(XMM7,XMM3)
1765             SSE_SUB_PS(XMM6,XMM7)
1766 
1767             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1768             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1769 
1770             /* Fourth Column */
1771             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1772             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1773 
1774             /* Matrix-Vector Product: */
1775             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1776             SSE_SHUFFLE(XMM5,XMM5,0x00)
1777             SSE_MULT_PS(XMM5,XMM0)
1778             SSE_SUB_PS(XMM4,XMM5)
1779 
1780             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1781             SSE_SHUFFLE(XMM6,XMM6,0x00)
1782             SSE_MULT_PS(XMM6,XMM1)
1783             SSE_SUB_PS(XMM4,XMM6)
1784 
1785             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1786             SSE_SHUFFLE(XMM7,XMM7,0x00)
1787             SSE_MULT_PS(XMM7,XMM2)
1788             SSE_SUB_PS(XMM4,XMM7)
1789 
1790             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1791             SSE_SHUFFLE(XMM5,XMM5,0x00)
1792             SSE_MULT_PS(XMM5,XMM3)
1793             SSE_SUB_PS(XMM4,XMM5)
1794 
1795             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1796             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1797           SSE_INLINE_END_2;
1798           pv   += 16;
1799         }
1800         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1801       }
1802       row = (unsigned int)(*bjtmp++);
1803 /*        row = (*bjtmp++)/4; */
1804 /*        bjtmp++; */
1805     }
1806     /* finished row so stick it into b->a */
1807     pv = ba + 16*bi[i];
1808     pj = bj + bi[i];
1809     nz = bi[i+1] - bi[i];
1810 
1811     /* Copy x block back into pv block */
1812     for (j=0; j<nz; j++) {
1813       x  = rtmp+16*((unsigned int)pj[j]);
1814 /*        x  = rtmp+4*pj[j]; */
1815 
1816       SSE_INLINE_BEGIN_2(x,pv)
1817         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1818         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1819         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1820 
1821         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1822         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1823 
1824         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1825         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1826 
1827         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1828         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1829 
1830         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1831         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1832 
1833         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1834         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1835 
1836         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1837         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1838 
1839         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1840         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1841       SSE_INLINE_END_2;
1842       pv += 16;
1843     }
1844     /* invert diagonal block */
1845     w = ba + 16*diag_offset[i];
1846     if (pivotinblocks) {
1847       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1848     } else {
1849       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1850     }
1851 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1852     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1853   }
1854 
1855   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1856   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1857   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1858   C->assembled = PETSC_TRUE;
1859   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1860   /* Flop Count from inverting diagonal blocks */
1861   SSE_SCOPE_END;
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 #endif
1866