xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 6ba06ab77c48d2bbbcba7e829ffcd04868efd3d5)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftamount;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4 -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171 {
172   Mat            C=B;
173   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174   IS             isrow = b->row,isicol = b->icol;
175   PetscErrorCode ierr;
176   const PetscInt *r,*ic,*ics;
177   PetscInt       i,j,k,nz,nzL,row;
178   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181   PetscInt       flg;
182   PetscReal      shift;
183 
184   PetscFunctionBegin;
185   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
186   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
187 
188   if (info->shifttype == MAT_SHIFT_NONE){
189     shift = 0;
190   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
191     shift = info->shiftamount;
192   }
193 
194   /* generate work space needed by the factorization */
195   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
196   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
197   ics  = ic;
198 
199   for (i=0; i<n; i++){
200     /* zero rtmp */
201     /* L part */
202     nz    = bi[i+1] - bi[i];
203     bjtmp = bj + bi[i];
204     for  (j=0; j<nz; j++){
205       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
206     }
207 
208     /* U part */
209     nz = bdiag[i]-bdiag[i+1];
210     bjtmp = bj + bdiag[i+1]+1;
211     for  (j=0; j<nz; j++){
212       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
213     }
214 
215     /* load in initial (unfactored row) */
216     nz    = ai[r[i]+1] - ai[r[i]];
217     ajtmp = aj + ai[r[i]];
218     v     = aa + bs2*ai[r[i]];
219     for (j=0; j<nz; j++) {
220       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
221     }
222 
223     /* elimination */
224     bjtmp = bj + bi[i];
225     nzL   = bi[i+1] - bi[i];
226     for(k=0;k < nzL;k++) {
227       row = bjtmp[k];
228       pc = rtmp + bs2*row;
229       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
230       if (flg) {
231         pv = b->a + bs2*bdiag[row];
232         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
233         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
234 
235         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
236         pv = b->a + bs2*(bdiag[row+1]+1);
237         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
238         for (j=0; j<nz; j++) {
239           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
240           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
241           v    = rtmp + bs2*pj[j];
242           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
243           pv  += bs2;
244         }
245         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
246       }
247     }
248 
249     /* finished row so stick it into b->a */
250     /* L part */
251     pv   = b->a + bs2*bi[i] ;
252     pj   = b->j + bi[i] ;
253     nz   = bi[i+1] - bi[i];
254     for (j=0; j<nz; j++) {
255       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
256     }
257 
258     /* Mark diagonal and invert diagonal for simplier triangular solves */
259     pv   = b->a + bs2*bdiag[i];
260     pj   = b->j + bdiag[i];
261     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
262     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
263 
264     /* U part */
265     pv = b->a + bs2*(bdiag[i+1]+1);
266     pj = b->j + bdiag[i+1]+1;
267     nz = bdiag[i] - bdiag[i+1] - 1;
268     for (j=0; j<nz; j++){
269       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
270     }
271   }
272 
273   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
274   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
275   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
276   C->ops->solve          = MatSolve_SeqBAIJ_4;
277   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
278   C->assembled = PETSC_TRUE;
279   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
280   PetscFunctionReturn(0);
281 }
282 
283 #undef __FUNCT__
284 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
285 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
286 {
287 /*
288     Default Version for when blocks are 4 by 4 Using natural ordering
289 */
290   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
291   PetscErrorCode ierr;
292   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
293   PetscInt       *ajtmpold,*ajtmp,nz,row;
294   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
295   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
296   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
297   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
298   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
299   MatScalar      m13,m14,m15,m16;
300   MatScalar      *ba = b->a,*aa = a->a;
301   PetscTruth     pivotinblocks = b->pivotinblocks;
302   PetscReal      shift = info->shiftamount;
303 
304   PetscFunctionBegin;
305   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
306 
307   for (i=0; i<n; i++) {
308     nz    = bi[i+1] - bi[i];
309     ajtmp = bj + bi[i];
310     for  (j=0; j<nz; j++) {
311       x = rtmp+16*ajtmp[j];
312       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
313       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
314     }
315     /* load in initial (unfactored row) */
316     nz       = ai[i+1] - ai[i];
317     ajtmpold = aj + ai[i];
318     v        = aa + 16*ai[i];
319     for (j=0; j<nz; j++) {
320       x    = rtmp+16*ajtmpold[j];
321       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
322       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
323       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
324       x[14] = v[14]; x[15] = v[15];
325       v    += 16;
326     }
327     row = *ajtmp++;
328     while (row < i) {
329       pc  = rtmp + 16*row;
330       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
331       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
332       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
333       p15 = pc[14]; p16 = pc[15];
334       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
335           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
336           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
337           || p16 != 0.0) {
338         pv = ba + 16*diag_offset[row];
339         pj = bj + diag_offset[row] + 1;
340         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
341         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
342         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
343         x15 = pv[14]; x16 = pv[15];
344         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
345         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
346         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
347         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
348 
349         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
350         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
351         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
352         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
353 
354         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
355         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
356         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
357         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
358 
359         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
360         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
361         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
362         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
363         nz = bi[row+1] - diag_offset[row] - 1;
364         pv += 16;
365         for (j=0; j<nz; j++) {
366           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
367           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
368           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
369           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
370           x    = rtmp + 16*pj[j];
371           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
372           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
373           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
374           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
375 
376           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
377           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
378           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
379           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
380 
381           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
382           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
383           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
384           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
385 
386           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
387           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
388           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
389           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
390 
391           pv   += 16;
392         }
393         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
394       }
395       row = *ajtmp++;
396     }
397     /* finished row so stick it into b->a */
398     pv = ba + 16*bi[i];
399     pj = bj + bi[i];
400     nz = bi[i+1] - bi[i];
401     for (j=0; j<nz; j++) {
402       x      = rtmp+16*pj[j];
403       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
404       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
405       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
406       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
407       pv   += 16;
408     }
409     /* invert diagonal block */
410     w = ba + 16*diag_offset[i];
411     if (pivotinblocks) {
412       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
413     } else {
414       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
415     }
416   }
417 
418   ierr = PetscFree(rtmp);CHKERRQ(ierr);
419   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
420   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
421   C->assembled = PETSC_TRUE;
422   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
423   PetscFunctionReturn(0);
424 }
425 
426 /*
427   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
428     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
429 */
430 #undef __FUNCT__
431 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
432 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
433 {
434   Mat            C=B;
435   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
436   PetscErrorCode ierr;
437   PetscInt       i,j,k,nz,nzL,row;
438   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
439   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
440   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
441   PetscInt       flg;
442   PetscReal      shift;
443 
444   PetscFunctionBegin;
445   /* generate work space needed by the factorization */
446   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
447   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
448 
449   if (info->shifttype == MAT_SHIFT_NONE){
450     shift = 0;
451   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
452     shift = info->shiftamount;
453   }
454 
455   for (i=0; i<n; i++){
456     /* zero rtmp */
457     /* L part */
458     nz    = bi[i+1] - bi[i];
459     bjtmp = bj + bi[i];
460     for  (j=0; j<nz; j++){
461       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
462     }
463 
464     /* U part */
465     nz = bdiag[i] - bdiag[i+1];
466     bjtmp = bj + bdiag[i+1]+1;
467     for  (j=0; j<nz; j++){
468       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
469     }
470 
471     /* load in initial (unfactored row) */
472     nz    = ai[i+1] - ai[i];
473     ajtmp = aj + ai[i];
474     v     = aa + bs2*ai[i];
475     for (j=0; j<nz; j++) {
476       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
477     }
478 
479     /* elimination */
480     bjtmp = bj + bi[i];
481     nzL   = bi[i+1] - bi[i];
482     for(k=0;k < nzL;k++) {
483       row = bjtmp[k];
484       pc = rtmp + bs2*row;
485       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
486       if (flg) {
487         pv = b->a + bs2*bdiag[row];
488         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
489         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
490 
491         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
492         pv = b->a + bs2*(bdiag[row+1]+1);
493         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
494         for (j=0; j<nz; j++) {
495           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
496           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
497           v    = rtmp + bs2*pj[j];
498           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
499           pv  += bs2;
500         }
501         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
502       }
503     }
504 
505     /* finished row so stick it into b->a */
506     /* L part */
507     pv   = b->a + bs2*bi[i] ;
508     pj   = b->j + bi[i] ;
509     nz   = bi[i+1] - bi[i];
510     for (j=0; j<nz; j++) {
511       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
512     }
513 
514     /* Mark diagonal and invert diagonal for simplier triangular solves */
515     pv   = b->a + bs2*bdiag[i];
516     pj   = b->j + bdiag[i];
517     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
518     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
519 
520     /* U part */
521     pv = b->a + bs2*(bdiag[i+1]+1);
522     pj = b->j + bdiag[i+1]+1;
523     nz = bdiag[i] - bdiag[i+1] - 1;
524     for (j=0; j<nz; j++){
525       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
526     }
527   }
528   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
529   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
530   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
531   C->assembled = PETSC_TRUE;
532   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
533   PetscFunctionReturn(0);
534 }
535 
536 #if defined(PETSC_HAVE_SSE)
537 
538 #include PETSC_HAVE_SSE
539 
540 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
541 #undef __FUNCT__
542 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
543 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
544 {
545   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
546   PetscErrorCode ierr;
547   int i,j,n = a->mbs;
548   int         *bj = b->j,*bjtmp,*pj;
549   int         row;
550   int         *ajtmpold,nz,*bi=b->i;
551   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
552   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
553   MatScalar   *ba = b->a,*aa = a->a;
554   int         nonzero=0;
555 /*    int            nonzero=0,colscale = 16; */
556   PetscTruth  pivotinblocks = b->pivotinblocks;
557   PetscReal      shift = info->shiftamount;
558 
559   PetscFunctionBegin;
560   SSE_SCOPE_BEGIN;
561 
562   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
563   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
564   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
565   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
566 /*    if ((unsigned long)bj==(unsigned long)aj) { */
567 /*      colscale = 4; */
568 /*    } */
569   for (i=0; i<n; i++) {
570     nz    = bi[i+1] - bi[i];
571     bjtmp = bj + bi[i];
572     /* zero out the 4x4 block accumulators */
573     /* zero out one register */
574     XOR_PS(XMM7,XMM7);
575     for  (j=0; j<nz; j++) {
576       x = rtmp+16*bjtmp[j];
577 /*        x = rtmp+4*bjtmp[j]; */
578       SSE_INLINE_BEGIN_1(x)
579         /* Copy zero register to memory locations */
580         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
581         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
582         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
583         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
584         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
585         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
586         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
587         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
588         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
589       SSE_INLINE_END_1;
590     }
591     /* load in initial (unfactored row) */
592     nz       = ai[i+1] - ai[i];
593     ajtmpold = aj + ai[i];
594     v        = aa + 16*ai[i];
595     for (j=0; j<nz; j++) {
596       x = rtmp+16*ajtmpold[j];
597 /*        x = rtmp+colscale*ajtmpold[j]; */
598       /* Copy v block into x block */
599       SSE_INLINE_BEGIN_2(v,x)
600         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
601         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
602         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
603 
604         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
605         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
606 
607         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
608         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
609 
610         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
611         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
612 
613         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
614         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
615 
616         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
617         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
618 
619         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
620         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
621 
622         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
623         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
624       SSE_INLINE_END_2;
625 
626       v += 16;
627     }
628 /*      row = (*bjtmp++)/4; */
629     row = *bjtmp++;
630     while (row < i) {
631       pc  = rtmp + 16*row;
632       SSE_INLINE_BEGIN_1(pc)
633         /* Load block from lower triangle */
634         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
635         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
636         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
637 
638         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
639         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
640 
641         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
642         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
643 
644         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
645         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
646 
647         /* Compare block to zero block */
648 
649         SSE_COPY_PS(XMM4,XMM7)
650         SSE_CMPNEQ_PS(XMM4,XMM0)
651 
652         SSE_COPY_PS(XMM5,XMM7)
653         SSE_CMPNEQ_PS(XMM5,XMM1)
654 
655         SSE_COPY_PS(XMM6,XMM7)
656         SSE_CMPNEQ_PS(XMM6,XMM2)
657 
658         SSE_CMPNEQ_PS(XMM7,XMM3)
659 
660         /* Reduce the comparisons to one SSE register */
661         SSE_OR_PS(XMM6,XMM7)
662         SSE_OR_PS(XMM5,XMM4)
663         SSE_OR_PS(XMM5,XMM6)
664       SSE_INLINE_END_1;
665 
666       /* Reduce the one SSE register to an integer register for branching */
667       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
668       MOVEMASK(nonzero,XMM5);
669 
670       /* If block is nonzero ... */
671       if (nonzero) {
672         pv = ba + 16*diag_offset[row];
673         PREFETCH_L1(&pv[16]);
674         pj = bj + diag_offset[row] + 1;
675 
676         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
677         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
678         /* but the diagonal was inverted already */
679         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
680 
681         SSE_INLINE_BEGIN_2(pv,pc)
682           /* Column 0, product is accumulated in XMM4 */
683           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
684           SSE_SHUFFLE(XMM4,XMM4,0x00)
685           SSE_MULT_PS(XMM4,XMM0)
686 
687           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
688           SSE_SHUFFLE(XMM5,XMM5,0x00)
689           SSE_MULT_PS(XMM5,XMM1)
690           SSE_ADD_PS(XMM4,XMM5)
691 
692           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
693           SSE_SHUFFLE(XMM6,XMM6,0x00)
694           SSE_MULT_PS(XMM6,XMM2)
695           SSE_ADD_PS(XMM4,XMM6)
696 
697           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
698           SSE_SHUFFLE(XMM7,XMM7,0x00)
699           SSE_MULT_PS(XMM7,XMM3)
700           SSE_ADD_PS(XMM4,XMM7)
701 
702           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
703           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
704 
705           /* Column 1, product is accumulated in XMM5 */
706           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
707           SSE_SHUFFLE(XMM5,XMM5,0x00)
708           SSE_MULT_PS(XMM5,XMM0)
709 
710           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
711           SSE_SHUFFLE(XMM6,XMM6,0x00)
712           SSE_MULT_PS(XMM6,XMM1)
713           SSE_ADD_PS(XMM5,XMM6)
714 
715           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
716           SSE_SHUFFLE(XMM7,XMM7,0x00)
717           SSE_MULT_PS(XMM7,XMM2)
718           SSE_ADD_PS(XMM5,XMM7)
719 
720           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
721           SSE_SHUFFLE(XMM6,XMM6,0x00)
722           SSE_MULT_PS(XMM6,XMM3)
723           SSE_ADD_PS(XMM5,XMM6)
724 
725           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
726           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
727 
728           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
729 
730           /* Column 2, product is accumulated in XMM6 */
731           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
732           SSE_SHUFFLE(XMM6,XMM6,0x00)
733           SSE_MULT_PS(XMM6,XMM0)
734 
735           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
736           SSE_SHUFFLE(XMM7,XMM7,0x00)
737           SSE_MULT_PS(XMM7,XMM1)
738           SSE_ADD_PS(XMM6,XMM7)
739 
740           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
741           SSE_SHUFFLE(XMM7,XMM7,0x00)
742           SSE_MULT_PS(XMM7,XMM2)
743           SSE_ADD_PS(XMM6,XMM7)
744 
745           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
746           SSE_SHUFFLE(XMM7,XMM7,0x00)
747           SSE_MULT_PS(XMM7,XMM3)
748           SSE_ADD_PS(XMM6,XMM7)
749 
750           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
751           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
752 
753           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
754           /* Column 3, product is accumulated in XMM0 */
755           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
756           SSE_SHUFFLE(XMM7,XMM7,0x00)
757           SSE_MULT_PS(XMM0,XMM7)
758 
759           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
760           SSE_SHUFFLE(XMM7,XMM7,0x00)
761           SSE_MULT_PS(XMM1,XMM7)
762           SSE_ADD_PS(XMM0,XMM1)
763 
764           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
765           SSE_SHUFFLE(XMM1,XMM1,0x00)
766           SSE_MULT_PS(XMM1,XMM2)
767           SSE_ADD_PS(XMM0,XMM1)
768 
769           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
770           SSE_SHUFFLE(XMM7,XMM7,0x00)
771           SSE_MULT_PS(XMM3,XMM7)
772           SSE_ADD_PS(XMM0,XMM3)
773 
774           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
775           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
776 
777           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
778           /* This is code to be maintained and read by humans afterall. */
779           /* Copy Multiplier Col 3 into XMM3 */
780           SSE_COPY_PS(XMM3,XMM0)
781           /* Copy Multiplier Col 2 into XMM2 */
782           SSE_COPY_PS(XMM2,XMM6)
783           /* Copy Multiplier Col 1 into XMM1 */
784           SSE_COPY_PS(XMM1,XMM5)
785           /* Copy Multiplier Col 0 into XMM0 */
786           SSE_COPY_PS(XMM0,XMM4)
787         SSE_INLINE_END_2;
788 
789         /* Update the row: */
790         nz = bi[row+1] - diag_offset[row] - 1;
791         pv += 16;
792         for (j=0; j<nz; j++) {
793           PREFETCH_L1(&pv[16]);
794           x = rtmp + 16*pj[j];
795 /*            x = rtmp + 4*pj[j]; */
796 
797           /* X:=X-M*PV, One column at a time */
798           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
799           SSE_INLINE_BEGIN_2(x,pv)
800             /* Load First Column of X*/
801             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
802             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
803 
804             /* Matrix-Vector Product: */
805             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
806             SSE_SHUFFLE(XMM5,XMM5,0x00)
807             SSE_MULT_PS(XMM5,XMM0)
808             SSE_SUB_PS(XMM4,XMM5)
809 
810             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
811             SSE_SHUFFLE(XMM6,XMM6,0x00)
812             SSE_MULT_PS(XMM6,XMM1)
813             SSE_SUB_PS(XMM4,XMM6)
814 
815             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
816             SSE_SHUFFLE(XMM7,XMM7,0x00)
817             SSE_MULT_PS(XMM7,XMM2)
818             SSE_SUB_PS(XMM4,XMM7)
819 
820             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
821             SSE_SHUFFLE(XMM5,XMM5,0x00)
822             SSE_MULT_PS(XMM5,XMM3)
823             SSE_SUB_PS(XMM4,XMM5)
824 
825             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
826             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
827 
828             /* Second Column */
829             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
830             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
831 
832             /* Matrix-Vector Product: */
833             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
834             SSE_SHUFFLE(XMM6,XMM6,0x00)
835             SSE_MULT_PS(XMM6,XMM0)
836             SSE_SUB_PS(XMM5,XMM6)
837 
838             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
839             SSE_SHUFFLE(XMM7,XMM7,0x00)
840             SSE_MULT_PS(XMM7,XMM1)
841             SSE_SUB_PS(XMM5,XMM7)
842 
843             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
844             SSE_SHUFFLE(XMM4,XMM4,0x00)
845             SSE_MULT_PS(XMM4,XMM2)
846             SSE_SUB_PS(XMM5,XMM4)
847 
848             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
849             SSE_SHUFFLE(XMM6,XMM6,0x00)
850             SSE_MULT_PS(XMM6,XMM3)
851             SSE_SUB_PS(XMM5,XMM6)
852 
853             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
854             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
855 
856             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
857 
858             /* Third Column */
859             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
860             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
861 
862             /* Matrix-Vector Product: */
863             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
864             SSE_SHUFFLE(XMM7,XMM7,0x00)
865             SSE_MULT_PS(XMM7,XMM0)
866             SSE_SUB_PS(XMM6,XMM7)
867 
868             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
869             SSE_SHUFFLE(XMM4,XMM4,0x00)
870             SSE_MULT_PS(XMM4,XMM1)
871             SSE_SUB_PS(XMM6,XMM4)
872 
873             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
874             SSE_SHUFFLE(XMM5,XMM5,0x00)
875             SSE_MULT_PS(XMM5,XMM2)
876             SSE_SUB_PS(XMM6,XMM5)
877 
878             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
879             SSE_SHUFFLE(XMM7,XMM7,0x00)
880             SSE_MULT_PS(XMM7,XMM3)
881             SSE_SUB_PS(XMM6,XMM7)
882 
883             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
884             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
885 
886             /* Fourth Column */
887             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
888             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
889 
890             /* Matrix-Vector Product: */
891             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
892             SSE_SHUFFLE(XMM5,XMM5,0x00)
893             SSE_MULT_PS(XMM5,XMM0)
894             SSE_SUB_PS(XMM4,XMM5)
895 
896             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
897             SSE_SHUFFLE(XMM6,XMM6,0x00)
898             SSE_MULT_PS(XMM6,XMM1)
899             SSE_SUB_PS(XMM4,XMM6)
900 
901             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
902             SSE_SHUFFLE(XMM7,XMM7,0x00)
903             SSE_MULT_PS(XMM7,XMM2)
904             SSE_SUB_PS(XMM4,XMM7)
905 
906             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
907             SSE_SHUFFLE(XMM5,XMM5,0x00)
908             SSE_MULT_PS(XMM5,XMM3)
909             SSE_SUB_PS(XMM4,XMM5)
910 
911             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
912             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
913           SSE_INLINE_END_2;
914           pv   += 16;
915         }
916         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
917       }
918       row = *bjtmp++;
919 /*        row = (*bjtmp++)/4; */
920     }
921     /* finished row so stick it into b->a */
922     pv = ba + 16*bi[i];
923     pj = bj + bi[i];
924     nz = bi[i+1] - bi[i];
925 
926     /* Copy x block back into pv block */
927     for (j=0; j<nz; j++) {
928       x  = rtmp+16*pj[j];
929 /*        x  = rtmp+4*pj[j]; */
930 
931       SSE_INLINE_BEGIN_2(x,pv)
932         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
933         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
934         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
935 
936         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
937         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
938 
939         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
940         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
941 
942         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
943         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
944 
945         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
946         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
947 
948         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
949         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
950 
951         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
952         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
953 
954         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
955         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
956       SSE_INLINE_END_2;
957       pv += 16;
958     }
959     /* invert diagonal block */
960     w = ba + 16*diag_offset[i];
961     if (pivotinblocks) {
962       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
963     } else {
964       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
965     }
966 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
967     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
968   }
969 
970   ierr = PetscFree(rtmp);CHKERRQ(ierr);
971   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
972   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
973   C->assembled = PETSC_TRUE;
974   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
975   /* Flop Count from inverting diagonal blocks */
976   SSE_SCOPE_END;
977   PetscFunctionReturn(0);
978 }
979 
980 #undef __FUNCT__
981 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
982 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
983 {
984   Mat            A=C;
985   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
986   PetscErrorCode ierr;
987   int i,j,n = a->mbs;
988   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
989   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
990   unsigned int   row;
991   int            nz,*bi=b->i;
992   int            *diag_offset = b->diag,*ai=a->i;
993   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
994   MatScalar      *ba = b->a,*aa = a->a;
995   int            nonzero=0;
996 /*    int            nonzero=0,colscale = 16; */
997   PetscTruth     pivotinblocks = b->pivotinblocks;
998   PetscReal      shift = info->shiftamount;
999 
1000   PetscFunctionBegin;
1001   SSE_SCOPE_BEGIN;
1002 
1003   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1004   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1005   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1006   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1007 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1008 /*      colscale = 4; */
1009 /*    } */
1010 
1011   for (i=0; i<n; i++) {
1012     nz    = bi[i+1] - bi[i];
1013     bjtmp = bj + bi[i];
1014     /* zero out the 4x4 block accumulators */
1015     /* zero out one register */
1016     XOR_PS(XMM7,XMM7);
1017     for  (j=0; j<nz; j++) {
1018       x = rtmp+16*((unsigned int)bjtmp[j]);
1019 /*        x = rtmp+4*bjtmp[j]; */
1020       SSE_INLINE_BEGIN_1(x)
1021         /* Copy zero register to memory locations */
1022         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1023         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1024         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1025         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1026         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1027         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1028         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1029         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1030         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1031       SSE_INLINE_END_1;
1032     }
1033     /* load in initial (unfactored row) */
1034     nz    = ai[i+1] - ai[i];
1035     ajtmp = aj + ai[i];
1036     v     = aa + 16*ai[i];
1037     for (j=0; j<nz; j++) {
1038       x = rtmp+16*((unsigned int)ajtmp[j]);
1039 /*        x = rtmp+colscale*ajtmp[j]; */
1040       /* Copy v block into x block */
1041       SSE_INLINE_BEGIN_2(v,x)
1042         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1043         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1044         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1045 
1046         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1047         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1048 
1049         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1050         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1051 
1052         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1053         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1054 
1055         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1056         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1057 
1058         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1059         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1060 
1061         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1062         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1063 
1064         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1065         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1066       SSE_INLINE_END_2;
1067 
1068       v += 16;
1069     }
1070 /*      row = (*bjtmp++)/4; */
1071     row = (unsigned int)(*bjtmp++);
1072     while (row < i) {
1073       pc  = rtmp + 16*row;
1074       SSE_INLINE_BEGIN_1(pc)
1075         /* Load block from lower triangle */
1076         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1077         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1078         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1079 
1080         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1081         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1082 
1083         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1084         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1085 
1086         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1087         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1088 
1089         /* Compare block to zero block */
1090 
1091         SSE_COPY_PS(XMM4,XMM7)
1092         SSE_CMPNEQ_PS(XMM4,XMM0)
1093 
1094         SSE_COPY_PS(XMM5,XMM7)
1095         SSE_CMPNEQ_PS(XMM5,XMM1)
1096 
1097         SSE_COPY_PS(XMM6,XMM7)
1098         SSE_CMPNEQ_PS(XMM6,XMM2)
1099 
1100         SSE_CMPNEQ_PS(XMM7,XMM3)
1101 
1102         /* Reduce the comparisons to one SSE register */
1103         SSE_OR_PS(XMM6,XMM7)
1104         SSE_OR_PS(XMM5,XMM4)
1105         SSE_OR_PS(XMM5,XMM6)
1106       SSE_INLINE_END_1;
1107 
1108       /* Reduce the one SSE register to an integer register for branching */
1109       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1110       MOVEMASK(nonzero,XMM5);
1111 
1112       /* If block is nonzero ... */
1113       if (nonzero) {
1114         pv = ba + 16*diag_offset[row];
1115         PREFETCH_L1(&pv[16]);
1116         pj = bj + diag_offset[row] + 1;
1117 
1118         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1119         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1120         /* but the diagonal was inverted already */
1121         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1122 
1123         SSE_INLINE_BEGIN_2(pv,pc)
1124           /* Column 0, product is accumulated in XMM4 */
1125           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1126           SSE_SHUFFLE(XMM4,XMM4,0x00)
1127           SSE_MULT_PS(XMM4,XMM0)
1128 
1129           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1130           SSE_SHUFFLE(XMM5,XMM5,0x00)
1131           SSE_MULT_PS(XMM5,XMM1)
1132           SSE_ADD_PS(XMM4,XMM5)
1133 
1134           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1135           SSE_SHUFFLE(XMM6,XMM6,0x00)
1136           SSE_MULT_PS(XMM6,XMM2)
1137           SSE_ADD_PS(XMM4,XMM6)
1138 
1139           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1140           SSE_SHUFFLE(XMM7,XMM7,0x00)
1141           SSE_MULT_PS(XMM7,XMM3)
1142           SSE_ADD_PS(XMM4,XMM7)
1143 
1144           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1145           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1146 
1147           /* Column 1, product is accumulated in XMM5 */
1148           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1149           SSE_SHUFFLE(XMM5,XMM5,0x00)
1150           SSE_MULT_PS(XMM5,XMM0)
1151 
1152           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1153           SSE_SHUFFLE(XMM6,XMM6,0x00)
1154           SSE_MULT_PS(XMM6,XMM1)
1155           SSE_ADD_PS(XMM5,XMM6)
1156 
1157           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1158           SSE_SHUFFLE(XMM7,XMM7,0x00)
1159           SSE_MULT_PS(XMM7,XMM2)
1160           SSE_ADD_PS(XMM5,XMM7)
1161 
1162           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1163           SSE_SHUFFLE(XMM6,XMM6,0x00)
1164           SSE_MULT_PS(XMM6,XMM3)
1165           SSE_ADD_PS(XMM5,XMM6)
1166 
1167           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1168           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1169 
1170           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1171 
1172           /* Column 2, product is accumulated in XMM6 */
1173           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1174           SSE_SHUFFLE(XMM6,XMM6,0x00)
1175           SSE_MULT_PS(XMM6,XMM0)
1176 
1177           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1178           SSE_SHUFFLE(XMM7,XMM7,0x00)
1179           SSE_MULT_PS(XMM7,XMM1)
1180           SSE_ADD_PS(XMM6,XMM7)
1181 
1182           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1183           SSE_SHUFFLE(XMM7,XMM7,0x00)
1184           SSE_MULT_PS(XMM7,XMM2)
1185           SSE_ADD_PS(XMM6,XMM7)
1186 
1187           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1188           SSE_SHUFFLE(XMM7,XMM7,0x00)
1189           SSE_MULT_PS(XMM7,XMM3)
1190           SSE_ADD_PS(XMM6,XMM7)
1191 
1192           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1193           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1194 
1195           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1196           /* Column 3, product is accumulated in XMM0 */
1197           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1198           SSE_SHUFFLE(XMM7,XMM7,0x00)
1199           SSE_MULT_PS(XMM0,XMM7)
1200 
1201           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1202           SSE_SHUFFLE(XMM7,XMM7,0x00)
1203           SSE_MULT_PS(XMM1,XMM7)
1204           SSE_ADD_PS(XMM0,XMM1)
1205 
1206           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1207           SSE_SHUFFLE(XMM1,XMM1,0x00)
1208           SSE_MULT_PS(XMM1,XMM2)
1209           SSE_ADD_PS(XMM0,XMM1)
1210 
1211           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1212           SSE_SHUFFLE(XMM7,XMM7,0x00)
1213           SSE_MULT_PS(XMM3,XMM7)
1214           SSE_ADD_PS(XMM0,XMM3)
1215 
1216           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1217           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1218 
1219           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1220           /* This is code to be maintained and read by humans afterall. */
1221           /* Copy Multiplier Col 3 into XMM3 */
1222           SSE_COPY_PS(XMM3,XMM0)
1223           /* Copy Multiplier Col 2 into XMM2 */
1224           SSE_COPY_PS(XMM2,XMM6)
1225           /* Copy Multiplier Col 1 into XMM1 */
1226           SSE_COPY_PS(XMM1,XMM5)
1227           /* Copy Multiplier Col 0 into XMM0 */
1228           SSE_COPY_PS(XMM0,XMM4)
1229         SSE_INLINE_END_2;
1230 
1231         /* Update the row: */
1232         nz = bi[row+1] - diag_offset[row] - 1;
1233         pv += 16;
1234         for (j=0; j<nz; j++) {
1235           PREFETCH_L1(&pv[16]);
1236           x = rtmp + 16*((unsigned int)pj[j]);
1237 /*            x = rtmp + 4*pj[j]; */
1238 
1239           /* X:=X-M*PV, One column at a time */
1240           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1241           SSE_INLINE_BEGIN_2(x,pv)
1242             /* Load First Column of X*/
1243             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1244             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1245 
1246             /* Matrix-Vector Product: */
1247             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1248             SSE_SHUFFLE(XMM5,XMM5,0x00)
1249             SSE_MULT_PS(XMM5,XMM0)
1250             SSE_SUB_PS(XMM4,XMM5)
1251 
1252             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1253             SSE_SHUFFLE(XMM6,XMM6,0x00)
1254             SSE_MULT_PS(XMM6,XMM1)
1255             SSE_SUB_PS(XMM4,XMM6)
1256 
1257             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1258             SSE_SHUFFLE(XMM7,XMM7,0x00)
1259             SSE_MULT_PS(XMM7,XMM2)
1260             SSE_SUB_PS(XMM4,XMM7)
1261 
1262             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1263             SSE_SHUFFLE(XMM5,XMM5,0x00)
1264             SSE_MULT_PS(XMM5,XMM3)
1265             SSE_SUB_PS(XMM4,XMM5)
1266 
1267             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1268             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1269 
1270             /* Second Column */
1271             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1272             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1273 
1274             /* Matrix-Vector Product: */
1275             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1276             SSE_SHUFFLE(XMM6,XMM6,0x00)
1277             SSE_MULT_PS(XMM6,XMM0)
1278             SSE_SUB_PS(XMM5,XMM6)
1279 
1280             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1281             SSE_SHUFFLE(XMM7,XMM7,0x00)
1282             SSE_MULT_PS(XMM7,XMM1)
1283             SSE_SUB_PS(XMM5,XMM7)
1284 
1285             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1286             SSE_SHUFFLE(XMM4,XMM4,0x00)
1287             SSE_MULT_PS(XMM4,XMM2)
1288             SSE_SUB_PS(XMM5,XMM4)
1289 
1290             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1291             SSE_SHUFFLE(XMM6,XMM6,0x00)
1292             SSE_MULT_PS(XMM6,XMM3)
1293             SSE_SUB_PS(XMM5,XMM6)
1294 
1295             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1296             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1297 
1298             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1299 
1300             /* Third Column */
1301             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1302             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1303 
1304             /* Matrix-Vector Product: */
1305             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1306             SSE_SHUFFLE(XMM7,XMM7,0x00)
1307             SSE_MULT_PS(XMM7,XMM0)
1308             SSE_SUB_PS(XMM6,XMM7)
1309 
1310             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1311             SSE_SHUFFLE(XMM4,XMM4,0x00)
1312             SSE_MULT_PS(XMM4,XMM1)
1313             SSE_SUB_PS(XMM6,XMM4)
1314 
1315             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1316             SSE_SHUFFLE(XMM5,XMM5,0x00)
1317             SSE_MULT_PS(XMM5,XMM2)
1318             SSE_SUB_PS(XMM6,XMM5)
1319 
1320             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1321             SSE_SHUFFLE(XMM7,XMM7,0x00)
1322             SSE_MULT_PS(XMM7,XMM3)
1323             SSE_SUB_PS(XMM6,XMM7)
1324 
1325             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1326             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1327 
1328             /* Fourth Column */
1329             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1330             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1331 
1332             /* Matrix-Vector Product: */
1333             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1334             SSE_SHUFFLE(XMM5,XMM5,0x00)
1335             SSE_MULT_PS(XMM5,XMM0)
1336             SSE_SUB_PS(XMM4,XMM5)
1337 
1338             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1339             SSE_SHUFFLE(XMM6,XMM6,0x00)
1340             SSE_MULT_PS(XMM6,XMM1)
1341             SSE_SUB_PS(XMM4,XMM6)
1342 
1343             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1344             SSE_SHUFFLE(XMM7,XMM7,0x00)
1345             SSE_MULT_PS(XMM7,XMM2)
1346             SSE_SUB_PS(XMM4,XMM7)
1347 
1348             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1349             SSE_SHUFFLE(XMM5,XMM5,0x00)
1350             SSE_MULT_PS(XMM5,XMM3)
1351             SSE_SUB_PS(XMM4,XMM5)
1352 
1353             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1354             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1355           SSE_INLINE_END_2;
1356           pv   += 16;
1357         }
1358         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1359       }
1360       row = (unsigned int)(*bjtmp++);
1361 /*        row = (*bjtmp++)/4; */
1362 /*        bjtmp++; */
1363     }
1364     /* finished row so stick it into b->a */
1365     pv = ba + 16*bi[i];
1366     pj = bj + bi[i];
1367     nz = bi[i+1] - bi[i];
1368 
1369     /* Copy x block back into pv block */
1370     for (j=0; j<nz; j++) {
1371       x  = rtmp+16*((unsigned int)pj[j]);
1372 /*        x  = rtmp+4*pj[j]; */
1373 
1374       SSE_INLINE_BEGIN_2(x,pv)
1375         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1376         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1377         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1378 
1379         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1380         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1381 
1382         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1383         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1384 
1385         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1386         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1387 
1388         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1389         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1390 
1391         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1392         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1393 
1394         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1395         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1396 
1397         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1398         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1399       SSE_INLINE_END_2;
1400       pv += 16;
1401     }
1402     /* invert diagonal block */
1403     w = ba + 16*diag_offset[i];
1404     if (pivotinblocks) {
1405       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1406     } else {
1407       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1408     }
1409 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1410     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1411   }
1412 
1413   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1414   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1415   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1416   C->assembled = PETSC_TRUE;
1417   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1418   /* Flop Count from inverting diagonal blocks */
1419   SSE_SCOPE_END;
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 #undef __FUNCT__
1424 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1425 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1426 {
1427   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1428   PetscErrorCode ierr;
1429   int  i,j,n = a->mbs;
1430   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1431   unsigned int   row;
1432   int            *ajtmpold,nz,*bi=b->i;
1433   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1434   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1435   MatScalar      *ba = b->a,*aa = a->a;
1436   int            nonzero=0;
1437 /*    int            nonzero=0,colscale = 16; */
1438   PetscTruth     pivotinblocks = b->pivotinblocks;
1439   PetscReal      shift = info->shiftamount;
1440 
1441   PetscFunctionBegin;
1442   SSE_SCOPE_BEGIN;
1443 
1444   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1445   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1446   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1447   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1448 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1449 /*      colscale = 4; */
1450 /*    } */
1451   if ((unsigned long)bj==(unsigned long)aj) {
1452     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1453   }
1454 
1455   for (i=0; i<n; i++) {
1456     nz    = bi[i+1] - bi[i];
1457     bjtmp = bj + bi[i];
1458     /* zero out the 4x4 block accumulators */
1459     /* zero out one register */
1460     XOR_PS(XMM7,XMM7);
1461     for  (j=0; j<nz; j++) {
1462       x = rtmp+16*((unsigned int)bjtmp[j]);
1463 /*        x = rtmp+4*bjtmp[j]; */
1464       SSE_INLINE_BEGIN_1(x)
1465         /* Copy zero register to memory locations */
1466         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1467         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1468         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1469         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1470         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1471         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1472         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1473         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1474         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1475       SSE_INLINE_END_1;
1476     }
1477     /* load in initial (unfactored row) */
1478     nz       = ai[i+1] - ai[i];
1479     ajtmpold = aj + ai[i];
1480     v        = aa + 16*ai[i];
1481     for (j=0; j<nz; j++) {
1482       x = rtmp+16*ajtmpold[j];
1483 /*        x = rtmp+colscale*ajtmpold[j]; */
1484       /* Copy v block into x block */
1485       SSE_INLINE_BEGIN_2(v,x)
1486         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1487         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1488         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1489 
1490         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1491         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1492 
1493         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1494         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1495 
1496         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1497         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1498 
1499         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1500         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1501 
1502         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1503         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1504 
1505         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1506         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1507 
1508         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1509         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1510       SSE_INLINE_END_2;
1511 
1512       v += 16;
1513     }
1514 /*      row = (*bjtmp++)/4; */
1515     row = (unsigned int)(*bjtmp++);
1516     while (row < i) {
1517       pc  = rtmp + 16*row;
1518       SSE_INLINE_BEGIN_1(pc)
1519         /* Load block from lower triangle */
1520         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1521         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1522         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1523 
1524         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1525         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1526 
1527         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1528         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1529 
1530         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1531         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1532 
1533         /* Compare block to zero block */
1534 
1535         SSE_COPY_PS(XMM4,XMM7)
1536         SSE_CMPNEQ_PS(XMM4,XMM0)
1537 
1538         SSE_COPY_PS(XMM5,XMM7)
1539         SSE_CMPNEQ_PS(XMM5,XMM1)
1540 
1541         SSE_COPY_PS(XMM6,XMM7)
1542         SSE_CMPNEQ_PS(XMM6,XMM2)
1543 
1544         SSE_CMPNEQ_PS(XMM7,XMM3)
1545 
1546         /* Reduce the comparisons to one SSE register */
1547         SSE_OR_PS(XMM6,XMM7)
1548         SSE_OR_PS(XMM5,XMM4)
1549         SSE_OR_PS(XMM5,XMM6)
1550       SSE_INLINE_END_1;
1551 
1552       /* Reduce the one SSE register to an integer register for branching */
1553       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1554       MOVEMASK(nonzero,XMM5);
1555 
1556       /* If block is nonzero ... */
1557       if (nonzero) {
1558         pv = ba + 16*diag_offset[row];
1559         PREFETCH_L1(&pv[16]);
1560         pj = bj + diag_offset[row] + 1;
1561 
1562         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1563         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1564         /* but the diagonal was inverted already */
1565         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1566 
1567         SSE_INLINE_BEGIN_2(pv,pc)
1568           /* Column 0, product is accumulated in XMM4 */
1569           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1570           SSE_SHUFFLE(XMM4,XMM4,0x00)
1571           SSE_MULT_PS(XMM4,XMM0)
1572 
1573           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1574           SSE_SHUFFLE(XMM5,XMM5,0x00)
1575           SSE_MULT_PS(XMM5,XMM1)
1576           SSE_ADD_PS(XMM4,XMM5)
1577 
1578           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1579           SSE_SHUFFLE(XMM6,XMM6,0x00)
1580           SSE_MULT_PS(XMM6,XMM2)
1581           SSE_ADD_PS(XMM4,XMM6)
1582 
1583           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1584           SSE_SHUFFLE(XMM7,XMM7,0x00)
1585           SSE_MULT_PS(XMM7,XMM3)
1586           SSE_ADD_PS(XMM4,XMM7)
1587 
1588           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1589           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1590 
1591           /* Column 1, product is accumulated in XMM5 */
1592           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1593           SSE_SHUFFLE(XMM5,XMM5,0x00)
1594           SSE_MULT_PS(XMM5,XMM0)
1595 
1596           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1597           SSE_SHUFFLE(XMM6,XMM6,0x00)
1598           SSE_MULT_PS(XMM6,XMM1)
1599           SSE_ADD_PS(XMM5,XMM6)
1600 
1601           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1602           SSE_SHUFFLE(XMM7,XMM7,0x00)
1603           SSE_MULT_PS(XMM7,XMM2)
1604           SSE_ADD_PS(XMM5,XMM7)
1605 
1606           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1607           SSE_SHUFFLE(XMM6,XMM6,0x00)
1608           SSE_MULT_PS(XMM6,XMM3)
1609           SSE_ADD_PS(XMM5,XMM6)
1610 
1611           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1612           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1613 
1614           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1615 
1616           /* Column 2, product is accumulated in XMM6 */
1617           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1618           SSE_SHUFFLE(XMM6,XMM6,0x00)
1619           SSE_MULT_PS(XMM6,XMM0)
1620 
1621           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1622           SSE_SHUFFLE(XMM7,XMM7,0x00)
1623           SSE_MULT_PS(XMM7,XMM1)
1624           SSE_ADD_PS(XMM6,XMM7)
1625 
1626           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1627           SSE_SHUFFLE(XMM7,XMM7,0x00)
1628           SSE_MULT_PS(XMM7,XMM2)
1629           SSE_ADD_PS(XMM6,XMM7)
1630 
1631           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1632           SSE_SHUFFLE(XMM7,XMM7,0x00)
1633           SSE_MULT_PS(XMM7,XMM3)
1634           SSE_ADD_PS(XMM6,XMM7)
1635 
1636           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1637           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1638 
1639           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1640           /* Column 3, product is accumulated in XMM0 */
1641           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1642           SSE_SHUFFLE(XMM7,XMM7,0x00)
1643           SSE_MULT_PS(XMM0,XMM7)
1644 
1645           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1646           SSE_SHUFFLE(XMM7,XMM7,0x00)
1647           SSE_MULT_PS(XMM1,XMM7)
1648           SSE_ADD_PS(XMM0,XMM1)
1649 
1650           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1651           SSE_SHUFFLE(XMM1,XMM1,0x00)
1652           SSE_MULT_PS(XMM1,XMM2)
1653           SSE_ADD_PS(XMM0,XMM1)
1654 
1655           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1656           SSE_SHUFFLE(XMM7,XMM7,0x00)
1657           SSE_MULT_PS(XMM3,XMM7)
1658           SSE_ADD_PS(XMM0,XMM3)
1659 
1660           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1661           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1662 
1663           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1664           /* This is code to be maintained and read by humans afterall. */
1665           /* Copy Multiplier Col 3 into XMM3 */
1666           SSE_COPY_PS(XMM3,XMM0)
1667           /* Copy Multiplier Col 2 into XMM2 */
1668           SSE_COPY_PS(XMM2,XMM6)
1669           /* Copy Multiplier Col 1 into XMM1 */
1670           SSE_COPY_PS(XMM1,XMM5)
1671           /* Copy Multiplier Col 0 into XMM0 */
1672           SSE_COPY_PS(XMM0,XMM4)
1673         SSE_INLINE_END_2;
1674 
1675         /* Update the row: */
1676         nz = bi[row+1] - diag_offset[row] - 1;
1677         pv += 16;
1678         for (j=0; j<nz; j++) {
1679           PREFETCH_L1(&pv[16]);
1680           x = rtmp + 16*((unsigned int)pj[j]);
1681 /*            x = rtmp + 4*pj[j]; */
1682 
1683           /* X:=X-M*PV, One column at a time */
1684           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1685           SSE_INLINE_BEGIN_2(x,pv)
1686             /* Load First Column of X*/
1687             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1688             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1689 
1690             /* Matrix-Vector Product: */
1691             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1692             SSE_SHUFFLE(XMM5,XMM5,0x00)
1693             SSE_MULT_PS(XMM5,XMM0)
1694             SSE_SUB_PS(XMM4,XMM5)
1695 
1696             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1697             SSE_SHUFFLE(XMM6,XMM6,0x00)
1698             SSE_MULT_PS(XMM6,XMM1)
1699             SSE_SUB_PS(XMM4,XMM6)
1700 
1701             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1702             SSE_SHUFFLE(XMM7,XMM7,0x00)
1703             SSE_MULT_PS(XMM7,XMM2)
1704             SSE_SUB_PS(XMM4,XMM7)
1705 
1706             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1707             SSE_SHUFFLE(XMM5,XMM5,0x00)
1708             SSE_MULT_PS(XMM5,XMM3)
1709             SSE_SUB_PS(XMM4,XMM5)
1710 
1711             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1712             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1713 
1714             /* Second Column */
1715             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1716             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1717 
1718             /* Matrix-Vector Product: */
1719             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1720             SSE_SHUFFLE(XMM6,XMM6,0x00)
1721             SSE_MULT_PS(XMM6,XMM0)
1722             SSE_SUB_PS(XMM5,XMM6)
1723 
1724             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1725             SSE_SHUFFLE(XMM7,XMM7,0x00)
1726             SSE_MULT_PS(XMM7,XMM1)
1727             SSE_SUB_PS(XMM5,XMM7)
1728 
1729             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1730             SSE_SHUFFLE(XMM4,XMM4,0x00)
1731             SSE_MULT_PS(XMM4,XMM2)
1732             SSE_SUB_PS(XMM5,XMM4)
1733 
1734             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1735             SSE_SHUFFLE(XMM6,XMM6,0x00)
1736             SSE_MULT_PS(XMM6,XMM3)
1737             SSE_SUB_PS(XMM5,XMM6)
1738 
1739             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1740             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1741 
1742             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1743 
1744             /* Third Column */
1745             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1746             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1747 
1748             /* Matrix-Vector Product: */
1749             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1750             SSE_SHUFFLE(XMM7,XMM7,0x00)
1751             SSE_MULT_PS(XMM7,XMM0)
1752             SSE_SUB_PS(XMM6,XMM7)
1753 
1754             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1755             SSE_SHUFFLE(XMM4,XMM4,0x00)
1756             SSE_MULT_PS(XMM4,XMM1)
1757             SSE_SUB_PS(XMM6,XMM4)
1758 
1759             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1760             SSE_SHUFFLE(XMM5,XMM5,0x00)
1761             SSE_MULT_PS(XMM5,XMM2)
1762             SSE_SUB_PS(XMM6,XMM5)
1763 
1764             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1765             SSE_SHUFFLE(XMM7,XMM7,0x00)
1766             SSE_MULT_PS(XMM7,XMM3)
1767             SSE_SUB_PS(XMM6,XMM7)
1768 
1769             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1770             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1771 
1772             /* Fourth Column */
1773             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1774             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1775 
1776             /* Matrix-Vector Product: */
1777             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1778             SSE_SHUFFLE(XMM5,XMM5,0x00)
1779             SSE_MULT_PS(XMM5,XMM0)
1780             SSE_SUB_PS(XMM4,XMM5)
1781 
1782             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1783             SSE_SHUFFLE(XMM6,XMM6,0x00)
1784             SSE_MULT_PS(XMM6,XMM1)
1785             SSE_SUB_PS(XMM4,XMM6)
1786 
1787             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1788             SSE_SHUFFLE(XMM7,XMM7,0x00)
1789             SSE_MULT_PS(XMM7,XMM2)
1790             SSE_SUB_PS(XMM4,XMM7)
1791 
1792             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1793             SSE_SHUFFLE(XMM5,XMM5,0x00)
1794             SSE_MULT_PS(XMM5,XMM3)
1795             SSE_SUB_PS(XMM4,XMM5)
1796 
1797             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1798             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1799           SSE_INLINE_END_2;
1800           pv   += 16;
1801         }
1802         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1803       }
1804       row = (unsigned int)(*bjtmp++);
1805 /*        row = (*bjtmp++)/4; */
1806 /*        bjtmp++; */
1807     }
1808     /* finished row so stick it into b->a */
1809     pv = ba + 16*bi[i];
1810     pj = bj + bi[i];
1811     nz = bi[i+1] - bi[i];
1812 
1813     /* Copy x block back into pv block */
1814     for (j=0; j<nz; j++) {
1815       x  = rtmp+16*((unsigned int)pj[j]);
1816 /*        x  = rtmp+4*pj[j]; */
1817 
1818       SSE_INLINE_BEGIN_2(x,pv)
1819         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1820         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1821         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1822 
1823         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1824         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1825 
1826         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1827         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1828 
1829         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1830         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1831 
1832         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1833         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1834 
1835         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1836         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1837 
1838         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1839         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1840 
1841         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1842         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1843       SSE_INLINE_END_2;
1844       pv += 16;
1845     }
1846     /* invert diagonal block */
1847     w = ba + 16*diag_offset[i];
1848     if (pivotinblocks) {
1849       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1850     } else {
1851       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1852     }
1853 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1854     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1855   }
1856 
1857   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1858   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1859   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1860   C->assembled = PETSC_TRUE;
1861   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1862   /* Flop Count from inverting diagonal blocks */
1863   SSE_SCOPE_END;
1864   PetscFunctionReturn(0);
1865 }
1866 
1867 #endif
1868