xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision e6580ceefd67cfee5d890f7b8ecb709dd93cb596)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftinblocks;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
163 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
164 {
165 /*
166     Default Version for when blocks are 4 by 4 Using natural ordering
167 */
168   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
169   PetscErrorCode ierr;
170   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
171   PetscInt       *ajtmpold,*ajtmp,nz,row;
172   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
173   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
174   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
175   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
176   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
177   MatScalar      m13,m14,m15,m16;
178   MatScalar      *ba = b->a,*aa = a->a;
179   PetscTruth     pivotinblocks = b->pivotinblocks;
180   PetscReal      shift = info->shiftinblocks;
181 
182   PetscFunctionBegin;
183   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
184 
185   for (i=0; i<n; i++) {
186     nz    = bi[i+1] - bi[i];
187     ajtmp = bj + bi[i];
188     for  (j=0; j<nz; j++) {
189       x = rtmp+16*ajtmp[j];
190       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
191       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
192     }
193     /* load in initial (unfactored row) */
194     nz       = ai[i+1] - ai[i];
195     ajtmpold = aj + ai[i];
196     v        = aa + 16*ai[i];
197     for (j=0; j<nz; j++) {
198       x    = rtmp+16*ajtmpold[j];
199       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
200       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
201       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
202       x[14] = v[14]; x[15] = v[15];
203       v    += 16;
204     }
205     row = *ajtmp++;
206     while (row < i) {
207       pc  = rtmp + 16*row;
208       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
209       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
210       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
211       p15 = pc[14]; p16 = pc[15];
212       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
213           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
214           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
215           || p16 != 0.0) {
216         pv = ba + 16*diag_offset[row];
217         pj = bj + diag_offset[row] + 1;
218         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
219         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
220         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
221         x15 = pv[14]; x16 = pv[15];
222         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
223         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
224         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
225         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
226 
227         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
228         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
229         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
230         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
231 
232         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
233         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
234         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
235         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
236 
237         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
238         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
239         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
240         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
241         nz = bi[row+1] - diag_offset[row] - 1;
242         pv += 16;
243         for (j=0; j<nz; j++) {
244           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
245           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
246           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
247           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
248           x    = rtmp + 16*pj[j];
249           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
250           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
251           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
252           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
253 
254           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
255           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
256           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
257           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
258 
259           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
260           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
261           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
262           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
263 
264           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
265           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
266           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
267           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
268 
269           pv   += 16;
270         }
271         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
272       }
273       row = *ajtmp++;
274     }
275     /* finished row so stick it into b->a */
276     pv = ba + 16*bi[i];
277     pj = bj + bi[i];
278     nz = bi[i+1] - bi[i];
279     for (j=0; j<nz; j++) {
280       x      = rtmp+16*pj[j];
281       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
282       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
283       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
284       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
285       pv   += 16;
286     }
287     /* invert diagonal block */
288     w = ba + 16*diag_offset[i];
289     if (pivotinblocks) {
290       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
291     } else {
292       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
293     }
294   }
295 
296   ierr = PetscFree(rtmp);CHKERRQ(ierr);
297   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
298   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
299   C->assembled = PETSC_TRUE;
300   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
301   PetscFunctionReturn(0);
302 }
303 
304 
305 #if defined(PETSC_HAVE_SSE)
306 
307 #include PETSC_HAVE_SSE
308 
309 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
310 #undef __FUNCT__
311 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
312 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
313 {
314   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
315   PetscErrorCode ierr;
316   int i,j,n = a->mbs;
317   int         *bj = b->j,*bjtmp,*pj;
318   int         row;
319   int         *ajtmpold,nz,*bi=b->i;
320   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
321   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
322   MatScalar   *ba = b->a,*aa = a->a;
323   int         nonzero=0;
324 /*    int            nonzero=0,colscale = 16; */
325   PetscTruth  pivotinblocks = b->pivotinblocks;
326   PetscReal      shift = info->shiftinblocks;
327 
328   PetscFunctionBegin;
329   SSE_SCOPE_BEGIN;
330 
331   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
332   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
333   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
334   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
335 /*    if ((unsigned long)bj==(unsigned long)aj) { */
336 /*      colscale = 4; */
337 /*    } */
338   for (i=0; i<n; i++) {
339     nz    = bi[i+1] - bi[i];
340     bjtmp = bj + bi[i];
341     /* zero out the 4x4 block accumulators */
342     /* zero out one register */
343     XOR_PS(XMM7,XMM7);
344     for  (j=0; j<nz; j++) {
345       x = rtmp+16*bjtmp[j];
346 /*        x = rtmp+4*bjtmp[j]; */
347       SSE_INLINE_BEGIN_1(x)
348         /* Copy zero register to memory locations */
349         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
350         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
351         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
352         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
353         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
354         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
355         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
356         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
357         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
358       SSE_INLINE_END_1;
359     }
360     /* load in initial (unfactored row) */
361     nz       = ai[i+1] - ai[i];
362     ajtmpold = aj + ai[i];
363     v        = aa + 16*ai[i];
364     for (j=0; j<nz; j++) {
365       x = rtmp+16*ajtmpold[j];
366 /*        x = rtmp+colscale*ajtmpold[j]; */
367       /* Copy v block into x block */
368       SSE_INLINE_BEGIN_2(v,x)
369         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
370         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
371         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
372 
373         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
374         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
375 
376         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
377         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
378 
379         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
380         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
381 
382         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
383         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
384 
385         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
386         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
387 
388         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
389         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
390 
391         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
392         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
393       SSE_INLINE_END_2;
394 
395       v += 16;
396     }
397 /*      row = (*bjtmp++)/4; */
398     row = *bjtmp++;
399     while (row < i) {
400       pc  = rtmp + 16*row;
401       SSE_INLINE_BEGIN_1(pc)
402         /* Load block from lower triangle */
403         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
404         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
405         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
406 
407         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
408         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
409 
410         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
411         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
412 
413         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
414         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
415 
416         /* Compare block to zero block */
417 
418         SSE_COPY_PS(XMM4,XMM7)
419         SSE_CMPNEQ_PS(XMM4,XMM0)
420 
421         SSE_COPY_PS(XMM5,XMM7)
422         SSE_CMPNEQ_PS(XMM5,XMM1)
423 
424         SSE_COPY_PS(XMM6,XMM7)
425         SSE_CMPNEQ_PS(XMM6,XMM2)
426 
427         SSE_CMPNEQ_PS(XMM7,XMM3)
428 
429         /* Reduce the comparisons to one SSE register */
430         SSE_OR_PS(XMM6,XMM7)
431         SSE_OR_PS(XMM5,XMM4)
432         SSE_OR_PS(XMM5,XMM6)
433       SSE_INLINE_END_1;
434 
435       /* Reduce the one SSE register to an integer register for branching */
436       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
437       MOVEMASK(nonzero,XMM5);
438 
439       /* If block is nonzero ... */
440       if (nonzero) {
441         pv = ba + 16*diag_offset[row];
442         PREFETCH_L1(&pv[16]);
443         pj = bj + diag_offset[row] + 1;
444 
445         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
446         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
447         /* but the diagonal was inverted already */
448         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
449 
450         SSE_INLINE_BEGIN_2(pv,pc)
451           /* Column 0, product is accumulated in XMM4 */
452           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
453           SSE_SHUFFLE(XMM4,XMM4,0x00)
454           SSE_MULT_PS(XMM4,XMM0)
455 
456           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
457           SSE_SHUFFLE(XMM5,XMM5,0x00)
458           SSE_MULT_PS(XMM5,XMM1)
459           SSE_ADD_PS(XMM4,XMM5)
460 
461           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
462           SSE_SHUFFLE(XMM6,XMM6,0x00)
463           SSE_MULT_PS(XMM6,XMM2)
464           SSE_ADD_PS(XMM4,XMM6)
465 
466           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
467           SSE_SHUFFLE(XMM7,XMM7,0x00)
468           SSE_MULT_PS(XMM7,XMM3)
469           SSE_ADD_PS(XMM4,XMM7)
470 
471           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
472           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
473 
474           /* Column 1, product is accumulated in XMM5 */
475           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
476           SSE_SHUFFLE(XMM5,XMM5,0x00)
477           SSE_MULT_PS(XMM5,XMM0)
478 
479           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
480           SSE_SHUFFLE(XMM6,XMM6,0x00)
481           SSE_MULT_PS(XMM6,XMM1)
482           SSE_ADD_PS(XMM5,XMM6)
483 
484           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
485           SSE_SHUFFLE(XMM7,XMM7,0x00)
486           SSE_MULT_PS(XMM7,XMM2)
487           SSE_ADD_PS(XMM5,XMM7)
488 
489           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
490           SSE_SHUFFLE(XMM6,XMM6,0x00)
491           SSE_MULT_PS(XMM6,XMM3)
492           SSE_ADD_PS(XMM5,XMM6)
493 
494           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
495           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
496 
497           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
498 
499           /* Column 2, product is accumulated in XMM6 */
500           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
501           SSE_SHUFFLE(XMM6,XMM6,0x00)
502           SSE_MULT_PS(XMM6,XMM0)
503 
504           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
505           SSE_SHUFFLE(XMM7,XMM7,0x00)
506           SSE_MULT_PS(XMM7,XMM1)
507           SSE_ADD_PS(XMM6,XMM7)
508 
509           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
510           SSE_SHUFFLE(XMM7,XMM7,0x00)
511           SSE_MULT_PS(XMM7,XMM2)
512           SSE_ADD_PS(XMM6,XMM7)
513 
514           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
515           SSE_SHUFFLE(XMM7,XMM7,0x00)
516           SSE_MULT_PS(XMM7,XMM3)
517           SSE_ADD_PS(XMM6,XMM7)
518 
519           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
520           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
521 
522           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
523           /* Column 3, product is accumulated in XMM0 */
524           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
525           SSE_SHUFFLE(XMM7,XMM7,0x00)
526           SSE_MULT_PS(XMM0,XMM7)
527 
528           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
529           SSE_SHUFFLE(XMM7,XMM7,0x00)
530           SSE_MULT_PS(XMM1,XMM7)
531           SSE_ADD_PS(XMM0,XMM1)
532 
533           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
534           SSE_SHUFFLE(XMM1,XMM1,0x00)
535           SSE_MULT_PS(XMM1,XMM2)
536           SSE_ADD_PS(XMM0,XMM1)
537 
538           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
539           SSE_SHUFFLE(XMM7,XMM7,0x00)
540           SSE_MULT_PS(XMM3,XMM7)
541           SSE_ADD_PS(XMM0,XMM3)
542 
543           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
544           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
545 
546           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
547           /* This is code to be maintained and read by humans afterall. */
548           /* Copy Multiplier Col 3 into XMM3 */
549           SSE_COPY_PS(XMM3,XMM0)
550           /* Copy Multiplier Col 2 into XMM2 */
551           SSE_COPY_PS(XMM2,XMM6)
552           /* Copy Multiplier Col 1 into XMM1 */
553           SSE_COPY_PS(XMM1,XMM5)
554           /* Copy Multiplier Col 0 into XMM0 */
555           SSE_COPY_PS(XMM0,XMM4)
556         SSE_INLINE_END_2;
557 
558         /* Update the row: */
559         nz = bi[row+1] - diag_offset[row] - 1;
560         pv += 16;
561         for (j=0; j<nz; j++) {
562           PREFETCH_L1(&pv[16]);
563           x = rtmp + 16*pj[j];
564 /*            x = rtmp + 4*pj[j]; */
565 
566           /* X:=X-M*PV, One column at a time */
567           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
568           SSE_INLINE_BEGIN_2(x,pv)
569             /* Load First Column of X*/
570             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
571             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
572 
573             /* Matrix-Vector Product: */
574             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
575             SSE_SHUFFLE(XMM5,XMM5,0x00)
576             SSE_MULT_PS(XMM5,XMM0)
577             SSE_SUB_PS(XMM4,XMM5)
578 
579             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
580             SSE_SHUFFLE(XMM6,XMM6,0x00)
581             SSE_MULT_PS(XMM6,XMM1)
582             SSE_SUB_PS(XMM4,XMM6)
583 
584             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
585             SSE_SHUFFLE(XMM7,XMM7,0x00)
586             SSE_MULT_PS(XMM7,XMM2)
587             SSE_SUB_PS(XMM4,XMM7)
588 
589             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
590             SSE_SHUFFLE(XMM5,XMM5,0x00)
591             SSE_MULT_PS(XMM5,XMM3)
592             SSE_SUB_PS(XMM4,XMM5)
593 
594             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
595             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
596 
597             /* Second Column */
598             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
599             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
600 
601             /* Matrix-Vector Product: */
602             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
603             SSE_SHUFFLE(XMM6,XMM6,0x00)
604             SSE_MULT_PS(XMM6,XMM0)
605             SSE_SUB_PS(XMM5,XMM6)
606 
607             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
608             SSE_SHUFFLE(XMM7,XMM7,0x00)
609             SSE_MULT_PS(XMM7,XMM1)
610             SSE_SUB_PS(XMM5,XMM7)
611 
612             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
613             SSE_SHUFFLE(XMM4,XMM4,0x00)
614             SSE_MULT_PS(XMM4,XMM2)
615             SSE_SUB_PS(XMM5,XMM4)
616 
617             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
618             SSE_SHUFFLE(XMM6,XMM6,0x00)
619             SSE_MULT_PS(XMM6,XMM3)
620             SSE_SUB_PS(XMM5,XMM6)
621 
622             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
623             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
624 
625             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
626 
627             /* Third Column */
628             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
629             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
630 
631             /* Matrix-Vector Product: */
632             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
633             SSE_SHUFFLE(XMM7,XMM7,0x00)
634             SSE_MULT_PS(XMM7,XMM0)
635             SSE_SUB_PS(XMM6,XMM7)
636 
637             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
638             SSE_SHUFFLE(XMM4,XMM4,0x00)
639             SSE_MULT_PS(XMM4,XMM1)
640             SSE_SUB_PS(XMM6,XMM4)
641 
642             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
643             SSE_SHUFFLE(XMM5,XMM5,0x00)
644             SSE_MULT_PS(XMM5,XMM2)
645             SSE_SUB_PS(XMM6,XMM5)
646 
647             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
648             SSE_SHUFFLE(XMM7,XMM7,0x00)
649             SSE_MULT_PS(XMM7,XMM3)
650             SSE_SUB_PS(XMM6,XMM7)
651 
652             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
653             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
654 
655             /* Fourth Column */
656             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
657             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
658 
659             /* Matrix-Vector Product: */
660             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
661             SSE_SHUFFLE(XMM5,XMM5,0x00)
662             SSE_MULT_PS(XMM5,XMM0)
663             SSE_SUB_PS(XMM4,XMM5)
664 
665             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
666             SSE_SHUFFLE(XMM6,XMM6,0x00)
667             SSE_MULT_PS(XMM6,XMM1)
668             SSE_SUB_PS(XMM4,XMM6)
669 
670             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
671             SSE_SHUFFLE(XMM7,XMM7,0x00)
672             SSE_MULT_PS(XMM7,XMM2)
673             SSE_SUB_PS(XMM4,XMM7)
674 
675             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
676             SSE_SHUFFLE(XMM5,XMM5,0x00)
677             SSE_MULT_PS(XMM5,XMM3)
678             SSE_SUB_PS(XMM4,XMM5)
679 
680             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
681             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
682           SSE_INLINE_END_2;
683           pv   += 16;
684         }
685         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
686       }
687       row = *bjtmp++;
688 /*        row = (*bjtmp++)/4; */
689     }
690     /* finished row so stick it into b->a */
691     pv = ba + 16*bi[i];
692     pj = bj + bi[i];
693     nz = bi[i+1] - bi[i];
694 
695     /* Copy x block back into pv block */
696     for (j=0; j<nz; j++) {
697       x  = rtmp+16*pj[j];
698 /*        x  = rtmp+4*pj[j]; */
699 
700       SSE_INLINE_BEGIN_2(x,pv)
701         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
702         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
703         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
704 
705         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
706         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
707 
708         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
709         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
710 
711         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
712         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
713 
714         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
715         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
716 
717         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
718         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
719 
720         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
721         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
722 
723         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
724         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
725       SSE_INLINE_END_2;
726       pv += 16;
727     }
728     /* invert diagonal block */
729     w = ba + 16*diag_offset[i];
730     if (pivotinblocks) {
731       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
732     } else {
733       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
734     }
735 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
736     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
737   }
738 
739   ierr = PetscFree(rtmp);CHKERRQ(ierr);
740   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
741   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
742   C->assembled = PETSC_TRUE;
743   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
744   /* Flop Count from inverting diagonal blocks */
745   SSE_SCOPE_END;
746   PetscFunctionReturn(0);
747 }
748 
749 #undef __FUNCT__
750 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
751 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
752 {
753   Mat            A=C;
754   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
755   PetscErrorCode ierr;
756   int i,j,n = a->mbs;
757   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
758   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
759   unsigned int   row;
760   int            nz,*bi=b->i;
761   int            *diag_offset = b->diag,*ai=a->i;
762   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
763   MatScalar      *ba = b->a,*aa = a->a;
764   int            nonzero=0;
765 /*    int            nonzero=0,colscale = 16; */
766   PetscTruth     pivotinblocks = b->pivotinblocks;
767   PetscReal      shift = info->shiftinblocks;
768 
769   PetscFunctionBegin;
770   SSE_SCOPE_BEGIN;
771 
772   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
773   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
774   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
775   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
776 /*    if ((unsigned long)bj==(unsigned long)aj) { */
777 /*      colscale = 4; */
778 /*    } */
779 
780   for (i=0; i<n; i++) {
781     nz    = bi[i+1] - bi[i];
782     bjtmp = bj + bi[i];
783     /* zero out the 4x4 block accumulators */
784     /* zero out one register */
785     XOR_PS(XMM7,XMM7);
786     for  (j=0; j<nz; j++) {
787       x = rtmp+16*((unsigned int)bjtmp[j]);
788 /*        x = rtmp+4*bjtmp[j]; */
789       SSE_INLINE_BEGIN_1(x)
790         /* Copy zero register to memory locations */
791         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
792         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
793         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
794         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
795         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
796         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
797         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
798         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
799         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
800       SSE_INLINE_END_1;
801     }
802     /* load in initial (unfactored row) */
803     nz    = ai[i+1] - ai[i];
804     ajtmp = aj + ai[i];
805     v     = aa + 16*ai[i];
806     for (j=0; j<nz; j++) {
807       x = rtmp+16*((unsigned int)ajtmp[j]);
808 /*        x = rtmp+colscale*ajtmp[j]; */
809       /* Copy v block into x block */
810       SSE_INLINE_BEGIN_2(v,x)
811         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
812         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
813         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
814 
815         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
816         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
817 
818         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
819         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
820 
821         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
822         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
823 
824         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
825         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
826 
827         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
828         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
829 
830         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
831         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
832 
833         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
834         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
835       SSE_INLINE_END_2;
836 
837       v += 16;
838     }
839 /*      row = (*bjtmp++)/4; */
840     row = (unsigned int)(*bjtmp++);
841     while (row < i) {
842       pc  = rtmp + 16*row;
843       SSE_INLINE_BEGIN_1(pc)
844         /* Load block from lower triangle */
845         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
846         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
847         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
848 
849         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
850         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
851 
852         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
853         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
854 
855         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
856         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
857 
858         /* Compare block to zero block */
859 
860         SSE_COPY_PS(XMM4,XMM7)
861         SSE_CMPNEQ_PS(XMM4,XMM0)
862 
863         SSE_COPY_PS(XMM5,XMM7)
864         SSE_CMPNEQ_PS(XMM5,XMM1)
865 
866         SSE_COPY_PS(XMM6,XMM7)
867         SSE_CMPNEQ_PS(XMM6,XMM2)
868 
869         SSE_CMPNEQ_PS(XMM7,XMM3)
870 
871         /* Reduce the comparisons to one SSE register */
872         SSE_OR_PS(XMM6,XMM7)
873         SSE_OR_PS(XMM5,XMM4)
874         SSE_OR_PS(XMM5,XMM6)
875       SSE_INLINE_END_1;
876 
877       /* Reduce the one SSE register to an integer register for branching */
878       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
879       MOVEMASK(nonzero,XMM5);
880 
881       /* If block is nonzero ... */
882       if (nonzero) {
883         pv = ba + 16*diag_offset[row];
884         PREFETCH_L1(&pv[16]);
885         pj = bj + diag_offset[row] + 1;
886 
887         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
888         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
889         /* but the diagonal was inverted already */
890         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
891 
892         SSE_INLINE_BEGIN_2(pv,pc)
893           /* Column 0, product is accumulated in XMM4 */
894           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
895           SSE_SHUFFLE(XMM4,XMM4,0x00)
896           SSE_MULT_PS(XMM4,XMM0)
897 
898           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
899           SSE_SHUFFLE(XMM5,XMM5,0x00)
900           SSE_MULT_PS(XMM5,XMM1)
901           SSE_ADD_PS(XMM4,XMM5)
902 
903           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
904           SSE_SHUFFLE(XMM6,XMM6,0x00)
905           SSE_MULT_PS(XMM6,XMM2)
906           SSE_ADD_PS(XMM4,XMM6)
907 
908           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
909           SSE_SHUFFLE(XMM7,XMM7,0x00)
910           SSE_MULT_PS(XMM7,XMM3)
911           SSE_ADD_PS(XMM4,XMM7)
912 
913           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
914           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
915 
916           /* Column 1, product is accumulated in XMM5 */
917           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
918           SSE_SHUFFLE(XMM5,XMM5,0x00)
919           SSE_MULT_PS(XMM5,XMM0)
920 
921           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
922           SSE_SHUFFLE(XMM6,XMM6,0x00)
923           SSE_MULT_PS(XMM6,XMM1)
924           SSE_ADD_PS(XMM5,XMM6)
925 
926           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
927           SSE_SHUFFLE(XMM7,XMM7,0x00)
928           SSE_MULT_PS(XMM7,XMM2)
929           SSE_ADD_PS(XMM5,XMM7)
930 
931           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
932           SSE_SHUFFLE(XMM6,XMM6,0x00)
933           SSE_MULT_PS(XMM6,XMM3)
934           SSE_ADD_PS(XMM5,XMM6)
935 
936           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
937           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
938 
939           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
940 
941           /* Column 2, product is accumulated in XMM6 */
942           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
943           SSE_SHUFFLE(XMM6,XMM6,0x00)
944           SSE_MULT_PS(XMM6,XMM0)
945 
946           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
947           SSE_SHUFFLE(XMM7,XMM7,0x00)
948           SSE_MULT_PS(XMM7,XMM1)
949           SSE_ADD_PS(XMM6,XMM7)
950 
951           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
952           SSE_SHUFFLE(XMM7,XMM7,0x00)
953           SSE_MULT_PS(XMM7,XMM2)
954           SSE_ADD_PS(XMM6,XMM7)
955 
956           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
957           SSE_SHUFFLE(XMM7,XMM7,0x00)
958           SSE_MULT_PS(XMM7,XMM3)
959           SSE_ADD_PS(XMM6,XMM7)
960 
961           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
962           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
963 
964           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
965           /* Column 3, product is accumulated in XMM0 */
966           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
967           SSE_SHUFFLE(XMM7,XMM7,0x00)
968           SSE_MULT_PS(XMM0,XMM7)
969 
970           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
971           SSE_SHUFFLE(XMM7,XMM7,0x00)
972           SSE_MULT_PS(XMM1,XMM7)
973           SSE_ADD_PS(XMM0,XMM1)
974 
975           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
976           SSE_SHUFFLE(XMM1,XMM1,0x00)
977           SSE_MULT_PS(XMM1,XMM2)
978           SSE_ADD_PS(XMM0,XMM1)
979 
980           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
981           SSE_SHUFFLE(XMM7,XMM7,0x00)
982           SSE_MULT_PS(XMM3,XMM7)
983           SSE_ADD_PS(XMM0,XMM3)
984 
985           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
986           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
987 
988           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
989           /* This is code to be maintained and read by humans afterall. */
990           /* Copy Multiplier Col 3 into XMM3 */
991           SSE_COPY_PS(XMM3,XMM0)
992           /* Copy Multiplier Col 2 into XMM2 */
993           SSE_COPY_PS(XMM2,XMM6)
994           /* Copy Multiplier Col 1 into XMM1 */
995           SSE_COPY_PS(XMM1,XMM5)
996           /* Copy Multiplier Col 0 into XMM0 */
997           SSE_COPY_PS(XMM0,XMM4)
998         SSE_INLINE_END_2;
999 
1000         /* Update the row: */
1001         nz = bi[row+1] - diag_offset[row] - 1;
1002         pv += 16;
1003         for (j=0; j<nz; j++) {
1004           PREFETCH_L1(&pv[16]);
1005           x = rtmp + 16*((unsigned int)pj[j]);
1006 /*            x = rtmp + 4*pj[j]; */
1007 
1008           /* X:=X-M*PV, One column at a time */
1009           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1010           SSE_INLINE_BEGIN_2(x,pv)
1011             /* Load First Column of X*/
1012             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1013             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1014 
1015             /* Matrix-Vector Product: */
1016             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1017             SSE_SHUFFLE(XMM5,XMM5,0x00)
1018             SSE_MULT_PS(XMM5,XMM0)
1019             SSE_SUB_PS(XMM4,XMM5)
1020 
1021             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1022             SSE_SHUFFLE(XMM6,XMM6,0x00)
1023             SSE_MULT_PS(XMM6,XMM1)
1024             SSE_SUB_PS(XMM4,XMM6)
1025 
1026             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1027             SSE_SHUFFLE(XMM7,XMM7,0x00)
1028             SSE_MULT_PS(XMM7,XMM2)
1029             SSE_SUB_PS(XMM4,XMM7)
1030 
1031             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1032             SSE_SHUFFLE(XMM5,XMM5,0x00)
1033             SSE_MULT_PS(XMM5,XMM3)
1034             SSE_SUB_PS(XMM4,XMM5)
1035 
1036             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1037             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1038 
1039             /* Second Column */
1040             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1041             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1042 
1043             /* Matrix-Vector Product: */
1044             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1045             SSE_SHUFFLE(XMM6,XMM6,0x00)
1046             SSE_MULT_PS(XMM6,XMM0)
1047             SSE_SUB_PS(XMM5,XMM6)
1048 
1049             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1050             SSE_SHUFFLE(XMM7,XMM7,0x00)
1051             SSE_MULT_PS(XMM7,XMM1)
1052             SSE_SUB_PS(XMM5,XMM7)
1053 
1054             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1055             SSE_SHUFFLE(XMM4,XMM4,0x00)
1056             SSE_MULT_PS(XMM4,XMM2)
1057             SSE_SUB_PS(XMM5,XMM4)
1058 
1059             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1060             SSE_SHUFFLE(XMM6,XMM6,0x00)
1061             SSE_MULT_PS(XMM6,XMM3)
1062             SSE_SUB_PS(XMM5,XMM6)
1063 
1064             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1065             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1066 
1067             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1068 
1069             /* Third Column */
1070             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1071             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1072 
1073             /* Matrix-Vector Product: */
1074             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1075             SSE_SHUFFLE(XMM7,XMM7,0x00)
1076             SSE_MULT_PS(XMM7,XMM0)
1077             SSE_SUB_PS(XMM6,XMM7)
1078 
1079             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1080             SSE_SHUFFLE(XMM4,XMM4,0x00)
1081             SSE_MULT_PS(XMM4,XMM1)
1082             SSE_SUB_PS(XMM6,XMM4)
1083 
1084             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1085             SSE_SHUFFLE(XMM5,XMM5,0x00)
1086             SSE_MULT_PS(XMM5,XMM2)
1087             SSE_SUB_PS(XMM6,XMM5)
1088 
1089             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1090             SSE_SHUFFLE(XMM7,XMM7,0x00)
1091             SSE_MULT_PS(XMM7,XMM3)
1092             SSE_SUB_PS(XMM6,XMM7)
1093 
1094             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1095             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1096 
1097             /* Fourth Column */
1098             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1099             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1100 
1101             /* Matrix-Vector Product: */
1102             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1103             SSE_SHUFFLE(XMM5,XMM5,0x00)
1104             SSE_MULT_PS(XMM5,XMM0)
1105             SSE_SUB_PS(XMM4,XMM5)
1106 
1107             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1108             SSE_SHUFFLE(XMM6,XMM6,0x00)
1109             SSE_MULT_PS(XMM6,XMM1)
1110             SSE_SUB_PS(XMM4,XMM6)
1111 
1112             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1113             SSE_SHUFFLE(XMM7,XMM7,0x00)
1114             SSE_MULT_PS(XMM7,XMM2)
1115             SSE_SUB_PS(XMM4,XMM7)
1116 
1117             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1118             SSE_SHUFFLE(XMM5,XMM5,0x00)
1119             SSE_MULT_PS(XMM5,XMM3)
1120             SSE_SUB_PS(XMM4,XMM5)
1121 
1122             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1123             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1124           SSE_INLINE_END_2;
1125           pv   += 16;
1126         }
1127         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1128       }
1129       row = (unsigned int)(*bjtmp++);
1130 /*        row = (*bjtmp++)/4; */
1131 /*        bjtmp++; */
1132     }
1133     /* finished row so stick it into b->a */
1134     pv = ba + 16*bi[i];
1135     pj = bj + bi[i];
1136     nz = bi[i+1] - bi[i];
1137 
1138     /* Copy x block back into pv block */
1139     for (j=0; j<nz; j++) {
1140       x  = rtmp+16*((unsigned int)pj[j]);
1141 /*        x  = rtmp+4*pj[j]; */
1142 
1143       SSE_INLINE_BEGIN_2(x,pv)
1144         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1145         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1146         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1147 
1148         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1149         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1150 
1151         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1152         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1153 
1154         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1155         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1156 
1157         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1158         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1159 
1160         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1161         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1162 
1163         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1164         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1165 
1166         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1167         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1168       SSE_INLINE_END_2;
1169       pv += 16;
1170     }
1171     /* invert diagonal block */
1172     w = ba + 16*diag_offset[i];
1173     if (pivotinblocks) {
1174       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1175     } else {
1176       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1177     }
1178 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1179     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1180   }
1181 
1182   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1183   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1184   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1185   C->assembled = PETSC_TRUE;
1186   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1187   /* Flop Count from inverting diagonal blocks */
1188   SSE_SCOPE_END;
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 #undef __FUNCT__
1193 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1194 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1195 {
1196   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1197   PetscErrorCode ierr;
1198   int  i,j,n = a->mbs;
1199   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1200   unsigned int   row;
1201   int            *ajtmpold,nz,*bi=b->i;
1202   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1203   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1204   MatScalar      *ba = b->a,*aa = a->a;
1205   int            nonzero=0;
1206 /*    int            nonzero=0,colscale = 16; */
1207   PetscTruth     pivotinblocks = b->pivotinblocks;
1208   PetscReal      shift = info->shiftinblocks;
1209 
1210   PetscFunctionBegin;
1211   SSE_SCOPE_BEGIN;
1212 
1213   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1214   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1215   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1216   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1217 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1218 /*      colscale = 4; */
1219 /*    } */
1220   if ((unsigned long)bj==(unsigned long)aj) {
1221     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1222   }
1223 
1224   for (i=0; i<n; i++) {
1225     nz    = bi[i+1] - bi[i];
1226     bjtmp = bj + bi[i];
1227     /* zero out the 4x4 block accumulators */
1228     /* zero out one register */
1229     XOR_PS(XMM7,XMM7);
1230     for  (j=0; j<nz; j++) {
1231       x = rtmp+16*((unsigned int)bjtmp[j]);
1232 /*        x = rtmp+4*bjtmp[j]; */
1233       SSE_INLINE_BEGIN_1(x)
1234         /* Copy zero register to memory locations */
1235         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1236         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1237         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1238         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1239         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1240         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1241         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1242         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1243         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1244       SSE_INLINE_END_1;
1245     }
1246     /* load in initial (unfactored row) */
1247     nz       = ai[i+1] - ai[i];
1248     ajtmpold = aj + ai[i];
1249     v        = aa + 16*ai[i];
1250     for (j=0; j<nz; j++) {
1251       x = rtmp+16*ajtmpold[j];
1252 /*        x = rtmp+colscale*ajtmpold[j]; */
1253       /* Copy v block into x block */
1254       SSE_INLINE_BEGIN_2(v,x)
1255         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1256         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1257         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1258 
1259         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1260         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1261 
1262         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1263         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1264 
1265         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1266         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1267 
1268         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1269         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1270 
1271         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1272         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1273 
1274         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1275         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1276 
1277         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1278         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1279       SSE_INLINE_END_2;
1280 
1281       v += 16;
1282     }
1283 /*      row = (*bjtmp++)/4; */
1284     row = (unsigned int)(*bjtmp++);
1285     while (row < i) {
1286       pc  = rtmp + 16*row;
1287       SSE_INLINE_BEGIN_1(pc)
1288         /* Load block from lower triangle */
1289         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1290         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1291         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1292 
1293         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1294         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1295 
1296         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1297         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1298 
1299         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1300         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1301 
1302         /* Compare block to zero block */
1303 
1304         SSE_COPY_PS(XMM4,XMM7)
1305         SSE_CMPNEQ_PS(XMM4,XMM0)
1306 
1307         SSE_COPY_PS(XMM5,XMM7)
1308         SSE_CMPNEQ_PS(XMM5,XMM1)
1309 
1310         SSE_COPY_PS(XMM6,XMM7)
1311         SSE_CMPNEQ_PS(XMM6,XMM2)
1312 
1313         SSE_CMPNEQ_PS(XMM7,XMM3)
1314 
1315         /* Reduce the comparisons to one SSE register */
1316         SSE_OR_PS(XMM6,XMM7)
1317         SSE_OR_PS(XMM5,XMM4)
1318         SSE_OR_PS(XMM5,XMM6)
1319       SSE_INLINE_END_1;
1320 
1321       /* Reduce the one SSE register to an integer register for branching */
1322       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1323       MOVEMASK(nonzero,XMM5);
1324 
1325       /* If block is nonzero ... */
1326       if (nonzero) {
1327         pv = ba + 16*diag_offset[row];
1328         PREFETCH_L1(&pv[16]);
1329         pj = bj + diag_offset[row] + 1;
1330 
1331         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1332         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1333         /* but the diagonal was inverted already */
1334         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1335 
1336         SSE_INLINE_BEGIN_2(pv,pc)
1337           /* Column 0, product is accumulated in XMM4 */
1338           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1339           SSE_SHUFFLE(XMM4,XMM4,0x00)
1340           SSE_MULT_PS(XMM4,XMM0)
1341 
1342           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1343           SSE_SHUFFLE(XMM5,XMM5,0x00)
1344           SSE_MULT_PS(XMM5,XMM1)
1345           SSE_ADD_PS(XMM4,XMM5)
1346 
1347           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1348           SSE_SHUFFLE(XMM6,XMM6,0x00)
1349           SSE_MULT_PS(XMM6,XMM2)
1350           SSE_ADD_PS(XMM4,XMM6)
1351 
1352           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1353           SSE_SHUFFLE(XMM7,XMM7,0x00)
1354           SSE_MULT_PS(XMM7,XMM3)
1355           SSE_ADD_PS(XMM4,XMM7)
1356 
1357           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1358           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1359 
1360           /* Column 1, product is accumulated in XMM5 */
1361           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1362           SSE_SHUFFLE(XMM5,XMM5,0x00)
1363           SSE_MULT_PS(XMM5,XMM0)
1364 
1365           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1366           SSE_SHUFFLE(XMM6,XMM6,0x00)
1367           SSE_MULT_PS(XMM6,XMM1)
1368           SSE_ADD_PS(XMM5,XMM6)
1369 
1370           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1371           SSE_SHUFFLE(XMM7,XMM7,0x00)
1372           SSE_MULT_PS(XMM7,XMM2)
1373           SSE_ADD_PS(XMM5,XMM7)
1374 
1375           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1376           SSE_SHUFFLE(XMM6,XMM6,0x00)
1377           SSE_MULT_PS(XMM6,XMM3)
1378           SSE_ADD_PS(XMM5,XMM6)
1379 
1380           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1381           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1382 
1383           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1384 
1385           /* Column 2, product is accumulated in XMM6 */
1386           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1387           SSE_SHUFFLE(XMM6,XMM6,0x00)
1388           SSE_MULT_PS(XMM6,XMM0)
1389 
1390           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1391           SSE_SHUFFLE(XMM7,XMM7,0x00)
1392           SSE_MULT_PS(XMM7,XMM1)
1393           SSE_ADD_PS(XMM6,XMM7)
1394 
1395           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1396           SSE_SHUFFLE(XMM7,XMM7,0x00)
1397           SSE_MULT_PS(XMM7,XMM2)
1398           SSE_ADD_PS(XMM6,XMM7)
1399 
1400           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1401           SSE_SHUFFLE(XMM7,XMM7,0x00)
1402           SSE_MULT_PS(XMM7,XMM3)
1403           SSE_ADD_PS(XMM6,XMM7)
1404 
1405           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1406           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1407 
1408           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1409           /* Column 3, product is accumulated in XMM0 */
1410           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1411           SSE_SHUFFLE(XMM7,XMM7,0x00)
1412           SSE_MULT_PS(XMM0,XMM7)
1413 
1414           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1415           SSE_SHUFFLE(XMM7,XMM7,0x00)
1416           SSE_MULT_PS(XMM1,XMM7)
1417           SSE_ADD_PS(XMM0,XMM1)
1418 
1419           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1420           SSE_SHUFFLE(XMM1,XMM1,0x00)
1421           SSE_MULT_PS(XMM1,XMM2)
1422           SSE_ADD_PS(XMM0,XMM1)
1423 
1424           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1425           SSE_SHUFFLE(XMM7,XMM7,0x00)
1426           SSE_MULT_PS(XMM3,XMM7)
1427           SSE_ADD_PS(XMM0,XMM3)
1428 
1429           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1430           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1431 
1432           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1433           /* This is code to be maintained and read by humans afterall. */
1434           /* Copy Multiplier Col 3 into XMM3 */
1435           SSE_COPY_PS(XMM3,XMM0)
1436           /* Copy Multiplier Col 2 into XMM2 */
1437           SSE_COPY_PS(XMM2,XMM6)
1438           /* Copy Multiplier Col 1 into XMM1 */
1439           SSE_COPY_PS(XMM1,XMM5)
1440           /* Copy Multiplier Col 0 into XMM0 */
1441           SSE_COPY_PS(XMM0,XMM4)
1442         SSE_INLINE_END_2;
1443 
1444         /* Update the row: */
1445         nz = bi[row+1] - diag_offset[row] - 1;
1446         pv += 16;
1447         for (j=0; j<nz; j++) {
1448           PREFETCH_L1(&pv[16]);
1449           x = rtmp + 16*((unsigned int)pj[j]);
1450 /*            x = rtmp + 4*pj[j]; */
1451 
1452           /* X:=X-M*PV, One column at a time */
1453           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1454           SSE_INLINE_BEGIN_2(x,pv)
1455             /* Load First Column of X*/
1456             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1457             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1458 
1459             /* Matrix-Vector Product: */
1460             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1461             SSE_SHUFFLE(XMM5,XMM5,0x00)
1462             SSE_MULT_PS(XMM5,XMM0)
1463             SSE_SUB_PS(XMM4,XMM5)
1464 
1465             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1466             SSE_SHUFFLE(XMM6,XMM6,0x00)
1467             SSE_MULT_PS(XMM6,XMM1)
1468             SSE_SUB_PS(XMM4,XMM6)
1469 
1470             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1471             SSE_SHUFFLE(XMM7,XMM7,0x00)
1472             SSE_MULT_PS(XMM7,XMM2)
1473             SSE_SUB_PS(XMM4,XMM7)
1474 
1475             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1476             SSE_SHUFFLE(XMM5,XMM5,0x00)
1477             SSE_MULT_PS(XMM5,XMM3)
1478             SSE_SUB_PS(XMM4,XMM5)
1479 
1480             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1481             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1482 
1483             /* Second Column */
1484             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1485             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1486 
1487             /* Matrix-Vector Product: */
1488             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1489             SSE_SHUFFLE(XMM6,XMM6,0x00)
1490             SSE_MULT_PS(XMM6,XMM0)
1491             SSE_SUB_PS(XMM5,XMM6)
1492 
1493             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1494             SSE_SHUFFLE(XMM7,XMM7,0x00)
1495             SSE_MULT_PS(XMM7,XMM1)
1496             SSE_SUB_PS(XMM5,XMM7)
1497 
1498             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1499             SSE_SHUFFLE(XMM4,XMM4,0x00)
1500             SSE_MULT_PS(XMM4,XMM2)
1501             SSE_SUB_PS(XMM5,XMM4)
1502 
1503             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1504             SSE_SHUFFLE(XMM6,XMM6,0x00)
1505             SSE_MULT_PS(XMM6,XMM3)
1506             SSE_SUB_PS(XMM5,XMM6)
1507 
1508             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1509             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1510 
1511             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1512 
1513             /* Third Column */
1514             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1515             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1516 
1517             /* Matrix-Vector Product: */
1518             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1519             SSE_SHUFFLE(XMM7,XMM7,0x00)
1520             SSE_MULT_PS(XMM7,XMM0)
1521             SSE_SUB_PS(XMM6,XMM7)
1522 
1523             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1524             SSE_SHUFFLE(XMM4,XMM4,0x00)
1525             SSE_MULT_PS(XMM4,XMM1)
1526             SSE_SUB_PS(XMM6,XMM4)
1527 
1528             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1529             SSE_SHUFFLE(XMM5,XMM5,0x00)
1530             SSE_MULT_PS(XMM5,XMM2)
1531             SSE_SUB_PS(XMM6,XMM5)
1532 
1533             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1534             SSE_SHUFFLE(XMM7,XMM7,0x00)
1535             SSE_MULT_PS(XMM7,XMM3)
1536             SSE_SUB_PS(XMM6,XMM7)
1537 
1538             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1539             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1540 
1541             /* Fourth Column */
1542             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1543             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1544 
1545             /* Matrix-Vector Product: */
1546             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1547             SSE_SHUFFLE(XMM5,XMM5,0x00)
1548             SSE_MULT_PS(XMM5,XMM0)
1549             SSE_SUB_PS(XMM4,XMM5)
1550 
1551             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1552             SSE_SHUFFLE(XMM6,XMM6,0x00)
1553             SSE_MULT_PS(XMM6,XMM1)
1554             SSE_SUB_PS(XMM4,XMM6)
1555 
1556             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1557             SSE_SHUFFLE(XMM7,XMM7,0x00)
1558             SSE_MULT_PS(XMM7,XMM2)
1559             SSE_SUB_PS(XMM4,XMM7)
1560 
1561             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1562             SSE_SHUFFLE(XMM5,XMM5,0x00)
1563             SSE_MULT_PS(XMM5,XMM3)
1564             SSE_SUB_PS(XMM4,XMM5)
1565 
1566             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1567             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1568           SSE_INLINE_END_2;
1569           pv   += 16;
1570         }
1571         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1572       }
1573       row = (unsigned int)(*bjtmp++);
1574 /*        row = (*bjtmp++)/4; */
1575 /*        bjtmp++; */
1576     }
1577     /* finished row so stick it into b->a */
1578     pv = ba + 16*bi[i];
1579     pj = bj + bi[i];
1580     nz = bi[i+1] - bi[i];
1581 
1582     /* Copy x block back into pv block */
1583     for (j=0; j<nz; j++) {
1584       x  = rtmp+16*((unsigned int)pj[j]);
1585 /*        x  = rtmp+4*pj[j]; */
1586 
1587       SSE_INLINE_BEGIN_2(x,pv)
1588         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1589         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1590         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1591 
1592         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1593         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1594 
1595         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1596         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1597 
1598         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1599         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1600 
1601         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1602         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1603 
1604         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1605         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1606 
1607         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1608         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1609 
1610         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1611         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1612       SSE_INLINE_END_2;
1613       pv += 16;
1614     }
1615     /* invert diagonal block */
1616     w = ba + 16*diag_offset[i];
1617     if (pivotinblocks) {
1618       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1619     } else {
1620       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1621     }
1622 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1623     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1624   }
1625 
1626   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1627   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1628   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1629   C->assembled = PETSC_TRUE;
1630   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1631   /* Flop Count from inverting diagonal blocks */
1632   SSE_SCOPE_END;
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 #endif
1637