xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision c05d87d6c2fa1d0952fe676de20339d09daa79bc)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftinblocks;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct"
170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
171 {
172   Mat            C=B;
173   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174   IS             isrow = b->row,isicol = b->icol;
175   PetscErrorCode ierr;
176   const PetscInt *r,*ic,*ics;
177   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
178   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
179   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
180   PetscInt       bs2 = a->bs2,flg;
181   PetscReal      shift = info->shiftinblocks;
182 
183   PetscFunctionBegin;
184   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
185   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
186 
187   /* generate work space needed by the factorization */
188   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
189   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
190   ics  = ic;
191 
192   for (i=0; i<n; i++){
193     /* zero rtmp */
194     /* L part */
195     nz    = bi[i+1] - bi[i];
196     bjtmp = bj + bi[i];
197     for  (j=0; j<nz; j++){
198       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
199     }
200 
201     /* U part */
202     nz = bdiag[i]-bdiag[i+1];
203     bjtmp = bj + bdiag[i+1]+1;
204     for  (j=0; j<nz; j++){
205       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
206     }
207 
208     /* load in initial (unfactored row) */
209     nz    = ai[r[i]+1] - ai[r[i]];
210     ajtmp = aj + ai[r[i]];
211     v     = aa + bs2*ai[r[i]];
212     for (j=0; j<nz; j++) {
213       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
214     }
215 
216     /* elimination */
217     bjtmp = bj + bi[i];
218     nzL   = bi[i+1] - bi[i];
219     for(k=0;k < nzL;k++) {
220       row = bjtmp[k];
221       pc = rtmp + bs2*row;
222       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
223       if (flg) {
224         pv = b->a + bs2*bdiag[row];
225         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
226         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
227 
228         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
229         pv = b->a + bs2*(bdiag[row+1]+1);
230         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
231         for (j=0; j<nz; j++) {
232           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
233           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
234           v    = rtmp + bs2*pj[j];
235           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
236           pv  += bs2;
237         }
238         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
239       }
240     }
241 
242     /* finished row so stick it into b->a */
243     /* L part */
244     pv   = b->a + bs2*bi[i] ;
245     pj   = b->j + bi[i] ;
246     nz   = bi[i+1] - bi[i];
247     for (j=0; j<nz; j++) {
248       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
249     }
250 
251     /* Mark diagonal and invert diagonal for simplier triangular solves */
252     pv   = b->a + bs2*bdiag[i];
253     pj   = b->j + bdiag[i];
254     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
255     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
256     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
257 
258     /* U part */
259     pv = b->a + bs2*(bdiag[i+1]+1);
260     pj = b->j + bdiag[i+1]+1;
261     nz = bdiag[i] - bdiag[i+1] - 1;
262     for (j=0; j<nz; j++){
263       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
264     }
265   }
266 
267   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
268   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
269   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
270 
271   C->assembled = PETSC_TRUE;
272   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
278 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
279 {
280 /*
281     Default Version for when blocks are 4 by 4 Using natural ordering
282 */
283   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
284   PetscErrorCode ierr;
285   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
286   PetscInt       *ajtmpold,*ajtmp,nz,row;
287   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
288   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
289   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
290   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
291   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
292   MatScalar      m13,m14,m15,m16;
293   MatScalar      *ba = b->a,*aa = a->a;
294   PetscTruth     pivotinblocks = b->pivotinblocks;
295   PetscReal      shift = info->shiftinblocks;
296 
297   PetscFunctionBegin;
298   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
299 
300   for (i=0; i<n; i++) {
301     nz    = bi[i+1] - bi[i];
302     ajtmp = bj + bi[i];
303     for  (j=0; j<nz; j++) {
304       x = rtmp+16*ajtmp[j];
305       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
306       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
307     }
308     /* load in initial (unfactored row) */
309     nz       = ai[i+1] - ai[i];
310     ajtmpold = aj + ai[i];
311     v        = aa + 16*ai[i];
312     for (j=0; j<nz; j++) {
313       x    = rtmp+16*ajtmpold[j];
314       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
315       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
316       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
317       x[14] = v[14]; x[15] = v[15];
318       v    += 16;
319     }
320     row = *ajtmp++;
321     while (row < i) {
322       pc  = rtmp + 16*row;
323       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
324       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
325       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
326       p15 = pc[14]; p16 = pc[15];
327       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
328           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
329           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
330           || p16 != 0.0) {
331         pv = ba + 16*diag_offset[row];
332         pj = bj + diag_offset[row] + 1;
333         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
334         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
335         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
336         x15 = pv[14]; x16 = pv[15];
337         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
338         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
339         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
340         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
341 
342         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
343         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
344         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
345         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
346 
347         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
348         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
349         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
350         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
351 
352         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
353         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
354         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
355         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
356         nz = bi[row+1] - diag_offset[row] - 1;
357         pv += 16;
358         for (j=0; j<nz; j++) {
359           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
360           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
361           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
362           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
363           x    = rtmp + 16*pj[j];
364           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
365           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
366           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
367           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
368 
369           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
370           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
371           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
372           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
373 
374           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
375           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
376           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
377           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
378 
379           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
380           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
381           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
382           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
383 
384           pv   += 16;
385         }
386         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
387       }
388       row = *ajtmp++;
389     }
390     /* finished row so stick it into b->a */
391     pv = ba + 16*bi[i];
392     pj = bj + bi[i];
393     nz = bi[i+1] - bi[i];
394     for (j=0; j<nz; j++) {
395       x      = rtmp+16*pj[j];
396       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
397       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
398       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
399       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
400       pv   += 16;
401     }
402     /* invert diagonal block */
403     w = ba + 16*diag_offset[i];
404     if (pivotinblocks) {
405       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
406     } else {
407       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
408     }
409   }
410 
411   ierr = PetscFree(rtmp);CHKERRQ(ierr);
412   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
413   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
414   C->assembled = PETSC_TRUE;
415   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
416   PetscFunctionReturn(0);
417 }
418 
419 /*
420   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct -
421     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct()
422 */
423 #undef __FUNCT__
424 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct"
425 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
426 {
427   Mat            C=B;
428   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
429   PetscErrorCode ierr;
430   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
431   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
432   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
433   PetscInt       bs2 = a->bs2,flg;
434   PetscReal      shift = info->shiftinblocks;
435 
436   PetscFunctionBegin;
437   /* generate work space needed by the factorization */
438   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
439   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
440 
441   for (i=0; i<n; i++){
442     /* zero rtmp */
443     /* L part */
444     nz    = bi[i+1] - bi[i];
445     bjtmp = bj + bi[i];
446     for  (j=0; j<nz; j++){
447       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
448     }
449 
450     /* U part */
451     nz = bdiag[i] - bdiag[i+1];
452     bjtmp = bj + bdiag[i+1]+1;
453     for  (j=0; j<nz; j++){
454       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
455     }
456 
457     /* load in initial (unfactored row) */
458     nz    = ai[i+1] - ai[i];
459     ajtmp = aj + ai[i];
460     v     = aa + bs2*ai[i];
461     for (j=0; j<nz; j++) {
462       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
463     }
464 
465     /* elimination */
466     bjtmp = bj + bi[i];
467     nzL   = bi[i+1] - bi[i];
468     for(k=0;k < nzL;k++) {
469       row = bjtmp[k];
470       pc = rtmp + bs2*row;
471       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
472       if (flg) {
473         pv = b->a + bs2*bdiag[row];
474         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
475         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
476 
477         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
478         pv = b->a + bs2*(bdiag[row+1]+1);
479         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
480         for (j=0; j<nz; j++) {
481           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
482           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
483           v    = rtmp + bs2*pj[j];
484           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
485           pv  += bs2;
486         }
487         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
488       }
489     }
490 
491     /* finished row so stick it into b->a */
492     /* L part */
493     pv   = b->a + bs2*bi[i] ;
494     pj   = b->j + bi[i] ;
495     nz   = bi[i+1] - bi[i];
496     for (j=0; j<nz; j++) {
497       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
498     }
499 
500     /* Mark diagonal and invert diagonal for simplier triangular solves */
501     pv   = b->a + bs2*bdiag[i];
502     pj   = b->j + bdiag[i];
503     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
504     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
505     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
506 
507     /* U part */
508     pv = b->a + bs2*(bdiag[i+1]+1);
509     pj = b->j + bdiag[i+1]+1;
510     nz = bdiag[i] - bdiag[i+1] - 1;
511     for (j=0; j<nz; j++){
512       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
513     }
514   }
515 
516   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
517   C->assembled = PETSC_TRUE;
518   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
519   PetscFunctionReturn(0);
520 }
521 
522 #if defined(PETSC_HAVE_SSE)
523 
524 #include PETSC_HAVE_SSE
525 
526 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
527 #undef __FUNCT__
528 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
529 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
530 {
531   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
532   PetscErrorCode ierr;
533   int i,j,n = a->mbs;
534   int         *bj = b->j,*bjtmp,*pj;
535   int         row;
536   int         *ajtmpold,nz,*bi=b->i;
537   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
538   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
539   MatScalar   *ba = b->a,*aa = a->a;
540   int         nonzero=0;
541 /*    int            nonzero=0,colscale = 16; */
542   PetscTruth  pivotinblocks = b->pivotinblocks;
543   PetscReal      shift = info->shiftinblocks;
544 
545   PetscFunctionBegin;
546   SSE_SCOPE_BEGIN;
547 
548   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
549   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
550   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
551   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
552 /*    if ((unsigned long)bj==(unsigned long)aj) { */
553 /*      colscale = 4; */
554 /*    } */
555   for (i=0; i<n; i++) {
556     nz    = bi[i+1] - bi[i];
557     bjtmp = bj + bi[i];
558     /* zero out the 4x4 block accumulators */
559     /* zero out one register */
560     XOR_PS(XMM7,XMM7);
561     for  (j=0; j<nz; j++) {
562       x = rtmp+16*bjtmp[j];
563 /*        x = rtmp+4*bjtmp[j]; */
564       SSE_INLINE_BEGIN_1(x)
565         /* Copy zero register to memory locations */
566         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
567         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
568         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
569         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
570         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
571         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
572         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
573         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
574         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
575       SSE_INLINE_END_1;
576     }
577     /* load in initial (unfactored row) */
578     nz       = ai[i+1] - ai[i];
579     ajtmpold = aj + ai[i];
580     v        = aa + 16*ai[i];
581     for (j=0; j<nz; j++) {
582       x = rtmp+16*ajtmpold[j];
583 /*        x = rtmp+colscale*ajtmpold[j]; */
584       /* Copy v block into x block */
585       SSE_INLINE_BEGIN_2(v,x)
586         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
587         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
588         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
589 
590         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
591         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
592 
593         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
594         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
595 
596         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
597         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
598 
599         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
600         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
601 
602         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
603         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
604 
605         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
606         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
607 
608         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
609         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
610       SSE_INLINE_END_2;
611 
612       v += 16;
613     }
614 /*      row = (*bjtmp++)/4; */
615     row = *bjtmp++;
616     while (row < i) {
617       pc  = rtmp + 16*row;
618       SSE_INLINE_BEGIN_1(pc)
619         /* Load block from lower triangle */
620         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
621         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
622         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
623 
624         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
625         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
626 
627         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
628         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
629 
630         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
631         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
632 
633         /* Compare block to zero block */
634 
635         SSE_COPY_PS(XMM4,XMM7)
636         SSE_CMPNEQ_PS(XMM4,XMM0)
637 
638         SSE_COPY_PS(XMM5,XMM7)
639         SSE_CMPNEQ_PS(XMM5,XMM1)
640 
641         SSE_COPY_PS(XMM6,XMM7)
642         SSE_CMPNEQ_PS(XMM6,XMM2)
643 
644         SSE_CMPNEQ_PS(XMM7,XMM3)
645 
646         /* Reduce the comparisons to one SSE register */
647         SSE_OR_PS(XMM6,XMM7)
648         SSE_OR_PS(XMM5,XMM4)
649         SSE_OR_PS(XMM5,XMM6)
650       SSE_INLINE_END_1;
651 
652       /* Reduce the one SSE register to an integer register for branching */
653       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
654       MOVEMASK(nonzero,XMM5);
655 
656       /* If block is nonzero ... */
657       if (nonzero) {
658         pv = ba + 16*diag_offset[row];
659         PREFETCH_L1(&pv[16]);
660         pj = bj + diag_offset[row] + 1;
661 
662         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
663         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
664         /* but the diagonal was inverted already */
665         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
666 
667         SSE_INLINE_BEGIN_2(pv,pc)
668           /* Column 0, product is accumulated in XMM4 */
669           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
670           SSE_SHUFFLE(XMM4,XMM4,0x00)
671           SSE_MULT_PS(XMM4,XMM0)
672 
673           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
674           SSE_SHUFFLE(XMM5,XMM5,0x00)
675           SSE_MULT_PS(XMM5,XMM1)
676           SSE_ADD_PS(XMM4,XMM5)
677 
678           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
679           SSE_SHUFFLE(XMM6,XMM6,0x00)
680           SSE_MULT_PS(XMM6,XMM2)
681           SSE_ADD_PS(XMM4,XMM6)
682 
683           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
684           SSE_SHUFFLE(XMM7,XMM7,0x00)
685           SSE_MULT_PS(XMM7,XMM3)
686           SSE_ADD_PS(XMM4,XMM7)
687 
688           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
689           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
690 
691           /* Column 1, product is accumulated in XMM5 */
692           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
693           SSE_SHUFFLE(XMM5,XMM5,0x00)
694           SSE_MULT_PS(XMM5,XMM0)
695 
696           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
697           SSE_SHUFFLE(XMM6,XMM6,0x00)
698           SSE_MULT_PS(XMM6,XMM1)
699           SSE_ADD_PS(XMM5,XMM6)
700 
701           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
702           SSE_SHUFFLE(XMM7,XMM7,0x00)
703           SSE_MULT_PS(XMM7,XMM2)
704           SSE_ADD_PS(XMM5,XMM7)
705 
706           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
707           SSE_SHUFFLE(XMM6,XMM6,0x00)
708           SSE_MULT_PS(XMM6,XMM3)
709           SSE_ADD_PS(XMM5,XMM6)
710 
711           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
712           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
713 
714           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
715 
716           /* Column 2, product is accumulated in XMM6 */
717           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
718           SSE_SHUFFLE(XMM6,XMM6,0x00)
719           SSE_MULT_PS(XMM6,XMM0)
720 
721           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
722           SSE_SHUFFLE(XMM7,XMM7,0x00)
723           SSE_MULT_PS(XMM7,XMM1)
724           SSE_ADD_PS(XMM6,XMM7)
725 
726           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
727           SSE_SHUFFLE(XMM7,XMM7,0x00)
728           SSE_MULT_PS(XMM7,XMM2)
729           SSE_ADD_PS(XMM6,XMM7)
730 
731           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
732           SSE_SHUFFLE(XMM7,XMM7,0x00)
733           SSE_MULT_PS(XMM7,XMM3)
734           SSE_ADD_PS(XMM6,XMM7)
735 
736           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
737           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
738 
739           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
740           /* Column 3, product is accumulated in XMM0 */
741           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
742           SSE_SHUFFLE(XMM7,XMM7,0x00)
743           SSE_MULT_PS(XMM0,XMM7)
744 
745           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
746           SSE_SHUFFLE(XMM7,XMM7,0x00)
747           SSE_MULT_PS(XMM1,XMM7)
748           SSE_ADD_PS(XMM0,XMM1)
749 
750           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
751           SSE_SHUFFLE(XMM1,XMM1,0x00)
752           SSE_MULT_PS(XMM1,XMM2)
753           SSE_ADD_PS(XMM0,XMM1)
754 
755           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
756           SSE_SHUFFLE(XMM7,XMM7,0x00)
757           SSE_MULT_PS(XMM3,XMM7)
758           SSE_ADD_PS(XMM0,XMM3)
759 
760           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
761           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
762 
763           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
764           /* This is code to be maintained and read by humans afterall. */
765           /* Copy Multiplier Col 3 into XMM3 */
766           SSE_COPY_PS(XMM3,XMM0)
767           /* Copy Multiplier Col 2 into XMM2 */
768           SSE_COPY_PS(XMM2,XMM6)
769           /* Copy Multiplier Col 1 into XMM1 */
770           SSE_COPY_PS(XMM1,XMM5)
771           /* Copy Multiplier Col 0 into XMM0 */
772           SSE_COPY_PS(XMM0,XMM4)
773         SSE_INLINE_END_2;
774 
775         /* Update the row: */
776         nz = bi[row+1] - diag_offset[row] - 1;
777         pv += 16;
778         for (j=0; j<nz; j++) {
779           PREFETCH_L1(&pv[16]);
780           x = rtmp + 16*pj[j];
781 /*            x = rtmp + 4*pj[j]; */
782 
783           /* X:=X-M*PV, One column at a time */
784           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
785           SSE_INLINE_BEGIN_2(x,pv)
786             /* Load First Column of X*/
787             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
788             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
789 
790             /* Matrix-Vector Product: */
791             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
792             SSE_SHUFFLE(XMM5,XMM5,0x00)
793             SSE_MULT_PS(XMM5,XMM0)
794             SSE_SUB_PS(XMM4,XMM5)
795 
796             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
797             SSE_SHUFFLE(XMM6,XMM6,0x00)
798             SSE_MULT_PS(XMM6,XMM1)
799             SSE_SUB_PS(XMM4,XMM6)
800 
801             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
802             SSE_SHUFFLE(XMM7,XMM7,0x00)
803             SSE_MULT_PS(XMM7,XMM2)
804             SSE_SUB_PS(XMM4,XMM7)
805 
806             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
807             SSE_SHUFFLE(XMM5,XMM5,0x00)
808             SSE_MULT_PS(XMM5,XMM3)
809             SSE_SUB_PS(XMM4,XMM5)
810 
811             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
812             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
813 
814             /* Second Column */
815             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
816             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
817 
818             /* Matrix-Vector Product: */
819             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
820             SSE_SHUFFLE(XMM6,XMM6,0x00)
821             SSE_MULT_PS(XMM6,XMM0)
822             SSE_SUB_PS(XMM5,XMM6)
823 
824             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
825             SSE_SHUFFLE(XMM7,XMM7,0x00)
826             SSE_MULT_PS(XMM7,XMM1)
827             SSE_SUB_PS(XMM5,XMM7)
828 
829             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
830             SSE_SHUFFLE(XMM4,XMM4,0x00)
831             SSE_MULT_PS(XMM4,XMM2)
832             SSE_SUB_PS(XMM5,XMM4)
833 
834             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
835             SSE_SHUFFLE(XMM6,XMM6,0x00)
836             SSE_MULT_PS(XMM6,XMM3)
837             SSE_SUB_PS(XMM5,XMM6)
838 
839             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
840             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
841 
842             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
843 
844             /* Third Column */
845             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
846             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
847 
848             /* Matrix-Vector Product: */
849             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
850             SSE_SHUFFLE(XMM7,XMM7,0x00)
851             SSE_MULT_PS(XMM7,XMM0)
852             SSE_SUB_PS(XMM6,XMM7)
853 
854             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
855             SSE_SHUFFLE(XMM4,XMM4,0x00)
856             SSE_MULT_PS(XMM4,XMM1)
857             SSE_SUB_PS(XMM6,XMM4)
858 
859             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
860             SSE_SHUFFLE(XMM5,XMM5,0x00)
861             SSE_MULT_PS(XMM5,XMM2)
862             SSE_SUB_PS(XMM6,XMM5)
863 
864             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
865             SSE_SHUFFLE(XMM7,XMM7,0x00)
866             SSE_MULT_PS(XMM7,XMM3)
867             SSE_SUB_PS(XMM6,XMM7)
868 
869             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
870             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
871 
872             /* Fourth Column */
873             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
874             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
875 
876             /* Matrix-Vector Product: */
877             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
878             SSE_SHUFFLE(XMM5,XMM5,0x00)
879             SSE_MULT_PS(XMM5,XMM0)
880             SSE_SUB_PS(XMM4,XMM5)
881 
882             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
883             SSE_SHUFFLE(XMM6,XMM6,0x00)
884             SSE_MULT_PS(XMM6,XMM1)
885             SSE_SUB_PS(XMM4,XMM6)
886 
887             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
888             SSE_SHUFFLE(XMM7,XMM7,0x00)
889             SSE_MULT_PS(XMM7,XMM2)
890             SSE_SUB_PS(XMM4,XMM7)
891 
892             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
893             SSE_SHUFFLE(XMM5,XMM5,0x00)
894             SSE_MULT_PS(XMM5,XMM3)
895             SSE_SUB_PS(XMM4,XMM5)
896 
897             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
898             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
899           SSE_INLINE_END_2;
900           pv   += 16;
901         }
902         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
903       }
904       row = *bjtmp++;
905 /*        row = (*bjtmp++)/4; */
906     }
907     /* finished row so stick it into b->a */
908     pv = ba + 16*bi[i];
909     pj = bj + bi[i];
910     nz = bi[i+1] - bi[i];
911 
912     /* Copy x block back into pv block */
913     for (j=0; j<nz; j++) {
914       x  = rtmp+16*pj[j];
915 /*        x  = rtmp+4*pj[j]; */
916 
917       SSE_INLINE_BEGIN_2(x,pv)
918         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
919         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
920         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
921 
922         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
923         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
924 
925         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
926         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
927 
928         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
929         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
930 
931         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
932         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
933 
934         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
935         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
936 
937         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
938         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
939 
940         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
941         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
942       SSE_INLINE_END_2;
943       pv += 16;
944     }
945     /* invert diagonal block */
946     w = ba + 16*diag_offset[i];
947     if (pivotinblocks) {
948       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
949     } else {
950       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
951     }
952 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
953     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
954   }
955 
956   ierr = PetscFree(rtmp);CHKERRQ(ierr);
957   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
958   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
959   C->assembled = PETSC_TRUE;
960   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
961   /* Flop Count from inverting diagonal blocks */
962   SSE_SCOPE_END;
963   PetscFunctionReturn(0);
964 }
965 
966 #undef __FUNCT__
967 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
968 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
969 {
970   Mat            A=C;
971   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
972   PetscErrorCode ierr;
973   int i,j,n = a->mbs;
974   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
975   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
976   unsigned int   row;
977   int            nz,*bi=b->i;
978   int            *diag_offset = b->diag,*ai=a->i;
979   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
980   MatScalar      *ba = b->a,*aa = a->a;
981   int            nonzero=0;
982 /*    int            nonzero=0,colscale = 16; */
983   PetscTruth     pivotinblocks = b->pivotinblocks;
984   PetscReal      shift = info->shiftinblocks;
985 
986   PetscFunctionBegin;
987   SSE_SCOPE_BEGIN;
988 
989   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
990   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
991   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
992   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
993 /*    if ((unsigned long)bj==(unsigned long)aj) { */
994 /*      colscale = 4; */
995 /*    } */
996 
997   for (i=0; i<n; i++) {
998     nz    = bi[i+1] - bi[i];
999     bjtmp = bj + bi[i];
1000     /* zero out the 4x4 block accumulators */
1001     /* zero out one register */
1002     XOR_PS(XMM7,XMM7);
1003     for  (j=0; j<nz; j++) {
1004       x = rtmp+16*((unsigned int)bjtmp[j]);
1005 /*        x = rtmp+4*bjtmp[j]; */
1006       SSE_INLINE_BEGIN_1(x)
1007         /* Copy zero register to memory locations */
1008         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1009         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1010         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1011         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1012         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1013         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1014         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1015         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1016         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1017       SSE_INLINE_END_1;
1018     }
1019     /* load in initial (unfactored row) */
1020     nz    = ai[i+1] - ai[i];
1021     ajtmp = aj + ai[i];
1022     v     = aa + 16*ai[i];
1023     for (j=0; j<nz; j++) {
1024       x = rtmp+16*((unsigned int)ajtmp[j]);
1025 /*        x = rtmp+colscale*ajtmp[j]; */
1026       /* Copy v block into x block */
1027       SSE_INLINE_BEGIN_2(v,x)
1028         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1029         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1030         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1031 
1032         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1033         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1034 
1035         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1036         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1037 
1038         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1039         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1040 
1041         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1042         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1043 
1044         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1045         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1046 
1047         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1048         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1049 
1050         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1051         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1052       SSE_INLINE_END_2;
1053 
1054       v += 16;
1055     }
1056 /*      row = (*bjtmp++)/4; */
1057     row = (unsigned int)(*bjtmp++);
1058     while (row < i) {
1059       pc  = rtmp + 16*row;
1060       SSE_INLINE_BEGIN_1(pc)
1061         /* Load block from lower triangle */
1062         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1063         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1064         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1065 
1066         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1067         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1068 
1069         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1070         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1071 
1072         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1073         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1074 
1075         /* Compare block to zero block */
1076 
1077         SSE_COPY_PS(XMM4,XMM7)
1078         SSE_CMPNEQ_PS(XMM4,XMM0)
1079 
1080         SSE_COPY_PS(XMM5,XMM7)
1081         SSE_CMPNEQ_PS(XMM5,XMM1)
1082 
1083         SSE_COPY_PS(XMM6,XMM7)
1084         SSE_CMPNEQ_PS(XMM6,XMM2)
1085 
1086         SSE_CMPNEQ_PS(XMM7,XMM3)
1087 
1088         /* Reduce the comparisons to one SSE register */
1089         SSE_OR_PS(XMM6,XMM7)
1090         SSE_OR_PS(XMM5,XMM4)
1091         SSE_OR_PS(XMM5,XMM6)
1092       SSE_INLINE_END_1;
1093 
1094       /* Reduce the one SSE register to an integer register for branching */
1095       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1096       MOVEMASK(nonzero,XMM5);
1097 
1098       /* If block is nonzero ... */
1099       if (nonzero) {
1100         pv = ba + 16*diag_offset[row];
1101         PREFETCH_L1(&pv[16]);
1102         pj = bj + diag_offset[row] + 1;
1103 
1104         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1105         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1106         /* but the diagonal was inverted already */
1107         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1108 
1109         SSE_INLINE_BEGIN_2(pv,pc)
1110           /* Column 0, product is accumulated in XMM4 */
1111           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1112           SSE_SHUFFLE(XMM4,XMM4,0x00)
1113           SSE_MULT_PS(XMM4,XMM0)
1114 
1115           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1116           SSE_SHUFFLE(XMM5,XMM5,0x00)
1117           SSE_MULT_PS(XMM5,XMM1)
1118           SSE_ADD_PS(XMM4,XMM5)
1119 
1120           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1121           SSE_SHUFFLE(XMM6,XMM6,0x00)
1122           SSE_MULT_PS(XMM6,XMM2)
1123           SSE_ADD_PS(XMM4,XMM6)
1124 
1125           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1126           SSE_SHUFFLE(XMM7,XMM7,0x00)
1127           SSE_MULT_PS(XMM7,XMM3)
1128           SSE_ADD_PS(XMM4,XMM7)
1129 
1130           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1131           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1132 
1133           /* Column 1, product is accumulated in XMM5 */
1134           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1135           SSE_SHUFFLE(XMM5,XMM5,0x00)
1136           SSE_MULT_PS(XMM5,XMM0)
1137 
1138           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1139           SSE_SHUFFLE(XMM6,XMM6,0x00)
1140           SSE_MULT_PS(XMM6,XMM1)
1141           SSE_ADD_PS(XMM5,XMM6)
1142 
1143           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1144           SSE_SHUFFLE(XMM7,XMM7,0x00)
1145           SSE_MULT_PS(XMM7,XMM2)
1146           SSE_ADD_PS(XMM5,XMM7)
1147 
1148           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1149           SSE_SHUFFLE(XMM6,XMM6,0x00)
1150           SSE_MULT_PS(XMM6,XMM3)
1151           SSE_ADD_PS(XMM5,XMM6)
1152 
1153           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1154           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1155 
1156           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1157 
1158           /* Column 2, product is accumulated in XMM6 */
1159           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1160           SSE_SHUFFLE(XMM6,XMM6,0x00)
1161           SSE_MULT_PS(XMM6,XMM0)
1162 
1163           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1164           SSE_SHUFFLE(XMM7,XMM7,0x00)
1165           SSE_MULT_PS(XMM7,XMM1)
1166           SSE_ADD_PS(XMM6,XMM7)
1167 
1168           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1169           SSE_SHUFFLE(XMM7,XMM7,0x00)
1170           SSE_MULT_PS(XMM7,XMM2)
1171           SSE_ADD_PS(XMM6,XMM7)
1172 
1173           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1174           SSE_SHUFFLE(XMM7,XMM7,0x00)
1175           SSE_MULT_PS(XMM7,XMM3)
1176           SSE_ADD_PS(XMM6,XMM7)
1177 
1178           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1179           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1180 
1181           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1182           /* Column 3, product is accumulated in XMM0 */
1183           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1184           SSE_SHUFFLE(XMM7,XMM7,0x00)
1185           SSE_MULT_PS(XMM0,XMM7)
1186 
1187           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1188           SSE_SHUFFLE(XMM7,XMM7,0x00)
1189           SSE_MULT_PS(XMM1,XMM7)
1190           SSE_ADD_PS(XMM0,XMM1)
1191 
1192           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1193           SSE_SHUFFLE(XMM1,XMM1,0x00)
1194           SSE_MULT_PS(XMM1,XMM2)
1195           SSE_ADD_PS(XMM0,XMM1)
1196 
1197           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1198           SSE_SHUFFLE(XMM7,XMM7,0x00)
1199           SSE_MULT_PS(XMM3,XMM7)
1200           SSE_ADD_PS(XMM0,XMM3)
1201 
1202           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1203           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1204 
1205           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1206           /* This is code to be maintained and read by humans afterall. */
1207           /* Copy Multiplier Col 3 into XMM3 */
1208           SSE_COPY_PS(XMM3,XMM0)
1209           /* Copy Multiplier Col 2 into XMM2 */
1210           SSE_COPY_PS(XMM2,XMM6)
1211           /* Copy Multiplier Col 1 into XMM1 */
1212           SSE_COPY_PS(XMM1,XMM5)
1213           /* Copy Multiplier Col 0 into XMM0 */
1214           SSE_COPY_PS(XMM0,XMM4)
1215         SSE_INLINE_END_2;
1216 
1217         /* Update the row: */
1218         nz = bi[row+1] - diag_offset[row] - 1;
1219         pv += 16;
1220         for (j=0; j<nz; j++) {
1221           PREFETCH_L1(&pv[16]);
1222           x = rtmp + 16*((unsigned int)pj[j]);
1223 /*            x = rtmp + 4*pj[j]; */
1224 
1225           /* X:=X-M*PV, One column at a time */
1226           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1227           SSE_INLINE_BEGIN_2(x,pv)
1228             /* Load First Column of X*/
1229             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1230             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1231 
1232             /* Matrix-Vector Product: */
1233             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1234             SSE_SHUFFLE(XMM5,XMM5,0x00)
1235             SSE_MULT_PS(XMM5,XMM0)
1236             SSE_SUB_PS(XMM4,XMM5)
1237 
1238             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1239             SSE_SHUFFLE(XMM6,XMM6,0x00)
1240             SSE_MULT_PS(XMM6,XMM1)
1241             SSE_SUB_PS(XMM4,XMM6)
1242 
1243             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1244             SSE_SHUFFLE(XMM7,XMM7,0x00)
1245             SSE_MULT_PS(XMM7,XMM2)
1246             SSE_SUB_PS(XMM4,XMM7)
1247 
1248             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1249             SSE_SHUFFLE(XMM5,XMM5,0x00)
1250             SSE_MULT_PS(XMM5,XMM3)
1251             SSE_SUB_PS(XMM4,XMM5)
1252 
1253             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1254             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1255 
1256             /* Second Column */
1257             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1258             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1259 
1260             /* Matrix-Vector Product: */
1261             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1262             SSE_SHUFFLE(XMM6,XMM6,0x00)
1263             SSE_MULT_PS(XMM6,XMM0)
1264             SSE_SUB_PS(XMM5,XMM6)
1265 
1266             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1267             SSE_SHUFFLE(XMM7,XMM7,0x00)
1268             SSE_MULT_PS(XMM7,XMM1)
1269             SSE_SUB_PS(XMM5,XMM7)
1270 
1271             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1272             SSE_SHUFFLE(XMM4,XMM4,0x00)
1273             SSE_MULT_PS(XMM4,XMM2)
1274             SSE_SUB_PS(XMM5,XMM4)
1275 
1276             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1277             SSE_SHUFFLE(XMM6,XMM6,0x00)
1278             SSE_MULT_PS(XMM6,XMM3)
1279             SSE_SUB_PS(XMM5,XMM6)
1280 
1281             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1282             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1283 
1284             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1285 
1286             /* Third Column */
1287             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1288             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1289 
1290             /* Matrix-Vector Product: */
1291             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1292             SSE_SHUFFLE(XMM7,XMM7,0x00)
1293             SSE_MULT_PS(XMM7,XMM0)
1294             SSE_SUB_PS(XMM6,XMM7)
1295 
1296             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1297             SSE_SHUFFLE(XMM4,XMM4,0x00)
1298             SSE_MULT_PS(XMM4,XMM1)
1299             SSE_SUB_PS(XMM6,XMM4)
1300 
1301             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1302             SSE_SHUFFLE(XMM5,XMM5,0x00)
1303             SSE_MULT_PS(XMM5,XMM2)
1304             SSE_SUB_PS(XMM6,XMM5)
1305 
1306             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1307             SSE_SHUFFLE(XMM7,XMM7,0x00)
1308             SSE_MULT_PS(XMM7,XMM3)
1309             SSE_SUB_PS(XMM6,XMM7)
1310 
1311             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1312             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1313 
1314             /* Fourth Column */
1315             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1316             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1317 
1318             /* Matrix-Vector Product: */
1319             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1320             SSE_SHUFFLE(XMM5,XMM5,0x00)
1321             SSE_MULT_PS(XMM5,XMM0)
1322             SSE_SUB_PS(XMM4,XMM5)
1323 
1324             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1325             SSE_SHUFFLE(XMM6,XMM6,0x00)
1326             SSE_MULT_PS(XMM6,XMM1)
1327             SSE_SUB_PS(XMM4,XMM6)
1328 
1329             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1330             SSE_SHUFFLE(XMM7,XMM7,0x00)
1331             SSE_MULT_PS(XMM7,XMM2)
1332             SSE_SUB_PS(XMM4,XMM7)
1333 
1334             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1335             SSE_SHUFFLE(XMM5,XMM5,0x00)
1336             SSE_MULT_PS(XMM5,XMM3)
1337             SSE_SUB_PS(XMM4,XMM5)
1338 
1339             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1340             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1341           SSE_INLINE_END_2;
1342           pv   += 16;
1343         }
1344         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1345       }
1346       row = (unsigned int)(*bjtmp++);
1347 /*        row = (*bjtmp++)/4; */
1348 /*        bjtmp++; */
1349     }
1350     /* finished row so stick it into b->a */
1351     pv = ba + 16*bi[i];
1352     pj = bj + bi[i];
1353     nz = bi[i+1] - bi[i];
1354 
1355     /* Copy x block back into pv block */
1356     for (j=0; j<nz; j++) {
1357       x  = rtmp+16*((unsigned int)pj[j]);
1358 /*        x  = rtmp+4*pj[j]; */
1359 
1360       SSE_INLINE_BEGIN_2(x,pv)
1361         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1362         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1363         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1364 
1365         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1366         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1367 
1368         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1369         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1370 
1371         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1372         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1373 
1374         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1375         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1376 
1377         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1378         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1379 
1380         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1381         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1382 
1383         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1384         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1385       SSE_INLINE_END_2;
1386       pv += 16;
1387     }
1388     /* invert diagonal block */
1389     w = ba + 16*diag_offset[i];
1390     if (pivotinblocks) {
1391       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1392     } else {
1393       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1394     }
1395 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1396     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1397   }
1398 
1399   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1400   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1401   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1402   C->assembled = PETSC_TRUE;
1403   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1404   /* Flop Count from inverting diagonal blocks */
1405   SSE_SCOPE_END;
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 #undef __FUNCT__
1410 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1411 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1412 {
1413   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1414   PetscErrorCode ierr;
1415   int  i,j,n = a->mbs;
1416   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1417   unsigned int   row;
1418   int            *ajtmpold,nz,*bi=b->i;
1419   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1420   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1421   MatScalar      *ba = b->a,*aa = a->a;
1422   int            nonzero=0;
1423 /*    int            nonzero=0,colscale = 16; */
1424   PetscTruth     pivotinblocks = b->pivotinblocks;
1425   PetscReal      shift = info->shiftinblocks;
1426 
1427   PetscFunctionBegin;
1428   SSE_SCOPE_BEGIN;
1429 
1430   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1431   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1432   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1433   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1434 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1435 /*      colscale = 4; */
1436 /*    } */
1437   if ((unsigned long)bj==(unsigned long)aj) {
1438     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1439   }
1440 
1441   for (i=0; i<n; i++) {
1442     nz    = bi[i+1] - bi[i];
1443     bjtmp = bj + bi[i];
1444     /* zero out the 4x4 block accumulators */
1445     /* zero out one register */
1446     XOR_PS(XMM7,XMM7);
1447     for  (j=0; j<nz; j++) {
1448       x = rtmp+16*((unsigned int)bjtmp[j]);
1449 /*        x = rtmp+4*bjtmp[j]; */
1450       SSE_INLINE_BEGIN_1(x)
1451         /* Copy zero register to memory locations */
1452         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1453         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1454         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1455         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1456         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1457         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1458         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1459         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1460         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1461       SSE_INLINE_END_1;
1462     }
1463     /* load in initial (unfactored row) */
1464     nz       = ai[i+1] - ai[i];
1465     ajtmpold = aj + ai[i];
1466     v        = aa + 16*ai[i];
1467     for (j=0; j<nz; j++) {
1468       x = rtmp+16*ajtmpold[j];
1469 /*        x = rtmp+colscale*ajtmpold[j]; */
1470       /* Copy v block into x block */
1471       SSE_INLINE_BEGIN_2(v,x)
1472         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1473         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1474         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1475 
1476         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1477         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1478 
1479         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1480         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1481 
1482         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1483         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1484 
1485         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1486         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1487 
1488         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1489         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1490 
1491         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1492         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1493 
1494         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1495         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1496       SSE_INLINE_END_2;
1497 
1498       v += 16;
1499     }
1500 /*      row = (*bjtmp++)/4; */
1501     row = (unsigned int)(*bjtmp++);
1502     while (row < i) {
1503       pc  = rtmp + 16*row;
1504       SSE_INLINE_BEGIN_1(pc)
1505         /* Load block from lower triangle */
1506         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1507         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1508         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1509 
1510         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1511         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1512 
1513         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1514         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1515 
1516         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1517         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1518 
1519         /* Compare block to zero block */
1520 
1521         SSE_COPY_PS(XMM4,XMM7)
1522         SSE_CMPNEQ_PS(XMM4,XMM0)
1523 
1524         SSE_COPY_PS(XMM5,XMM7)
1525         SSE_CMPNEQ_PS(XMM5,XMM1)
1526 
1527         SSE_COPY_PS(XMM6,XMM7)
1528         SSE_CMPNEQ_PS(XMM6,XMM2)
1529 
1530         SSE_CMPNEQ_PS(XMM7,XMM3)
1531 
1532         /* Reduce the comparisons to one SSE register */
1533         SSE_OR_PS(XMM6,XMM7)
1534         SSE_OR_PS(XMM5,XMM4)
1535         SSE_OR_PS(XMM5,XMM6)
1536       SSE_INLINE_END_1;
1537 
1538       /* Reduce the one SSE register to an integer register for branching */
1539       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1540       MOVEMASK(nonzero,XMM5);
1541 
1542       /* If block is nonzero ... */
1543       if (nonzero) {
1544         pv = ba + 16*diag_offset[row];
1545         PREFETCH_L1(&pv[16]);
1546         pj = bj + diag_offset[row] + 1;
1547 
1548         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1549         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1550         /* but the diagonal was inverted already */
1551         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1552 
1553         SSE_INLINE_BEGIN_2(pv,pc)
1554           /* Column 0, product is accumulated in XMM4 */
1555           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1556           SSE_SHUFFLE(XMM4,XMM4,0x00)
1557           SSE_MULT_PS(XMM4,XMM0)
1558 
1559           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1560           SSE_SHUFFLE(XMM5,XMM5,0x00)
1561           SSE_MULT_PS(XMM5,XMM1)
1562           SSE_ADD_PS(XMM4,XMM5)
1563 
1564           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1565           SSE_SHUFFLE(XMM6,XMM6,0x00)
1566           SSE_MULT_PS(XMM6,XMM2)
1567           SSE_ADD_PS(XMM4,XMM6)
1568 
1569           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1570           SSE_SHUFFLE(XMM7,XMM7,0x00)
1571           SSE_MULT_PS(XMM7,XMM3)
1572           SSE_ADD_PS(XMM4,XMM7)
1573 
1574           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1575           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1576 
1577           /* Column 1, product is accumulated in XMM5 */
1578           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1579           SSE_SHUFFLE(XMM5,XMM5,0x00)
1580           SSE_MULT_PS(XMM5,XMM0)
1581 
1582           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1583           SSE_SHUFFLE(XMM6,XMM6,0x00)
1584           SSE_MULT_PS(XMM6,XMM1)
1585           SSE_ADD_PS(XMM5,XMM6)
1586 
1587           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1588           SSE_SHUFFLE(XMM7,XMM7,0x00)
1589           SSE_MULT_PS(XMM7,XMM2)
1590           SSE_ADD_PS(XMM5,XMM7)
1591 
1592           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1593           SSE_SHUFFLE(XMM6,XMM6,0x00)
1594           SSE_MULT_PS(XMM6,XMM3)
1595           SSE_ADD_PS(XMM5,XMM6)
1596 
1597           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1598           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1599 
1600           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1601 
1602           /* Column 2, product is accumulated in XMM6 */
1603           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1604           SSE_SHUFFLE(XMM6,XMM6,0x00)
1605           SSE_MULT_PS(XMM6,XMM0)
1606 
1607           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1608           SSE_SHUFFLE(XMM7,XMM7,0x00)
1609           SSE_MULT_PS(XMM7,XMM1)
1610           SSE_ADD_PS(XMM6,XMM7)
1611 
1612           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1613           SSE_SHUFFLE(XMM7,XMM7,0x00)
1614           SSE_MULT_PS(XMM7,XMM2)
1615           SSE_ADD_PS(XMM6,XMM7)
1616 
1617           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1618           SSE_SHUFFLE(XMM7,XMM7,0x00)
1619           SSE_MULT_PS(XMM7,XMM3)
1620           SSE_ADD_PS(XMM6,XMM7)
1621 
1622           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1623           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1624 
1625           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1626           /* Column 3, product is accumulated in XMM0 */
1627           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1628           SSE_SHUFFLE(XMM7,XMM7,0x00)
1629           SSE_MULT_PS(XMM0,XMM7)
1630 
1631           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1632           SSE_SHUFFLE(XMM7,XMM7,0x00)
1633           SSE_MULT_PS(XMM1,XMM7)
1634           SSE_ADD_PS(XMM0,XMM1)
1635 
1636           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1637           SSE_SHUFFLE(XMM1,XMM1,0x00)
1638           SSE_MULT_PS(XMM1,XMM2)
1639           SSE_ADD_PS(XMM0,XMM1)
1640 
1641           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1642           SSE_SHUFFLE(XMM7,XMM7,0x00)
1643           SSE_MULT_PS(XMM3,XMM7)
1644           SSE_ADD_PS(XMM0,XMM3)
1645 
1646           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1647           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1648 
1649           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1650           /* This is code to be maintained and read by humans afterall. */
1651           /* Copy Multiplier Col 3 into XMM3 */
1652           SSE_COPY_PS(XMM3,XMM0)
1653           /* Copy Multiplier Col 2 into XMM2 */
1654           SSE_COPY_PS(XMM2,XMM6)
1655           /* Copy Multiplier Col 1 into XMM1 */
1656           SSE_COPY_PS(XMM1,XMM5)
1657           /* Copy Multiplier Col 0 into XMM0 */
1658           SSE_COPY_PS(XMM0,XMM4)
1659         SSE_INLINE_END_2;
1660 
1661         /* Update the row: */
1662         nz = bi[row+1] - diag_offset[row] - 1;
1663         pv += 16;
1664         for (j=0; j<nz; j++) {
1665           PREFETCH_L1(&pv[16]);
1666           x = rtmp + 16*((unsigned int)pj[j]);
1667 /*            x = rtmp + 4*pj[j]; */
1668 
1669           /* X:=X-M*PV, One column at a time */
1670           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1671           SSE_INLINE_BEGIN_2(x,pv)
1672             /* Load First Column of X*/
1673             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1674             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1675 
1676             /* Matrix-Vector Product: */
1677             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1678             SSE_SHUFFLE(XMM5,XMM5,0x00)
1679             SSE_MULT_PS(XMM5,XMM0)
1680             SSE_SUB_PS(XMM4,XMM5)
1681 
1682             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1683             SSE_SHUFFLE(XMM6,XMM6,0x00)
1684             SSE_MULT_PS(XMM6,XMM1)
1685             SSE_SUB_PS(XMM4,XMM6)
1686 
1687             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1688             SSE_SHUFFLE(XMM7,XMM7,0x00)
1689             SSE_MULT_PS(XMM7,XMM2)
1690             SSE_SUB_PS(XMM4,XMM7)
1691 
1692             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1693             SSE_SHUFFLE(XMM5,XMM5,0x00)
1694             SSE_MULT_PS(XMM5,XMM3)
1695             SSE_SUB_PS(XMM4,XMM5)
1696 
1697             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1698             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1699 
1700             /* Second Column */
1701             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1702             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1703 
1704             /* Matrix-Vector Product: */
1705             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1706             SSE_SHUFFLE(XMM6,XMM6,0x00)
1707             SSE_MULT_PS(XMM6,XMM0)
1708             SSE_SUB_PS(XMM5,XMM6)
1709 
1710             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1711             SSE_SHUFFLE(XMM7,XMM7,0x00)
1712             SSE_MULT_PS(XMM7,XMM1)
1713             SSE_SUB_PS(XMM5,XMM7)
1714 
1715             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1716             SSE_SHUFFLE(XMM4,XMM4,0x00)
1717             SSE_MULT_PS(XMM4,XMM2)
1718             SSE_SUB_PS(XMM5,XMM4)
1719 
1720             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1721             SSE_SHUFFLE(XMM6,XMM6,0x00)
1722             SSE_MULT_PS(XMM6,XMM3)
1723             SSE_SUB_PS(XMM5,XMM6)
1724 
1725             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1726             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1727 
1728             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1729 
1730             /* Third Column */
1731             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1732             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1733 
1734             /* Matrix-Vector Product: */
1735             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1736             SSE_SHUFFLE(XMM7,XMM7,0x00)
1737             SSE_MULT_PS(XMM7,XMM0)
1738             SSE_SUB_PS(XMM6,XMM7)
1739 
1740             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1741             SSE_SHUFFLE(XMM4,XMM4,0x00)
1742             SSE_MULT_PS(XMM4,XMM1)
1743             SSE_SUB_PS(XMM6,XMM4)
1744 
1745             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1746             SSE_SHUFFLE(XMM5,XMM5,0x00)
1747             SSE_MULT_PS(XMM5,XMM2)
1748             SSE_SUB_PS(XMM6,XMM5)
1749 
1750             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1751             SSE_SHUFFLE(XMM7,XMM7,0x00)
1752             SSE_MULT_PS(XMM7,XMM3)
1753             SSE_SUB_PS(XMM6,XMM7)
1754 
1755             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1756             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1757 
1758             /* Fourth Column */
1759             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1760             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1761 
1762             /* Matrix-Vector Product: */
1763             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1764             SSE_SHUFFLE(XMM5,XMM5,0x00)
1765             SSE_MULT_PS(XMM5,XMM0)
1766             SSE_SUB_PS(XMM4,XMM5)
1767 
1768             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1769             SSE_SHUFFLE(XMM6,XMM6,0x00)
1770             SSE_MULT_PS(XMM6,XMM1)
1771             SSE_SUB_PS(XMM4,XMM6)
1772 
1773             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1774             SSE_SHUFFLE(XMM7,XMM7,0x00)
1775             SSE_MULT_PS(XMM7,XMM2)
1776             SSE_SUB_PS(XMM4,XMM7)
1777 
1778             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1779             SSE_SHUFFLE(XMM5,XMM5,0x00)
1780             SSE_MULT_PS(XMM5,XMM3)
1781             SSE_SUB_PS(XMM4,XMM5)
1782 
1783             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1784             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1785           SSE_INLINE_END_2;
1786           pv   += 16;
1787         }
1788         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1789       }
1790       row = (unsigned int)(*bjtmp++);
1791 /*        row = (*bjtmp++)/4; */
1792 /*        bjtmp++; */
1793     }
1794     /* finished row so stick it into b->a */
1795     pv = ba + 16*bi[i];
1796     pj = bj + bi[i];
1797     nz = bi[i+1] - bi[i];
1798 
1799     /* Copy x block back into pv block */
1800     for (j=0; j<nz; j++) {
1801       x  = rtmp+16*((unsigned int)pj[j]);
1802 /*        x  = rtmp+4*pj[j]; */
1803 
1804       SSE_INLINE_BEGIN_2(x,pv)
1805         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1806         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1807         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1808 
1809         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1810         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1811 
1812         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1813         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1814 
1815         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1816         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1817 
1818         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1819         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1820 
1821         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1822         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1823 
1824         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1825         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1826 
1827         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1828         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1829       SSE_INLINE_END_2;
1830       pv += 16;
1831     }
1832     /* invert diagonal block */
1833     w = ba + 16*diag_offset[i];
1834     if (pivotinblocks) {
1835       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1836     } else {
1837       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1838     }
1839 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1840     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1841   }
1842 
1843   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1844   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1845   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1846   C->assembled = PETSC_TRUE;
1847   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1848   /* Flop Count from inverting diagonal blocks */
1849   SSE_SCOPE_END;
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 #endif
1854