xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 753ebd1d069d7a4e86ed47ccb83df941bd1fb5db)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 4 by 4
12 */
13 #undef __FUNCT__
14 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat C,Mat A,const MatFactorInfo *info)
16 {
17   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18   IS             isrow = b->row,isicol = b->icol;
19   PetscErrorCode ierr;
20   const PetscInt *r,*ic;
21   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22   PetscInt       *ajtmpold,*ajtmp,nz,row;
23   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
25   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28   MatScalar      m13,m14,m15,m16;
29   MatScalar      *ba = b->a,*aa = a->a;
30   PetscTruth     pivotinblocks = b->pivotinblocks;
31   PetscReal      shift = info->shiftinblocks;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
37 
38   for (i=0; i<n; i++) {
39     nz    = bi[i+1] - bi[i];
40     ajtmp = bj + bi[i];
41     for  (j=0; j<nz; j++) {
42       x = rtmp+16*ajtmp[j];
43       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
44       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45     }
46     /* load in initial (unfactored row) */
47     idx      = r[i];
48     nz       = ai[idx+1] - ai[idx];
49     ajtmpold = aj + ai[idx];
50     v        = aa + 16*ai[idx];
51     for (j=0; j<nz; j++) {
52       x    = rtmp+16*ic[ajtmpold[j]];
53       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
54       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
55       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56       x[14] = v[14]; x[15] = v[15];
57       v    += 16;
58     }
59     row = *ajtmp++;
60     while (row < i) {
61       pc  = rtmp + 16*row;
62       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
63       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
64       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65       p15 = pc[14]; p16 = pc[15];
66       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69           || p16 != 0.0) {
70         pv = ba + 16*diag_offset[row];
71         pj = bj + diag_offset[row] + 1;
72         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
73         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
74         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75         x15 = pv[14]; x16 = pv[15];
76         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
77         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
78         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
79         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
80 
81         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
82         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
83         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
84         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
85 
86         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
87         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
88         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
89         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
90 
91         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
92         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
93         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
94         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
95 
96         nz = bi[row+1] - diag_offset[row] - 1;
97         pv += 16;
98         for (j=0; j<nz; j++) {
99           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103           x    = rtmp + 16*pj[j];
104           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
108 
109           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
113 
114           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
118 
119           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
123 
124           pv   += 16;
125         }
126         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
127       }
128       row = *ajtmp++;
129     }
130     /* finished row so stick it into b->a */
131     pv = ba + 16*bi[i];
132     pj = bj + bi[i];
133     nz = bi[i+1] - bi[i];
134     for (j=0; j<nz; j++) {
135       x      = rtmp+16*pj[j];
136       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140       pv   += 16;
141     }
142     /* invert diagonal block */
143     w    = ba + 16*diag_offset[i];
144     if (pivotinblocks) {
145       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
146     } else {
147       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154   C->ops->solve          = MatSolve_SeqBAIJ_4;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
156   C->assembled = PETSC_TRUE;
157   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
158   PetscFunctionReturn(0);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_4_newdatastruct -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
163        Kernel_A_gets_A_times_B()
164        Kernel_A_gets_A_minus_B_times_C()
165        Kernel_A_gets_inverse_A()
166 */
167 
168 #undef __FUNCT__
169 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_newdatastruct"
170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
171 {
172   Mat            C=B;
173   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174   IS             isrow = b->row,isicol = b->icol;
175   PetscErrorCode ierr;
176   const PetscInt *r,*ic,*ics;
177   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
178   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
179   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
180   PetscInt       bs2 = a->bs2,flg;
181   PetscReal      shift = info->shiftinblocks;
182 
183   PetscFunctionBegin;
184   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
185   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
186 
187   /* generate work space needed by the factorization */
188   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
189   mwork = rtmp + bs2*n;
190   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
191   ics  = ic;
192 
193   for (i=0; i<n; i++){
194     /* zero rtmp */
195     /* L part */
196     nz    = bi[i+1] - bi[i];
197     bjtmp = bj + bi[i];
198     for  (j=0; j<nz; j++){
199       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
200     }
201 
202     /* U part */
203     nz = bdiag[i]-bdiag[i+1];
204     bjtmp = bj + bdiag[i+1]+1;
205     for  (j=0; j<nz; j++){
206       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
207     }
208 
209     /* load in initial (unfactored row) */
210     nz    = ai[r[i]+1] - ai[r[i]];
211     ajtmp = aj + ai[r[i]];
212     v     = aa + bs2*ai[r[i]];
213     for (j=0; j<nz; j++) {
214       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
215     }
216 
217     /* elimination */
218     bjtmp = bj + bi[i];
219     nzL   = bi[i+1] - bi[i];
220     for(k=0;k < nzL;k++) {
221       row = bjtmp[k];
222       pc = rtmp + bs2*row;
223       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
224       if (flg) {
225         pv = b->a + bs2*bdiag[row];
226         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
227         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
228 
229         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
230         pv = b->a + bs2*(bdiag[row+1]+1);
231         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
232         for (j=0; j<nz; j++) {
233           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
234           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
235           v    = rtmp + bs2*pj[j];
236           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
237           pv  += bs2;
238         }
239         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
240       }
241     }
242 
243     /* finished row so stick it into b->a */
244     /* L part */
245     pv   = b->a + bs2*bi[i] ;
246     pj   = b->j + bi[i] ;
247     nz   = bi[i+1] - bi[i];
248     for (j=0; j<nz; j++) {
249       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
250     }
251 
252     /* Mark diagonal and invert diagonal for simplier triangular solves */
253     pv   = b->a + bs2*bdiag[i];
254     pj   = b->j + bdiag[i];
255     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
256     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
257     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
258 
259     /* U part */
260     pv = b->a + bs2*(bdiag[i+1]+1);
261     pj = b->j + bdiag[i+1]+1;
262     nz = bdiag[i] - bdiag[i+1] - 1;
263     for (j=0; j<nz; j++){
264       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
265     }
266   }
267 
268   ierr = PetscFree(rtmp);CHKERRQ(ierr);
269   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
270   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
271 
272   C->assembled = PETSC_TRUE;
273   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
274   PetscFunctionReturn(0);
275 }
276 
277 #undef __FUNCT__
278 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
279 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
280 {
281 /*
282     Default Version for when blocks are 4 by 4 Using natural ordering
283 */
284   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
285   PetscErrorCode ierr;
286   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
287   PetscInt       *ajtmpold,*ajtmp,nz,row;
288   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
289   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
290   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
291   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
292   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
293   MatScalar      m13,m14,m15,m16;
294   MatScalar      *ba = b->a,*aa = a->a;
295   PetscTruth     pivotinblocks = b->pivotinblocks;
296   PetscReal      shift = info->shiftinblocks;
297 
298   PetscFunctionBegin;
299   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
300 
301   for (i=0; i<n; i++) {
302     nz    = bi[i+1] - bi[i];
303     ajtmp = bj + bi[i];
304     for  (j=0; j<nz; j++) {
305       x = rtmp+16*ajtmp[j];
306       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
307       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
308     }
309     /* load in initial (unfactored row) */
310     nz       = ai[i+1] - ai[i];
311     ajtmpold = aj + ai[i];
312     v        = aa + 16*ai[i];
313     for (j=0; j<nz; j++) {
314       x    = rtmp+16*ajtmpold[j];
315       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
316       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
317       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
318       x[14] = v[14]; x[15] = v[15];
319       v    += 16;
320     }
321     row = *ajtmp++;
322     while (row < i) {
323       pc  = rtmp + 16*row;
324       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
325       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
326       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
327       p15 = pc[14]; p16 = pc[15];
328       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
329           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
330           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
331           || p16 != 0.0) {
332         pv = ba + 16*diag_offset[row];
333         pj = bj + diag_offset[row] + 1;
334         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
335         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
336         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
337         x15 = pv[14]; x16 = pv[15];
338         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
339         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
340         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
341         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
342 
343         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
344         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
345         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
346         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
347 
348         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
349         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
350         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
351         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
352 
353         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
354         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
355         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
356         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
357         nz = bi[row+1] - diag_offset[row] - 1;
358         pv += 16;
359         for (j=0; j<nz; j++) {
360           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
361           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
362           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
363           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
364           x    = rtmp + 16*pj[j];
365           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
366           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
367           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
368           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
369 
370           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
371           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
372           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
373           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
374 
375           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
376           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
377           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
378           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
379 
380           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
381           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
382           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
383           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
384 
385           pv   += 16;
386         }
387         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
388       }
389       row = *ajtmp++;
390     }
391     /* finished row so stick it into b->a */
392     pv = ba + 16*bi[i];
393     pj = bj + bi[i];
394     nz = bi[i+1] - bi[i];
395     for (j=0; j<nz; j++) {
396       x      = rtmp+16*pj[j];
397       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
398       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
399       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
400       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
401       pv   += 16;
402     }
403     /* invert diagonal block */
404     w = ba + 16*diag_offset[i];
405     if (pivotinblocks) {
406       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
407     } else {
408       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
409     }
410   }
411 
412   ierr = PetscFree(rtmp);CHKERRQ(ierr);
413   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
414   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
415   C->assembled = PETSC_TRUE;
416   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
417   PetscFunctionReturn(0);
418 }
419 
420 /*
421   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct -
422     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct()
423 */
424 #undef __FUNCT__
425 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct"
426 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
427 {
428   Mat            C=B;
429   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
430   PetscErrorCode ierr;
431   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
432   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
433   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
434   PetscInt       bs2 = a->bs2,flg;
435   PetscReal      shift = info->shiftinblocks;
436 
437   PetscFunctionBegin;
438   /* generate work space needed by the factorization */
439   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
440   mwork = rtmp + bs2*n;
441   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
442 
443   for (i=0; i<n; i++){
444     /* zero rtmp */
445     /* L part */
446     nz    = bi[i+1] - bi[i];
447     bjtmp = bj + bi[i];
448     for  (j=0; j<nz; j++){
449       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
450     }
451 
452     /* U part */
453     nz = bdiag[i] - bdiag[i+1];
454     bjtmp = bj + bdiag[i+1]+1;
455     for  (j=0; j<nz; j++){
456       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
457     }
458 
459     /* load in initial (unfactored row) */
460     nz    = ai[i+1] - ai[i];
461     ajtmp = aj + ai[i];
462     v     = aa + bs2*ai[i];
463     for (j=0; j<nz; j++) {
464       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
465     }
466 
467     /* elimination */
468     bjtmp = bj + bi[i];
469     nzL   = bi[i+1] - bi[i];
470     for(k=0;k < nzL;k++) {
471       row = bjtmp[k];
472       pc = rtmp + bs2*row;
473       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
474       if (flg) {
475         pv = b->a + bs2*bdiag[row];
476         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
477         ierr = Kernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
478 
479         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
480         pv = b->a + bs2*(bdiag[row+1]+1);
481         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
482         for (j=0; j<nz; j++) {
483           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
484           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
485           v    = rtmp + bs2*pj[j];
486           ierr = Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
487           pv  += bs2;
488         }
489         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
490       }
491     }
492 
493     /* finished row so stick it into b->a */
494     /* L part */
495     pv   = b->a + bs2*bi[i] ;
496     pj   = b->j + bi[i] ;
497     nz   = bi[i+1] - bi[i];
498     for (j=0; j<nz; j++) {
499       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
500     }
501 
502     /* Mark diagonal and invert diagonal for simplier triangular solves */
503     pv   = b->a + bs2*bdiag[i];
504     pj   = b->j + bdiag[i];
505     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
506     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
507     ierr = Kernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
508 
509     /* U part */
510     pv = b->a + bs2*(bdiag[i+1]+1);
511     pj = b->j + bdiag[i+1]+1;
512     nz = bdiag[i] - bdiag[i+1] - 1;
513     for (j=0; j<nz; j++){
514       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
515     }
516   }
517 
518   ierr = PetscFree(rtmp);CHKERRQ(ierr);
519   C->assembled = PETSC_TRUE;
520   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
521   PetscFunctionReturn(0);
522 }
523 
524 #if defined(PETSC_HAVE_SSE)
525 
526 #include PETSC_HAVE_SSE
527 
528 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
529 #undef __FUNCT__
530 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
531 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
532 {
533   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
534   PetscErrorCode ierr;
535   int i,j,n = a->mbs;
536   int         *bj = b->j,*bjtmp,*pj;
537   int         row;
538   int         *ajtmpold,nz,*bi=b->i;
539   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
540   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
541   MatScalar   *ba = b->a,*aa = a->a;
542   int         nonzero=0;
543 /*    int            nonzero=0,colscale = 16; */
544   PetscTruth  pivotinblocks = b->pivotinblocks;
545   PetscReal      shift = info->shiftinblocks;
546 
547   PetscFunctionBegin;
548   SSE_SCOPE_BEGIN;
549 
550   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
551   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
552   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
553   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
554 /*    if ((unsigned long)bj==(unsigned long)aj) { */
555 /*      colscale = 4; */
556 /*    } */
557   for (i=0; i<n; i++) {
558     nz    = bi[i+1] - bi[i];
559     bjtmp = bj + bi[i];
560     /* zero out the 4x4 block accumulators */
561     /* zero out one register */
562     XOR_PS(XMM7,XMM7);
563     for  (j=0; j<nz; j++) {
564       x = rtmp+16*bjtmp[j];
565 /*        x = rtmp+4*bjtmp[j]; */
566       SSE_INLINE_BEGIN_1(x)
567         /* Copy zero register to memory locations */
568         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
569         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
570         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
571         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
572         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
573         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
574         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
575         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
576         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
577       SSE_INLINE_END_1;
578     }
579     /* load in initial (unfactored row) */
580     nz       = ai[i+1] - ai[i];
581     ajtmpold = aj + ai[i];
582     v        = aa + 16*ai[i];
583     for (j=0; j<nz; j++) {
584       x = rtmp+16*ajtmpold[j];
585 /*        x = rtmp+colscale*ajtmpold[j]; */
586       /* Copy v block into x block */
587       SSE_INLINE_BEGIN_2(v,x)
588         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
589         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
590         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
591 
592         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
593         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
594 
595         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
596         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
597 
598         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
599         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
600 
601         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
602         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
603 
604         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
605         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
606 
607         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
608         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
609 
610         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
611         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
612       SSE_INLINE_END_2;
613 
614       v += 16;
615     }
616 /*      row = (*bjtmp++)/4; */
617     row = *bjtmp++;
618     while (row < i) {
619       pc  = rtmp + 16*row;
620       SSE_INLINE_BEGIN_1(pc)
621         /* Load block from lower triangle */
622         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
623         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
624         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
625 
626         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
627         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
628 
629         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
630         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
631 
632         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
633         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
634 
635         /* Compare block to zero block */
636 
637         SSE_COPY_PS(XMM4,XMM7)
638         SSE_CMPNEQ_PS(XMM4,XMM0)
639 
640         SSE_COPY_PS(XMM5,XMM7)
641         SSE_CMPNEQ_PS(XMM5,XMM1)
642 
643         SSE_COPY_PS(XMM6,XMM7)
644         SSE_CMPNEQ_PS(XMM6,XMM2)
645 
646         SSE_CMPNEQ_PS(XMM7,XMM3)
647 
648         /* Reduce the comparisons to one SSE register */
649         SSE_OR_PS(XMM6,XMM7)
650         SSE_OR_PS(XMM5,XMM4)
651         SSE_OR_PS(XMM5,XMM6)
652       SSE_INLINE_END_1;
653 
654       /* Reduce the one SSE register to an integer register for branching */
655       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
656       MOVEMASK(nonzero,XMM5);
657 
658       /* If block is nonzero ... */
659       if (nonzero) {
660         pv = ba + 16*diag_offset[row];
661         PREFETCH_L1(&pv[16]);
662         pj = bj + diag_offset[row] + 1;
663 
664         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
665         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
666         /* but the diagonal was inverted already */
667         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
668 
669         SSE_INLINE_BEGIN_2(pv,pc)
670           /* Column 0, product is accumulated in XMM4 */
671           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
672           SSE_SHUFFLE(XMM4,XMM4,0x00)
673           SSE_MULT_PS(XMM4,XMM0)
674 
675           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
676           SSE_SHUFFLE(XMM5,XMM5,0x00)
677           SSE_MULT_PS(XMM5,XMM1)
678           SSE_ADD_PS(XMM4,XMM5)
679 
680           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
681           SSE_SHUFFLE(XMM6,XMM6,0x00)
682           SSE_MULT_PS(XMM6,XMM2)
683           SSE_ADD_PS(XMM4,XMM6)
684 
685           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
686           SSE_SHUFFLE(XMM7,XMM7,0x00)
687           SSE_MULT_PS(XMM7,XMM3)
688           SSE_ADD_PS(XMM4,XMM7)
689 
690           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
691           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
692 
693           /* Column 1, product is accumulated in XMM5 */
694           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
695           SSE_SHUFFLE(XMM5,XMM5,0x00)
696           SSE_MULT_PS(XMM5,XMM0)
697 
698           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
699           SSE_SHUFFLE(XMM6,XMM6,0x00)
700           SSE_MULT_PS(XMM6,XMM1)
701           SSE_ADD_PS(XMM5,XMM6)
702 
703           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
704           SSE_SHUFFLE(XMM7,XMM7,0x00)
705           SSE_MULT_PS(XMM7,XMM2)
706           SSE_ADD_PS(XMM5,XMM7)
707 
708           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
709           SSE_SHUFFLE(XMM6,XMM6,0x00)
710           SSE_MULT_PS(XMM6,XMM3)
711           SSE_ADD_PS(XMM5,XMM6)
712 
713           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
714           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
715 
716           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
717 
718           /* Column 2, product is accumulated in XMM6 */
719           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
720           SSE_SHUFFLE(XMM6,XMM6,0x00)
721           SSE_MULT_PS(XMM6,XMM0)
722 
723           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
724           SSE_SHUFFLE(XMM7,XMM7,0x00)
725           SSE_MULT_PS(XMM7,XMM1)
726           SSE_ADD_PS(XMM6,XMM7)
727 
728           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
729           SSE_SHUFFLE(XMM7,XMM7,0x00)
730           SSE_MULT_PS(XMM7,XMM2)
731           SSE_ADD_PS(XMM6,XMM7)
732 
733           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
734           SSE_SHUFFLE(XMM7,XMM7,0x00)
735           SSE_MULT_PS(XMM7,XMM3)
736           SSE_ADD_PS(XMM6,XMM7)
737 
738           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
739           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
740 
741           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
742           /* Column 3, product is accumulated in XMM0 */
743           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
744           SSE_SHUFFLE(XMM7,XMM7,0x00)
745           SSE_MULT_PS(XMM0,XMM7)
746 
747           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
748           SSE_SHUFFLE(XMM7,XMM7,0x00)
749           SSE_MULT_PS(XMM1,XMM7)
750           SSE_ADD_PS(XMM0,XMM1)
751 
752           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
753           SSE_SHUFFLE(XMM1,XMM1,0x00)
754           SSE_MULT_PS(XMM1,XMM2)
755           SSE_ADD_PS(XMM0,XMM1)
756 
757           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
758           SSE_SHUFFLE(XMM7,XMM7,0x00)
759           SSE_MULT_PS(XMM3,XMM7)
760           SSE_ADD_PS(XMM0,XMM3)
761 
762           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
763           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
764 
765           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
766           /* This is code to be maintained and read by humans afterall. */
767           /* Copy Multiplier Col 3 into XMM3 */
768           SSE_COPY_PS(XMM3,XMM0)
769           /* Copy Multiplier Col 2 into XMM2 */
770           SSE_COPY_PS(XMM2,XMM6)
771           /* Copy Multiplier Col 1 into XMM1 */
772           SSE_COPY_PS(XMM1,XMM5)
773           /* Copy Multiplier Col 0 into XMM0 */
774           SSE_COPY_PS(XMM0,XMM4)
775         SSE_INLINE_END_2;
776 
777         /* Update the row: */
778         nz = bi[row+1] - diag_offset[row] - 1;
779         pv += 16;
780         for (j=0; j<nz; j++) {
781           PREFETCH_L1(&pv[16]);
782           x = rtmp + 16*pj[j];
783 /*            x = rtmp + 4*pj[j]; */
784 
785           /* X:=X-M*PV, One column at a time */
786           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
787           SSE_INLINE_BEGIN_2(x,pv)
788             /* Load First Column of X*/
789             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
790             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
791 
792             /* Matrix-Vector Product: */
793             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
794             SSE_SHUFFLE(XMM5,XMM5,0x00)
795             SSE_MULT_PS(XMM5,XMM0)
796             SSE_SUB_PS(XMM4,XMM5)
797 
798             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
799             SSE_SHUFFLE(XMM6,XMM6,0x00)
800             SSE_MULT_PS(XMM6,XMM1)
801             SSE_SUB_PS(XMM4,XMM6)
802 
803             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
804             SSE_SHUFFLE(XMM7,XMM7,0x00)
805             SSE_MULT_PS(XMM7,XMM2)
806             SSE_SUB_PS(XMM4,XMM7)
807 
808             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
809             SSE_SHUFFLE(XMM5,XMM5,0x00)
810             SSE_MULT_PS(XMM5,XMM3)
811             SSE_SUB_PS(XMM4,XMM5)
812 
813             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
814             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
815 
816             /* Second Column */
817             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
818             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
819 
820             /* Matrix-Vector Product: */
821             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
822             SSE_SHUFFLE(XMM6,XMM6,0x00)
823             SSE_MULT_PS(XMM6,XMM0)
824             SSE_SUB_PS(XMM5,XMM6)
825 
826             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
827             SSE_SHUFFLE(XMM7,XMM7,0x00)
828             SSE_MULT_PS(XMM7,XMM1)
829             SSE_SUB_PS(XMM5,XMM7)
830 
831             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
832             SSE_SHUFFLE(XMM4,XMM4,0x00)
833             SSE_MULT_PS(XMM4,XMM2)
834             SSE_SUB_PS(XMM5,XMM4)
835 
836             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
837             SSE_SHUFFLE(XMM6,XMM6,0x00)
838             SSE_MULT_PS(XMM6,XMM3)
839             SSE_SUB_PS(XMM5,XMM6)
840 
841             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
842             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
843 
844             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
845 
846             /* Third Column */
847             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
848             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
849 
850             /* Matrix-Vector Product: */
851             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
852             SSE_SHUFFLE(XMM7,XMM7,0x00)
853             SSE_MULT_PS(XMM7,XMM0)
854             SSE_SUB_PS(XMM6,XMM7)
855 
856             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
857             SSE_SHUFFLE(XMM4,XMM4,0x00)
858             SSE_MULT_PS(XMM4,XMM1)
859             SSE_SUB_PS(XMM6,XMM4)
860 
861             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
862             SSE_SHUFFLE(XMM5,XMM5,0x00)
863             SSE_MULT_PS(XMM5,XMM2)
864             SSE_SUB_PS(XMM6,XMM5)
865 
866             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
867             SSE_SHUFFLE(XMM7,XMM7,0x00)
868             SSE_MULT_PS(XMM7,XMM3)
869             SSE_SUB_PS(XMM6,XMM7)
870 
871             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
872             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
873 
874             /* Fourth Column */
875             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
876             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
877 
878             /* Matrix-Vector Product: */
879             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
880             SSE_SHUFFLE(XMM5,XMM5,0x00)
881             SSE_MULT_PS(XMM5,XMM0)
882             SSE_SUB_PS(XMM4,XMM5)
883 
884             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
885             SSE_SHUFFLE(XMM6,XMM6,0x00)
886             SSE_MULT_PS(XMM6,XMM1)
887             SSE_SUB_PS(XMM4,XMM6)
888 
889             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
890             SSE_SHUFFLE(XMM7,XMM7,0x00)
891             SSE_MULT_PS(XMM7,XMM2)
892             SSE_SUB_PS(XMM4,XMM7)
893 
894             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
895             SSE_SHUFFLE(XMM5,XMM5,0x00)
896             SSE_MULT_PS(XMM5,XMM3)
897             SSE_SUB_PS(XMM4,XMM5)
898 
899             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
900             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
901           SSE_INLINE_END_2;
902           pv   += 16;
903         }
904         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
905       }
906       row = *bjtmp++;
907 /*        row = (*bjtmp++)/4; */
908     }
909     /* finished row so stick it into b->a */
910     pv = ba + 16*bi[i];
911     pj = bj + bi[i];
912     nz = bi[i+1] - bi[i];
913 
914     /* Copy x block back into pv block */
915     for (j=0; j<nz; j++) {
916       x  = rtmp+16*pj[j];
917 /*        x  = rtmp+4*pj[j]; */
918 
919       SSE_INLINE_BEGIN_2(x,pv)
920         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
921         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
922         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
923 
924         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
925         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
926 
927         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
928         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
929 
930         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
931         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
932 
933         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
934         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
935 
936         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
937         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
938 
939         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
940         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
941 
942         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
943         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
944       SSE_INLINE_END_2;
945       pv += 16;
946     }
947     /* invert diagonal block */
948     w = ba + 16*diag_offset[i];
949     if (pivotinblocks) {
950       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
951     } else {
952       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
953     }
954 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
955     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
956   }
957 
958   ierr = PetscFree(rtmp);CHKERRQ(ierr);
959   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
960   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
961   C->assembled = PETSC_TRUE;
962   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
963   /* Flop Count from inverting diagonal blocks */
964   SSE_SCOPE_END;
965   PetscFunctionReturn(0);
966 }
967 
968 #undef __FUNCT__
969 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
970 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
971 {
972   Mat            A=C;
973   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
974   PetscErrorCode ierr;
975   int i,j,n = a->mbs;
976   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
977   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
978   unsigned int   row;
979   int            nz,*bi=b->i;
980   int            *diag_offset = b->diag,*ai=a->i;
981   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
982   MatScalar      *ba = b->a,*aa = a->a;
983   int            nonzero=0;
984 /*    int            nonzero=0,colscale = 16; */
985   PetscTruth     pivotinblocks = b->pivotinblocks;
986   PetscReal      shift = info->shiftinblocks;
987 
988   PetscFunctionBegin;
989   SSE_SCOPE_BEGIN;
990 
991   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
992   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
993   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
994   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
995 /*    if ((unsigned long)bj==(unsigned long)aj) { */
996 /*      colscale = 4; */
997 /*    } */
998 
999   for (i=0; i<n; i++) {
1000     nz    = bi[i+1] - bi[i];
1001     bjtmp = bj + bi[i];
1002     /* zero out the 4x4 block accumulators */
1003     /* zero out one register */
1004     XOR_PS(XMM7,XMM7);
1005     for  (j=0; j<nz; j++) {
1006       x = rtmp+16*((unsigned int)bjtmp[j]);
1007 /*        x = rtmp+4*bjtmp[j]; */
1008       SSE_INLINE_BEGIN_1(x)
1009         /* Copy zero register to memory locations */
1010         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1011         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1012         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1013         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1014         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1015         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1016         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1017         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1018         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1019       SSE_INLINE_END_1;
1020     }
1021     /* load in initial (unfactored row) */
1022     nz    = ai[i+1] - ai[i];
1023     ajtmp = aj + ai[i];
1024     v     = aa + 16*ai[i];
1025     for (j=0; j<nz; j++) {
1026       x = rtmp+16*((unsigned int)ajtmp[j]);
1027 /*        x = rtmp+colscale*ajtmp[j]; */
1028       /* Copy v block into x block */
1029       SSE_INLINE_BEGIN_2(v,x)
1030         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1031         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1032         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1033 
1034         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1035         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1036 
1037         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1038         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1039 
1040         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1041         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1042 
1043         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1044         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1045 
1046         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1047         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1048 
1049         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1050         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1051 
1052         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1053         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1054       SSE_INLINE_END_2;
1055 
1056       v += 16;
1057     }
1058 /*      row = (*bjtmp++)/4; */
1059     row = (unsigned int)(*bjtmp++);
1060     while (row < i) {
1061       pc  = rtmp + 16*row;
1062       SSE_INLINE_BEGIN_1(pc)
1063         /* Load block from lower triangle */
1064         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1065         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1066         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1067 
1068         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1069         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1070 
1071         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1072         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1073 
1074         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1075         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1076 
1077         /* Compare block to zero block */
1078 
1079         SSE_COPY_PS(XMM4,XMM7)
1080         SSE_CMPNEQ_PS(XMM4,XMM0)
1081 
1082         SSE_COPY_PS(XMM5,XMM7)
1083         SSE_CMPNEQ_PS(XMM5,XMM1)
1084 
1085         SSE_COPY_PS(XMM6,XMM7)
1086         SSE_CMPNEQ_PS(XMM6,XMM2)
1087 
1088         SSE_CMPNEQ_PS(XMM7,XMM3)
1089 
1090         /* Reduce the comparisons to one SSE register */
1091         SSE_OR_PS(XMM6,XMM7)
1092         SSE_OR_PS(XMM5,XMM4)
1093         SSE_OR_PS(XMM5,XMM6)
1094       SSE_INLINE_END_1;
1095 
1096       /* Reduce the one SSE register to an integer register for branching */
1097       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1098       MOVEMASK(nonzero,XMM5);
1099 
1100       /* If block is nonzero ... */
1101       if (nonzero) {
1102         pv = ba + 16*diag_offset[row];
1103         PREFETCH_L1(&pv[16]);
1104         pj = bj + diag_offset[row] + 1;
1105 
1106         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1107         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1108         /* but the diagonal was inverted already */
1109         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1110 
1111         SSE_INLINE_BEGIN_2(pv,pc)
1112           /* Column 0, product is accumulated in XMM4 */
1113           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1114           SSE_SHUFFLE(XMM4,XMM4,0x00)
1115           SSE_MULT_PS(XMM4,XMM0)
1116 
1117           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1118           SSE_SHUFFLE(XMM5,XMM5,0x00)
1119           SSE_MULT_PS(XMM5,XMM1)
1120           SSE_ADD_PS(XMM4,XMM5)
1121 
1122           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1123           SSE_SHUFFLE(XMM6,XMM6,0x00)
1124           SSE_MULT_PS(XMM6,XMM2)
1125           SSE_ADD_PS(XMM4,XMM6)
1126 
1127           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1128           SSE_SHUFFLE(XMM7,XMM7,0x00)
1129           SSE_MULT_PS(XMM7,XMM3)
1130           SSE_ADD_PS(XMM4,XMM7)
1131 
1132           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1133           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1134 
1135           /* Column 1, product is accumulated in XMM5 */
1136           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1137           SSE_SHUFFLE(XMM5,XMM5,0x00)
1138           SSE_MULT_PS(XMM5,XMM0)
1139 
1140           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1141           SSE_SHUFFLE(XMM6,XMM6,0x00)
1142           SSE_MULT_PS(XMM6,XMM1)
1143           SSE_ADD_PS(XMM5,XMM6)
1144 
1145           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1146           SSE_SHUFFLE(XMM7,XMM7,0x00)
1147           SSE_MULT_PS(XMM7,XMM2)
1148           SSE_ADD_PS(XMM5,XMM7)
1149 
1150           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1151           SSE_SHUFFLE(XMM6,XMM6,0x00)
1152           SSE_MULT_PS(XMM6,XMM3)
1153           SSE_ADD_PS(XMM5,XMM6)
1154 
1155           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1156           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1157 
1158           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1159 
1160           /* Column 2, product is accumulated in XMM6 */
1161           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1162           SSE_SHUFFLE(XMM6,XMM6,0x00)
1163           SSE_MULT_PS(XMM6,XMM0)
1164 
1165           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1166           SSE_SHUFFLE(XMM7,XMM7,0x00)
1167           SSE_MULT_PS(XMM7,XMM1)
1168           SSE_ADD_PS(XMM6,XMM7)
1169 
1170           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1171           SSE_SHUFFLE(XMM7,XMM7,0x00)
1172           SSE_MULT_PS(XMM7,XMM2)
1173           SSE_ADD_PS(XMM6,XMM7)
1174 
1175           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1176           SSE_SHUFFLE(XMM7,XMM7,0x00)
1177           SSE_MULT_PS(XMM7,XMM3)
1178           SSE_ADD_PS(XMM6,XMM7)
1179 
1180           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1181           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1182 
1183           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1184           /* Column 3, product is accumulated in XMM0 */
1185           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1186           SSE_SHUFFLE(XMM7,XMM7,0x00)
1187           SSE_MULT_PS(XMM0,XMM7)
1188 
1189           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1190           SSE_SHUFFLE(XMM7,XMM7,0x00)
1191           SSE_MULT_PS(XMM1,XMM7)
1192           SSE_ADD_PS(XMM0,XMM1)
1193 
1194           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1195           SSE_SHUFFLE(XMM1,XMM1,0x00)
1196           SSE_MULT_PS(XMM1,XMM2)
1197           SSE_ADD_PS(XMM0,XMM1)
1198 
1199           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1200           SSE_SHUFFLE(XMM7,XMM7,0x00)
1201           SSE_MULT_PS(XMM3,XMM7)
1202           SSE_ADD_PS(XMM0,XMM3)
1203 
1204           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1205           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1206 
1207           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1208           /* This is code to be maintained and read by humans afterall. */
1209           /* Copy Multiplier Col 3 into XMM3 */
1210           SSE_COPY_PS(XMM3,XMM0)
1211           /* Copy Multiplier Col 2 into XMM2 */
1212           SSE_COPY_PS(XMM2,XMM6)
1213           /* Copy Multiplier Col 1 into XMM1 */
1214           SSE_COPY_PS(XMM1,XMM5)
1215           /* Copy Multiplier Col 0 into XMM0 */
1216           SSE_COPY_PS(XMM0,XMM4)
1217         SSE_INLINE_END_2;
1218 
1219         /* Update the row: */
1220         nz = bi[row+1] - diag_offset[row] - 1;
1221         pv += 16;
1222         for (j=0; j<nz; j++) {
1223           PREFETCH_L1(&pv[16]);
1224           x = rtmp + 16*((unsigned int)pj[j]);
1225 /*            x = rtmp + 4*pj[j]; */
1226 
1227           /* X:=X-M*PV, One column at a time */
1228           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1229           SSE_INLINE_BEGIN_2(x,pv)
1230             /* Load First Column of X*/
1231             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1232             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1233 
1234             /* Matrix-Vector Product: */
1235             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1236             SSE_SHUFFLE(XMM5,XMM5,0x00)
1237             SSE_MULT_PS(XMM5,XMM0)
1238             SSE_SUB_PS(XMM4,XMM5)
1239 
1240             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1241             SSE_SHUFFLE(XMM6,XMM6,0x00)
1242             SSE_MULT_PS(XMM6,XMM1)
1243             SSE_SUB_PS(XMM4,XMM6)
1244 
1245             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1246             SSE_SHUFFLE(XMM7,XMM7,0x00)
1247             SSE_MULT_PS(XMM7,XMM2)
1248             SSE_SUB_PS(XMM4,XMM7)
1249 
1250             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1251             SSE_SHUFFLE(XMM5,XMM5,0x00)
1252             SSE_MULT_PS(XMM5,XMM3)
1253             SSE_SUB_PS(XMM4,XMM5)
1254 
1255             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1256             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1257 
1258             /* Second Column */
1259             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1260             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1261 
1262             /* Matrix-Vector Product: */
1263             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1264             SSE_SHUFFLE(XMM6,XMM6,0x00)
1265             SSE_MULT_PS(XMM6,XMM0)
1266             SSE_SUB_PS(XMM5,XMM6)
1267 
1268             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1269             SSE_SHUFFLE(XMM7,XMM7,0x00)
1270             SSE_MULT_PS(XMM7,XMM1)
1271             SSE_SUB_PS(XMM5,XMM7)
1272 
1273             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1274             SSE_SHUFFLE(XMM4,XMM4,0x00)
1275             SSE_MULT_PS(XMM4,XMM2)
1276             SSE_SUB_PS(XMM5,XMM4)
1277 
1278             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1279             SSE_SHUFFLE(XMM6,XMM6,0x00)
1280             SSE_MULT_PS(XMM6,XMM3)
1281             SSE_SUB_PS(XMM5,XMM6)
1282 
1283             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1284             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1285 
1286             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1287 
1288             /* Third Column */
1289             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1290             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1291 
1292             /* Matrix-Vector Product: */
1293             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1294             SSE_SHUFFLE(XMM7,XMM7,0x00)
1295             SSE_MULT_PS(XMM7,XMM0)
1296             SSE_SUB_PS(XMM6,XMM7)
1297 
1298             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1299             SSE_SHUFFLE(XMM4,XMM4,0x00)
1300             SSE_MULT_PS(XMM4,XMM1)
1301             SSE_SUB_PS(XMM6,XMM4)
1302 
1303             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1304             SSE_SHUFFLE(XMM5,XMM5,0x00)
1305             SSE_MULT_PS(XMM5,XMM2)
1306             SSE_SUB_PS(XMM6,XMM5)
1307 
1308             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1309             SSE_SHUFFLE(XMM7,XMM7,0x00)
1310             SSE_MULT_PS(XMM7,XMM3)
1311             SSE_SUB_PS(XMM6,XMM7)
1312 
1313             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1314             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1315 
1316             /* Fourth Column */
1317             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1318             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1319 
1320             /* Matrix-Vector Product: */
1321             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1322             SSE_SHUFFLE(XMM5,XMM5,0x00)
1323             SSE_MULT_PS(XMM5,XMM0)
1324             SSE_SUB_PS(XMM4,XMM5)
1325 
1326             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1327             SSE_SHUFFLE(XMM6,XMM6,0x00)
1328             SSE_MULT_PS(XMM6,XMM1)
1329             SSE_SUB_PS(XMM4,XMM6)
1330 
1331             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1332             SSE_SHUFFLE(XMM7,XMM7,0x00)
1333             SSE_MULT_PS(XMM7,XMM2)
1334             SSE_SUB_PS(XMM4,XMM7)
1335 
1336             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1337             SSE_SHUFFLE(XMM5,XMM5,0x00)
1338             SSE_MULT_PS(XMM5,XMM3)
1339             SSE_SUB_PS(XMM4,XMM5)
1340 
1341             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1342             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1343           SSE_INLINE_END_2;
1344           pv   += 16;
1345         }
1346         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1347       }
1348       row = (unsigned int)(*bjtmp++);
1349 /*        row = (*bjtmp++)/4; */
1350 /*        bjtmp++; */
1351     }
1352     /* finished row so stick it into b->a */
1353     pv = ba + 16*bi[i];
1354     pj = bj + bi[i];
1355     nz = bi[i+1] - bi[i];
1356 
1357     /* Copy x block back into pv block */
1358     for (j=0; j<nz; j++) {
1359       x  = rtmp+16*((unsigned int)pj[j]);
1360 /*        x  = rtmp+4*pj[j]; */
1361 
1362       SSE_INLINE_BEGIN_2(x,pv)
1363         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1364         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1365         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1366 
1367         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1368         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1369 
1370         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1371         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1372 
1373         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1374         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1375 
1376         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1377         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1378 
1379         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1380         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1381 
1382         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1383         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1384 
1385         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1386         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1387       SSE_INLINE_END_2;
1388       pv += 16;
1389     }
1390     /* invert diagonal block */
1391     w = ba + 16*diag_offset[i];
1392     if (pivotinblocks) {
1393       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1394     } else {
1395       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1396     }
1397 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1398     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1399   }
1400 
1401   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1402   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1403   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1404   C->assembled = PETSC_TRUE;
1405   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1406   /* Flop Count from inverting diagonal blocks */
1407   SSE_SCOPE_END;
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 #undef __FUNCT__
1412 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1413 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1414 {
1415   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1416   PetscErrorCode ierr;
1417   int  i,j,n = a->mbs;
1418   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1419   unsigned int   row;
1420   int            *ajtmpold,nz,*bi=b->i;
1421   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1422   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1423   MatScalar      *ba = b->a,*aa = a->a;
1424   int            nonzero=0;
1425 /*    int            nonzero=0,colscale = 16; */
1426   PetscTruth     pivotinblocks = b->pivotinblocks;
1427   PetscReal      shift = info->shiftinblocks;
1428 
1429   PetscFunctionBegin;
1430   SSE_SCOPE_BEGIN;
1431 
1432   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1433   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1434   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1435   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1436 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1437 /*      colscale = 4; */
1438 /*    } */
1439   if ((unsigned long)bj==(unsigned long)aj) {
1440     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1441   }
1442 
1443   for (i=0; i<n; i++) {
1444     nz    = bi[i+1] - bi[i];
1445     bjtmp = bj + bi[i];
1446     /* zero out the 4x4 block accumulators */
1447     /* zero out one register */
1448     XOR_PS(XMM7,XMM7);
1449     for  (j=0; j<nz; j++) {
1450       x = rtmp+16*((unsigned int)bjtmp[j]);
1451 /*        x = rtmp+4*bjtmp[j]; */
1452       SSE_INLINE_BEGIN_1(x)
1453         /* Copy zero register to memory locations */
1454         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1455         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1456         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1457         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1458         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1459         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1460         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1461         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1462         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1463       SSE_INLINE_END_1;
1464     }
1465     /* load in initial (unfactored row) */
1466     nz       = ai[i+1] - ai[i];
1467     ajtmpold = aj + ai[i];
1468     v        = aa + 16*ai[i];
1469     for (j=0; j<nz; j++) {
1470       x = rtmp+16*ajtmpold[j];
1471 /*        x = rtmp+colscale*ajtmpold[j]; */
1472       /* Copy v block into x block */
1473       SSE_INLINE_BEGIN_2(v,x)
1474         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1475         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1476         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1477 
1478         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1479         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1480 
1481         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1482         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1483 
1484         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1485         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1486 
1487         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1488         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1489 
1490         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1491         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1492 
1493         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1494         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1495 
1496         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1497         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1498       SSE_INLINE_END_2;
1499 
1500       v += 16;
1501     }
1502 /*      row = (*bjtmp++)/4; */
1503     row = (unsigned int)(*bjtmp++);
1504     while (row < i) {
1505       pc  = rtmp + 16*row;
1506       SSE_INLINE_BEGIN_1(pc)
1507         /* Load block from lower triangle */
1508         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1509         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1510         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1511 
1512         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1513         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1514 
1515         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1516         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1517 
1518         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1519         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1520 
1521         /* Compare block to zero block */
1522 
1523         SSE_COPY_PS(XMM4,XMM7)
1524         SSE_CMPNEQ_PS(XMM4,XMM0)
1525 
1526         SSE_COPY_PS(XMM5,XMM7)
1527         SSE_CMPNEQ_PS(XMM5,XMM1)
1528 
1529         SSE_COPY_PS(XMM6,XMM7)
1530         SSE_CMPNEQ_PS(XMM6,XMM2)
1531 
1532         SSE_CMPNEQ_PS(XMM7,XMM3)
1533 
1534         /* Reduce the comparisons to one SSE register */
1535         SSE_OR_PS(XMM6,XMM7)
1536         SSE_OR_PS(XMM5,XMM4)
1537         SSE_OR_PS(XMM5,XMM6)
1538       SSE_INLINE_END_1;
1539 
1540       /* Reduce the one SSE register to an integer register for branching */
1541       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1542       MOVEMASK(nonzero,XMM5);
1543 
1544       /* If block is nonzero ... */
1545       if (nonzero) {
1546         pv = ba + 16*diag_offset[row];
1547         PREFETCH_L1(&pv[16]);
1548         pj = bj + diag_offset[row] + 1;
1549 
1550         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1551         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1552         /* but the diagonal was inverted already */
1553         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1554 
1555         SSE_INLINE_BEGIN_2(pv,pc)
1556           /* Column 0, product is accumulated in XMM4 */
1557           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1558           SSE_SHUFFLE(XMM4,XMM4,0x00)
1559           SSE_MULT_PS(XMM4,XMM0)
1560 
1561           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1562           SSE_SHUFFLE(XMM5,XMM5,0x00)
1563           SSE_MULT_PS(XMM5,XMM1)
1564           SSE_ADD_PS(XMM4,XMM5)
1565 
1566           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1567           SSE_SHUFFLE(XMM6,XMM6,0x00)
1568           SSE_MULT_PS(XMM6,XMM2)
1569           SSE_ADD_PS(XMM4,XMM6)
1570 
1571           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1572           SSE_SHUFFLE(XMM7,XMM7,0x00)
1573           SSE_MULT_PS(XMM7,XMM3)
1574           SSE_ADD_PS(XMM4,XMM7)
1575 
1576           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1577           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1578 
1579           /* Column 1, product is accumulated in XMM5 */
1580           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1581           SSE_SHUFFLE(XMM5,XMM5,0x00)
1582           SSE_MULT_PS(XMM5,XMM0)
1583 
1584           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1585           SSE_SHUFFLE(XMM6,XMM6,0x00)
1586           SSE_MULT_PS(XMM6,XMM1)
1587           SSE_ADD_PS(XMM5,XMM6)
1588 
1589           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1590           SSE_SHUFFLE(XMM7,XMM7,0x00)
1591           SSE_MULT_PS(XMM7,XMM2)
1592           SSE_ADD_PS(XMM5,XMM7)
1593 
1594           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1595           SSE_SHUFFLE(XMM6,XMM6,0x00)
1596           SSE_MULT_PS(XMM6,XMM3)
1597           SSE_ADD_PS(XMM5,XMM6)
1598 
1599           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1600           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1601 
1602           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1603 
1604           /* Column 2, product is accumulated in XMM6 */
1605           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1606           SSE_SHUFFLE(XMM6,XMM6,0x00)
1607           SSE_MULT_PS(XMM6,XMM0)
1608 
1609           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1610           SSE_SHUFFLE(XMM7,XMM7,0x00)
1611           SSE_MULT_PS(XMM7,XMM1)
1612           SSE_ADD_PS(XMM6,XMM7)
1613 
1614           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1615           SSE_SHUFFLE(XMM7,XMM7,0x00)
1616           SSE_MULT_PS(XMM7,XMM2)
1617           SSE_ADD_PS(XMM6,XMM7)
1618 
1619           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1620           SSE_SHUFFLE(XMM7,XMM7,0x00)
1621           SSE_MULT_PS(XMM7,XMM3)
1622           SSE_ADD_PS(XMM6,XMM7)
1623 
1624           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1625           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1626 
1627           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1628           /* Column 3, product is accumulated in XMM0 */
1629           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1630           SSE_SHUFFLE(XMM7,XMM7,0x00)
1631           SSE_MULT_PS(XMM0,XMM7)
1632 
1633           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1634           SSE_SHUFFLE(XMM7,XMM7,0x00)
1635           SSE_MULT_PS(XMM1,XMM7)
1636           SSE_ADD_PS(XMM0,XMM1)
1637 
1638           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1639           SSE_SHUFFLE(XMM1,XMM1,0x00)
1640           SSE_MULT_PS(XMM1,XMM2)
1641           SSE_ADD_PS(XMM0,XMM1)
1642 
1643           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1644           SSE_SHUFFLE(XMM7,XMM7,0x00)
1645           SSE_MULT_PS(XMM3,XMM7)
1646           SSE_ADD_PS(XMM0,XMM3)
1647 
1648           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1649           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1650 
1651           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1652           /* This is code to be maintained and read by humans afterall. */
1653           /* Copy Multiplier Col 3 into XMM3 */
1654           SSE_COPY_PS(XMM3,XMM0)
1655           /* Copy Multiplier Col 2 into XMM2 */
1656           SSE_COPY_PS(XMM2,XMM6)
1657           /* Copy Multiplier Col 1 into XMM1 */
1658           SSE_COPY_PS(XMM1,XMM5)
1659           /* Copy Multiplier Col 0 into XMM0 */
1660           SSE_COPY_PS(XMM0,XMM4)
1661         SSE_INLINE_END_2;
1662 
1663         /* Update the row: */
1664         nz = bi[row+1] - diag_offset[row] - 1;
1665         pv += 16;
1666         for (j=0; j<nz; j++) {
1667           PREFETCH_L1(&pv[16]);
1668           x = rtmp + 16*((unsigned int)pj[j]);
1669 /*            x = rtmp + 4*pj[j]; */
1670 
1671           /* X:=X-M*PV, One column at a time */
1672           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1673           SSE_INLINE_BEGIN_2(x,pv)
1674             /* Load First Column of X*/
1675             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1676             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1677 
1678             /* Matrix-Vector Product: */
1679             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1680             SSE_SHUFFLE(XMM5,XMM5,0x00)
1681             SSE_MULT_PS(XMM5,XMM0)
1682             SSE_SUB_PS(XMM4,XMM5)
1683 
1684             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1685             SSE_SHUFFLE(XMM6,XMM6,0x00)
1686             SSE_MULT_PS(XMM6,XMM1)
1687             SSE_SUB_PS(XMM4,XMM6)
1688 
1689             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1690             SSE_SHUFFLE(XMM7,XMM7,0x00)
1691             SSE_MULT_PS(XMM7,XMM2)
1692             SSE_SUB_PS(XMM4,XMM7)
1693 
1694             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1695             SSE_SHUFFLE(XMM5,XMM5,0x00)
1696             SSE_MULT_PS(XMM5,XMM3)
1697             SSE_SUB_PS(XMM4,XMM5)
1698 
1699             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1700             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1701 
1702             /* Second Column */
1703             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1704             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1705 
1706             /* Matrix-Vector Product: */
1707             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1708             SSE_SHUFFLE(XMM6,XMM6,0x00)
1709             SSE_MULT_PS(XMM6,XMM0)
1710             SSE_SUB_PS(XMM5,XMM6)
1711 
1712             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1713             SSE_SHUFFLE(XMM7,XMM7,0x00)
1714             SSE_MULT_PS(XMM7,XMM1)
1715             SSE_SUB_PS(XMM5,XMM7)
1716 
1717             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1718             SSE_SHUFFLE(XMM4,XMM4,0x00)
1719             SSE_MULT_PS(XMM4,XMM2)
1720             SSE_SUB_PS(XMM5,XMM4)
1721 
1722             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1723             SSE_SHUFFLE(XMM6,XMM6,0x00)
1724             SSE_MULT_PS(XMM6,XMM3)
1725             SSE_SUB_PS(XMM5,XMM6)
1726 
1727             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1728             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1729 
1730             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1731 
1732             /* Third Column */
1733             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1734             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1735 
1736             /* Matrix-Vector Product: */
1737             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1738             SSE_SHUFFLE(XMM7,XMM7,0x00)
1739             SSE_MULT_PS(XMM7,XMM0)
1740             SSE_SUB_PS(XMM6,XMM7)
1741 
1742             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1743             SSE_SHUFFLE(XMM4,XMM4,0x00)
1744             SSE_MULT_PS(XMM4,XMM1)
1745             SSE_SUB_PS(XMM6,XMM4)
1746 
1747             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1748             SSE_SHUFFLE(XMM5,XMM5,0x00)
1749             SSE_MULT_PS(XMM5,XMM2)
1750             SSE_SUB_PS(XMM6,XMM5)
1751 
1752             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1753             SSE_SHUFFLE(XMM7,XMM7,0x00)
1754             SSE_MULT_PS(XMM7,XMM3)
1755             SSE_SUB_PS(XMM6,XMM7)
1756 
1757             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1758             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1759 
1760             /* Fourth Column */
1761             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1762             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1763 
1764             /* Matrix-Vector Product: */
1765             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1766             SSE_SHUFFLE(XMM5,XMM5,0x00)
1767             SSE_MULT_PS(XMM5,XMM0)
1768             SSE_SUB_PS(XMM4,XMM5)
1769 
1770             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1771             SSE_SHUFFLE(XMM6,XMM6,0x00)
1772             SSE_MULT_PS(XMM6,XMM1)
1773             SSE_SUB_PS(XMM4,XMM6)
1774 
1775             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1776             SSE_SHUFFLE(XMM7,XMM7,0x00)
1777             SSE_MULT_PS(XMM7,XMM2)
1778             SSE_SUB_PS(XMM4,XMM7)
1779 
1780             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1781             SSE_SHUFFLE(XMM5,XMM5,0x00)
1782             SSE_MULT_PS(XMM5,XMM3)
1783             SSE_SUB_PS(XMM4,XMM5)
1784 
1785             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1786             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1787           SSE_INLINE_END_2;
1788           pv   += 16;
1789         }
1790         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1791       }
1792       row = (unsigned int)(*bjtmp++);
1793 /*        row = (*bjtmp++)/4; */
1794 /*        bjtmp++; */
1795     }
1796     /* finished row so stick it into b->a */
1797     pv = ba + 16*bi[i];
1798     pj = bj + bi[i];
1799     nz = bi[i+1] - bi[i];
1800 
1801     /* Copy x block back into pv block */
1802     for (j=0; j<nz; j++) {
1803       x  = rtmp+16*((unsigned int)pj[j]);
1804 /*        x  = rtmp+4*pj[j]; */
1805 
1806       SSE_INLINE_BEGIN_2(x,pv)
1807         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1808         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1809         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1810 
1811         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1812         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1813 
1814         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1815         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1816 
1817         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1818         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1819 
1820         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1821         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1822 
1823         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1824         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1825 
1826         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1827         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1828 
1829         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1830         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1831       SSE_INLINE_END_2;
1832       pv += 16;
1833     }
1834     /* invert diagonal block */
1835     w = ba + 16*diag_offset[i];
1836     if (pivotinblocks) {
1837       ierr = Kernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1838     } else {
1839       ierr = Kernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1840     }
1841 /*      ierr = Kernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1842     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1843   }
1844 
1845   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1846   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1847   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1848   C->assembled = PETSC_TRUE;
1849   ierr = PetscLogFlops(1.3333*64*b->mbs);CHKERRQ(ierr);
1850   /* Flop Count from inverting diagonal blocks */
1851   SSE_SCOPE_END;
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 #endif
1856