xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision f7cf7585ec4ab5bbe01f2b7df7e992ca325859cc)
1 /* Using Modified Sparse Row (MSR) storage.
2 See page 85, "Iterative Methods ..." by Saad. */
3 
4 /*$Id: sbaijfact.c,v 1.2 2000/06/23 22:00:05 buschelm Exp balay $*/
5 /*
6     Factorization code for SBAIJ format.
7 */
8 #include "sbaij.h"
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/ilu.h"
12 
13 #undef __FUNC__
14 #define __FUNC__ "MatLUFactorSymbolic_SeqSBAIJ"
15 int MatLUFactorSymbolic_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatLUInfo *info,Mat *B)
16 {
17   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
18   IS          isicol;
19   int         *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
20   int         *jutmp,bs = a->bs,bs2=a->bs2;
21   int         m,nzi,realloc = 0;
22   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
23   PetscReal   f = 1.0;
24 
25   PetscFunctionBegin;
26   PetscValidHeaderSpecific(isrow,IS_COOKIE);
27   PetscValidHeaderSpecific(iscol,IS_COOKIE);
28   /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/
29   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
30   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
31   ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr);
32   for (k=0; k<mbs; k++) {
33     if ( rip[k] - riip[k] != 0 ) {
34       printf("Non-symm. permutation, use symm. permutation or general matrix format\n");
35       break;
36     }
37   }
38 
39   /* initialization */
40   /* Don't know how many column pointers are needed so estimate.
41      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
42   if (info) f = info->fill;
43   umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
44   ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
45   iu[0] = mbs+1;
46   juptr = mbs;
47   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
48   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
49   for (i=0; i<mbs; i++){
50     jl[i] = mbs; q[i] = 0;
51   }
52 
53   /* for each row k */
54   for (k=0; k<mbs; k++){
55     nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
56     q[k] = mbs;
57     /* initialize nonzero structure of k-th row to row rip[k] of A */
58     jmin = ai[rip[k]];
59     jmax = ai[rip[k]+1];
60     for (j=jmin; j<jmax; j++){
61       vj = riip[aj[j]]; /* col. value */
62       if(vj > k){
63         qm = k;
64         do {
65           m  = qm; qm = q[m];
66         } while(qm < vj);
67         if (qm == vj) {
68           printf(" error: duplicate entry in A\n"); break;
69         }
70         nzk++;
71         q[m] = vj;
72         q[vj] = qm;
73       } /* if(vj > k) */
74     } /* for (j=jmin; j<jmax; j++) */
75 
76     /* modify nonzero structure of k-th row by computing fill-in
77        for each row i to be merged in */
78     i = k;
79     i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
80     /* printf(" next pivot row i=%d\n",i); */
81     while (i < mbs){
82       /* merge row i into k-th row */
83       nzi = iu[i+1] - (iu[i]+1);
84       jmin = iu[i] + 1; jmax = iu[i] + nzi;
85       qm = k;
86       for (j=jmin; j<jmax+1; j++){
87         vj = ju[j];
88         do {
89           m = qm; qm = q[m];
90         } while (qm < vj);
91         if (qm != vj){
92          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
93         }
94       }
95       i = jl[i]; /* next pivot row */
96     }
97 
98     /* add k to row list for first nonzero element in k-th row */
99     if (nzk > 0){
100       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
101       jl[k] = jl[i]; jl[i] = k;
102     }
103     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
104 
105     /* allocate more space to ju if needed */
106     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
107       /* estimate how much additional space we will need */
108       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
109       /* just double the memory each time */
110       maxadd = umax;
111       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
112       umax += maxadd;
113 
114       /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */
115       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
116       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
117       ierr = PetscFree(ju);CHKERRQ(ierr);
118       ju = iu = jutmp;
119       realloc++; /* count how many times we realloc */
120     }
121 
122     /* save nonzero structure of k-th row in ju */
123     i=k;
124     jumin = juptr + 1; juptr += nzk;
125     for (j=jumin; j<juptr+1; j++){
126       i=q[i];
127       ju[j]=i;
128       /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/
129     }
130     /* printf("\n");  */
131   } /* for (k=0; k<mbs; k++) */
132 
133   if (ai[mbs] != 0) {
134     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
135     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
136     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
137     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
138     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:for best performance.\n");
139   } else {
140      PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
141   }
142 
143   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
144   ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr);
145 
146   ierr = PetscFree(q);CHKERRQ(ierr);
147   ierr = PetscFree(jl);CHKERRQ(ierr);
148 
149   /* put together the new matrix */
150   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
151   PLogObjectParent(*B,isicol);
152   b = (Mat_SeqSBAIJ*)(*B)->data;
153   ierr = PetscFree(b->imax);CHKERRQ(ierr);
154   b->singlemalloc = PETSC_FALSE;
155   /* the next line frees the default space generated by the Create() */
156   ierr = PetscFree(b->a);CHKERRQ(ierr);
157   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
158   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
159   b->j          = ju;
160   b->i          = iu;
161   b->diag       = 0;
162   b->ilen       = 0;
163   b->imax       = 0;
164   b->row        = isrow;
165   b->col        = iscol;
166   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
167   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
168   b->icol       = isicol;
169   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
170   /* In b structure:  Free imax, ilen, old a, old j.
171      Allocate idnew, solve_work, new a, new j */
172   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
173   b->s_maxnz = b->s_nz = iu[mbs];
174 
175   (*B)->factor                 = FACTOR_LU;
176   (*B)->info.factor_mallocs    = realloc;
177   (*B)->info.fill_ratio_given  = f;
178   if (ai[mbs] != 0) {
179     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
180   } else {
181     (*B)->info.fill_ratio_needed = 0.0;
182   }
183 
184 
185   PetscFunctionReturn(0);
186 }
187 
188 /* ----------------------------------------------------------- */
189 #undef __FUNC__
190 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_N"
191 int MatLUFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
192 {
193   Mat                C = *B;
194   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
195   IS                 isrow = b->row,isicol = b->icol;
196   int                *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
197   int                *ajtmpold,*ajtmp,nz,row,bslog,*ai=a->i,*aj=a->j,k,flg;
198   int                *diag_offset=b->diag,diag,bs=a->bs,bs2 = a->bs2,*v_pivots,*pj;
199   MatScalar          *ba = b->a,*aa = a->a,*pv,*v,*rtmp,*multiplier,*v_work,*pc,*w;
200 
201   PetscFunctionBegin;
202   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
203   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
204   rtmp = (MatScalar*)PetscMalloc(bs2*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
205   ierr = PetscMemzero(rtmp,bs2*(n+1)*sizeof(MatScalar));CHKERRQ(ierr);
206   /* generate work space needed by dense LU factorization */
207   v_work     = (MatScalar*)PetscMalloc(bs*sizeof(int) + (bs+bs2)*sizeof(MatScalar));CHKPTRQ(v_work);
208   multiplier = v_work + bs;
209   v_pivots   = (int*)(multiplier + bs2);
210 
211   /* flops in while loop */
212   bslog = 2*bs*bs2;
213 
214   for (i=0; i<n; i++) {
215     nz    = bi[i+1] - bi[i];
216     ajtmp = bj + bi[i];
217     for  (j=0; j<nz; j++) {
218       ierr = PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
219     }
220     /* load in initial (unfactored row) */
221     nz       = ai[r[i]+1] - ai[r[i]];
222     ajtmpold = aj + ai[r[i]];
223     v        = aa + bs2*ai[r[i]];
224     for (j=0; j<nz; j++) {
225       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
226     }
227     row = *ajtmp++;
228     while (row < i) {
229       pc = rtmp + bs2*row;
230 /*      if (*pc) { */
231       for (flg=0,k=0; k<bs2; k++) { if (pc[k]!=0.0) { flg =1; break; }}
232       if (flg) {
233         pv = ba + bs2*diag_offset[row];
234         pj = bj + diag_offset[row] + 1;
235         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier);
236         nz = bi[row+1] - diag_offset[row] - 1;
237         pv += bs2;
238         for (j=0; j<nz; j++) {
239           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
240         }
241         PLogFlops(bslog*(nz+1)-bs);
242       }
243         row = *ajtmp++;
244     }
245     /* finished row so stick it into b->a */
246     pv = ba + bs2*bi[i];
247     pj = bj + bi[i];
248     nz = bi[i+1] - bi[i];
249     for (j=0; j<nz; j++) {
250       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
251     }
252     diag = diag_offset[i] - bi[i];
253     /* invert diagonal block */
254     w = pv + bs2*diag;
255     Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);
256   }
257 
258   ierr = PetscFree(rtmp);CHKERRQ(ierr);
259   ierr = PetscFree(v_work);CHKERRQ(ierr);
260   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
261   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
262   C->factor = FACTOR_LU;
263   C->assembled = PETSC_TRUE;
264   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
265   PetscFunctionReturn(0);
266 }
267 /* ------------------------------------------------------------*/
268 /*
269       Version for when blocks are 7 by 7
270 */
271 #undef __FUNC__
272 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_7"
273 int MatLUFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B)
274 {
275   Mat         C = *B;
276   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
277   IS          isrow = b->row,isicol = b->icol;
278   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
279   int         *ajtmpold,*ajtmp,nz,row;
280   int         *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
281   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
282   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
283   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
284   MatScalar   x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
285   MatScalar   p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
286   MatScalar   m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
287   MatScalar   p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
288   MatScalar   p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49;
289   MatScalar   x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
290   MatScalar   x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49;
291   MatScalar   m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
292   MatScalar   m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49;
293   MatScalar   *ba = b->a,*aa = a->a;
294 
295   PetscFunctionBegin;
296   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
297   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
298   rtmp  = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
299 
300   for (i=0; i<n; i++) {
301     nz    = bi[i+1] - bi[i];
302     ajtmp = bj + bi[i];
303     for  (j=0; j<nz; j++) {
304       x = rtmp+49*ajtmp[j];
305       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
306       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
307       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
308       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
309       x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ;
310       x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ;
311     }
312     /* load in initial (unfactored row) */
313     idx      = r[i];
314     nz       = ai[idx+1] - ai[idx];
315     ajtmpold = aj + ai[idx];
316     v        = aa + 49*ai[idx];
317     for (j=0; j<nz; j++) {
318       x    = rtmp+49*ic[ajtmpold[j]];
319       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
320       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
321       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
322       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
323       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
324       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
325       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
326       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
327       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
328       x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39];
329       x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43];
330       x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47];
331       x[48] = v[48];
332       v    += 49;
333     }
334     row = *ajtmp++;
335     while (row < i) {
336       pc  =  rtmp + 49*row;
337       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
338       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
339       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
340       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
341       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
342       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
343       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
344       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
345       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
346       p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39];
347       p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43];
348       p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47];
349       p49 = pc[48];
350       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
351           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
352           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
353           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
354           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
355           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
356           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
357           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
358           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 ||
359           p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 ||
360           p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 ||
361           p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 ||
362           p49 != 0.0) {
363         pv = ba + 49*diag_offset[row];
364         pj = bj + diag_offset[row] + 1;
365 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
366 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
367 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
368 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
369 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
370 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
371 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
372 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
373 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
374 	x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
375 	x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
376 	x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
377 	x49 = pv[48];
378         pc[0]  = m1  = p1*x1  + p8*x2   + p15*x3  + p22*x4  + p29*x5  + p36*x6 + p43*x7;
379         pc[1]  = m2  = p2*x1  + p9*x2   + p16*x3  + p23*x4  + p30*x5  + p37*x6 + p44*x7;
380         pc[2]  = m3  = p3*x1  + p10*x2  + p17*x3  + p24*x4  + p31*x5  + p38*x6 + p45*x7;
381         pc[3]  = m4  = p4*x1  + p11*x2  + p18*x3  + p25*x4  + p32*x5  + p39*x6 + p46*x7;
382         pc[4]  = m5  = p5*x1  + p12*x2  + p19*x3  + p26*x4  + p33*x5  + p40*x6 + p47*x7;
383         pc[5]  = m6  = p6*x1  + p13*x2  + p20*x3  + p27*x4  + p34*x5  + p41*x6 + p48*x7;
384         pc[6]  = m7  = p7*x1  + p14*x2  + p21*x3  + p28*x4  + p35*x5  + p42*x6 + p49*x7;
385 
386         pc[7]  = m8  = p1*x8  + p8*x9   + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14;
387         pc[8]  = m9  = p2*x8  + p9*x9   + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14;
388         pc[9]  = m10 = p3*x8  + p10*x9  + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14;
389         pc[10] = m11 = p4*x8  + p11*x9  + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14;
390         pc[11] = m12 = p5*x8  + p12*x9  + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14;
391         pc[12] = m13 = p6*x8  + p13*x9  + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14;
392         pc[13] = m14 = p7*x8  + p14*x9  + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14;
393 
394         pc[14] = m15 = p1*x15 + p8*x16  + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21;
395         pc[15] = m16 = p2*x15 + p9*x16  + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21;
396         pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21;
397         pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21;
398         pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21;
399         pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21;
400         pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21;
401 
402         pc[21] = m22 = p1*x22 + p8*x23  + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28;
403         pc[22] = m23 = p2*x22 + p9*x23  + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28;
404         pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28;
405         pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28;
406         pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28;
407         pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28;
408         pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28;
409 
410         pc[28] = m29 = p1*x29 + p8*x30  + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35;
411         pc[29] = m30 = p2*x29 + p9*x30  + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35;
412         pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35;
413         pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35;
414         pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35;
415         pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35;
416         pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35;
417 
418         pc[35] = m36 = p1*x36 + p8*x37  + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42;
419         pc[36] = m37 = p2*x36 + p9*x37  + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42;
420         pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42;
421         pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42;
422         pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42;
423         pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42;
424         pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42;
425 
426         pc[42] = m43 = p1*x43 + p8*x44  + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49;
427         pc[43] = m44 = p2*x43 + p9*x44  + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49;
428         pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49;
429         pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49;
430         pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49;
431         pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49;
432         pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49;
433 
434         nz = bi[row+1] - diag_offset[row] - 1;
435         pv += 49;
436         for (j=0; j<nz; j++) {
437 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
438 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
439 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
440 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
441 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
442 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
443 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
444 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
445 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
446 	  x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
447 	  x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
448 	  x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
449 	  x49 = pv[48];
450 	  x    = rtmp + 49*pj[j];
451 	  x[0]  -= m1*x1  + m8*x2   + m15*x3  + m22*x4  + m29*x5  + m36*x6 + m43*x7;
452 	  x[1]  -= m2*x1  + m9*x2   + m16*x3  + m23*x4  + m30*x5  + m37*x6 + m44*x7;
453 	  x[2]  -= m3*x1  + m10*x2  + m17*x3  + m24*x4  + m31*x5  + m38*x6 + m45*x7;
454 	  x[3]  -= m4*x1  + m11*x2  + m18*x3  + m25*x4  + m32*x5  + m39*x6 + m46*x7;
455 	  x[4]  -= m5*x1  + m12*x2  + m19*x3  + m26*x4  + m33*x5  + m40*x6 + m47*x7;
456 	  x[5]  -= m6*x1  + m13*x2  + m20*x3  + m27*x4  + m34*x5  + m41*x6 + m48*x7;
457 	  x[6]  -= m7*x1  + m14*x2  + m21*x3  + m28*x4  + m35*x5  + m42*x6 + m49*x7;
458 
459 	  x[7]  -= m1*x8  + m8*x9   + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14;
460 	  x[8]  -= m2*x8  + m9*x9   + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14;
461 	  x[9]  -= m3*x8  + m10*x9  + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14;
462 	  x[10] -= m4*x8  + m11*x9  + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14;
463 	  x[11] -= m5*x8  + m12*x9  + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14;
464 	  x[12] -= m6*x8  + m13*x9  + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14;
465 	  x[13] -= m7*x8  + m14*x9  + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14;
466 
467 	  x[14] -= m1*x15 + m8*x16  + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21;
468 	  x[15] -= m2*x15 + m9*x16  + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21;
469 	  x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21;
470 	  x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21;
471 	  x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21;
472 	  x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21;
473 	  x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21;
474 
475 	  x[21] -= m1*x22 + m8*x23  + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28;
476 	  x[22] -= m2*x22 + m9*x23  + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28;
477 	  x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28;
478 	  x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28;
479 	  x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28;
480 	  x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28;
481 	  x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28;
482 
483 	  x[28] -= m1*x29 + m8*x30  + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35;
484 	  x[29] -= m2*x29 + m9*x30  + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35;
485 	  x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35;
486 	  x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35;
487 	  x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35;
488 	  x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35;
489 	  x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35;
490 
491 	  x[35] -= m1*x36 + m8*x37  + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42;
492 	  x[36] -= m2*x36 + m9*x37  + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42;
493 	  x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42;
494 	  x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42;
495 	  x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42;
496 	  x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42;
497 	  x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42;
498 
499 	  x[42] -= m1*x43 + m8*x44  + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49;
500 	  x[43] -= m2*x43 + m9*x44  + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49;
501 	  x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49;
502 	  x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49;
503 	  x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49;
504 	  x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49;
505 	  x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49;
506           pv   += 49;
507         }
508         PLogFlops(686*nz+637);
509       }
510       row = *ajtmp++;
511     }
512     /* finished row so stick it into b->a */
513     pv = ba + 49*bi[i];
514     pj = bj + bi[i];
515     nz = bi[i+1] - bi[i];
516     for (j=0; j<nz; j++) {
517       x      = rtmp+49*pj[j];
518       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
519       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
520       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
521       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
522       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
523       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
524       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
525       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
526       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
527       pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39];
528       pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43];
529       pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47];
530       pv[48] = x[48];
531       pv   += 49;
532     }
533     /* invert diagonal block */
534     w = ba + 49*diag_offset[i];
535     ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr);
536   }
537 
538   ierr = PetscFree(rtmp);CHKERRQ(ierr);
539   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
540   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
541   C->factor = FACTOR_LU;
542   C->assembled = PETSC_TRUE;
543   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
544   PetscFunctionReturn(0);
545 }
546 
547 /*
548       Version for when blocks are 7 by 7 Using natural ordering
549 */
550 #undef __FUNC__
551 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_7_NaturalOrdering"
552 int MatLUFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B)
553 {
554   Mat          C = *B;
555   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
556   int          ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
557   int          *ajtmpold,*ajtmp,nz,row;
558   int          *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
559   MatScalar    *pv,*v,*rtmp,*pc,*w,*x;
560   MatScalar    x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
561   MatScalar    x16,x17,x18,x19,x20,x21,x22,x23,x24,x25;
562   MatScalar    p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15;
563   MatScalar    p16,p17,p18,p19,p20,p21,p22,p23,p24,p25;
564   MatScalar    m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15;
565   MatScalar    m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
566   MatScalar    p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
567   MatScalar    p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49;
568   MatScalar    x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
569   MatScalar    x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49;
570   MatScalar    m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
571   MatScalar    m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49;
572   MatScalar    *ba = b->a,*aa = a->a;
573 
574   PetscFunctionBegin;
575   rtmp  = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
576   for (i=0; i<n; i++) {
577     nz    = bi[i+1] - bi[i];
578     ajtmp = bj + bi[i];
579     for  (j=0; j<nz; j++) {
580       x = rtmp+49*ajtmp[j];
581       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
582       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
583       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
584       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
585       x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ;
586       x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ;
587     }
588     /* load in initial (unfactored row) */
589     nz       = ai[i+1] - ai[i];
590     ajtmpold = aj + ai[i];
591     v        = aa + 49*ai[i];
592     for (j=0; j<nz; j++) {
593       x    = rtmp+49*ajtmpold[j];
594       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
595       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
596       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
597       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
598       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
599       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
600       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
601       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
602       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
603       x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39];
604       x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43];
605       x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47];
606       x[48] = v[48];
607       v    += 49;
608     }
609     row = *ajtmp++;
610     while (row < i) {
611       pc  = rtmp + 49*row;
612       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
613       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
614       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
615       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
616       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
617       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
618       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
619       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
620       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
621       p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39];
622       p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43];
623       p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47];
624       p49 = pc[48];
625       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
626           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
627           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
628           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
629           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
630           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
631           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
632           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
633           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 ||
634           p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 ||
635           p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 ||
636           p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 ||
637           p49 != 0.0) {
638         pv = ba + 49*diag_offset[row];
639         pj = bj + diag_offset[row] + 1;
640 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
641 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
642 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
643 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
644 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
645 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
646 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
647 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
648 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
649 	x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
650 	x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
651 	x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
652         x49 = pv[48];
653         pc[0]  = m1  = p1*x1  + p8*x2   + p15*x3  + p22*x4  + p29*x5  + p36*x6 + p43*x7;
654         pc[1]  = m2  = p2*x1  + p9*x2   + p16*x3  + p23*x4  + p30*x5  + p37*x6 + p44*x7;
655         pc[2]  = m3  = p3*x1  + p10*x2  + p17*x3  + p24*x4  + p31*x5  + p38*x6 + p45*x7;
656         pc[3]  = m4  = p4*x1  + p11*x2  + p18*x3  + p25*x4  + p32*x5  + p39*x6 + p46*x7;
657         pc[4]  = m5  = p5*x1  + p12*x2  + p19*x3  + p26*x4  + p33*x5  + p40*x6 + p47*x7;
658         pc[5]  = m6  = p6*x1  + p13*x2  + p20*x3  + p27*x4  + p34*x5  + p41*x6 + p48*x7;
659         pc[6]  = m7  = p7*x1  + p14*x2  + p21*x3  + p28*x4  + p35*x5  + p42*x6 + p49*x7;
660 
661         pc[7]  = m8  = p1*x8  + p8*x9   + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14;
662         pc[8]  = m9  = p2*x8  + p9*x9   + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14;
663         pc[9]  = m10 = p3*x8  + p10*x9  + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14;
664         pc[10] = m11 = p4*x8  + p11*x9  + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14;
665         pc[11] = m12 = p5*x8  + p12*x9  + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14;
666         pc[12] = m13 = p6*x8  + p13*x9  + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14;
667         pc[13] = m14 = p7*x8  + p14*x9  + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14;
668 
669         pc[14] = m15 = p1*x15 + p8*x16  + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21;
670         pc[15] = m16 = p2*x15 + p9*x16  + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21;
671         pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21;
672         pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21;
673         pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21;
674         pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21;
675         pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21;
676 
677         pc[21] = m22 = p1*x22 + p8*x23  + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28;
678         pc[22] = m23 = p2*x22 + p9*x23  + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28;
679         pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28;
680         pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28;
681         pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28;
682         pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28;
683         pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28;
684 
685         pc[28] = m29 = p1*x29 + p8*x30  + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35;
686         pc[29] = m30 = p2*x29 + p9*x30  + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35;
687         pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35;
688         pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35;
689         pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35;
690         pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35;
691         pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35;
692 
693         pc[35] = m36 = p1*x36 + p8*x37  + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42;
694         pc[36] = m37 = p2*x36 + p9*x37  + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42;
695         pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42;
696         pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42;
697         pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42;
698         pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42;
699         pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42;
700 
701         pc[42] = m43 = p1*x43 + p8*x44  + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49;
702         pc[43] = m44 = p2*x43 + p9*x44  + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49;
703         pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49;
704         pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49;
705         pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49;
706         pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49;
707         pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49;
708 
709         nz = bi[row+1] - diag_offset[row] - 1;
710         pv += 49;
711         for (j=0; j<nz; j++) {
712 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
713 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
714 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
715 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
716 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
717 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
718 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
719 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
720 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
721 	  x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
722 	  x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
723 	  x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
724 	  x49 = pv[48];
725 	  x    = rtmp + 49*pj[j];
726 	  x[0]  -= m1*x1  + m8*x2   + m15*x3  + m22*x4  + m29*x5  + m36*x6 + m43*x7;
727 	  x[1]  -= m2*x1  + m9*x2   + m16*x3  + m23*x4  + m30*x5  + m37*x6 + m44*x7;
728 	  x[2]  -= m3*x1  + m10*x2  + m17*x3  + m24*x4  + m31*x5  + m38*x6 + m45*x7;
729 	  x[3]  -= m4*x1  + m11*x2  + m18*x3  + m25*x4  + m32*x5  + m39*x6 + m46*x7;
730 	  x[4]  -= m5*x1  + m12*x2  + m19*x3  + m26*x4  + m33*x5  + m40*x6 + m47*x7;
731 	  x[5]  -= m6*x1  + m13*x2  + m20*x3  + m27*x4  + m34*x5  + m41*x6 + m48*x7;
732 	  x[6]  -= m7*x1  + m14*x2  + m21*x3  + m28*x4  + m35*x5  + m42*x6 + m49*x7;
733 
734 	  x[7]  -= m1*x8  + m8*x9   + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14;
735 	  x[8]  -= m2*x8  + m9*x9   + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14;
736 	  x[9]  -= m3*x8  + m10*x9  + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14;
737 	  x[10] -= m4*x8  + m11*x9  + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14;
738 	  x[11] -= m5*x8  + m12*x9  + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14;
739 	  x[12] -= m6*x8  + m13*x9  + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14;
740 	  x[13] -= m7*x8  + m14*x9  + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14;
741 
742 	  x[14] -= m1*x15 + m8*x16  + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21;
743 	  x[15] -= m2*x15 + m9*x16  + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21;
744 	  x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21;
745 	  x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21;
746 	  x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21;
747 	  x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21;
748 	  x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21;
749 
750 	  x[21] -= m1*x22 + m8*x23  + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28;
751 	  x[22] -= m2*x22 + m9*x23  + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28;
752 	  x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28;
753 	  x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28;
754 	  x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28;
755 	  x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28;
756 	  x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28;
757 
758 	  x[28] -= m1*x29 + m8*x30  + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35;
759 	  x[29] -= m2*x29 + m9*x30  + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35;
760 	  x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35;
761 	  x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35;
762 	  x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35;
763 	  x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35;
764 	  x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35;
765 
766 	  x[35] -= m1*x36 + m8*x37  + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42;
767 	  x[36] -= m2*x36 + m9*x37  + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42;
768 	  x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42;
769 	  x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42;
770 	  x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42;
771 	  x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42;
772 	  x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42;
773 
774 	  x[42] -= m1*x43 + m8*x44  + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49;
775 	  x[43] -= m2*x43 + m9*x44  + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49;
776 	  x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49;
777 	  x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49;
778 	  x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49;
779 	  x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49;
780 	  x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49;
781           pv   += 49;
782         }
783         PLogFlops(686*nz+637);
784       }
785       row = *ajtmp++;
786     }
787     /* finished row so stick it into b->a */
788     pv = ba + 49*bi[i];
789     pj = bj + bi[i];
790     nz = bi[i+1] - bi[i];
791     for (j=0; j<nz; j++) {
792       x      = rtmp+49*pj[j];
793       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
794       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
795       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
796       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
797       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
798       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
799       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
800       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
801       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
802       pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39];
803       pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43];
804       pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47];
805       pv[48] = x[48];
806       pv   += 49;
807     }
808     /* invert diagonal block */
809     w = ba + 49*diag_offset[i];
810     ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr);
811   }
812 
813   ierr = PetscFree(rtmp);CHKERRQ(ierr);
814   C->factor    = FACTOR_LU;
815   C->assembled = PETSC_TRUE;
816   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
817   PetscFunctionReturn(0);
818 }
819 
820 /* ------------------------------------------------------------*/
821 /*
822       Version for when blocks are 6 by 6
823 */
824 #undef __FUNC__
825 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_6"
826 int MatLUFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B)
827 {
828   Mat          C = *B;
829   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
830   IS           isrow = b->row,isicol = b->icol;
831   int          *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
832   int          *ajtmpold,*ajtmp,nz,row;
833   int          *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
834   MatScalar    *pv,*v,*rtmp,*pc,*w,*x;
835   MatScalar    p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
836   MatScalar    p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
837   MatScalar    x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
838   MatScalar    p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
839   MatScalar    m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
840   MatScalar    p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
841   MatScalar    x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
842   MatScalar    m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
843   MatScalar    *ba = b->a,*aa = a->a;
844 
845   PetscFunctionBegin;
846   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
847   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
848   rtmp  = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
849 
850   for (i=0; i<n; i++) {
851     nz    = bi[i+1] - bi[i];
852     ajtmp = bj + bi[i];
853     for  (j=0; j<nz; j++) {
854       x = rtmp+36*ajtmp[j];
855       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
856       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
857       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
858       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
859       x[34] = x[35] = 0.0 ;
860     }
861     /* load in initial (unfactored row) */
862     idx      = r[i];
863     nz       = ai[idx+1] - ai[idx];
864     ajtmpold = aj + ai[idx];
865     v        = aa + 36*ai[idx];
866     for (j=0; j<nz; j++) {
867       x    = rtmp+36*ic[ajtmpold[j]];
868       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
869       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
870       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
871       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
872       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
873       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
874       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
875       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
876       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
877       v    += 36;
878     }
879     row = *ajtmp++;
880     while (row < i) {
881       pc  =  rtmp + 36*row;
882       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
883       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
884       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
885       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
886       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
887       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
888       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
889       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
890       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
891       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
892           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
893           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
894           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
895           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
896           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
897           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
898           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
899           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) {
900         pv = ba + 36*diag_offset[row];
901         pj = bj + diag_offset[row] + 1;
902 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
903 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
904 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
905 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
906 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
907 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
908 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
909 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
910 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
911         pc[0]  = m1  = p1*x1  + p7*x2   + p13*x3  + p19*x4  + p25*x5  + p31*x6;
912         pc[1]  = m2  = p2*x1  + p8*x2   + p14*x3  + p20*x4  + p26*x5  + p32*x6;
913         pc[2]  = m3  = p3*x1  + p9*x2   + p15*x3  + p21*x4  + p27*x5  + p33*x6;
914         pc[3]  = m4  = p4*x1  + p10*x2  + p16*x3  + p22*x4  + p28*x5  + p34*x6;
915         pc[4]  = m5  = p5*x1  + p11*x2  + p17*x3  + p23*x4  + p29*x5  + p35*x6;
916         pc[5]  = m6  = p6*x1  + p12*x2  + p18*x3  + p24*x4  + p30*x5  + p36*x6;
917 
918         pc[6]  = m7  = p1*x7  + p7*x8   + p13*x9  + p19*x10 + p25*x11 + p31*x12;
919         pc[7]  = m8  = p2*x7  + p8*x8   + p14*x9  + p20*x10 + p26*x11 + p32*x12;
920         pc[8]  = m9  = p3*x7  + p9*x8   + p15*x9  + p21*x10 + p27*x11 + p33*x12;
921         pc[9]  = m10 = p4*x7  + p10*x8  + p16*x9  + p22*x10 + p28*x11 + p34*x12;
922         pc[10] = m11 = p5*x7  + p11*x8  + p17*x9  + p23*x10 + p29*x11 + p35*x12;
923         pc[11] = m12 = p6*x7  + p12*x8  + p18*x9  + p24*x10 + p30*x11 + p36*x12;
924 
925         pc[12] = m13 = p1*x13 + p7*x14  + p13*x15 + p19*x16 + p25*x17 + p31*x18;
926         pc[13] = m14 = p2*x13 + p8*x14  + p14*x15 + p20*x16 + p26*x17 + p32*x18;
927         pc[14] = m15 = p3*x13 + p9*x14  + p15*x15 + p21*x16 + p27*x17 + p33*x18;
928         pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18;
929         pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18;
930         pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18;
931 
932         pc[18] = m19 = p1*x19 + p7*x20  + p13*x21 + p19*x22 + p25*x23 + p31*x24;
933         pc[19] = m20 = p2*x19 + p8*x20  + p14*x21 + p20*x22 + p26*x23 + p32*x24;
934         pc[20] = m21 = p3*x19 + p9*x20  + p15*x21 + p21*x22 + p27*x23 + p33*x24;
935         pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24;
936         pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24;
937         pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24;
938 
939         pc[24] = m25 = p1*x25 + p7*x26  + p13*x27 + p19*x28 + p25*x29 + p31*x30;
940         pc[25] = m26 = p2*x25 + p8*x26  + p14*x27 + p20*x28 + p26*x29 + p32*x30;
941         pc[26] = m27 = p3*x25 + p9*x26  + p15*x27 + p21*x28 + p27*x29 + p33*x30;
942         pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30;
943         pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30;
944         pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30;
945 
946         pc[30] = m31 = p1*x31 + p7*x32  + p13*x33 + p19*x34 + p25*x35 + p31*x36;
947         pc[31] = m32 = p2*x31 + p8*x32  + p14*x33 + p20*x34 + p26*x35 + p32*x36;
948         pc[32] = m33 = p3*x31 + p9*x32  + p15*x33 + p21*x34 + p27*x35 + p33*x36;
949         pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36;
950         pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36;
951         pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36;
952 
953         nz = bi[row+1] - diag_offset[row] - 1;
954         pv += 36;
955         for (j=0; j<nz; j++) {
956 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
957 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
958 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
959 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
960 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
961 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
962 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
963 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
964 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
965 	  x    = rtmp + 36*pj[j];
966           x[0]  -= m1*x1  + m7*x2   + m13*x3  + m19*x4  + m25*x5  + m31*x6;
967           x[1]  -= m2*x1  + m8*x2   + m14*x3  + m20*x4  + m26*x5  + m32*x6;
968           x[2]  -= m3*x1  + m9*x2   + m15*x3  + m21*x4  + m27*x5  + m33*x6;
969           x[3]  -= m4*x1  + m10*x2  + m16*x3  + m22*x4  + m28*x5  + m34*x6;
970           x[4]  -= m5*x1  + m11*x2  + m17*x3  + m23*x4  + m29*x5  + m35*x6;
971           x[5]  -= m6*x1  + m12*x2  + m18*x3  + m24*x4  + m30*x5  + m36*x6;
972 
973 	  x[6]  -= m1*x7  + m7*x8   + m13*x9  + m19*x10 + m25*x11 + m31*x12;
974 	  x[7]  -= m2*x7  + m8*x8   + m14*x9  + m20*x10 + m26*x11 + m32*x12;
975 	  x[8]  -= m3*x7  + m9*x8   + m15*x9  + m21*x10 + m27*x11 + m33*x12;
976 	  x[9]  -= m4*x7  + m10*x8  + m16*x9  + m22*x10 + m28*x11 + m34*x12;
977 	  x[10] -= m5*x7  + m11*x8  + m17*x9  + m23*x10 + m29*x11 + m35*x12;
978 	  x[11] -= m6*x7  + m12*x8  + m18*x9  + m24*x10 + m30*x11 + m36*x12;
979 
980 	  x[12] -= m1*x13 + m7*x14  + m13*x15 + m19*x16 + m25*x17 + m31*x18;
981 	  x[13] -= m2*x13 + m8*x14  + m14*x15 + m20*x16 + m26*x17 + m32*x18;
982 	  x[14] -= m3*x13 + m9*x14  + m15*x15 + m21*x16 + m27*x17 + m33*x18;
983 	  x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18;
984 	  x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18;
985 	  x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18;
986 
987 	  x[18] -= m1*x19 + m7*x20  + m13*x21 + m19*x22 + m25*x23 + m31*x24;
988 	  x[19] -= m2*x19 + m8*x20  + m14*x21 + m20*x22 + m26*x23 + m32*x24;
989 	  x[20] -= m3*x19 + m9*x20  + m15*x21 + m21*x22 + m27*x23 + m33*x24;
990 	  x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24;
991 	  x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24;
992 	  x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24;
993 
994 	  x[24] -= m1*x25 + m7*x26  + m13*x27 + m19*x28 + m25*x29 + m31*x30;
995 	  x[25] -= m2*x25 + m8*x26  + m14*x27 + m20*x28 + m26*x29 + m32*x30;
996 	  x[26] -= m3*x25 + m9*x26  + m15*x27 + m21*x28 + m27*x29 + m33*x30;
997 	  x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30;
998 	  x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30;
999 	  x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30;
1000 
1001 	  x[30] -= m1*x31 + m7*x32  + m13*x33 + m19*x34 + m25*x35 + m31*x36;
1002 	  x[31] -= m2*x31 + m8*x32  + m14*x33 + m20*x34 + m26*x35 + m32*x36;
1003 	  x[32] -= m3*x31 + m9*x32  + m15*x33 + m21*x34 + m27*x35 + m33*x36;
1004 	  x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36;
1005 	  x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36;
1006 	  x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36;
1007 
1008           pv   += 36;
1009         }
1010         PLogFlops(432*nz+396);
1011       }
1012       row = *ajtmp++;
1013     }
1014     /* finished row so stick it into b->a */
1015     pv = ba + 36*bi[i];
1016     pj = bj + bi[i];
1017     nz = bi[i+1] - bi[i];
1018     for (j=0; j<nz; j++) {
1019       x      = rtmp+36*pj[j];
1020       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1021       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
1022       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
1023       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1024       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
1025       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
1026       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
1027       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
1028       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
1029       pv   += 36;
1030     }
1031     /* invert diagonal block */
1032     w = ba + 36*diag_offset[i];
1033     ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr);
1034   }
1035 
1036   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1037   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1038   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1039   C->factor = FACTOR_LU;
1040   C->assembled = PETSC_TRUE;
1041   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1042   PetscFunctionReturn(0);
1043 }
1044 /*
1045       Version for when blocks are 6 by 6 Using natural ordering
1046 */
1047 #undef __FUNC__
1048 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_6_NaturalOrdering"
1049 int MatLUFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B)
1050 {
1051   Mat         C = *B;
1052   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1053   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1054   int         *ajtmpold,*ajtmp,nz,row;
1055   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
1056   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1057   MatScalar   x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
1058   MatScalar   x16,x17,x18,x19,x20,x21,x22,x23,x24,x25;
1059   MatScalar   p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15;
1060   MatScalar   p16,p17,p18,p19,p20,p21,p22,p23,p24,p25;
1061   MatScalar   m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15;
1062   MatScalar   m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
1063   MatScalar   p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
1064   MatScalar   x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
1065   MatScalar   m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
1066   MatScalar   *ba = b->a,*aa = a->a;
1067 
1068   PetscFunctionBegin;
1069   rtmp  = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1070   for (i=0; i<n; i++) {
1071     nz    = bi[i+1] - bi[i];
1072     ajtmp = bj + bi[i];
1073     for  (j=0; j<nz; j++) {
1074       x = rtmp+36*ajtmp[j];
1075       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
1076       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
1077       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
1078       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
1079       x[34] = x[35] = 0.0 ;
1080     }
1081     /* load in initial (unfactored row) */
1082     nz       = ai[i+1] - ai[i];
1083     ajtmpold = aj + ai[i];
1084     v        = aa + 36*ai[i];
1085     for (j=0; j<nz; j++) {
1086       x    = rtmp+36*ajtmpold[j];
1087       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
1088       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
1089       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
1090       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
1091       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
1092       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
1093       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
1094       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
1095       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
1096       v    += 36;
1097     }
1098     row = *ajtmp++;
1099     while (row < i) {
1100       pc  = rtmp + 36*row;
1101       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1102       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
1103       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
1104       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
1105       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
1106       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
1107       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
1108       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
1109       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
1110       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
1111           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
1112           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
1113           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
1114           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
1115           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
1116           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
1117           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
1118           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) {
1119         pv = ba + 36*diag_offset[row];
1120         pj = bj + diag_offset[row] + 1;
1121 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1122 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
1123 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
1124 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
1125 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
1126 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
1127 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
1128 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
1129 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
1130         pc[0]  = m1  = p1*x1  + p7*x2   + p13*x3  + p19*x4  + p25*x5  + p31*x6;
1131         pc[1]  = m2  = p2*x1  + p8*x2   + p14*x3  + p20*x4  + p26*x5  + p32*x6;
1132         pc[2]  = m3  = p3*x1  + p9*x2   + p15*x3  + p21*x4  + p27*x5  + p33*x6;
1133         pc[3]  = m4  = p4*x1  + p10*x2  + p16*x3  + p22*x4  + p28*x5  + p34*x6;
1134         pc[4]  = m5  = p5*x1  + p11*x2  + p17*x3  + p23*x4  + p29*x5  + p35*x6;
1135         pc[5]  = m6  = p6*x1  + p12*x2  + p18*x3  + p24*x4  + p30*x5  + p36*x6;
1136 
1137         pc[6]  = m7  = p1*x7  + p7*x8   + p13*x9  + p19*x10 + p25*x11 + p31*x12;
1138         pc[7]  = m8  = p2*x7  + p8*x8   + p14*x9  + p20*x10 + p26*x11 + p32*x12;
1139         pc[8]  = m9  = p3*x7  + p9*x8   + p15*x9  + p21*x10 + p27*x11 + p33*x12;
1140         pc[9]  = m10 = p4*x7  + p10*x8  + p16*x9  + p22*x10 + p28*x11 + p34*x12;
1141         pc[10] = m11 = p5*x7  + p11*x8  + p17*x9  + p23*x10 + p29*x11 + p35*x12;
1142         pc[11] = m12 = p6*x7  + p12*x8  + p18*x9  + p24*x10 + p30*x11 + p36*x12;
1143 
1144         pc[12] = m13 = p1*x13 + p7*x14  + p13*x15 + p19*x16 + p25*x17 + p31*x18;
1145         pc[13] = m14 = p2*x13 + p8*x14  + p14*x15 + p20*x16 + p26*x17 + p32*x18;
1146         pc[14] = m15 = p3*x13 + p9*x14  + p15*x15 + p21*x16 + p27*x17 + p33*x18;
1147         pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18;
1148         pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18;
1149         pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18;
1150 
1151         pc[18] = m19 = p1*x19 + p7*x20  + p13*x21 + p19*x22 + p25*x23 + p31*x24;
1152         pc[19] = m20 = p2*x19 + p8*x20  + p14*x21 + p20*x22 + p26*x23 + p32*x24;
1153         pc[20] = m21 = p3*x19 + p9*x20  + p15*x21 + p21*x22 + p27*x23 + p33*x24;
1154         pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24;
1155         pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24;
1156         pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24;
1157 
1158         pc[24] = m25 = p1*x25 + p7*x26  + p13*x27 + p19*x28 + p25*x29 + p31*x30;
1159         pc[25] = m26 = p2*x25 + p8*x26  + p14*x27 + p20*x28 + p26*x29 + p32*x30;
1160         pc[26] = m27 = p3*x25 + p9*x26  + p15*x27 + p21*x28 + p27*x29 + p33*x30;
1161         pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30;
1162         pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30;
1163         pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30;
1164 
1165         pc[30] = m31 = p1*x31 + p7*x32  + p13*x33 + p19*x34 + p25*x35 + p31*x36;
1166         pc[31] = m32 = p2*x31 + p8*x32  + p14*x33 + p20*x34 + p26*x35 + p32*x36;
1167         pc[32] = m33 = p3*x31 + p9*x32  + p15*x33 + p21*x34 + p27*x35 + p33*x36;
1168         pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36;
1169         pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36;
1170         pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36;
1171 
1172         nz = bi[row+1] - diag_offset[row] - 1;
1173         pv += 36;
1174         for (j=0; j<nz; j++) {
1175 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1176 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
1177 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
1178 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
1179 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
1180 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
1181 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
1182 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
1183 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
1184 	  x    = rtmp + 36*pj[j];
1185           x[0]  -= m1*x1  + m7*x2   + m13*x3  + m19*x4  + m25*x5  + m31*x6;
1186           x[1]  -= m2*x1  + m8*x2   + m14*x3  + m20*x4  + m26*x5  + m32*x6;
1187           x[2]  -= m3*x1  + m9*x2   + m15*x3  + m21*x4  + m27*x5  + m33*x6;
1188           x[3]  -= m4*x1  + m10*x2  + m16*x3  + m22*x4  + m28*x5  + m34*x6;
1189           x[4]  -= m5*x1  + m11*x2  + m17*x3  + m23*x4  + m29*x5  + m35*x6;
1190           x[5]  -= m6*x1  + m12*x2  + m18*x3  + m24*x4  + m30*x5  + m36*x6;
1191 
1192 	  x[6]  -= m1*x7  + m7*x8   + m13*x9  + m19*x10 + m25*x11 + m31*x12;
1193 	  x[7]  -= m2*x7  + m8*x8   + m14*x9  + m20*x10 + m26*x11 + m32*x12;
1194 	  x[8]  -= m3*x7  + m9*x8   + m15*x9  + m21*x10 + m27*x11 + m33*x12;
1195 	  x[9]  -= m4*x7  + m10*x8  + m16*x9  + m22*x10 + m28*x11 + m34*x12;
1196 	  x[10] -= m5*x7  + m11*x8  + m17*x9  + m23*x10 + m29*x11 + m35*x12;
1197 	  x[11] -= m6*x7  + m12*x8  + m18*x9  + m24*x10 + m30*x11 + m36*x12;
1198 
1199 	  x[12] -= m1*x13 + m7*x14  + m13*x15 + m19*x16 + m25*x17 + m31*x18;
1200 	  x[13] -= m2*x13 + m8*x14  + m14*x15 + m20*x16 + m26*x17 + m32*x18;
1201 	  x[14] -= m3*x13 + m9*x14  + m15*x15 + m21*x16 + m27*x17 + m33*x18;
1202 	  x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18;
1203 	  x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18;
1204 	  x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18;
1205 
1206 	  x[18] -= m1*x19 + m7*x20  + m13*x21 + m19*x22 + m25*x23 + m31*x24;
1207 	  x[19] -= m2*x19 + m8*x20  + m14*x21 + m20*x22 + m26*x23 + m32*x24;
1208 	  x[20] -= m3*x19 + m9*x20  + m15*x21 + m21*x22 + m27*x23 + m33*x24;
1209 	  x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24;
1210 	  x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24;
1211 	  x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24;
1212 
1213 	  x[24] -= m1*x25 + m7*x26  + m13*x27 + m19*x28 + m25*x29 + m31*x30;
1214 	  x[25] -= m2*x25 + m8*x26  + m14*x27 + m20*x28 + m26*x29 + m32*x30;
1215 	  x[26] -= m3*x25 + m9*x26  + m15*x27 + m21*x28 + m27*x29 + m33*x30;
1216 	  x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30;
1217 	  x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30;
1218 	  x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30;
1219 
1220 	  x[30] -= m1*x31 + m7*x32  + m13*x33 + m19*x34 + m25*x35 + m31*x36;
1221 	  x[31] -= m2*x31 + m8*x32  + m14*x33 + m20*x34 + m26*x35 + m32*x36;
1222 	  x[32] -= m3*x31 + m9*x32  + m15*x33 + m21*x34 + m27*x35 + m33*x36;
1223 	  x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36;
1224 	  x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36;
1225 	  x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36;
1226 
1227           pv   += 36;
1228         }
1229         PLogFlops(432*nz+396);
1230       }
1231       row = *ajtmp++;
1232     }
1233     /* finished row so stick it into b->a */
1234     pv = ba + 36*bi[i];
1235     pj = bj + bi[i];
1236     nz = bi[i+1] - bi[i];
1237     for (j=0; j<nz; j++) {
1238       x      = rtmp+36*pj[j];
1239       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1240       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
1241       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
1242       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1243       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
1244       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
1245       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
1246       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
1247       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
1248       pv   += 36;
1249     }
1250     /* invert diagonal block */
1251     w = ba + 36*diag_offset[i];
1252     ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr);
1253   }
1254 
1255   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1256   C->factor    = FACTOR_LU;
1257   C->assembled = PETSC_TRUE;
1258   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /* ------------------------------------------------------------*/
1263 /*
1264       Version for when blocks are 5 by 5
1265 */
1266 #undef __FUNC__
1267 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_5"
1268 int MatLUFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B)
1269 {
1270   Mat         C = *B;
1271   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1272   IS          isrow = b->row,isicol = b->icol;
1273   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1274   int         *ajtmpold,*ajtmp,nz,row;
1275   int         *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
1276   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1277   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1278   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
1279   MatScalar   x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
1280   MatScalar   p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
1281   MatScalar   m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
1282   MatScalar   *ba = b->a,*aa = a->a;
1283 
1284   PetscFunctionBegin;
1285   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1286   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1287   rtmp  = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1288 
1289   for (i=0; i<n; i++) {
1290     nz    = bi[i+1] - bi[i];
1291     ajtmp = bj + bi[i];
1292     for  (j=0; j<nz; j++) {
1293       x = rtmp+25*ajtmp[j];
1294       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
1295       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
1296       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
1297     }
1298     /* load in initial (unfactored row) */
1299     idx      = r[i];
1300     nz       = ai[idx+1] - ai[idx];
1301     ajtmpold = aj + ai[idx];
1302     v        = aa + 25*ai[idx];
1303     for (j=0; j<nz; j++) {
1304       x    = rtmp+25*ic[ajtmpold[j]];
1305       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
1306       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
1307       x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1308       x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17];
1309       x[18] = v[18]; x[19] = v[19]; x[20] = v[20]; x[21] = v[21];
1310       x[22] = v[22]; x[23] = v[23]; x[24] = v[24];
1311       v    += 25;
1312     }
1313     row = *ajtmp++;
1314     while (row < i) {
1315       pc = rtmp + 25*row;
1316       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
1317       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
1318       p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1319       p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; p19 = pc[18];
1320       p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
1321       p25 = pc[24];
1322       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1323           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1324           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1325           || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 ||
1326           p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 ||
1327           p24 != 0.0 || p25 != 0.0) {
1328         pv = ba + 25*diag_offset[row];
1329         pj = bj + diag_offset[row] + 1;
1330         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
1331         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
1332         x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1333         x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17];
1334         x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21];
1335         x23 = pv[22]; x24 = pv[23]; x25 = pv[24];
1336         pc[0] = m1 = p1*x1 + p6*x2  + p11*x3 + p16*x4 + p21*x5;
1337         pc[1] = m2 = p2*x1 + p7*x2  + p12*x3 + p17*x4 + p22*x5;
1338         pc[2] = m3 = p3*x1 + p8*x2  + p13*x3 + p18*x4 + p23*x5;
1339         pc[3] = m4 = p4*x1 + p9*x2  + p14*x3 + p19*x4 + p24*x5;
1340         pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5;
1341 
1342         pc[5] = m6 = p1*x6 + p6*x7  + p11*x8 + p16*x9 + p21*x10;
1343         pc[6] = m7 = p2*x6 + p7*x7  + p12*x8 + p17*x9 + p22*x10;
1344         pc[7] = m8 = p3*x6 + p8*x7  + p13*x8 + p18*x9 + p23*x10;
1345         pc[8] = m9 = p4*x6 + p9*x7  + p14*x8 + p19*x9 + p24*x10;
1346         pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10;
1347 
1348         pc[10] = m11 = p1*x11 + p6*x12  + p11*x13 + p16*x14 + p21*x15;
1349         pc[11] = m12 = p2*x11 + p7*x12  + p12*x13 + p17*x14 + p22*x15;
1350         pc[12] = m13 = p3*x11 + p8*x12  + p13*x13 + p18*x14 + p23*x15;
1351         pc[13] = m14 = p4*x11 + p9*x12  + p14*x13 + p19*x14 + p24*x15;
1352         pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15;
1353 
1354         pc[15] = m16 = p1*x16 + p6*x17  + p11*x18 + p16*x19 + p21*x20;
1355         pc[16] = m17 = p2*x16 + p7*x17  + p12*x18 + p17*x19 + p22*x20;
1356         pc[17] = m18 = p3*x16 + p8*x17  + p13*x18 + p18*x19 + p23*x20;
1357         pc[18] = m19 = p4*x16 + p9*x17  + p14*x18 + p19*x19 + p24*x20;
1358         pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20;
1359 
1360         pc[20] = m21 = p1*x21 + p6*x22  + p11*x23 + p16*x24 + p21*x25;
1361         pc[21] = m22 = p2*x21 + p7*x22  + p12*x23 + p17*x24 + p22*x25;
1362         pc[22] = m23 = p3*x21 + p8*x22  + p13*x23 + p18*x24 + p23*x25;
1363         pc[23] = m24 = p4*x21 + p9*x22  + p14*x23 + p19*x24 + p24*x25;
1364         pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25;
1365 
1366         nz = bi[row+1] - diag_offset[row] - 1;
1367         pv += 25;
1368         for (j=0; j<nz; j++) {
1369           x1   = pv[0];  x2 = pv[1];   x3  = pv[2];  x4  = pv[3];
1370           x5   = pv[4];  x6 = pv[5];   x7  = pv[6];  x8  = pv[7]; x9 = pv[8];
1371           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1372           x14  = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16];
1373           x18  = pv[17]; x19 = pv[18]; x20 = pv[19]; x21 = pv[20];
1374           x22  = pv[21]; x23 = pv[22]; x24 = pv[23]; x25 = pv[24];
1375           x    = rtmp + 25*pj[j];
1376           x[0] -= m1*x1 + m6*x2  + m11*x3 + m16*x4 + m21*x5;
1377           x[1] -= m2*x1 + m7*x2  + m12*x3 + m17*x4 + m22*x5;
1378           x[2] -= m3*x1 + m8*x2  + m13*x3 + m18*x4 + m23*x5;
1379           x[3] -= m4*x1 + m9*x2  + m14*x3 + m19*x4 + m24*x5;
1380           x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5;
1381 
1382           x[5] -= m1*x6 + m6*x7  + m11*x8 + m16*x9 + m21*x10;
1383           x[6] -= m2*x6 + m7*x7  + m12*x8 + m17*x9 + m22*x10;
1384           x[7] -= m3*x6 + m8*x7  + m13*x8 + m18*x9 + m23*x10;
1385           x[8] -= m4*x6 + m9*x7  + m14*x8 + m19*x9 + m24*x10;
1386           x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10;
1387 
1388           x[10] -= m1*x11 + m6*x12  + m11*x13 + m16*x14 + m21*x15;
1389           x[11] -= m2*x11 + m7*x12  + m12*x13 + m17*x14 + m22*x15;
1390           x[12] -= m3*x11 + m8*x12  + m13*x13 + m18*x14 + m23*x15;
1391           x[13] -= m4*x11 + m9*x12  + m14*x13 + m19*x14 + m24*x15;
1392           x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15;
1393 
1394           x[15] -= m1*x16 + m6*x17  + m11*x18 + m16*x19 + m21*x20;
1395           x[16] -= m2*x16 + m7*x17  + m12*x18 + m17*x19 + m22*x20;
1396           x[17] -= m3*x16 + m8*x17  + m13*x18 + m18*x19 + m23*x20;
1397           x[18] -= m4*x16 + m9*x17  + m14*x18 + m19*x19 + m24*x20;
1398           x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20;
1399 
1400           x[20] -= m1*x21 + m6*x22  + m11*x23 + m16*x24 + m21*x25;
1401           x[21] -= m2*x21 + m7*x22  + m12*x23 + m17*x24 + m22*x25;
1402           x[22] -= m3*x21 + m8*x22  + m13*x23 + m18*x24 + m23*x25;
1403           x[23] -= m4*x21 + m9*x22  + m14*x23 + m19*x24 + m24*x25;
1404           x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25;
1405 
1406           pv   += 25;
1407         }
1408         PLogFlops(250*nz+225);
1409       }
1410       row = *ajtmp++;
1411     }
1412     /* finished row so stick it into b->a */
1413     pv = ba + 25*bi[i];
1414     pj = bj + bi[i];
1415     nz = bi[i+1] - bi[i];
1416     for (j=0; j<nz; j++) {
1417       x     = rtmp+25*pj[j];
1418       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
1419       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
1420       pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1421       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16];
1422       pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20];
1423       pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; pv[24] = x[24];
1424       pv   += 25;
1425     }
1426     /* invert diagonal block */
1427     w = ba + 25*diag_offset[i];
1428     ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr);
1429   }
1430 
1431   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1432   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1433   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1434   C->factor = FACTOR_LU;
1435   C->assembled = PETSC_TRUE;
1436   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1437   PetscFunctionReturn(0);
1438 }
1439 /*
1440       Version for when blocks are 5 by 5 Using natural ordering
1441 */
1442 #undef __FUNC__
1443 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_5_NaturalOrdering"
1444 int MatLUFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B)
1445 {
1446   Mat         C = *B;
1447   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1448   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1449   int         *ajtmpold,*ajtmp,nz,row;
1450   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
1451   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1452   MatScalar   x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
1453   MatScalar   x16,x17,x18,x19,x20,x21,x22,x23,x24,x25;
1454   MatScalar   p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15;
1455   MatScalar   p16,p17,p18,p19,p20,p21,p22,p23,p24,p25;
1456   MatScalar   m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15;
1457   MatScalar   m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
1458   MatScalar   *ba = b->a,*aa = a->a;
1459 
1460   PetscFunctionBegin;
1461   rtmp  = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1462   for (i=0; i<n; i++) {
1463     nz    = bi[i+1] - bi[i];
1464     ajtmp = bj + bi[i];
1465     for  (j=0; j<nz; j++) {
1466       x = rtmp+25*ajtmp[j];
1467       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
1468       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
1469       x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
1470     }
1471     /* load in initial (unfactored row) */
1472     nz       = ai[i+1] - ai[i];
1473     ajtmpold = aj + ai[i];
1474     v        = aa + 25*ai[i];
1475     for (j=0; j<nz; j++) {
1476       x    = rtmp+25*ajtmpold[j];
1477       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
1478       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
1479       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1480       x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; x[18] = v[18];
1481       x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
1482       x[24] = v[24];
1483       v    += 25;
1484     }
1485     row = *ajtmp++;
1486     while (row < i) {
1487       pc  = rtmp + 25*row;
1488       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1489       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
1490       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1491       p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17];
1492       p19 = pc[18]; p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22];
1493       p24 = pc[23]; p25 = pc[24];
1494       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1495           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1496           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1497           || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0
1498           || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
1499         pv = ba + 25*diag_offset[row];
1500         pj = bj + diag_offset[row] + 1;
1501         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1502         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
1503         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1504         x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; x19 = pv[18];
1505         x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
1506         x25 = pv[24];
1507         pc[0] = m1 = p1*x1 + p6*x2  + p11*x3 + p16*x4 + p21*x5;
1508         pc[1] = m2 = p2*x1 + p7*x2  + p12*x3 + p17*x4 + p22*x5;
1509         pc[2] = m3 = p3*x1 + p8*x2  + p13*x3 + p18*x4 + p23*x5;
1510         pc[3] = m4 = p4*x1 + p9*x2  + p14*x3 + p19*x4 + p24*x5;
1511         pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5;
1512 
1513         pc[5]  = m6  = p1*x6 + p6*x7  + p11*x8 + p16*x9 + p21*x10;
1514         pc[6]  = m7  = p2*x6 + p7*x7  + p12*x8 + p17*x9 + p22*x10;
1515         pc[7]  = m8  = p3*x6 + p8*x7  + p13*x8 + p18*x9 + p23*x10;
1516         pc[8]  = m9  = p4*x6 + p9*x7  + p14*x8 + p19*x9 + p24*x10;
1517         pc[9]  = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10;
1518 
1519         pc[10] = m11 = p1*x11 + p6*x12  + p11*x13 + p16*x14 + p21*x15;
1520         pc[11] = m12 = p2*x11 + p7*x12  + p12*x13 + p17*x14 + p22*x15;
1521         pc[12] = m13 = p3*x11 + p8*x12  + p13*x13 + p18*x14 + p23*x15;
1522         pc[13] = m14 = p4*x11 + p9*x12  + p14*x13 + p19*x14 + p24*x15;
1523         pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15;
1524 
1525         pc[15] = m16 = p1*x16 + p6*x17  + p11*x18 + p16*x19 + p21*x20;
1526         pc[16] = m17 = p2*x16 + p7*x17  + p12*x18 + p17*x19 + p22*x20;
1527         pc[17] = m18 = p3*x16 + p8*x17  + p13*x18 + p18*x19 + p23*x20;
1528         pc[18] = m19 = p4*x16 + p9*x17  + p14*x18 + p19*x19 + p24*x20;
1529         pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20;
1530 
1531         pc[20] = m21 = p1*x21 + p6*x22  + p11*x23 + p16*x24 + p21*x25;
1532         pc[21] = m22 = p2*x21 + p7*x22  + p12*x23 + p17*x24 + p22*x25;
1533         pc[22] = m23 = p3*x21 + p8*x22  + p13*x23 + p18*x24 + p23*x25;
1534         pc[23] = m24 = p4*x21 + p9*x22  + p14*x23 + p19*x24 + p24*x25;
1535         pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25;
1536 
1537         nz = bi[row+1] - diag_offset[row] - 1;
1538         pv += 25;
1539         for (j=0; j<nz; j++) {
1540           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
1541           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
1542           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1543           x14  = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17];
1544           x19 = pv[18];  x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22];
1545           x24 = pv[23];  x25 = pv[24];
1546           x    = rtmp + 25*pj[j];
1547           x[0] -= m1*x1 + m6*x2   + m11*x3  + m16*x4 + m21*x5;
1548           x[1] -= m2*x1 + m7*x2   + m12*x3  + m17*x4 + m22*x5;
1549           x[2] -= m3*x1 + m8*x2   + m13*x3  + m18*x4 + m23*x5;
1550           x[3] -= m4*x1 + m9*x2   + m14*x3  + m19*x4 + m24*x5;
1551           x[4] -= m5*x1 + m10*x2  + m15*x3  + m20*x4 + m25*x5;
1552 
1553           x[5] -= m1*x6 + m6*x7   + m11*x8  + m16*x9 + m21*x10;
1554           x[6] -= m2*x6 + m7*x7   + m12*x8  + m17*x9 + m22*x10;
1555           x[7] -= m3*x6 + m8*x7   + m13*x8  + m18*x9 + m23*x10;
1556           x[8] -= m4*x6 + m9*x7   + m14*x8  + m19*x9 + m24*x10;
1557           x[9] -= m5*x6 + m10*x7  + m15*x8  + m20*x9 + m25*x10;
1558 
1559           x[10] -= m1*x11 + m6*x12  + m11*x13 + m16*x14 + m21*x15;
1560           x[11] -= m2*x11 + m7*x12  + m12*x13 + m17*x14 + m22*x15;
1561           x[12] -= m3*x11 + m8*x12  + m13*x13 + m18*x14 + m23*x15;
1562           x[13] -= m4*x11 + m9*x12  + m14*x13 + m19*x14 + m24*x15;
1563           x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15;
1564 
1565           x[15] -= m1*x16 + m6*x17  + m11*x18 + m16*x19 + m21*x20;
1566           x[16] -= m2*x16 + m7*x17  + m12*x18 + m17*x19 + m22*x20;
1567           x[17] -= m3*x16 + m8*x17  + m13*x18 + m18*x19 + m23*x20;
1568           x[18] -= m4*x16 + m9*x17  + m14*x18 + m19*x19 + m24*x20;
1569           x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20;
1570 
1571           x[20] -= m1*x21 + m6*x22  + m11*x23 + m16*x24 + m21*x25;
1572           x[21] -= m2*x21 + m7*x22  + m12*x23 + m17*x24 + m22*x25;
1573           x[22] -= m3*x21 + m8*x22  + m13*x23 + m18*x24 + m23*x25;
1574           x[23] -= m4*x21 + m9*x22  + m14*x23 + m19*x24 + m24*x25;
1575           x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25;
1576           pv   += 25;
1577         }
1578         PLogFlops(250*nz+225);
1579       }
1580       row = *ajtmp++;
1581     }
1582     /* finished row so stick it into b->a */
1583     pv = ba + 25*bi[i];
1584     pj = bj + bi[i];
1585     nz = bi[i+1] - bi[i];
1586     for (j=0; j<nz; j++) {
1587       x      = rtmp+25*pj[j];
1588       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1589       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
1590       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1591       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; pv[17] = x[17];
1592       pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22];
1593       pv[23] = x[23]; pv[24] = x[24];
1594       pv   += 25;
1595     }
1596     /* invert diagonal block */
1597     w = ba + 25*diag_offset[i];
1598     ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr);
1599   }
1600 
1601   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1602   C->factor    = FACTOR_LU;
1603   C->assembled = PETSC_TRUE;
1604   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 /* ------------------------------------------------------------*/
1609 /*
1610       Version for when blocks are 4 by 4
1611 */
1612 #undef __FUNC__
1613 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_4"
1614 int MatLUFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B)
1615 {
1616   Mat         C = *B;
1617   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1618   IS          isrow = b->row,isicol = b->icol;
1619   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1620   int         *ajtmpold,*ajtmp,nz,row;
1621   int         *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
1622   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1623   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1624   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
1625   MatScalar   p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
1626   MatScalar   m13,m14,m15,m16;
1627   MatScalar   *ba = b->a,*aa = a->a;
1628 
1629   PetscFunctionBegin;
1630   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1631   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1632   rtmp  = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1633 
1634   for (i=0; i<n; i++) {
1635     nz    = bi[i+1] - bi[i];
1636     ajtmp = bj + bi[i];
1637     for  (j=0; j<nz; j++) {
1638       x = rtmp+16*ajtmp[j];
1639       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
1640       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
1641     }
1642     /* load in initial (unfactored row) */
1643     idx      = r[i];
1644     nz       = ai[idx+1] - ai[idx];
1645     ajtmpold = aj + ai[idx];
1646     v        = aa + 16*ai[idx];
1647     for (j=0; j<nz; j++) {
1648       x    = rtmp+16*ic[ajtmpold[j]];
1649       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
1650       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
1651       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1652       x[14] = v[14]; x[15] = v[15];
1653       v    += 16;
1654     }
1655     row = *ajtmp++;
1656     while (row < i) {
1657       pc  = rtmp + 16*row;
1658       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1659       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
1660       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1661       p15 = pc[14]; p16 = pc[15];
1662       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1663           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1664           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1665           || p16 != 0.0) {
1666         pv = ba + 16*diag_offset[row];
1667         pj = bj + diag_offset[row] + 1;
1668         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1669         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
1670         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1671         x15 = pv[14]; x16 = pv[15];
1672         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
1673         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
1674         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
1675         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
1676 
1677         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
1678         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
1679         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
1680         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
1681 
1682         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
1683         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
1684         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
1685         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
1686 
1687         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
1688         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
1689         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
1690         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
1691 
1692         nz = bi[row+1] - diag_offset[row] - 1;
1693         pv += 16;
1694         for (j=0; j<nz; j++) {
1695           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
1696           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
1697           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1698           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
1699           x    = rtmp + 16*pj[j];
1700           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
1701           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
1702           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
1703           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
1704 
1705           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
1706           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
1707           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
1708           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
1709 
1710           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
1711           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
1712           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
1713           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
1714 
1715           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
1716           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
1717           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
1718           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
1719 
1720           pv   += 16;
1721         }
1722         PLogFlops(128*nz+112);
1723       }
1724       row = *ajtmp++;
1725     }
1726     /* finished row so stick it into b->a */
1727     pv = ba + 16*bi[i];
1728     pj = bj + bi[i];
1729     nz = bi[i+1] - bi[i];
1730     for (j=0; j<nz; j++) {
1731       x      = rtmp+16*pj[j];
1732       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1733       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
1734       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1735       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1736       pv   += 16;
1737     }
1738     /* invert diagonal block */
1739     w = ba + 16*diag_offset[i];
1740     ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr);
1741   }
1742 
1743   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1744   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1745   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1746   C->factor = FACTOR_LU;
1747   C->assembled = PETSC_TRUE;
1748   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
1749   PetscFunctionReturn(0);
1750 }
1751 /*
1752       Version for when blocks are 4 by 4 Using natural ordering
1753 */
1754 #undef __FUNC__
1755 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_4_NaturalOrdering"
1756 int MatLUFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B)
1757 {
1758   Mat         C = *B;
1759   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1760   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1761   int         *ajtmpold,*ajtmp,nz,row;
1762   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
1763   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1764   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1765   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
1766   MatScalar   p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
1767   MatScalar   m13,m14,m15,m16;
1768   MatScalar   *ba = b->a,*aa = a->a;
1769 
1770   PetscFunctionBegin;
1771   rtmp  = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1772 
1773   for (i=0; i<n; i++) {
1774     nz    = bi[i+1] - bi[i];
1775     ajtmp = bj + bi[i];
1776     for  (j=0; j<nz; j++) {
1777       x = rtmp+16*ajtmp[j];
1778       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
1779       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
1780     }
1781     /* load in initial (unfactored row) */
1782     nz       = ai[i+1] - ai[i];
1783     ajtmpold = aj + ai[i];
1784     v        = aa + 16*ai[i];
1785     for (j=0; j<nz; j++) {
1786       x    = rtmp+16*ajtmpold[j];
1787       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
1788       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
1789       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1790       x[14] = v[14]; x[15] = v[15];
1791       v    += 16;
1792     }
1793     row = *ajtmp++;
1794     while (row < i) {
1795       pc  = rtmp + 16*row;
1796       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1797       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
1798       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1799       p15 = pc[14]; p16 = pc[15];
1800       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1801           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1802           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1803           || p16 != 0.0) {
1804         pv = ba + 16*diag_offset[row];
1805         pj = bj + diag_offset[row] + 1;
1806         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1807         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
1808         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1809         x15 = pv[14]; x16 = pv[15];
1810         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
1811         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
1812         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
1813         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
1814 
1815         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
1816         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
1817         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
1818         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
1819 
1820         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
1821         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
1822         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
1823         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
1824 
1825         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
1826         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
1827         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
1828         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
1829 
1830         nz = bi[row+1] - diag_offset[row] - 1;
1831         pv += 16;
1832         for (j=0; j<nz; j++) {
1833           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
1834           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
1835           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1836           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
1837           x    = rtmp + 16*pj[j];
1838           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
1839           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
1840           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
1841           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
1842 
1843           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
1844           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
1845           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
1846           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
1847 
1848           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
1849           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
1850           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
1851           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
1852 
1853           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
1854           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
1855           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
1856           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
1857 
1858           pv   += 16;
1859         }
1860         PLogFlops(128*nz+112);
1861       }
1862       row = *ajtmp++;
1863     }
1864     /* finished row so stick it into b->a */
1865     pv = ba + 16*bi[i];
1866     pj = bj + bi[i];
1867     nz = bi[i+1] - bi[i];
1868     for (j=0; j<nz; j++) {
1869       x      = rtmp+16*pj[j];
1870       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1871       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
1872       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1873       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1874       pv   += 16;
1875     }
1876     /* invert diagonal block */
1877     w = ba + 16*diag_offset[i];
1878     ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr);
1879   }
1880 
1881   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1882   C->factor    = FACTOR_LU;
1883   C->assembled = PETSC_TRUE;
1884   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
1885   PetscFunctionReturn(0);
1886 }
1887 
1888 
1889 /* ------------------------------------------------------------*/
1890 /*
1891       Version for when blocks are 3 by 3
1892 */
1893 #undef __FUNC__
1894 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_3"
1895 int MatLUFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B)
1896 {
1897   Mat         C = *B;
1898   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1899   IS          isrow = b->row,isicol = b->icol;
1900   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1901   int         *ajtmpold,*ajtmp,nz,row,*ai=a->i,*aj=a->j;
1902   int         *diag_offset = b->diag,idx,*pj;
1903   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1904   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1905   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9;
1906   MatScalar   *ba = b->a,*aa = a->a;
1907 
1908   PetscFunctionBegin;
1909   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1910   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1911   rtmp  = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1912 
1913   for (i=0; i<n; i++) {
1914     nz    = bi[i+1] - bi[i];
1915     ajtmp = bj + bi[i];
1916     for  (j=0; j<nz; j++) {
1917       x = rtmp + 9*ajtmp[j];
1918       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
1919     }
1920     /* load in initial (unfactored row) */
1921     idx      = r[i];
1922     nz       = ai[idx+1] - ai[idx];
1923     ajtmpold = aj + ai[idx];
1924     v        = aa + 9*ai[idx];
1925     for (j=0; j<nz; j++) {
1926       x    = rtmp + 9*ic[ajtmpold[j]];
1927       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
1928       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
1929       v    += 9;
1930     }
1931     row = *ajtmp++;
1932     while (row < i) {
1933       pc = rtmp + 9*row;
1934       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
1935       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
1936       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1937           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
1938         pv = ba + 9*diag_offset[row];
1939         pj = bj + diag_offset[row] + 1;
1940         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
1941         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
1942         pc[0] = m1 = p1*x1 + p4*x2 + p7*x3;
1943         pc[1] = m2 = p2*x1 + p5*x2 + p8*x3;
1944         pc[2] = m3 = p3*x1 + p6*x2 + p9*x3;
1945 
1946         pc[3] = m4 = p1*x4 + p4*x5 + p7*x6;
1947         pc[4] = m5 = p2*x4 + p5*x5 + p8*x6;
1948         pc[5] = m6 = p3*x4 + p6*x5 + p9*x6;
1949 
1950         pc[6] = m7 = p1*x7 + p4*x8 + p7*x9;
1951         pc[7] = m8 = p2*x7 + p5*x8 + p8*x9;
1952         pc[8] = m9 = p3*x7 + p6*x8 + p9*x9;
1953         nz = bi[row+1] - diag_offset[row] - 1;
1954         pv += 9;
1955         for (j=0; j<nz; j++) {
1956           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
1957           x5   = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
1958           x    = rtmp + 9*pj[j];
1959           x[0] -= m1*x1 + m4*x2 + m7*x3;
1960           x[1] -= m2*x1 + m5*x2 + m8*x3;
1961           x[2] -= m3*x1 + m6*x2 + m9*x3;
1962 
1963           x[3] -= m1*x4 + m4*x5 + m7*x6;
1964           x[4] -= m2*x4 + m5*x5 + m8*x6;
1965           x[5] -= m3*x4 + m6*x5 + m9*x6;
1966 
1967           x[6] -= m1*x7 + m4*x8 + m7*x9;
1968           x[7] -= m2*x7 + m5*x8 + m8*x9;
1969           x[8] -= m3*x7 + m6*x8 + m9*x9;
1970           pv   += 9;
1971         }
1972         PLogFlops(54*nz+36);
1973       }
1974       row = *ajtmp++;
1975     }
1976     /* finished row so stick it into b->a */
1977     pv = ba + 9*bi[i];
1978     pj = bj + bi[i];
1979     nz = bi[i+1] - bi[i];
1980     for (j=0; j<nz; j++) {
1981       x     = rtmp + 9*pj[j];
1982       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
1983       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
1984       pv   += 9;
1985     }
1986     /* invert diagonal block */
1987     w = ba + 9*diag_offset[i];
1988     ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr);
1989   }
1990 
1991   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1992   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1993   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1994   C->factor = FACTOR_LU;
1995   C->assembled = PETSC_TRUE;
1996   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
1997   PetscFunctionReturn(0);
1998 }
1999 /*
2000       Version for when blocks are 3 by 3 Using natural ordering
2001 */
2002 #undef __FUNC__
2003 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_3_NaturalOrdering"
2004 int MatLUFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B)
2005 {
2006   Mat                C = *B;
2007   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
2008   int                ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
2009   int                *ajtmpold,*ajtmp,nz,row;
2010   int                *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
2011   MatScalar          *pv,*v,*rtmp,*pc,*w,*x;
2012   MatScalar          p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
2013   MatScalar          p5,p6,p7,p8,p9,x5,x6,x7,x8,x9;
2014   MatScalar          *ba = b->a,*aa = a->a;
2015 
2016   PetscFunctionBegin;
2017   rtmp  = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
2018 
2019   for (i=0; i<n; i++) {
2020     nz    = bi[i+1] - bi[i];
2021     ajtmp = bj + bi[i];
2022     for  (j=0; j<nz; j++) {
2023       x = rtmp+9*ajtmp[j];
2024       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = 0.0;
2025     }
2026     /* load in initial (unfactored row) */
2027     nz       = ai[i+1] - ai[i];
2028     ajtmpold = aj + ai[i];
2029     v        = aa + 9*ai[i];
2030     for (j=0; j<nz; j++) {
2031       x    = rtmp+9*ajtmpold[j];
2032       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
2033       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
2034       v    += 9;
2035     }
2036     row = *ajtmp++;
2037     while (row < i) {
2038       pc  = rtmp + 9*row;
2039       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
2040       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
2041       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
2042           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
2043         pv = ba + 9*diag_offset[row];
2044         pj = bj + diag_offset[row] + 1;
2045         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
2046         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
2047         pc[0] = m1 = p1*x1 + p4*x2 + p7*x3;
2048         pc[1] = m2 = p2*x1 + p5*x2 + p8*x3;
2049         pc[2] = m3 = p3*x1 + p6*x2 + p9*x3;
2050 
2051         pc[3] = m4 = p1*x4 + p4*x5 + p7*x6;
2052         pc[4] = m5 = p2*x4 + p5*x5 + p8*x6;
2053         pc[5] = m6 = p3*x4 + p6*x5 + p9*x6;
2054 
2055         pc[6] = m7 = p1*x7 + p4*x8 + p7*x9;
2056         pc[7] = m8 = p2*x7 + p5*x8 + p8*x9;
2057         pc[8] = m9 = p3*x7 + p6*x8 + p9*x9;
2058 
2059         nz = bi[row+1] - diag_offset[row] - 1;
2060         pv += 9;
2061         for (j=0; j<nz; j++) {
2062           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
2063           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
2064           x    = rtmp + 9*pj[j];
2065           x[0] -= m1*x1 + m4*x2 + m7*x3;
2066           x[1] -= m2*x1 + m5*x2 + m8*x3;
2067           x[2] -= m3*x1 + m6*x2 + m9*x3;
2068 
2069           x[3] -= m1*x4 + m4*x5 + m7*x6;
2070           x[4] -= m2*x4 + m5*x5 + m8*x6;
2071           x[5] -= m3*x4 + m6*x5 + m9*x6;
2072 
2073           x[6] -= m1*x7 + m4*x8 + m7*x9;
2074           x[7] -= m2*x7 + m5*x8 + m8*x9;
2075           x[8] -= m3*x7 + m6*x8 + m9*x9;
2076           pv   += 9;
2077         }
2078         PLogFlops(54*nz+36);
2079       }
2080       row = *ajtmp++;
2081     }
2082     /* finished row so stick it into b->a */
2083     pv = ba + 9*bi[i];
2084     pj = bj + bi[i];
2085     nz = bi[i+1] - bi[i];
2086     for (j=0; j<nz; j++) {
2087       x      = rtmp+9*pj[j];
2088       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
2089       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
2090       pv   += 9;
2091     }
2092     /* invert diagonal block */
2093     w = ba + 9*diag_offset[i];
2094     ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr);
2095   }
2096 
2097   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2098   C->factor    = FACTOR_LU;
2099   C->assembled = PETSC_TRUE;
2100   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2101   PetscFunctionReturn(0);
2102 }
2103 
2104 /* ------------------------------------------------------------*/
2105 /*
2106       Version for when blocks are 2 by 2
2107 */
2108 #undef __FUNC__
2109 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_2"
2110 int MatLUFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
2111 {
2112   Mat                C = *B;
2113   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
2114   IS                 isrow = b->row,isicol = b->icol;
2115   int                *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
2116   int                *ajtmpold,*ajtmp,nz,row;
2117   int                *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj;
2118   MatScalar          *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
2119   MatScalar          p1,p2,p3,p4;
2120   MatScalar          *ba = b->a,*aa = a->a;
2121 
2122   PetscFunctionBegin;
2123   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2124   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2125   rtmp  = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
2126 
2127   for (i=0; i<n; i++) {
2128     nz    = bi[i+1] - bi[i];
2129     ajtmp = bj + bi[i];
2130     for  (j=0; j<nz; j++) {
2131       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
2132     }
2133     /* load in initial (unfactored row) */
2134     idx      = r[i];
2135     nz       = ai[idx+1] - ai[idx];
2136     ajtmpold = aj + ai[idx];
2137     v        = aa + 4*ai[idx];
2138     for (j=0; j<nz; j++) {
2139       x    = rtmp+4*ic[ajtmpold[j]];
2140       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
2141       v    += 4;
2142     }
2143     row = *ajtmp++;
2144     while (row < i) {
2145       pc = rtmp + 4*row;
2146       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
2147       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
2148         pv = ba + 4*diag_offset[row];
2149         pj = bj + diag_offset[row] + 1;
2150         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
2151         pc[0] = m1 = p1*x1 + p3*x2;
2152         pc[1] = m2 = p2*x1 + p4*x2;
2153         pc[2] = m3 = p1*x3 + p3*x4;
2154         pc[3] = m4 = p2*x3 + p4*x4;
2155         nz = bi[row+1] - diag_offset[row] - 1;
2156         pv += 4;
2157         for (j=0; j<nz; j++) {
2158           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
2159           x    = rtmp + 4*pj[j];
2160           x[0] -= m1*x1 + m3*x2;
2161           x[1] -= m2*x1 + m4*x2;
2162           x[2] -= m1*x3 + m3*x4;
2163           x[3] -= m2*x3 + m4*x4;
2164           pv   += 4;
2165         }
2166         PLogFlops(16*nz+12);
2167       }
2168       row = *ajtmp++;
2169     }
2170     /* finished row so stick it into b->a */
2171     pv = ba + 4*bi[i];
2172     pj = bj + bi[i];
2173     nz = bi[i+1] - bi[i];
2174     for (j=0; j<nz; j++) {
2175       x     = rtmp+4*pj[j];
2176       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
2177       pv   += 4;
2178     }
2179     /* invert diagonal block */
2180     w = ba + 4*diag_offset[i];
2181     ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr);
2182     /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/
2183   }
2184 
2185   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2186   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2187   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2188   C->factor = FACTOR_LU;
2189   C->assembled = PETSC_TRUE;
2190   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2191   PetscFunctionReturn(0);
2192 }
2193 /*
2194       Version for when blocks are 2 by 2 Using natural ordering
2195 */
2196 #undef __FUNC__
2197 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
2198 int MatLUFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
2199 {
2200   Mat                C = *B;
2201   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
2202   int                ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
2203   int                *ajtmpold,*ajtmp,nz,row;
2204   int                *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
2205   MatScalar          *pv,*v,*rtmp,*pc,*w,*x;
2206   MatScalar          p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4;
2207   MatScalar          *ba = b->a,*aa = a->a;
2208 
2209   PetscFunctionBegin;
2210   rtmp  = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
2211 
2212   for (i=0; i<n; i++) {
2213     nz    = bi[i+1] - bi[i];
2214     ajtmp = bj + bi[i];
2215     for  (j=0; j<nz; j++) {
2216       x = rtmp+4*ajtmp[j];
2217       x[0]  = x[1]  = x[2]  = x[3]  = 0.0;
2218     }
2219     /* load in initial (unfactored row) */
2220     nz       = ai[i+1] - ai[i];
2221     ajtmpold = aj + ai[i];
2222     v        = aa + 4*ai[i];
2223     for (j=0; j<nz; j++) {
2224       x    = rtmp+4*ajtmpold[j];
2225       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
2226       v    += 4;
2227     }
2228     row = *ajtmp++;
2229     while (row < i) {
2230       pc  = rtmp + 4*row;
2231       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
2232       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
2233         pv = ba + 4*diag_offset[row];
2234         pj = bj + diag_offset[row] + 1;
2235         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
2236         pc[0] = m1 = p1*x1 + p3*x2;
2237         pc[1] = m2 = p2*x1 + p4*x2;
2238         pc[2] = m3 = p1*x3 + p3*x4;
2239         pc[3] = m4 = p2*x3 + p4*x4;
2240         nz = bi[row+1] - diag_offset[row] - 1;
2241         pv += 4;
2242         for (j=0; j<nz; j++) {
2243           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
2244           x    = rtmp + 4*pj[j];
2245           x[0] -= m1*x1 + m3*x2;
2246           x[1] -= m2*x1 + m4*x2;
2247           x[2] -= m1*x3 + m3*x4;
2248           x[3] -= m2*x3 + m4*x4;
2249           pv   += 4;
2250         }
2251         PLogFlops(16*nz+12);
2252       }
2253       row = *ajtmp++;
2254     }
2255     /* finished row so stick it into b->a */
2256     pv = ba + 4*bi[i];
2257     pj = bj + bi[i];
2258     nz = bi[i+1] - bi[i];
2259     for (j=0; j<nz; j++) {
2260       x      = rtmp+4*pj[j];
2261       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
2262       pv   += 4;
2263     }
2264     /* invert diagonal block */
2265     w = ba + 4*diag_offset[i];
2266     ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr);
2267     /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/
2268   }
2269 
2270   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2271   C->factor    = FACTOR_LU;
2272   C->assembled = PETSC_TRUE;
2273   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2274   PetscFunctionReturn(0);
2275 }
2276 
2277 /* ----------------------------------------------------------- */
2278 /*
2279      Version for when blocks are 1 by 1.
2280 */
2281 #undef __FUNC__
2282 #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_1"
2283 int MatLUFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
2284 {
2285   Mat                C = *B;
2286   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2287   IS                 ip = b->row;
2288   int                *rip,*riip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
2289   int                *ai = a->i,*aj = a->j;
2290   MatScalar          *rtmp;
2291   MatScalar          *ba = b->a,*aa = a->a;
2292   MatScalar          dk,uikdi;
2293   int                k,jmin,jmax,*jl,*il,vj,nexti,juj,ili;
2294 
2295   PetscFunctionBegin;
2296   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
2297   riip = rip;
2298 
2299   /* INITIALIZATION */
2300   /* il and jl record the first nonzero element in each row of the accessing
2301      window U(0:k, k:mbs-1).
2302      jl:    list of rows to be added to uneliminated rows
2303             i>= k: jl(i) is the first row to be added to row i
2304             i<  k: jl(i) is the row following row i in some list of rows
2305             jl(i) = mbs indicates the end of a list
2306      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2307             row i of U */
2308   rtmp  = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2309   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2310   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2311   for (i=0; i<mbs; i++) {
2312     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
2313   }
2314 
2315   /* FOR EACH ROW K */
2316   for (k = 0; k<mbs; k++){
2317 
2318     /* INITIALIZE K-TH ROW WITH ELEMENTS NONZERO IN ROW P(K) OF A */
2319     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
2320     if (jmin < jmax) {
2321       for (j = jmin; j < jmax; j++){
2322         vj = riip[aj[j]];
2323         if (k <= vj) rtmp[vj] = aa[j];
2324       }
2325     }
2326 
2327     /* MODIFY K-TH ROW BY ADDING IN THOSE ROWS I WITH U(I,K) NE 0
2328        FOR EACH ROW I TO BE ADDED IN */
2329     dk = rtmp[k];
2330     i = jl[k]; /* first row to be added to k_th row  */
2331     /* printf(" k=%d, pivot row = %d\n",k,i); */
2332 
2333     while (i < mbs){
2334       nexti = jl[i]; /* next row to be added to k_th row */
2335       /* printf("      pivot row = %d\n", nexti); */
2336 
2337       /* COMPUTE MULTIPLIER AND UPDATE DIAGONAL ELEMENT */
2338       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2339       uikdi = - ba[ili]*ba[i];
2340       dk += uikdi*ba[ili];
2341       ba[ili] = uikdi; /* update U(i,k) */
2342 
2343       /* ADD MULTIPLE OF ROW I TO K-TH ROW ... */
2344       jmin = ili + 1; jmax = bi[i+1];
2345       if (jmin < jmax){
2346         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
2347         /* ... AND ADD I TO ROW LIST FOR NEXT NONZERO ENTRY */
2348          il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2349          j     = bj[jmin];
2350          jl[i] = jl[j]; jl[j] = i; /* update jl */
2351       }
2352       i = nexti;
2353       /* printf("                  pivot row i=%d\n",i);  */
2354     }
2355 
2356     /* CHECK FOR ZERO PIVOT AND SAVE DIAGONAL ELEMENT */
2357     if (dk == 0.0){
2358       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot");
2359     }
2360 
2361     /* SAVE NONZERO ENTRIES IN K-TH ROW OF U ... */
2362     ba[k] = 1.0/dk;
2363     jmin = bi[k]; jmax = bi[k+1];
2364     if (jmin < jmax) {
2365       for (j=jmin; j<jmax; j++){
2366          juj = bj[j]; ba[j] = rtmp[juj]; rtmp[juj] = 0.0;
2367       }
2368 
2369       /* ... AND ADD K TO ROW LIST FOR FIRST NONZERO ENTRY IN K-TH ROW */
2370       il[k] = jmin;
2371       i     = bj[jmin];
2372       jl[k] = jl[i]; jl[i] = k;
2373     }
2374   }
2375 
2376   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2377   ierr = PetscFree(il);CHKERRQ(ierr);
2378   ierr = PetscFree(jl);CHKERRQ(ierr);
2379 
2380   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
2381   C->factor    = FACTOR_LU;
2382   C->assembled = PETSC_TRUE;
2383   PLogFlops(b->mbs);
2384   PetscFunctionReturn(0);
2385 }
2386 
2387 #undef __FUNC__
2388 #define __FUNC__ "MatLUFactor_SeqSBAIJ"
2389 int MatLUFactor_SeqSBAIJ(Mat A,IS row,IS col,MatLUInfo *info)
2390 {
2391   Mat_SeqBAIJ    *mat = (Mat_SeqBAIJ*)A->data;
2392   int            ierr,refct;
2393   Mat            C;
2394   PetscOps *Abops;
2395   MatOps   Aops;
2396 
2397   PetscFunctionBegin;
2398   ierr = MatLUFactorSymbolic(A,row,col,info,&C);CHKERRQ(ierr);
2399   ierr = MatLUFactorNumeric(A,&C);CHKERRQ(ierr);
2400 
2401   /* free all the data structures from mat */
2402   ierr = PetscFree(mat->a);CHKERRQ(ierr);
2403   if (!mat->singlemalloc) {
2404     ierr = PetscFree(mat->i);CHKERRQ(ierr);
2405     ierr = PetscFree(mat->j);CHKERRQ(ierr);
2406   }
2407   if (mat->diag) {ierr = PetscFree(mat->diag);CHKERRQ(ierr);}
2408   if (mat->ilen) {ierr = PetscFree(mat->ilen);CHKERRQ(ierr);}
2409   if (mat->imax) {ierr = PetscFree(mat->imax);CHKERRQ(ierr);}
2410   if (mat->solve_work) {ierr = PetscFree(mat->solve_work);CHKERRQ(ierr);}
2411   if (mat->mult_work) {ierr = PetscFree(mat->mult_work);CHKERRQ(ierr);}
2412   if (mat->icol) {ierr = ISDestroy(mat->icol);CHKERRQ(ierr);}
2413   ierr = PetscFree(mat);CHKERRQ(ierr);
2414 
2415   ierr = MapDestroy(A->rmap);CHKERRQ(ierr);
2416   ierr = MapDestroy(A->cmap);CHKERRQ(ierr);
2417 
2418   /*
2419        This is horrible,horrible code. We need to keep the
2420     A pointers for the bops and ops but copy everything
2421     else from C.
2422   */
2423   Abops = A->bops;
2424   Aops  = A->ops;
2425   refct = A->refct;
2426   ierr  = PetscMemcpy(A,C,sizeof(struct _p_Mat));CHKERRQ(ierr);
2427   mat   = (Mat_SeqBAIJ*)A->data;
2428   PLogObjectParent(A,mat->icol);
2429 
2430   A->bops  = Abops;
2431   A->ops   = Aops;
2432   A->qlist = 0;
2433   A->refct = refct;
2434   /* copy over the type_name and name */
2435   ierr     = PetscStrallocpy(C->type_name,&A->type_name);CHKERRQ(ierr);
2436   ierr     = PetscStrallocpy(C->name,&A->name);CHKERRQ(ierr);
2437 
2438   PetscHeaderDestroy(C);
2439   PetscFunctionReturn(0);
2440 }
2441 
2442 
2443