xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 895147d2f268e2f35cdbefae1953318f3b79da40)
1 /* Using Modified Sparse Row (MSR) storage.
2 See page 85, "Iterative Methods ..." by Saad. */
3 
4 /*$Id: sbaijfact.c,v 1.49 2000/11/07 20:31:04 hzhang Exp hzhang $*/
5 /*
6     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7 */
8 #include "sbaij.h"
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/ilu.h"
12 #include "include/petscis.h"
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
16 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
17 {
18   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
19   int         *rip,ierr,i,mbs = a->mbs,*ai,*aj;
20   int         *jutmp,bs = a->bs,bs2=a->bs2;
21   int         m,nzi,realloc = 0;
22   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
23   PetscTruth  perm_identity;
24 
25   PetscFunctionBegin;
26   if (!perm) {
27     ierr = ISCreateStride(PETSC_COMM_SELF,mbs,0,1,&perm);CHKERRQ(ierr);
28   }
29   PetscValidHeaderSpecific(perm,IS_COOKIE);
30 
31   /* check whether perm is the identity mapping */
32   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
33   if (!perm_identity) a->permute = PETSC_TRUE;
34 
35   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
36 
37   if (perm_identity){ /* without permutation */
38     ai = a->i; aj = a->j;
39   } else {            /* non-trivial permutation */
40     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRA(ierr);
41     ai = a->inew; aj = a->jnew;
42   }
43 
44   /* initialization */
45   /* Don't know how many column pointers are needed so estimate.
46      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
47   iu   = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(iu);
48   umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
49   ju   = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
50   iu[0] = mbs+1;
51   juptr = mbs;
52   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
53   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
54   for (i=0; i<mbs; i++){
55     jl[i] = mbs; q[i] = 0;
56   }
57 
58   /* for each row k */
59   for (k=0; k<mbs; k++){
60     nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
61     q[k] = mbs;
62     /* initialize nonzero structure of k-th row to row rip[k] of A */
63     jmin = ai[rip[k]];
64     jmax = ai[rip[k]+1];
65     for (j=jmin; j<jmax; j++){
66       vj = rip[aj[j]]; /* col. value */
67       if(vj > k){
68         qm = k;
69         do {
70           m  = qm; qm = q[m];
71         } while(qm < vj);
72         if (qm == vj) {
73           printf(" error: duplicate entry in A\n"); break;
74         }
75         nzk++;
76         q[m] = vj;
77         q[vj] = qm;
78       } /* if(vj > k) */
79     } /* for (j=jmin; j<jmax; j++) */
80 
81     /* modify nonzero structure of k-th row by computing fill-in
82        for each row i to be merged in */
83     i = k;
84     i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
85     /* printf(" next pivot row i=%d\n",i); */
86     while (i < mbs){
87       /* merge row i into k-th row */
88       nzi = iu[i+1] - (iu[i]+1);
89       jmin = iu[i] + 1; jmax = iu[i] + nzi;
90       qm = k;
91       for (j=jmin; j<jmax+1; j++){
92         vj = ju[j];
93         do {
94           m = qm; qm = q[m];
95         } while (qm < vj);
96         if (qm != vj){
97          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
98         }
99       }
100       i = jl[i]; /* next pivot row */
101     }
102 
103     /* add k to row list for first nonzero element in k-th row */
104     if (nzk > 0){
105       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
106       jl[k] = jl[i]; jl[i] = k;
107     }
108     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
109 
110     /* allocate more space to ju if needed */
111     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
112       /* estimate how much additional space we will need */
113       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
114       /* just double the memory each time */
115       maxadd = umax;
116       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
117       umax += maxadd;
118 
119       /* allocate a longer ju */
120       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
121       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
122       ierr  = PetscFree(ju);CHKERRQ(ierr);
123       ju    = jutmp;
124       realloc++; /* count how many times we realloc */
125     }
126 
127     /* save nonzero structure of k-th row in ju */
128     i=k;
129     jumin = juptr + 1; juptr += nzk;
130     for (j=jumin; j<juptr+1; j++){
131       i=q[i];
132       ju[j]=i;
133     }
134   }
135 
136   if (ai[mbs] != 0) {
137     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
138     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
139     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
140     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
141     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
142   } else {
143      PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
144   }
145 
146   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
147   ierr = PetscFree(q);CHKERRQ(ierr);
148   ierr = PetscFree(jl);CHKERRQ(ierr);
149 
150   /* put together the new matrix */
151   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
152   /* PLogObjectParent(*B,iperm); */
153   b = (Mat_SeqSBAIJ*)(*B)->data;
154   ierr = PetscFree(b->imax);CHKERRQ(ierr);
155   b->singlemalloc = PETSC_FALSE;
156   /* the next line frees the default space generated by the Create() */
157   ierr = PetscFree(b->a);CHKERRQ(ierr);
158   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
159   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
160   b->j          = ju;
161   b->i          = iu;
162   b->diag       = 0;
163   b->ilen       = 0;
164   b->imax       = 0;
165   b->row        = perm;
166   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
167   b->icol       = perm;
168   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
169   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
170   /* In b structure:  Free imax, ilen, old a, old j.
171      Allocate idnew, solve_work, new a, new j */
172   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
173   b->s_maxnz = b->s_nz = iu[mbs];
174 
175   (*B)->factor                 = FACTOR_CHOLESKY;
176   (*B)->info.factor_mallocs    = realloc;
177   (*B)->info.fill_ratio_given  = f;
178   if (ai[mbs] != 0) {
179     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
180   } else {
181     (*B)->info.fill_ratio_needed = 0.0;
182   }
183 
184   if (perm_identity){
185     switch (bs) {
186       case 1:
187         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
188         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
189         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
190         break;
191       case 2:
192         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
193         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
194         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
195         break;
196       case 3:
197         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
198         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
199         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
200         break;
201       case 4:
202         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
203         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
204         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
205         break;
206       case 5:
207         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
208         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
209         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
210         break;
211       case 6:
212         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
213         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
214         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
215         break;
216       case 7:
217         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
218         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
219         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
220       break;
221       default:
222         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
223         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
224         PLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
225       break;
226     }
227   }
228 
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNC__
233 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
234 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
235 {
236   Mat                C = *B;
237   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
238   IS                 perm = b->row;
239   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
240   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
241   int                bs=a->bs,bs2 = a->bs2;
242   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
243   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
244   MatScalar          *work;
245   int                *pivots;
246 
247   PetscFunctionBegin;
248 
249   /* initialization */
250   rtmp  = (MatScalar*)PetscMalloc(bs2*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
251   ierr  = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
252   il    = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
253   jl    = il + mbs;
254   for (i=0; i<mbs; i++) {
255     jl[i] = mbs; il[0] = 0;
256   }
257   dk    = (MatScalar*)PetscMalloc((2*bs2+bs)*sizeof(MatScalar));CHKPTRQ(dk);
258   uik   = dk + bs2;
259   work  = uik + bs2;
260   pivots= (int*)PetscMalloc(bs*sizeof(int));CHKPTRQ(pivots);
261 
262   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
263 
264   /* check permutation */
265   if (!a->permute){
266     ai = a->i; aj = a->j; aa = a->a;
267   } else {
268     ai = a->inew; aj = a->jnew;
269     aa = (MatScalar*)PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
270     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
271     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
272     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
273 
274     for (i=0; i<mbs; i++){
275       jmin = ai[i]; jmax = ai[i+1];
276       for (j=jmin; j<jmax; j++){
277         while (a2anew[j] != j){
278           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
279           for (k1=0; k1<bs2; k1++){
280             dk[k1]       = aa[k*bs2+k1];
281             aa[k*bs2+k1] = aa[j*bs2+k1];
282             aa[j*bs2+k1] = dk[k1];
283           }
284         }
285         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
286         if (i > aj[j]){
287           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
288           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
289           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
290           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
291             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
292           }
293         }
294       }
295     }
296     ierr = PetscFree(a2anew);CHKERRA(ierr);
297   }
298 
299   /* for each row k */
300   for (k = 0; k<mbs; k++){
301 
302     /*initialize k-th row with elements nonzero in row perm(k) of A */
303     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
304     if (jmin < jmax) {
305       ap = aa + jmin*bs2;
306       for (j = jmin; j < jmax; j++){
307         vj = perm_ptr[aj[j]];         /* block col. index */
308         rtmp_ptr = rtmp + vj*bs2;
309         for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
310       }
311     }
312 
313     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
314     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
315     i = jl[k]; /* first row to be added to k_th row  */
316 
317     while (i < mbs){
318       nexti = jl[i]; /* next row to be added to k_th row */
319 
320       /* compute multiplier */
321       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
322 
323       /* uik = -inv(Di)*U_bar(i,k) */
324       diag = ba + i*bs2;
325       u    = ba + ili*bs2;
326       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
327       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
328 
329       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
330       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
331 
332       /* update -U(i,k) */
333       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
334 
335       /* add multiple of row i to k-th row ... */
336       jmin = ili + 1; jmax = bi[i+1];
337       if (jmin < jmax){
338         for (j=jmin; j<jmax; j++) {
339           /* rtmp += -U(i,k)^T * U_bar(i,j) */
340           rtmp_ptr = rtmp + bj[j]*bs2;
341           u = ba + j*bs2;
342           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
343         }
344 
345         /* ... add i to row list for next nonzero entry */
346         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
347         j     = bj[jmin];
348         jl[i] = jl[j]; jl[j] = i; /* update jl */
349       }
350       i = nexti;
351     }
352 
353     /* save nonzero entries in k-th row of U ... */
354 
355     /* invert diagonal block */
356     diag = ba+k*bs2;
357     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
358     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
359 
360     jmin = bi[k]; jmax = bi[k+1];
361     if (jmin < jmax) {
362       for (j=jmin; j<jmax; j++){
363          vj = bj[j];           /* block col. index of U */
364          u   = ba + j*bs2;
365          rtmp_ptr = rtmp + vj*bs2;
366          for (k1=0; k1<bs2; k1++){
367            *u++        = *rtmp_ptr;
368            *rtmp_ptr++ = 0.0;
369          }
370       }
371 
372       /* ... add k to row list for first nonzero entry in k-th row */
373       il[k] = jmin;
374       i     = bj[jmin];
375       jl[k] = jl[i]; jl[i] = k;
376     }
377   }
378 
379   ierr = PetscFree(rtmp);CHKERRQ(ierr);
380   ierr = PetscFree(il);CHKERRQ(ierr);
381   ierr = PetscFree(dk);CHKERRQ(ierr);
382   ierr = PetscFree(pivots);CHKERRQ(ierr);
383   if (a->permute){
384     ierr = PetscFree(aa);CHKERRQ(ierr);
385   }
386 
387   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
388   C->factor    = FACTOR_CHOLESKY;
389   C->assembled = PETSC_TRUE;
390   C->preallocated = PETSC_TRUE;
391   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
392   PetscFunctionReturn(0);
393 }
394 
395 #undef __FUNC__
396 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
397 int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
398 {
399   Mat                C = *B;
400   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
401   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
402   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
403   int                bs=a->bs,bs2 = a->bs2;
404   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
405   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
406   MatScalar          *work;
407   int                *pivots;
408 
409   PetscFunctionBegin;
410 
411   /* initialization */
412 
413   rtmp  = (MatScalar*)PetscMalloc(bs2*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
414   ierr  = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
415   il    = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
416   jl    = il + mbs;
417   for (i=0; i<mbs; i++) {
418     jl[i] = mbs; il[0] = 0;
419   }
420   dk    = (MatScalar*)PetscMalloc((2*bs2+bs)*sizeof(MatScalar));CHKPTRQ(dk);
421   uik   = dk + bs2;
422   work  = uik + bs2;
423   pivots= (int*)PetscMalloc(bs*sizeof(int));CHKPTRQ(pivots);
424 
425   ai = a->i; aj = a->j; aa = a->a;
426 
427   /* for each row k */
428   for (k = 0; k<mbs; k++){
429 
430     /*initialize k-th row with elements nonzero in row k of A */
431     jmin = ai[k]; jmax = ai[k+1];
432     if (jmin < jmax) {
433       ap = aa + jmin*bs2;
434       for (j = jmin; j < jmax; j++){
435         vj = aj[j];         /* block col. index */
436         rtmp_ptr = rtmp + vj*bs2;
437         for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
438       }
439     }
440 
441     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
442     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
443     i = jl[k]; /* first row to be added to k_th row  */
444 
445     while (i < mbs){
446       nexti = jl[i]; /* next row to be added to k_th row */
447 
448       /* compute multiplier */
449       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
450 
451       /* uik = -inv(Di)*U_bar(i,k) */
452       diag = ba + i*bs2;
453       u    = ba + ili*bs2;
454       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
455       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
456 
457       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
458       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
459 
460       /* update -U(i,k) */
461       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
462 
463       /* add multiple of row i to k-th row ... */
464       jmin = ili + 1; jmax = bi[i+1];
465       if (jmin < jmax){
466         for (j=jmin; j<jmax; j++) {
467           /* rtmp += -U(i,k)^T * U_bar(i,j) */
468           rtmp_ptr = rtmp + bj[j]*bs2;
469           u = ba + j*bs2;
470           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
471         }
472 
473         /* ... add i to row list for next nonzero entry */
474         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
475         j     = bj[jmin];
476         jl[i] = jl[j]; jl[j] = i; /* update jl */
477       }
478       i = nexti;
479     }
480 
481     /* save nonzero entries in k-th row of U ... */
482 
483     /* invert diagonal block */
484     diag = ba+k*bs2;
485     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
486     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
487 
488     jmin = bi[k]; jmax = bi[k+1];
489     if (jmin < jmax) {
490       for (j=jmin; j<jmax; j++){
491          vj = bj[j];           /* block col. index of U */
492          u   = ba + j*bs2;
493          rtmp_ptr = rtmp + vj*bs2;
494          for (k1=0; k1<bs2; k1++){
495            *u++        = *rtmp_ptr;
496            *rtmp_ptr++ = 0.0;
497          }
498       }
499 
500       /* ... add k to row list for first nonzero entry in k-th row */
501       il[k] = jmin;
502       i     = bj[jmin];
503       jl[k] = jl[i]; jl[i] = k;
504     }
505   }
506 
507   ierr = PetscFree(rtmp);CHKERRQ(ierr);
508   ierr = PetscFree(il);CHKERRQ(ierr);
509   ierr = PetscFree(dk);CHKERRQ(ierr);
510   ierr = PetscFree(pivots);CHKERRQ(ierr);
511 
512   C->factor    = FACTOR_CHOLESKY;
513   C->assembled = PETSC_TRUE;
514   C->preallocated = PETSC_TRUE;
515   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
516   PetscFunctionReturn(0);
517 }
518 
519 /* Version for when blocks are 7 by 7 */
520 #undef __FUNC__
521 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7"
522 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B)
523 {
524   Mat                C = *B;
525   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
526   IS                 perm = b->row;
527   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
528   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
529   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
530   MatScalar          *u,*d,*w,*wp;
531 
532   PetscFunctionBegin;
533   /* initialization */
534   w  = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w);
535   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
536   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
537   jl = il + mbs;
538   for (i=0; i<mbs; i++) {
539     jl[i] = mbs; il[0] = 0;
540   }
541   dk    = (MatScalar*)PetscMalloc(98*sizeof(MatScalar));CHKPTRQ(dk);
542   uik   = dk + 49;
543   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
544 
545   /* check permutation */
546   if (!a->permute){
547     ai = a->i; aj = a->j; aa = a->a;
548   } else {
549     ai = a->inew; aj = a->jnew;
550     aa = (MatScalar*)PetscMalloc(49*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
551     ierr = PetscMemcpy(aa,a->a,49*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
552     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
553     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
554 
555     for (i=0; i<mbs; i++){
556       jmin = ai[i]; jmax = ai[i+1];
557       for (j=jmin; j<jmax; j++){
558         while (a2anew[j] != j){
559           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
560           for (k1=0; k1<49; k1++){
561             dk[k1]       = aa[k*49+k1];
562             aa[k*49+k1] = aa[j*49+k1];
563             aa[j*49+k1] = dk[k1];
564           }
565         }
566         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
567         if (i > aj[j]){
568           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
569           ap = aa + j*49;                     /* ptr to the beginning of j-th block of aa */
570           for (k=0; k<49; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
571           for (k=0; k<7; k++){               /* j-th block of aa <- dk^T */
572             for (k1=0; k1<7; k1++) *ap++ = dk[k + 7*k1];
573           }
574         }
575       }
576     }
577     ierr = PetscFree(a2anew);CHKERRA(ierr);
578   }
579 
580   /* for each row k */
581   for (k = 0; k<mbs; k++){
582 
583     /*initialize k-th row with elements nonzero in row perm(k) of A */
584     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
585     if (jmin < jmax) {
586       ap = aa + jmin*49;
587       for (j = jmin; j < jmax; j++){
588         vj = perm_ptr[aj[j]];         /* block col. index */
589         wp = w + vj*49;
590         for (i=0; i<49; i++) *wp++ = *ap++;
591       }
592     }
593 
594     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
595     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
596     i = jl[k]; /* first row to be added to k_th row  */
597 
598     while (i < mbs){
599       nexti = jl[i]; /* next row to be added to k_th row */
600 
601       /* compute multiplier */
602       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
603 
604       /* uik = -inv(Di)*U_bar(i,k) */
605       d = ba + i*49;
606       u    = ba + ili*49;
607 
608       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
609       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
610       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
611       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
612       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
613       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
614       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
615 
616       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
617       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
618       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
619       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
620       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
621       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
622       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
623 
624       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
625       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
626       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
627       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
628       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
629       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
630       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
631 
632       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
633       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
634       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
635       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
636       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
637       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
638       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
639 
640       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
641       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
642       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
643       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
644       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
645       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
646       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
647 
648       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
649       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
650       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
651       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
652       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
653       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
654       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
655 
656       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
657       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
658       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
659       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
660       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
661       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
662       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
663 
664       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
665       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
666       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
667       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
668       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
669       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
670       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
671       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
672 
673       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
674       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
675       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
676       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
677       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
678       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
679       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
680 
681       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
682       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
683       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
684       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
685       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
686       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
687       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
688 
689       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
690       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
691       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
692       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
693       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
694       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
695       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
696 
697       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
698       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
699       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
700       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
701       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
702       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
703       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
704 
705       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
706       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
707       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
708       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
709       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
710       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
711       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
712 
713       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
714       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
715       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
716       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
717       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
718       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
719       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
720 
721       /* update -U(i,k) */
722       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
723 
724       /* add multiple of row i to k-th row ... */
725       jmin = ili + 1; jmax = bi[i+1];
726       if (jmin < jmax){
727         for (j=jmin; j<jmax; j++) {
728           /* w += -U(i,k)^T * U_bar(i,j) */
729           wp = w + bj[j]*49;
730           u = ba + j*49;
731 
732           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
733           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
734           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
735           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
736           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
737           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
738           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
739 
740           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
741           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
742           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
743           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
744           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
745           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
746           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
747 
748           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
749           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
750           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
751           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
752           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
753           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
754           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
755 
756           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
757           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
758           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
759           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
760           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
761           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
762           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
763 
764           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
765           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
766           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
767           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
768           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
769           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
770           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
771 
772           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
773           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
774           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
775           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
776           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
777           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
778           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
779 
780           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
781           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
782           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
783           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
784           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
785           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
786           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
787         }
788 
789         /* ... add i to row list for next nonzero entry */
790         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
791         j     = bj[jmin];
792         jl[i] = jl[j]; jl[j] = i; /* update jl */
793       }
794       i = nexti;
795     }
796 
797     /* save nonzero entries in k-th row of U ... */
798 
799     /* invert diagonal block */
800     d = ba+k*49;
801     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
802     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
803 
804     jmin = bi[k]; jmax = bi[k+1];
805     if (jmin < jmax) {
806       for (j=jmin; j<jmax; j++){
807          vj = bj[j];           /* block col. index of U */
808          u   = ba + j*49;
809          wp = w + vj*49;
810          for (k1=0; k1<49; k1++){
811            *u++        = *wp;
812            *wp++ = 0.0;
813          }
814       }
815 
816       /* ... add k to row list for first nonzero entry in k-th row */
817       il[k] = jmin;
818       i     = bj[jmin];
819       jl[k] = jl[i]; jl[i] = k;
820     }
821   }
822 
823   ierr = PetscFree(w);CHKERRQ(ierr);
824   ierr = PetscFree(il);CHKERRQ(ierr);
825   ierr = PetscFree(dk);CHKERRQ(ierr);
826   if (a->permute){
827     ierr = PetscFree(aa);CHKERRQ(ierr);
828   }
829 
830   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
831   C->factor    = FACTOR_CHOLESKY;
832   C->assembled = PETSC_TRUE;
833   C->preallocated = PETSC_TRUE;
834   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
835   PetscFunctionReturn(0);
836 }
837 
838 /*
839       Version for when blocks are 7 by 7 Using natural ordering
840 */
841 #undef __FUNC__
842 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering"
843 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B)
844 {
845   Mat                C = *B;
846   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
847   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
848   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
849   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
850   MatScalar          *u,*d,*w,*wp;
851 
852   PetscFunctionBegin;
853 
854   /* initialization */
855   w  = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w);
856   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
857   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
858   jl = il + mbs;
859   for (i=0; i<mbs; i++) {
860     jl[i] = mbs; il[0] = 0;
861   }
862   dk    = (MatScalar*)PetscMalloc(98*sizeof(MatScalar));CHKPTRQ(dk);
863   uik   = dk + 49;
864 
865   ai = a->i; aj = a->j; aa = a->a;
866 
867   /* for each row k */
868   for (k = 0; k<mbs; k++){
869 
870     /*initialize k-th row with elements nonzero in row k of A */
871     jmin = ai[k]; jmax = ai[k+1];
872     if (jmin < jmax) {
873       ap = aa + jmin*49;
874       for (j = jmin; j < jmax; j++){
875         vj = aj[j];         /* block col. index */
876         wp = w + vj*49;
877         for (i=0; i<49; i++) *wp++ = *ap++;
878       }
879     }
880 
881     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
882     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
883     i = jl[k]; /* first row to be added to k_th row  */
884 
885     while (i < mbs){
886       nexti = jl[i]; /* next row to be added to k_th row */
887 
888       /* compute multiplier */
889       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
890 
891       /* uik = -inv(Di)*U_bar(i,k) */
892       d = ba + i*49;
893       u    = ba + ili*49;
894 
895       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
896       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
897       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
898       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
899       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
900       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
901       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
902 
903       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
904       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
905       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
906       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
907       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
908       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
909       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
910 
911       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
912       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
913       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
914       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
915       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
916       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
917       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
918 
919       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
920       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
921       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
922       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
923       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
924       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
925       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
926 
927       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
928       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
929       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
930       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
931       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
932       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
933       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
934 
935       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
936       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
937       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
938       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
939       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
940       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
941       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
942 
943       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
944       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
945       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
946       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
947       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
948       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
949       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
950 
951       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
952       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
953       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
954       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
955       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
956       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
957       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
958       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
959 
960       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
961       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
962       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
963       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
964       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
965       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
966       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
967 
968       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
969       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
970       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
971       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
972       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
973       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
974       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
975 
976       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
977       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
978       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
979       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
980       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
981       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
982       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
983 
984       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
985       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
986       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
987       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
988       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
989       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
990       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
991 
992       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
993       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
994       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
995       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
996       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
997       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
998       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
999 
1000       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
1001       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
1002       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
1003       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
1004       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
1005       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
1006       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
1007 
1008       /* update -U(i,k) */
1009       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
1010 
1011       /* add multiple of row i to k-th row ... */
1012       jmin = ili + 1; jmax = bi[i+1];
1013       if (jmin < jmax){
1014         for (j=jmin; j<jmax; j++) {
1015           /* w += -U(i,k)^T * U_bar(i,j) */
1016           wp = w + bj[j]*49;
1017           u = ba + j*49;
1018 
1019           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
1020           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
1021           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
1022           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
1023           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
1024           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
1025           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
1026 
1027           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
1028           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
1029           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
1030           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
1031           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
1032           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
1033           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
1034 
1035           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
1036           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
1037           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
1038           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
1039           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
1040           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
1041           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
1042 
1043           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
1044           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
1045           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
1046           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
1047           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
1048           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
1049           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
1050 
1051           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
1052           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
1053           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
1054           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
1055           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
1056           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
1057           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
1058 
1059           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
1060           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
1061           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
1062           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
1063           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
1064           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
1065           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
1066 
1067           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
1068           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
1069           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
1070           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
1071           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
1072           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
1073           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
1074         }
1075 
1076         /* ... add i to row list for next nonzero entry */
1077         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1078         j     = bj[jmin];
1079         jl[i] = jl[j]; jl[j] = i; /* update jl */
1080       }
1081       i = nexti;
1082     }
1083 
1084     /* save nonzero entries in k-th row of U ... */
1085 
1086     /* invert diagonal block */
1087     d = ba+k*49;
1088     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
1089     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
1090 
1091     jmin = bi[k]; jmax = bi[k+1];
1092     if (jmin < jmax) {
1093       for (j=jmin; j<jmax; j++){
1094          vj = bj[j];           /* block col. index of U */
1095          u   = ba + j*49;
1096          wp = w + vj*49;
1097          for (k1=0; k1<49; k1++){
1098            *u++        = *wp;
1099            *wp++ = 0.0;
1100          }
1101       }
1102 
1103       /* ... add k to row list for first nonzero entry in k-th row */
1104       il[k] = jmin;
1105       i     = bj[jmin];
1106       jl[k] = jl[i]; jl[i] = k;
1107     }
1108   }
1109 
1110   ierr = PetscFree(w);CHKERRQ(ierr);
1111   ierr = PetscFree(il);CHKERRQ(ierr);
1112   ierr = PetscFree(dk);CHKERRQ(ierr);
1113 
1114   C->factor    = FACTOR_CHOLESKY;
1115   C->assembled = PETSC_TRUE;
1116   C->preallocated = PETSC_TRUE;
1117   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 /* Version for when blocks are 6 by 6 */
1122 #undef __FUNC__
1123 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6"
1124 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B)
1125 {
1126   Mat                C = *B;
1127   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1128   IS                 perm = b->row;
1129   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1130   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1131   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1132   MatScalar          *u,*d,*w,*wp;
1133 
1134   PetscFunctionBegin;
1135   /* initialization */
1136   w  = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w);
1137   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1138   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
1139   jl = il + mbs;
1140   for (i=0; i<mbs; i++) {
1141     jl[i] = mbs; il[0] = 0;
1142   }
1143   dk    = (MatScalar*)PetscMalloc(72*sizeof(MatScalar));CHKPTRQ(dk);
1144   uik   = dk + 36;
1145   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
1146 
1147   /* check permutation */
1148   if (!a->permute){
1149     ai = a->i; aj = a->j; aa = a->a;
1150   } else {
1151     ai = a->inew; aj = a->jnew;
1152     aa = (MatScalar*)PetscMalloc(36*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
1153     ierr = PetscMemcpy(aa,a->a,36*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1154     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
1155     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1156 
1157     for (i=0; i<mbs; i++){
1158       jmin = ai[i]; jmax = ai[i+1];
1159       for (j=jmin; j<jmax; j++){
1160         while (a2anew[j] != j){
1161           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1162           for (k1=0; k1<36; k1++){
1163             dk[k1]       = aa[k*36+k1];
1164             aa[k*36+k1] = aa[j*36+k1];
1165             aa[j*36+k1] = dk[k1];
1166           }
1167         }
1168         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1169         if (i > aj[j]){
1170           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1171           ap = aa + j*36;                     /* ptr to the beginning of j-th block of aa */
1172           for (k=0; k<36; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1173           for (k=0; k<6; k++){               /* j-th block of aa <- dk^T */
1174             for (k1=0; k1<6; k1++) *ap++ = dk[k + 6*k1];
1175           }
1176         }
1177       }
1178     }
1179     ierr = PetscFree(a2anew);CHKERRA(ierr);
1180   }
1181 
1182   /* for each row k */
1183   for (k = 0; k<mbs; k++){
1184 
1185     /*initialize k-th row with elements nonzero in row perm(k) of A */
1186     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1187     if (jmin < jmax) {
1188       ap = aa + jmin*36;
1189       for (j = jmin; j < jmax; j++){
1190         vj = perm_ptr[aj[j]];         /* block col. index */
1191         wp = w + vj*36;
1192         for (i=0; i<36; i++) *wp++ = *ap++;
1193       }
1194     }
1195 
1196     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1197     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1198     i = jl[k]; /* first row to be added to k_th row  */
1199 
1200     while (i < mbs){
1201       nexti = jl[i]; /* next row to be added to k_th row */
1202 
1203       /* compute multiplier */
1204       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1205 
1206       /* uik = -inv(Di)*U_bar(i,k) */
1207       d = ba + i*36;
1208       u    = ba + ili*36;
1209 
1210       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1211       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1212       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1213       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1214       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1215       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1216 
1217       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1218       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1219       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1220       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1221       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1222       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1223 
1224       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1225       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1226       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1227       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1228       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1229       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1230 
1231       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1232       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1233       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1234       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1235       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1236       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1237 
1238       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1239       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1240       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1241       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1242       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1243       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1244 
1245       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1246       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1247       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1248       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1249       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1250       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1251 
1252       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1253       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1254       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1255       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1256       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1257       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1258       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1259 
1260       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1261       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1262       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1263       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1264       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1265       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1266 
1267       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1268       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1269       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1270       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1271       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1272       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1273 
1274       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1275       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1276       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1277       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1278       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1279       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1280 
1281       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1282       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1283       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1284       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1285       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1286       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1287 
1288       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1289       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1290       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1291       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1292       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1293       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1294 
1295       /* update -U(i,k) */
1296       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1297 
1298       /* add multiple of row i to k-th row ... */
1299       jmin = ili + 1; jmax = bi[i+1];
1300       if (jmin < jmax){
1301         for (j=jmin; j<jmax; j++) {
1302           /* w += -U(i,k)^T * U_bar(i,j) */
1303           wp = w + bj[j]*36;
1304           u = ba + j*36;
1305           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1306           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1307           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1308           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1309           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1310           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1311 
1312           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1313           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1314           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1315           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1316           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1317           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1318 
1319           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1320           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1321           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1322           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1323           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1324           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1325 
1326           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1327           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1328           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1329           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1330           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1331           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1332 
1333           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1334           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1335           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1336           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1337           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1338           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1339 
1340           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1341           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1342           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1343           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1344           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1345           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1346         }
1347 
1348         /* ... add i to row list for next nonzero entry */
1349         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1350         j     = bj[jmin];
1351         jl[i] = jl[j]; jl[j] = i; /* update jl */
1352       }
1353       i = nexti;
1354     }
1355 
1356     /* save nonzero entries in k-th row of U ... */
1357 
1358     /* invert diagonal block */
1359     d = ba+k*36;
1360     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1361     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1362 
1363     jmin = bi[k]; jmax = bi[k+1];
1364     if (jmin < jmax) {
1365       for (j=jmin; j<jmax; j++){
1366          vj = bj[j];           /* block col. index of U */
1367          u   = ba + j*36;
1368          wp = w + vj*36;
1369          for (k1=0; k1<36; k1++){
1370            *u++        = *wp;
1371            *wp++ = 0.0;
1372          }
1373       }
1374 
1375       /* ... add k to row list for first nonzero entry in k-th row */
1376       il[k] = jmin;
1377       i     = bj[jmin];
1378       jl[k] = jl[i]; jl[i] = k;
1379     }
1380   }
1381 
1382   ierr = PetscFree(w);CHKERRQ(ierr);
1383   ierr = PetscFree(il);CHKERRQ(ierr);
1384   ierr = PetscFree(dk);CHKERRQ(ierr);
1385   if (a->permute){
1386     ierr = PetscFree(aa);CHKERRQ(ierr);
1387   }
1388 
1389   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1390   C->factor    = FACTOR_CHOLESKY;
1391   C->assembled = PETSC_TRUE;
1392   C->preallocated = PETSC_TRUE;
1393   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 /*
1398       Version for when blocks are 6 by 6 Using natural ordering
1399 */
1400 #undef __FUNC__
1401 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering"
1402 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B)
1403 {
1404   Mat                C = *B;
1405   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1406   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1407   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1408   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1409   MatScalar          *u,*d,*w,*wp;
1410 
1411   PetscFunctionBegin;
1412 
1413   /* initialization */
1414   w  = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w);
1415   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1416   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
1417   jl = il + mbs;
1418   for (i=0; i<mbs; i++) {
1419     jl[i] = mbs; il[0] = 0;
1420   }
1421   dk    = (MatScalar*)PetscMalloc(72*sizeof(MatScalar));CHKPTRQ(dk);
1422   uik   = dk + 36;
1423 
1424   ai = a->i; aj = a->j; aa = a->a;
1425 
1426   /* for each row k */
1427   for (k = 0; k<mbs; k++){
1428 
1429     /*initialize k-th row with elements nonzero in row k of A */
1430     jmin = ai[k]; jmax = ai[k+1];
1431     if (jmin < jmax) {
1432       ap = aa + jmin*36;
1433       for (j = jmin; j < jmax; j++){
1434         vj = aj[j];         /* block col. index */
1435         wp = w + vj*36;
1436         for (i=0; i<36; i++) *wp++ = *ap++;
1437       }
1438     }
1439 
1440     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1441     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1442     i = jl[k]; /* first row to be added to k_th row  */
1443 
1444     while (i < mbs){
1445       nexti = jl[i]; /* next row to be added to k_th row */
1446 
1447       /* compute multiplier */
1448       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1449 
1450       /* uik = -inv(Di)*U_bar(i,k) */
1451       d = ba + i*36;
1452       u    = ba + ili*36;
1453 
1454       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1455       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1456       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1457       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1458       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1459       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1460 
1461       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1462       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1463       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1464       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1465       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1466       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1467 
1468       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1469       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1470       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1471       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1472       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1473       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1474 
1475       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1476       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1477       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1478       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1479       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1480       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1481 
1482       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1483       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1484       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1485       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1486       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1487       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1488 
1489       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1490       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1491       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1492       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1493       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1494       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1495 
1496       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1497       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1498       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1499       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1500       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1501       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1502       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1503 
1504       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1505       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1506       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1507       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1508       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1509       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1510 
1511       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1512       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1513       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1514       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1515       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1516       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1517 
1518       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1519       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1520       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1521       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1522       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1523       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1524 
1525       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1526       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1527       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1528       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1529       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1530       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1531 
1532       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1533       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1534       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1535       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1536       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1537       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1538 
1539       /* update -U(i,k) */
1540       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1541 
1542       /* add multiple of row i to k-th row ... */
1543       jmin = ili + 1; jmax = bi[i+1];
1544       if (jmin < jmax){
1545         for (j=jmin; j<jmax; j++) {
1546           /* w += -U(i,k)^T * U_bar(i,j) */
1547           wp = w + bj[j]*36;
1548           u = ba + j*36;
1549           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1550           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1551           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1552           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1553           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1554           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1555 
1556           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1557           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1558           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1559           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1560           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1561           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1562 
1563           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1564           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1565           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1566           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1567           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1568           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1569 
1570           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1571           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1572           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1573           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1574           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1575           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1576 
1577           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1578           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1579           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1580           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1581           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1582           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1583 
1584           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1585           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1586           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1587           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1588           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1589           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1590         }
1591 
1592         /* ... add i to row list for next nonzero entry */
1593         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1594         j     = bj[jmin];
1595         jl[i] = jl[j]; jl[j] = i; /* update jl */
1596       }
1597       i = nexti;
1598     }
1599 
1600     /* save nonzero entries in k-th row of U ... */
1601 
1602     /* invert diagonal block */
1603     d = ba+k*36;
1604     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1605     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1606 
1607     jmin = bi[k]; jmax = bi[k+1];
1608     if (jmin < jmax) {
1609       for (j=jmin; j<jmax; j++){
1610          vj = bj[j];           /* block col. index of U */
1611          u   = ba + j*36;
1612          wp = w + vj*36;
1613          for (k1=0; k1<36; k1++){
1614            *u++        = *wp;
1615            *wp++ = 0.0;
1616          }
1617       }
1618 
1619       /* ... add k to row list for first nonzero entry in k-th row */
1620       il[k] = jmin;
1621       i     = bj[jmin];
1622       jl[k] = jl[i]; jl[i] = k;
1623     }
1624   }
1625 
1626   ierr = PetscFree(w);CHKERRQ(ierr);
1627   ierr = PetscFree(il);CHKERRQ(ierr);
1628   ierr = PetscFree(dk);CHKERRQ(ierr);
1629 
1630   C->factor    = FACTOR_CHOLESKY;
1631   C->assembled = PETSC_TRUE;
1632   C->preallocated = PETSC_TRUE;
1633   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 /* Version for when blocks are 5 by 5  */
1638 #undef __FUNC__
1639 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5"
1640 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B)
1641 {
1642   Mat                C = *B;
1643   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1644   IS                 perm = b->row;
1645   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1646   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1647   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1648   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1649 
1650   PetscFunctionBegin;
1651   /* initialization */
1652   rtmp  = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1653   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1654   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
1655   jl = il + mbs;
1656   for (i=0; i<mbs; i++) {
1657     jl[i] = mbs; il[0] = 0;
1658   }
1659   dk    = (MatScalar*)PetscMalloc(50*sizeof(MatScalar));CHKPTRQ(dk);
1660   uik   = dk + 25;
1661   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
1662 
1663   /* check permutation */
1664   if (!a->permute){
1665     ai = a->i; aj = a->j; aa = a->a;
1666   } else {
1667     ai = a->inew; aj = a->jnew;
1668     aa = (MatScalar*)PetscMalloc(25*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
1669     ierr = PetscMemcpy(aa,a->a,25*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1670     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
1671     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1672 
1673     for (i=0; i<mbs; i++){
1674       jmin = ai[i]; jmax = ai[i+1];
1675       for (j=jmin; j<jmax; j++){
1676         while (a2anew[j] != j){
1677           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1678           for (k1=0; k1<25; k1++){
1679             dk[k1]       = aa[k*25+k1];
1680             aa[k*25+k1] = aa[j*25+k1];
1681             aa[j*25+k1] = dk[k1];
1682           }
1683         }
1684         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1685         if (i > aj[j]){
1686           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1687           ap = aa + j*25;                     /* ptr to the beginning of j-th block of aa */
1688           for (k=0; k<25; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1689           for (k=0; k<5; k++){               /* j-th block of aa <- dk^T */
1690             for (k1=0; k1<5; k1++) *ap++ = dk[k + 5*k1];
1691           }
1692         }
1693       }
1694     }
1695     ierr = PetscFree(a2anew);CHKERRA(ierr);
1696   }
1697 
1698   /* for each row k */
1699   for (k = 0; k<mbs; k++){
1700 
1701     /*initialize k-th row with elements nonzero in row perm(k) of A */
1702     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1703     if (jmin < jmax) {
1704       ap = aa + jmin*25;
1705       for (j = jmin; j < jmax; j++){
1706         vj = perm_ptr[aj[j]];         /* block col. index */
1707         rtmp_ptr = rtmp + vj*25;
1708         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1709       }
1710     }
1711 
1712     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1713     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1714     i = jl[k]; /* first row to be added to k_th row  */
1715 
1716     while (i < mbs){
1717       nexti = jl[i]; /* next row to be added to k_th row */
1718 
1719       /* compute multiplier */
1720       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1721 
1722       /* uik = -inv(Di)*U_bar(i,k) */
1723       d = ba + i*25;
1724       u    = ba + ili*25;
1725 
1726       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1727       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1728       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1729       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1730       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1731 
1732       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1733       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1734       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1735       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1736       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1737 
1738       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1739       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1740       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1741       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1742       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1743 
1744       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1745       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1746       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1747       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1748       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1749 
1750       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1751       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1752       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1753       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1754       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1755 
1756 
1757       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1758       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1759       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1760       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1761       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1762       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1763 
1764       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1765       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1766       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1767       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1768       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1769 
1770       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1771       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1772       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1773       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1774       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1775 
1776       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1777       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1778       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1779       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1780       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1781 
1782       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1783       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1784       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1785       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1786       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1787 
1788       /* update -U(i,k) */
1789       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1790 
1791       /* add multiple of row i to k-th row ... */
1792       jmin = ili + 1; jmax = bi[i+1];
1793       if (jmin < jmax){
1794         for (j=jmin; j<jmax; j++) {
1795           /* rtmp += -U(i,k)^T * U_bar(i,j) */
1796           rtmp_ptr = rtmp + bj[j]*25;
1797           u = ba + j*25;
1798           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1799           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1800           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1801           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1802           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1803 
1804           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1805           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1806           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1807           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1808           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1809 
1810           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1811           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1812           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1813           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1814           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1815 
1816           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1817           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1818           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1819           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1820           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1821 
1822           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1823           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1824           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1825           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1826           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1827         }
1828 
1829         /* ... add i to row list for next nonzero entry */
1830         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1831         j     = bj[jmin];
1832         jl[i] = jl[j]; jl[j] = i; /* update jl */
1833       }
1834       i = nexti;
1835     }
1836 
1837     /* save nonzero entries in k-th row of U ... */
1838 
1839     /* invert diagonal block */
1840     d = ba+k*25;
1841     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
1842     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
1843 
1844     jmin = bi[k]; jmax = bi[k+1];
1845     if (jmin < jmax) {
1846       for (j=jmin; j<jmax; j++){
1847          vj = bj[j];           /* block col. index of U */
1848          u   = ba + j*25;
1849          rtmp_ptr = rtmp + vj*25;
1850          for (k1=0; k1<25; k1++){
1851            *u++        = *rtmp_ptr;
1852            *rtmp_ptr++ = 0.0;
1853          }
1854       }
1855 
1856       /* ... add k to row list for first nonzero entry in k-th row */
1857       il[k] = jmin;
1858       i     = bj[jmin];
1859       jl[k] = jl[i]; jl[i] = k;
1860     }
1861   }
1862 
1863   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1864   ierr = PetscFree(il);CHKERRQ(ierr);
1865   ierr = PetscFree(dk);CHKERRQ(ierr);
1866   if (a->permute){
1867     ierr = PetscFree(aa);CHKERRQ(ierr);
1868   }
1869 
1870   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1871   C->factor    = FACTOR_CHOLESKY;
1872   C->assembled = PETSC_TRUE;
1873   C->preallocated = PETSC_TRUE;
1874   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 /*
1879       Version for when blocks are 5 by 5 Using natural ordering
1880 */
1881 #undef __FUNC__
1882 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering"
1883 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B)
1884 {
1885   Mat                C = *B;
1886   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1887   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1888   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1889   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1890   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1891 
1892   PetscFunctionBegin;
1893 
1894   /* initialization */
1895   rtmp  = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1896   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1897   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
1898   jl = il + mbs;
1899   for (i=0; i<mbs; i++) {
1900     jl[i] = mbs; il[0] = 0;
1901   }
1902   dk    = (MatScalar*)PetscMalloc(50*sizeof(MatScalar));CHKPTRQ(dk);
1903   uik   = dk + 25;
1904 
1905   ai = a->i; aj = a->j; aa = a->a;
1906 
1907   /* for each row k */
1908   for (k = 0; k<mbs; k++){
1909 
1910     /*initialize k-th row with elements nonzero in row k of A */
1911     jmin = ai[k]; jmax = ai[k+1];
1912     if (jmin < jmax) {
1913       ap = aa + jmin*25;
1914       for (j = jmin; j < jmax; j++){
1915         vj = aj[j];         /* block col. index */
1916         rtmp_ptr = rtmp + vj*25;
1917         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1918       }
1919     }
1920 
1921     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1922     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1923     i = jl[k]; /* first row to be added to k_th row  */
1924 
1925     while (i < mbs){
1926       nexti = jl[i]; /* next row to be added to k_th row */
1927 
1928       /* compute multiplier */
1929       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1930 
1931       /* uik = -inv(Di)*U_bar(i,k) */
1932       d = ba + i*25;
1933       u    = ba + ili*25;
1934 
1935       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1936       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1937       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1938       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1939       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1940 
1941       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1942       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1943       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1944       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1945       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1946 
1947       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1948       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1949       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1950       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1951       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1952 
1953       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1954       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1955       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1956       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1957       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1958 
1959       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1960       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1961       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1962       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1963       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1964 
1965 
1966       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1967       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1968       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1969       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1970       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1971       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1972 
1973       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1974       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1975       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1976       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1977       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1978 
1979       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1980       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1981       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1982       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1983       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1984 
1985       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1986       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1987       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1988       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1989       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1990 
1991       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1992       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1993       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1994       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1995       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1996 
1997       /* update -U(i,k) */
1998       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1999 
2000       /* add multiple of row i to k-th row ... */
2001       jmin = ili + 1; jmax = bi[i+1];
2002       if (jmin < jmax){
2003         for (j=jmin; j<jmax; j++) {
2004           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2005           rtmp_ptr = rtmp + bj[j]*25;
2006           u = ba + j*25;
2007           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
2008           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
2009           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
2010           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
2011           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
2012 
2013           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
2014           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
2015           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
2016           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
2017           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
2018 
2019           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
2020           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
2021           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
2022           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
2023           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
2024 
2025           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
2026           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
2027           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
2028           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
2029           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
2030 
2031           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
2032           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
2033           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
2034           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
2035           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
2036         }
2037 
2038         /* ... add i to row list for next nonzero entry */
2039         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2040         j     = bj[jmin];
2041         jl[i] = jl[j]; jl[j] = i; /* update jl */
2042       }
2043       i = nexti;
2044     }
2045 
2046     /* save nonzero entries in k-th row of U ... */
2047 
2048     /* invert diagonal block */
2049     d = ba+k*25;
2050     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
2051     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
2052 
2053     jmin = bi[k]; jmax = bi[k+1];
2054     if (jmin < jmax) {
2055       for (j=jmin; j<jmax; j++){
2056          vj = bj[j];           /* block col. index of U */
2057          u   = ba + j*25;
2058          rtmp_ptr = rtmp + vj*25;
2059          for (k1=0; k1<25; k1++){
2060            *u++        = *rtmp_ptr;
2061            *rtmp_ptr++ = 0.0;
2062          }
2063       }
2064 
2065       /* ... add k to row list for first nonzero entry in k-th row */
2066       il[k] = jmin;
2067       i     = bj[jmin];
2068       jl[k] = jl[i]; jl[i] = k;
2069     }
2070   }
2071 
2072   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2073   ierr = PetscFree(il);CHKERRQ(ierr);
2074   ierr = PetscFree(dk);CHKERRQ(ierr);
2075 
2076   C->factor    = FACTOR_CHOLESKY;
2077   C->assembled = PETSC_TRUE;
2078   C->preallocated = PETSC_TRUE;
2079   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
2080   PetscFunctionReturn(0);
2081 }
2082 
2083 /*
2084       Version for when blocks are 4 by 4 Using natural ordering
2085 */
2086 #undef __FUNC__
2087 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering"
2088 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B)
2089 {
2090   Mat                C = *B;
2091   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2092   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2093   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2094   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2095   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2096 
2097   PetscFunctionBegin;
2098 
2099   /* initialization */
2100   rtmp  = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2101   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2102   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
2103   jl = il + mbs;
2104   for (i=0; i<mbs; i++) {
2105     jl[i] = mbs; il[0] = 0;
2106   }
2107   dk    = (MatScalar*)PetscMalloc(32*sizeof(MatScalar));CHKPTRQ(dk);
2108   uik   = dk + 16;
2109 
2110   ai = a->i; aj = a->j; aa = a->a;
2111 
2112   /* for each row k */
2113   for (k = 0; k<mbs; k++){
2114 
2115     /*initialize k-th row with elements nonzero in row k of A */
2116     jmin = ai[k]; jmax = ai[k+1];
2117     if (jmin < jmax) {
2118       ap = aa + jmin*16;
2119       for (j = jmin; j < jmax; j++){
2120         vj = aj[j];         /* block col. index */
2121         rtmp_ptr = rtmp + vj*16;
2122         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
2123       }
2124     }
2125 
2126     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2127     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
2128     i = jl[k]; /* first row to be added to k_th row  */
2129 
2130     while (i < mbs){
2131       nexti = jl[i]; /* next row to be added to k_th row */
2132 
2133       /* compute multiplier */
2134       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2135 
2136       /* uik = -inv(Di)*U_bar(i,k) */
2137       diag = ba + i*16;
2138       u    = ba + ili*16;
2139 
2140       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
2141       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
2142       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
2143       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
2144 
2145       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2146       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2147       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2148       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2149 
2150       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2151       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2152       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2153       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2154 
2155       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2156       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2157       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2158       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2159 
2160       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2161       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2162       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2163       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2164       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2165 
2166       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2167       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2168       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2169       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2170 
2171       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2172       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2173       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2174       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2175 
2176       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2177       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2178       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2179       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2180 
2181       /* update -U(i,k) */
2182       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2183 
2184       /* add multiple of row i to k-th row ... */
2185       jmin = ili + 1; jmax = bi[i+1];
2186       if (jmin < jmax){
2187         for (j=jmin; j<jmax; j++) {
2188           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2189           rtmp_ptr = rtmp + bj[j]*16;
2190           u = ba + j*16;
2191           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2192           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2193           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2194           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2195 
2196           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2197           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2198           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2199           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2200 
2201           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2202           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2203           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2204           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2205 
2206           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2207           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2208           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2209           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2210         }
2211 
2212         /* ... add i to row list for next nonzero entry */
2213         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2214         j     = bj[jmin];
2215         jl[i] = jl[j]; jl[j] = i; /* update jl */
2216       }
2217       i = nexti;
2218     }
2219 
2220     /* save nonzero entries in k-th row of U ... */
2221 
2222     /* invert diagonal block */
2223     diag = ba+k*16;
2224     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2225     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2226 
2227     jmin = bi[k]; jmax = bi[k+1];
2228     if (jmin < jmax) {
2229       for (j=jmin; j<jmax; j++){
2230          vj = bj[j];           /* block col. index of U */
2231          u   = ba + j*16;
2232          rtmp_ptr = rtmp + vj*16;
2233          for (k1=0; k1<16; k1++){
2234            *u++        = *rtmp_ptr;
2235            *rtmp_ptr++ = 0.0;
2236          }
2237       }
2238 
2239       /* ... add k to row list for first nonzero entry in k-th row */
2240       il[k] = jmin;
2241       i     = bj[jmin];
2242       jl[k] = jl[i]; jl[i] = k;
2243     }
2244   }
2245 
2246   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2247   ierr = PetscFree(il);CHKERRQ(ierr);
2248   ierr = PetscFree(dk);CHKERRQ(ierr);
2249 
2250   C->factor    = FACTOR_CHOLESKY;
2251   C->assembled = PETSC_TRUE;
2252   C->preallocated = PETSC_TRUE;
2253   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2254   PetscFunctionReturn(0);
2255 }
2256 
2257 /* Version for when blocks are 4 by 4  */
2258 #undef __FUNC__
2259 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4"
2260 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B)
2261 {
2262   Mat                C = *B;
2263   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2264   IS                 perm = b->row;
2265   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2266   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2267   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2268   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2269 
2270   PetscFunctionBegin;
2271   /* initialization */
2272   rtmp  = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2273   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2274   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
2275   jl = il + mbs;
2276   for (i=0; i<mbs; i++) {
2277     jl[i] = mbs; il[0] = 0;
2278   }
2279   dk    = (MatScalar*)PetscMalloc(32*sizeof(MatScalar));CHKPTRQ(dk);
2280   uik   = dk + 16;
2281   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2282 
2283   /* check permutation */
2284   if (!a->permute){
2285     ai = a->i; aj = a->j; aa = a->a;
2286   } else {
2287     ai = a->inew; aj = a->jnew;
2288     aa = (MatScalar*)PetscMalloc(16*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2289     ierr = PetscMemcpy(aa,a->a,16*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2290     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2291     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2292 
2293     for (i=0; i<mbs; i++){
2294       jmin = ai[i]; jmax = ai[i+1];
2295       for (j=jmin; j<jmax; j++){
2296         while (a2anew[j] != j){
2297           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2298           for (k1=0; k1<16; k1++){
2299             dk[k1]       = aa[k*16+k1];
2300             aa[k*16+k1] = aa[j*16+k1];
2301             aa[j*16+k1] = dk[k1];
2302           }
2303         }
2304         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2305         if (i > aj[j]){
2306           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2307           ap = aa + j*16;                     /* ptr to the beginning of j-th block of aa */
2308           for (k=0; k<16; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2309           for (k=0; k<4; k++){               /* j-th block of aa <- dk^T */
2310             for (k1=0; k1<4; k1++) *ap++ = dk[k + 4*k1];
2311           }
2312         }
2313       }
2314     }
2315     ierr = PetscFree(a2anew);CHKERRA(ierr);
2316   }
2317 
2318   /* for each row k */
2319   for (k = 0; k<mbs; k++){
2320 
2321     /*initialize k-th row with elements nonzero in row perm(k) of A */
2322     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2323     if (jmin < jmax) {
2324       ap = aa + jmin*16;
2325       for (j = jmin; j < jmax; j++){
2326         vj = perm_ptr[aj[j]];         /* block col. index */
2327         rtmp_ptr = rtmp + vj*16;
2328         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
2329       }
2330     }
2331 
2332     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2333     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
2334     i = jl[k]; /* first row to be added to k_th row  */
2335 
2336     while (i < mbs){
2337       nexti = jl[i]; /* next row to be added to k_th row */
2338 
2339       /* compute multiplier */
2340       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2341 
2342       /* uik = -inv(Di)*U_bar(i,k) */
2343       diag = ba + i*16;
2344       u    = ba + ili*16;
2345 
2346       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
2347       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
2348       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
2349       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
2350 
2351       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2352       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2353       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2354       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2355 
2356       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2357       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2358       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2359       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2360 
2361       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2362       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2363       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2364       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2365 
2366       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2367       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2368       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2369       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2370       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2371 
2372       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2373       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2374       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2375       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2376 
2377       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2378       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2379       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2380       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2381 
2382       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2383       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2384       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2385       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2386 
2387       /* update -U(i,k) */
2388       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2389 
2390       /* add multiple of row i to k-th row ... */
2391       jmin = ili + 1; jmax = bi[i+1];
2392       if (jmin < jmax){
2393         for (j=jmin; j<jmax; j++) {
2394           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2395           rtmp_ptr = rtmp + bj[j]*16;
2396           u = ba + j*16;
2397           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2398           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2399           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2400           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2401 
2402           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2403           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2404           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2405           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2406 
2407           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2408           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2409           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2410           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2411 
2412           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2413           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2414           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2415           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2416         }
2417 
2418         /* ... add i to row list for next nonzero entry */
2419         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2420         j     = bj[jmin];
2421         jl[i] = jl[j]; jl[j] = i; /* update jl */
2422       }
2423       i = nexti;
2424     }
2425 
2426     /* save nonzero entries in k-th row of U ... */
2427 
2428     /* invert diagonal block */
2429     diag = ba+k*16;
2430     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2431     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2432 
2433     jmin = bi[k]; jmax = bi[k+1];
2434     if (jmin < jmax) {
2435       for (j=jmin; j<jmax; j++){
2436          vj = bj[j];           /* block col. index of U */
2437          u   = ba + j*16;
2438          rtmp_ptr = rtmp + vj*16;
2439          for (k1=0; k1<16; k1++){
2440            *u++        = *rtmp_ptr;
2441            *rtmp_ptr++ = 0.0;
2442          }
2443       }
2444 
2445       /* ... add k to row list for first nonzero entry in k-th row */
2446       il[k] = jmin;
2447       i     = bj[jmin];
2448       jl[k] = jl[i]; jl[i] = k;
2449     }
2450   }
2451 
2452   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2453   ierr = PetscFree(il);CHKERRQ(ierr);
2454   ierr = PetscFree(dk);CHKERRQ(ierr);
2455   if (a->permute){
2456     ierr = PetscFree(aa);CHKERRQ(ierr);
2457   }
2458 
2459   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2460   C->factor    = FACTOR_CHOLESKY;
2461   C->assembled = PETSC_TRUE;
2462   C->preallocated = PETSC_TRUE;
2463   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2464   PetscFunctionReturn(0);
2465 }
2466 
2467 /* Version for when blocks are 3 by 3  */
2468 #undef __FUNC__
2469 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3"
2470 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B)
2471 {
2472   Mat                C = *B;
2473   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2474   IS                 perm = b->row;
2475   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2476   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2477   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2478   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2479 
2480   PetscFunctionBegin;
2481 
2482   /* initialization */
2483   rtmp  = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2484   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2485   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
2486   jl = il + mbs;
2487   for (i=0; i<mbs; i++) {
2488     jl[i] = mbs; il[0] = 0;
2489   }
2490   dk  = (MatScalar*)PetscMalloc(18*sizeof(MatScalar));CHKPTRQ(dk);
2491   uik = dk + 9;
2492   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2493 
2494   /* check permutation */
2495   if (!a->permute){
2496     ai = a->i; aj = a->j; aa = a->a;
2497   } else {
2498     ai = a->inew; aj = a->jnew;
2499     aa = (MatScalar*)PetscMalloc(9*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2500     ierr = PetscMemcpy(aa,a->a,9*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2501     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2502     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2503 
2504     for (i=0; i<mbs; i++){
2505       jmin = ai[i]; jmax = ai[i+1];
2506       for (j=jmin; j<jmax; j++){
2507         while (a2anew[j] != j){
2508           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2509           for (k1=0; k1<9; k1++){
2510             dk[k1]       = aa[k*9+k1];
2511             aa[k*9+k1] = aa[j*9+k1];
2512             aa[j*9+k1] = dk[k1];
2513           }
2514         }
2515         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2516         if (i > aj[j]){
2517           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2518           ap = aa + j*9;                     /* ptr to the beginning of j-th block of aa */
2519           for (k=0; k<9; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2520           for (k=0; k<3; k++){               /* j-th block of aa <- dk^T */
2521             for (k1=0; k1<3; k1++) *ap++ = dk[k + 3*k1];
2522           }
2523         }
2524       }
2525     }
2526     ierr = PetscFree(a2anew);CHKERRA(ierr);
2527   }
2528 
2529   /* for each row k */
2530   for (k = 0; k<mbs; k++){
2531 
2532     /*initialize k-th row with elements nonzero in row perm(k) of A */
2533     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2534     if (jmin < jmax) {
2535       ap = aa + jmin*9;
2536       for (j = jmin; j < jmax; j++){
2537         vj = perm_ptr[aj[j]];         /* block col. index */
2538         rtmp_ptr = rtmp + vj*9;
2539         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2540       }
2541     }
2542 
2543     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2544     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2545     i = jl[k]; /* first row to be added to k_th row  */
2546 
2547     while (i < mbs){
2548       nexti = jl[i]; /* next row to be added to k_th row */
2549 
2550       /* compute multiplier */
2551       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2552 
2553       /* uik = -inv(Di)*U_bar(i,k) */
2554       diag = ba + i*9;
2555       u    = ba + ili*9;
2556 
2557       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2558       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2559       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2560 
2561       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2562       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2563       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2564 
2565       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2566       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2567       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2568 
2569       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2570       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2571       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2572       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2573 
2574       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2575       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2576       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2577 
2578       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2579       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2580       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2581 
2582       /* update -U(i,k) */
2583       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2584 
2585       /* add multiple of row i to k-th row ... */
2586       jmin = ili + 1; jmax = bi[i+1];
2587       if (jmin < jmax){
2588         for (j=jmin; j<jmax; j++) {
2589           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2590           rtmp_ptr = rtmp + bj[j]*9;
2591           u = ba + j*9;
2592           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2593           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2594           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2595 
2596           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2597           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2598           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2599 
2600           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2601           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2602           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2603         }
2604 
2605         /* ... add i to row list for next nonzero entry */
2606         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2607         j     = bj[jmin];
2608         jl[i] = jl[j]; jl[j] = i; /* update jl */
2609       }
2610       i = nexti;
2611     }
2612 
2613     /* save nonzero entries in k-th row of U ... */
2614 
2615     /* invert diagonal block */
2616     diag = ba+k*9;
2617     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2618     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2619 
2620     jmin = bi[k]; jmax = bi[k+1];
2621     if (jmin < jmax) {
2622       for (j=jmin; j<jmax; j++){
2623          vj = bj[j];           /* block col. index of U */
2624          u   = ba + j*9;
2625          rtmp_ptr = rtmp + vj*9;
2626          for (k1=0; k1<9; k1++){
2627            *u++        = *rtmp_ptr;
2628            *rtmp_ptr++ = 0.0;
2629          }
2630       }
2631 
2632       /* ... add k to row list for first nonzero entry in k-th row */
2633       il[k] = jmin;
2634       i     = bj[jmin];
2635       jl[k] = jl[i]; jl[i] = k;
2636     }
2637   }
2638 
2639   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2640   ierr = PetscFree(il);CHKERRQ(ierr);
2641   ierr = PetscFree(dk);CHKERRQ(ierr);
2642   if (a->permute){
2643     ierr = PetscFree(aa);CHKERRQ(ierr);
2644   }
2645 
2646   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2647   C->factor    = FACTOR_CHOLESKY;
2648   C->assembled = PETSC_TRUE;
2649   C->preallocated = PETSC_TRUE;
2650   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2651   PetscFunctionReturn(0);
2652 }
2653 
2654 /*
2655       Version for when blocks are 3 by 3 Using natural ordering
2656 */
2657 #undef __FUNC__
2658 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering"
2659 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B)
2660 {
2661   Mat                C = *B;
2662   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2663   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2664   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2665   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2666   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2667 
2668   PetscFunctionBegin;
2669 
2670   /* initialization */
2671   rtmp  = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2672   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2673   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
2674   jl = il + mbs;
2675   for (i=0; i<mbs; i++) {
2676     jl[i] = mbs; il[0] = 0;
2677   }
2678   dk  = (MatScalar*)PetscMalloc(18*sizeof(MatScalar));CHKPTRQ(dk);
2679   uik = dk + 9;
2680 
2681   ai = a->i; aj = a->j; aa = a->a;
2682 
2683   /* for each row k */
2684   for (k = 0; k<mbs; k++){
2685 
2686     /*initialize k-th row with elements nonzero in row k of A */
2687     jmin = ai[k]; jmax = ai[k+1];
2688     if (jmin < jmax) {
2689       ap = aa + jmin*9;
2690       for (j = jmin; j < jmax; j++){
2691         vj = aj[j];         /* block col. index */
2692         rtmp_ptr = rtmp + vj*9;
2693         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2694       }
2695     }
2696 
2697     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2698     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2699     i = jl[k]; /* first row to be added to k_th row  */
2700 
2701     while (i < mbs){
2702       nexti = jl[i]; /* next row to be added to k_th row */
2703 
2704       /* compute multiplier */
2705       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2706 
2707       /* uik = -inv(Di)*U_bar(i,k) */
2708       diag = ba + i*9;
2709       u    = ba + ili*9;
2710 
2711       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2712       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2713       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2714 
2715       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2716       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2717       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2718 
2719       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2720       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2721       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2722 
2723       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2724       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2725       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2726       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2727 
2728       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2729       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2730       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2731 
2732       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2733       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2734       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2735 
2736       /* update -U(i,k) */
2737       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2738 
2739       /* add multiple of row i to k-th row ... */
2740       jmin = ili + 1; jmax = bi[i+1];
2741       if (jmin < jmax){
2742         for (j=jmin; j<jmax; j++) {
2743           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2744           rtmp_ptr = rtmp + bj[j]*9;
2745           u = ba + j*9;
2746           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2747           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2748           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2749 
2750           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2751           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2752           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2753 
2754           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2755           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2756           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2757         }
2758 
2759         /* ... add i to row list for next nonzero entry */
2760         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2761         j     = bj[jmin];
2762         jl[i] = jl[j]; jl[j] = i; /* update jl */
2763       }
2764       i = nexti;
2765     }
2766 
2767     /* save nonzero entries in k-th row of U ... */
2768 
2769     /* invert diagonal block */
2770     diag = ba+k*9;
2771     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2772     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2773 
2774     jmin = bi[k]; jmax = bi[k+1];
2775     if (jmin < jmax) {
2776       for (j=jmin; j<jmax; j++){
2777          vj = bj[j];           /* block col. index of U */
2778          u   = ba + j*9;
2779          rtmp_ptr = rtmp + vj*9;
2780          for (k1=0; k1<9; k1++){
2781            *u++        = *rtmp_ptr;
2782            *rtmp_ptr++ = 0.0;
2783          }
2784       }
2785 
2786       /* ... add k to row list for first nonzero entry in k-th row */
2787       il[k] = jmin;
2788       i     = bj[jmin];
2789       jl[k] = jl[i]; jl[i] = k;
2790     }
2791   }
2792 
2793   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2794   ierr = PetscFree(il);CHKERRQ(ierr);
2795   ierr = PetscFree(dk);CHKERRQ(ierr);
2796 
2797   C->factor    = FACTOR_CHOLESKY;
2798   C->assembled = PETSC_TRUE;
2799   C->preallocated = PETSC_TRUE;
2800   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2801   PetscFunctionReturn(0);
2802 }
2803 
2804 /*
2805     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
2806     Version for blocks 2 by 2.
2807 */
2808 #undef __FUNC__
2809 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
2810 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
2811 {
2812   Mat                C = *B;
2813   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2814   IS                 perm = b->row;
2815   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2816   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2817   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2818   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2819 
2820   PetscFunctionBegin;
2821 
2822   /* initialization */
2823   /* il and jl record the first nonzero element in each row of the accessing
2824      window U(0:k, k:mbs-1).
2825      jl:    list of rows to be added to uneliminated rows
2826             i>= k: jl(i) is the first row to be added to row i
2827             i<  k: jl(i) is the row following row i in some list of rows
2828             jl(i) = mbs indicates the end of a list
2829      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2830             row i of U */
2831   rtmp  = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2832   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2833   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
2834   jl = il + mbs;
2835   for (i=0; i<mbs; i++) {
2836     jl[i] = mbs; il[0] = 0;
2837   }
2838   dk  = (MatScalar*)PetscMalloc(8*sizeof(MatScalar));CHKPTRQ(dk);
2839   uik = dk + 4;
2840   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2841 
2842   /* check permutation */
2843   if (!a->permute){
2844     ai = a->i; aj = a->j; aa = a->a;
2845   } else {
2846     ai = a->inew; aj = a->jnew;
2847     aa = (MatScalar*)PetscMalloc(4*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2848     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2849     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2850     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2851 
2852     for (i=0; i<mbs; i++){
2853       jmin = ai[i]; jmax = ai[i+1];
2854       for (j=jmin; j<jmax; j++){
2855         while (a2anew[j] != j){
2856           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2857           for (k1=0; k1<4; k1++){
2858             dk[k1]       = aa[k*4+k1];
2859             aa[k*4+k1] = aa[j*4+k1];
2860             aa[j*4+k1] = dk[k1];
2861           }
2862         }
2863         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2864         if (i > aj[j]){
2865           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2866           ap = aa + j*4;     /* ptr to the beginning of the block */
2867           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
2868           ap[1] = ap[2];
2869           ap[2] = dk[1];
2870         }
2871       }
2872     }
2873     ierr = PetscFree(a2anew);CHKERRA(ierr);
2874   }
2875 
2876   /* for each row k */
2877   for (k = 0; k<mbs; k++){
2878 
2879     /*initialize k-th row with elements nonzero in row perm(k) of A */
2880     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2881     if (jmin < jmax) {
2882       ap = aa + jmin*4;
2883       for (j = jmin; j < jmax; j++){
2884         vj = perm_ptr[aj[j]];         /* block col. index */
2885         rtmp_ptr = rtmp + vj*4;
2886         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
2887       }
2888     }
2889 
2890     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2891     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
2892     i = jl[k]; /* first row to be added to k_th row  */
2893 
2894     while (i < mbs){
2895       nexti = jl[i]; /* next row to be added to k_th row */
2896 
2897       /* compute multiplier */
2898       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2899 
2900       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
2901       diag = ba + i*4;
2902       u    = ba + ili*4;
2903       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
2904       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
2905       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
2906       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
2907 
2908       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
2909       dk[0] += uik[0]*u[0] + uik[1]*u[1];
2910       dk[1] += uik[2]*u[0] + uik[3]*u[1];
2911       dk[2] += uik[0]*u[2] + uik[1]*u[3];
2912       dk[3] += uik[2]*u[2] + uik[3]*u[3];
2913 
2914       /* update -U(i,k): ba[ili] = uik */
2915       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
2916 
2917       /* add multiple of row i to k-th row ... */
2918       jmin = ili + 1; jmax = bi[i+1];
2919       if (jmin < jmax){
2920         for (j=jmin; j<jmax; j++) {
2921           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
2922           rtmp_ptr = rtmp + bj[j]*4;
2923           u = ba + j*4;
2924           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
2925           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
2926           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
2927           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
2928         }
2929 
2930         /* ... add i to row list for next nonzero entry */
2931         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2932         j     = bj[jmin];
2933         jl[i] = jl[j]; jl[j] = i; /* update jl */
2934       }
2935       i = nexti;
2936     }
2937 
2938     /* save nonzero entries in k-th row of U ... */
2939 
2940     /* invert diagonal block */
2941     diag = ba+k*4;
2942     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
2943     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
2944 
2945     jmin = bi[k]; jmax = bi[k+1];
2946     if (jmin < jmax) {
2947       for (j=jmin; j<jmax; j++){
2948          vj = bj[j];           /* block col. index of U */
2949          u   = ba + j*4;
2950          rtmp_ptr = rtmp + vj*4;
2951          for (k1=0; k1<4; k1++){
2952            *u++        = *rtmp_ptr;
2953            *rtmp_ptr++ = 0.0;
2954          }
2955       }
2956 
2957       /* ... add k to row list for first nonzero entry in k-th row */
2958       il[k] = jmin;
2959       i     = bj[jmin];
2960       jl[k] = jl[i]; jl[i] = k;
2961     }
2962   }
2963 
2964   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2965   ierr = PetscFree(il);CHKERRQ(ierr);
2966   ierr = PetscFree(dk);CHKERRQ(ierr);
2967   if (a->permute) {
2968     ierr = PetscFree(aa);CHKERRQ(ierr);
2969   }
2970   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2971   C->factor    = FACTOR_CHOLESKY;
2972   C->assembled = PETSC_TRUE;
2973   C->preallocated = PETSC_TRUE;
2974   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2975   PetscFunctionReturn(0);
2976 }
2977 
2978 /*
2979       Version for when blocks are 2 by 2 Using natural ordering
2980 */
2981 #undef __FUNC__
2982 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
2983 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
2984 {
2985   Mat                C = *B;
2986   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2987   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2988   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2989   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2990   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2991 
2992   PetscFunctionBegin;
2993 
2994   /* initialization */
2995   /* il and jl record the first nonzero element in each row of the accessing
2996      window U(0:k, k:mbs-1).
2997      jl:    list of rows to be added to uneliminated rows
2998             i>= k: jl(i) is the first row to be added to row i
2999             i<  k: jl(i) is the row following row i in some list of rows
3000             jl(i) = mbs indicates the end of a list
3001      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3002             row i of U */
3003   rtmp  = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
3004   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
3005   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
3006   jl = il + mbs;
3007   for (i=0; i<mbs; i++) {
3008     jl[i] = mbs; il[0] = 0;
3009   }
3010   dk  = (MatScalar*)PetscMalloc(8*sizeof(MatScalar));CHKPTRQ(dk);
3011   uik = dk + 4;
3012 
3013   ai = a->i; aj = a->j; aa = a->a;
3014 
3015   /* for each row k */
3016   for (k = 0; k<mbs; k++){
3017 
3018     /*initialize k-th row with elements nonzero in row k of A */
3019     jmin = ai[k]; jmax = ai[k+1];
3020     if (jmin < jmax) {
3021       ap = aa + jmin*4;
3022       for (j = jmin; j < jmax; j++){
3023         vj = aj[j];         /* block col. index */
3024         rtmp_ptr = rtmp + vj*4;
3025         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
3026       }
3027     }
3028 
3029     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3030     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
3031     i = jl[k]; /* first row to be added to k_th row  */
3032 
3033     while (i < mbs){
3034       nexti = jl[i]; /* next row to be added to k_th row */
3035 
3036       /* compute multiplier */
3037       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3038 
3039       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
3040       diag = ba + i*4;
3041       u    = ba + ili*4;
3042       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
3043       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
3044       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
3045       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
3046 
3047       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
3048       dk[0] += uik[0]*u[0] + uik[1]*u[1];
3049       dk[1] += uik[2]*u[0] + uik[3]*u[1];
3050       dk[2] += uik[0]*u[2] + uik[1]*u[3];
3051       dk[3] += uik[2]*u[2] + uik[3]*u[3];
3052 
3053       /* update -U(i,k): ba[ili] = uik */
3054       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
3055 
3056       /* add multiple of row i to k-th row ... */
3057       jmin = ili + 1; jmax = bi[i+1];
3058       if (jmin < jmax){
3059         for (j=jmin; j<jmax; j++) {
3060           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
3061           rtmp_ptr = rtmp + bj[j]*4;
3062           u = ba + j*4;
3063           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
3064           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
3065           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
3066           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
3067         }
3068 
3069         /* ... add i to row list for next nonzero entry */
3070         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3071         j     = bj[jmin];
3072         jl[i] = jl[j]; jl[j] = i; /* update jl */
3073       }
3074       i = nexti;
3075     }
3076 
3077     /* save nonzero entries in k-th row of U ... */
3078 
3079     /* invert diagonal block */
3080     diag = ba+k*4;
3081     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
3082     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
3083 
3084     jmin = bi[k]; jmax = bi[k+1];
3085     if (jmin < jmax) {
3086       for (j=jmin; j<jmax; j++){
3087          vj = bj[j];           /* block col. index of U */
3088          u   = ba + j*4;
3089          rtmp_ptr = rtmp + vj*4;
3090          for (k1=0; k1<4; k1++){
3091            *u++        = *rtmp_ptr;
3092            *rtmp_ptr++ = 0.0;
3093          }
3094       }
3095 
3096       /* ... add k to row list for first nonzero entry in k-th row */
3097       il[k] = jmin;
3098       i     = bj[jmin];
3099       jl[k] = jl[i]; jl[i] = k;
3100     }
3101   }
3102 
3103   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3104   ierr = PetscFree(il);CHKERRQ(ierr);
3105   ierr = PetscFree(dk);CHKERRQ(ierr);
3106 
3107   C->factor    = FACTOR_CHOLESKY;
3108   C->assembled = PETSC_TRUE;
3109   C->preallocated = PETSC_TRUE;
3110   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
3111   PetscFunctionReturn(0);
3112 }
3113 
3114 /*
3115     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
3116     Version for blocks are 1 by 1.
3117 */
3118 #undef __FUNC__
3119 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
3120 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
3121 {
3122   Mat                C = *B;
3123   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
3124   IS                 ip = b->row;
3125   int                *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
3126   int                *ai,*aj,*r;
3127   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
3128   MatScalar          *rtmp;
3129   MatScalar          *ba = b->a,*aa,ak;
3130   MatScalar          dk,uikdi;
3131 
3132   PetscFunctionBegin;
3133   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
3134   if (!a->permute){
3135     ai = a->i; aj = a->j; aa = a->a;
3136   } else {
3137     ai = a->inew; aj = a->jnew;
3138     aa = (MatScalar*)PetscMalloc(ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
3139     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
3140     r   = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(r);
3141     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
3142 
3143     jmin = ai[0]; jmax = ai[mbs];
3144     for (j=jmin; j<jmax; j++){
3145       while (r[j] != j){
3146         k = r[j]; r[j] = r[k]; r[k] = k;
3147         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
3148       }
3149     }
3150     ierr = PetscFree(r);CHKERRA(ierr);
3151   }
3152 
3153   /* initialization */
3154   /* il and jl record the first nonzero element in each row of the accessing
3155      window U(0:k, k:mbs-1).
3156      jl:    list of rows to be added to uneliminated rows
3157             i>= k: jl(i) is the first row to be added to row i
3158             i<  k: jl(i) is the row following row i in some list of rows
3159             jl(i) = mbs indicates the end of a list
3160      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3161             row i of U */
3162   rtmp  = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
3163   il = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
3164   jl = il + mbs;
3165   for (i=0; i<mbs; i++) {
3166     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
3167   }
3168 
3169   /* for each row k */
3170   for (k = 0; k<mbs; k++){
3171 
3172     /*initialize k-th row with elements nonzero in row perm(k) of A */
3173     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
3174     if (jmin < jmax) {
3175       for (j = jmin; j < jmax; j++){
3176         vj = rip[aj[j]];
3177         rtmp[vj] = aa[j];
3178       }
3179     }
3180 
3181     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3182     dk = rtmp[k];
3183     i = jl[k]; /* first row to be added to k_th row  */
3184 
3185     while (i < mbs){
3186       nexti = jl[i]; /* next row to be added to k_th row */
3187 
3188       /* compute multiplier, update D(k) and U(i,k) */
3189       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3190       uikdi = - ba[ili]*ba[i];
3191       dk += uikdi*ba[ili];
3192       ba[ili] = uikdi; /* -U(i,k) */
3193 
3194       /* add multiple of row i to k-th row ... */
3195       jmin = ili + 1; jmax = bi[i+1];
3196       if (jmin < jmax){
3197         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
3198         /* ... add i to row list for next nonzero entry */
3199         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3200         j     = bj[jmin];
3201         jl[i] = jl[j]; jl[j] = i; /* update jl */
3202       }
3203       i = nexti;
3204     }
3205 
3206     /* check for zero pivot and save diagoanl element */
3207     if (dk == 0.0){
3208       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
3209       /*
3210     }else if (PetscRealPart(dk) < 0){
3211       ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk);
3212       */
3213     }
3214 
3215     /* save nonzero entries in k-th row of U ... */
3216     ba[k] = 1.0/dk;
3217     jmin = bi[k]; jmax = bi[k+1];
3218     if (jmin < jmax) {
3219       for (j=jmin; j<jmax; j++){
3220          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
3221       }
3222       /* ... add k to row list for first nonzero entry in k-th row */
3223       il[k] = jmin;
3224       i     = bj[jmin];
3225       jl[k] = jl[i]; jl[i] = k;
3226     }
3227   }
3228 
3229   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3230   ierr = PetscFree(il);CHKERRQ(ierr);
3231   if (a->permute){
3232     ierr = PetscFree(aa);CHKERRQ(ierr);
3233   }
3234 
3235   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
3236   C->factor    = FACTOR_CHOLESKY;
3237   C->assembled = PETSC_TRUE;
3238   C->preallocated = PETSC_TRUE;
3239   PLogFlops(b->mbs);
3240   PetscFunctionReturn(0);
3241 }
3242 
3243 /*
3244   Version for when blocks are 1 by 1 Using natural ordering
3245 */
3246 #undef __FUNC__
3247 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
3248 int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
3249 {
3250   Mat                C = *B;
3251   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
3252   int                ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
3253   int                *ai,*aj;
3254   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
3255   MatScalar          *rtmp,*ba = b->a,*aa,dk,uikdi;
3256 
3257   PetscFunctionBegin;
3258 
3259   /* initialization */
3260   /* il and jl record the first nonzero element in each row of the accessing
3261      window U(0:k, k:mbs-1).
3262      jl:    list of rows to be added to uneliminated rows
3263             i>= k: jl(i) is the first row to be added to row i
3264             i<  k: jl(i) is the row following row i in some list of rows
3265             jl(i) = mbs indicates the end of a list
3266      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3267             row i of U */
3268   rtmp  = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
3269   il    = (int*)PetscMalloc(2*mbs*sizeof(int));CHKPTRQ(il);
3270   jl    = il + mbs;
3271   for (i=0; i<mbs; i++) {
3272     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
3273   }
3274 
3275   ai = a->i; aj = a->j; aa = a->a;
3276 
3277   /* for each row k */
3278   for (k = 0; k<mbs; k++){
3279 
3280     /*initialize k-th row with elements nonzero in row perm(k) of A */
3281     jmin = ai[k]; jmax = ai[k+1];
3282     if (jmin < jmax) {
3283       for (j = jmin; j < jmax; j++){
3284         vj = aj[j];
3285         rtmp[vj] = aa[j];
3286       }
3287     }
3288 
3289     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3290     dk = rtmp[k];
3291     i = jl[k]; /* first row to be added to k_th row  */
3292 
3293     while (i < mbs){
3294       nexti = jl[i]; /* next row to be added to k_th row */
3295 
3296       /* compute multiplier, update D(k) and U(i,k) */
3297       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3298       uikdi = - ba[ili]*ba[i];
3299       dk += uikdi*ba[ili];
3300       ba[ili] = uikdi; /* -U(i,k) */
3301 
3302       /* add multiple of row i to k-th row ... */
3303       jmin = ili + 1; jmax = bi[i+1];
3304       if (jmin < jmax){
3305         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
3306         /* ... add i to row list for next nonzero entry */
3307         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3308         j     = bj[jmin];
3309         jl[i] = jl[j]; jl[j] = i; /* update jl */
3310       }
3311       i = nexti;
3312     }
3313 
3314     /* check for zero pivot and save diagoanl element */
3315     if (dk == 0.0){
3316       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
3317       /*
3318     }else if (PetscRealPart(dk) < 0){
3319       ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk);
3320       */
3321     }
3322 
3323     /* save nonzero entries in k-th row of U ... */
3324     ba[k] = 1.0/dk;
3325     jmin = bi[k]; jmax = bi[k+1];
3326     if (jmin < jmax) {
3327       for (j=jmin; j<jmax; j++){
3328          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
3329       }
3330       /* ... add k to row list for first nonzero entry in k-th row */
3331       il[k] = jmin;
3332       i     = bj[jmin];
3333       jl[k] = jl[i]; jl[i] = k;
3334     }
3335   }
3336 
3337   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3338   ierr = PetscFree(il);CHKERRQ(ierr);
3339 
3340   C->factor    = FACTOR_CHOLESKY;
3341   C->assembled = PETSC_TRUE;
3342   C->preallocated = PETSC_TRUE;
3343   PLogFlops(b->mbs);
3344   PetscFunctionReturn(0);
3345 }
3346 
3347 #undef __FUNC__
3348 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ"
3349 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
3350 {
3351   int ierr;
3352   Mat C;
3353 
3354   PetscFunctionBegin;
3355   ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr);
3356   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
3357   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 
3362