xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision b0a32e0c6855ee6a6cd3495fa7da12ea9885bc5d)
1 /* Using Modified Sparse Row (MSR) storage.
2 See page 85, "Iterative Methods ..." by Saad. */
3 
4 /*$Id: sbaijfact.c,v 1.51 2000/11/28 17:29:37 bsmith Exp bsmith $*/
5 /*
6     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7 */
8 #include "sbaij.h"
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/ilu.h"
12 #include "include/petscis.h"
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
16 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
17 {
18   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
19   int         *rip,ierr,i,mbs = a->mbs,*ai,*aj;
20   int         *jutmp,bs = a->bs,bs2=a->bs2;
21   int         m,nzi,realloc = 0;
22   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
23   PetscTruth  perm_identity;
24 
25   PetscFunctionBegin;
26   if (!perm) {
27     ierr = ISCreateStride(PETSC_COMM_SELF,mbs,0,1,&perm);CHKERRQ(ierr);
28   }
29   PetscValidHeaderSpecific(perm,IS_COOKIE);
30 
31   /* check whether perm is the identity mapping */
32   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
33   if (!perm_identity) a->permute = PETSC_TRUE;
34 
35   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
36 
37   if (perm_identity){ /* without permutation */
38     ai = a->i; aj = a->j;
39   } else {            /* non-trivial permutation */
40     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRA(ierr);
41     ai = a->inew; aj = a->jnew;
42   }
43 
44   /* initialization */
45   /* Don't know how many column pointers are needed so estimate.
46      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
47 ierr = PetscMalloc((mbs+1)*sizeof(int),&  iu   );CHKERRQ(ierr);
48   umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
49 ierr = PetscMalloc(umax*sizeof(int),&(  ju   ));CHKERRQ(ierr);
50   iu[0] = mbs+1;
51   juptr = mbs;
52 ierr = PetscMalloc(mbs*sizeof(int),&(  jl ));CHKERRQ(ierr);
53 ierr = PetscMalloc(mbs*sizeof(int),&(  q  ));CHKERRQ(ierr);
54   for (i=0; i<mbs; i++){
55     jl[i] = mbs; q[i] = 0;
56   }
57 
58   /* for each row k */
59   for (k=0; k<mbs; k++){
60     nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
61     q[k] = mbs;
62     /* initialize nonzero structure of k-th row to row rip[k] of A */
63     jmin = ai[rip[k]];
64     jmax = ai[rip[k]+1];
65     for (j=jmin; j<jmax; j++){
66       vj = rip[aj[j]]; /* col. value */
67       if(vj > k){
68         qm = k;
69         do {
70           m  = qm; qm = q[m];
71         } while(qm < vj);
72         if (qm == vj) {
73           printf(" error: duplicate entry in A\n"); break;
74         }
75         nzk++;
76         q[m] = vj;
77         q[vj] = qm;
78       } /* if(vj > k) */
79     } /* for (j=jmin; j<jmax; j++) */
80 
81     /* modify nonzero structure of k-th row by computing fill-in
82        for each row i to be merged in */
83     i = k;
84     i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
85     /* printf(" next pivot row i=%d\n",i); */
86     while (i < mbs){
87       /* merge row i into k-th row */
88       nzi = iu[i+1] - (iu[i]+1);
89       jmin = iu[i] + 1; jmax = iu[i] + nzi;
90       qm = k;
91       for (j=jmin; j<jmax+1; j++){
92         vj = ju[j];
93         do {
94           m = qm; qm = q[m];
95         } while (qm < vj);
96         if (qm != vj){
97          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
98         }
99       }
100       i = jl[i]; /* next pivot row */
101     }
102 
103     /* add k to row list for first nonzero element in k-th row */
104     if (nzk > 0){
105       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
106       jl[k] = jl[i]; jl[i] = k;
107     }
108     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
109 
110     /* allocate more space to ju if needed */
111     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
112       /* estimate how much additional space we will need */
113       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
114       /* just double the memory each time */
115       maxadd = umax;
116       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
117       umax += maxadd;
118 
119       /* allocate a longer ju */
120 ierr = PetscMalloc(umax*sizeof(int),&(      jutmp ));CHKERRQ(ierr);
121       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
122       ierr  = PetscFree(ju);CHKERRQ(ierr);
123       ju    = jutmp;
124       realloc++; /* count how many times we realloc */
125     }
126 
127     /* save nonzero structure of k-th row in ju */
128     i=k;
129     jumin = juptr + 1; juptr += nzk;
130     for (j=jumin; j<juptr+1; j++){
131       i=q[i];
132       ju[j]=i;
133     }
134   }
135 
136   if (ai[mbs] != 0) {
137     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
138     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
139     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
140     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
141     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
142   } else {
143      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
144   }
145 
146   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
147   ierr = PetscFree(q);CHKERRQ(ierr);
148   ierr = PetscFree(jl);CHKERRQ(ierr);
149 
150   /* put together the new matrix */
151   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
152   /* PetscLogObjectParent(*B,iperm); */
153   b = (Mat_SeqSBAIJ*)(*B)->data;
154   ierr = PetscFree(b->imax);CHKERRQ(ierr);
155   b->singlemalloc = PETSC_FALSE;
156   /* the next line frees the default space generated by the Create() */
157   ierr = PetscFree(b->a);CHKERRQ(ierr);
158   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
159   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKERRQ(ierr);
160   b->j          = ju;
161   b->i          = iu;
162   b->diag       = 0;
163   b->ilen       = 0;
164   b->imax       = 0;
165   b->row        = perm;
166   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
167   b->icol       = perm;
168   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
169 ierr = PetscMalloc((bs*mbs+bs)*sizeof(Scalar),&  b->solve_work );CHKERRQ(ierr);
170   /* In b structure:  Free imax, ilen, old a, old j.
171      Allocate idnew, solve_work, new a, new j */
172   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
173   b->s_maxnz = b->s_nz = iu[mbs];
174 
175   (*B)->factor                 = FACTOR_CHOLESKY;
176   (*B)->info.factor_mallocs    = realloc;
177   (*B)->info.fill_ratio_given  = f;
178   if (ai[mbs] != 0) {
179     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
180   } else {
181     (*B)->info.fill_ratio_needed = 0.0;
182   }
183 
184   if (perm_identity){
185     switch (bs) {
186       case 1:
187         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
188         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
189         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
190         break;
191       case 2:
192         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
193         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
194         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
195         break;
196       case 3:
197         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
198         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
199         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
200         break;
201       case 4:
202         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
203         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
204         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
205         break;
206       case 5:
207         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
208         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
209         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
210         break;
211       case 6:
212         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
213         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
214         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
215         break;
216       case 7:
217         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
218         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
219         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
220       break;
221       default:
222         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
223         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
224         PetscLogInfo(A,"MatIncompleteCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
225       break;
226     }
227   }
228 
229   PetscFunctionReturn(0);
230 }
231 
232 
233 #undef __FUNC__
234 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
235 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
236 {
237   Mat                C = *B;
238   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
239   IS                 perm = b->row;
240   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
241   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
242   int                bs=a->bs,bs2 = a->bs2;
243   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
244   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
245   MatScalar          *work;
246   int                *pivots;
247 
248   PetscFunctionBegin;
249 
250   /* initialization */
251 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
252   ierr  = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
253 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il    ));CHKERRQ(ierr);
254   jl    = il + mbs;
255   for (i=0; i<mbs; i++) {
256     jl[i] = mbs; il[0] = 0;
257   }
258 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&  dk    );CHKERRQ(ierr);
259   uik   = dk + bs2;
260   work  = uik + bs2;
261 ierr = PetscMalloc(bs*sizeof(int),&(  pivots));CHKERRQ(ierr);
262 
263   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
264 
265   /* check permutation */
266   if (!a->permute){
267     ai = a->i; aj = a->j; aa = a->a;
268   } else {
269     ai = a->inew; aj = a->jnew;
270     aa = (MatScalar*)PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
271     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
272     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
273     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
274 
275     for (i=0; i<mbs; i++){
276       jmin = ai[i]; jmax = ai[i+1];
277       for (j=jmin; j<jmax; j++){
278         while (a2anew[j] != j){
279           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
280           for (k1=0; k1<bs2; k1++){
281             dk[k1]       = aa[k*bs2+k1];
282             aa[k*bs2+k1] = aa[j*bs2+k1];
283             aa[j*bs2+k1] = dk[k1];
284           }
285         }
286         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
287         if (i > aj[j]){
288           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
289           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
290           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
291           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
292             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
293           }
294         }
295       }
296     }
297     ierr = PetscFree(a2anew);CHKERRA(ierr);
298   }
299 
300   /* for each row k */
301   for (k = 0; k<mbs; k++){
302 
303     /*initialize k-th row with elements nonzero in row perm(k) of A */
304     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
305     if (jmin < jmax) {
306       ap = aa + jmin*bs2;
307       for (j = jmin; j < jmax; j++){
308         vj = perm_ptr[aj[j]];         /* block col. index */
309         rtmp_ptr = rtmp + vj*bs2;
310         for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
311       }
312     }
313 
314     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
315     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
316     i = jl[k]; /* first row to be added to k_th row  */
317 
318     while (i < mbs){
319       nexti = jl[i]; /* next row to be added to k_th row */
320 
321       /* compute multiplier */
322       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
323 
324       /* uik = -inv(Di)*U_bar(i,k) */
325       diag = ba + i*bs2;
326       u    = ba + ili*bs2;
327       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
328       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
329 
330       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
331       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
332 
333       /* update -U(i,k) */
334       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
335 
336       /* add multiple of row i to k-th row ... */
337       jmin = ili + 1; jmax = bi[i+1];
338       if (jmin < jmax){
339         for (j=jmin; j<jmax; j++) {
340           /* rtmp += -U(i,k)^T * U_bar(i,j) */
341           rtmp_ptr = rtmp + bj[j]*bs2;
342           u = ba + j*bs2;
343           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
344         }
345 
346         /* ... add i to row list for next nonzero entry */
347         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
348         j     = bj[jmin];
349         jl[i] = jl[j]; jl[j] = i; /* update jl */
350       }
351       i = nexti;
352     }
353 
354     /* save nonzero entries in k-th row of U ... */
355 
356     /* invert diagonal block */
357     diag = ba+k*bs2;
358     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
359     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
360 
361     jmin = bi[k]; jmax = bi[k+1];
362     if (jmin < jmax) {
363       for (j=jmin; j<jmax; j++){
364          vj = bj[j];           /* block col. index of U */
365          u   = ba + j*bs2;
366          rtmp_ptr = rtmp + vj*bs2;
367          for (k1=0; k1<bs2; k1++){
368            *u++        = *rtmp_ptr;
369            *rtmp_ptr++ = 0.0;
370          }
371       }
372 
373       /* ... add k to row list for first nonzero entry in k-th row */
374       il[k] = jmin;
375       i     = bj[jmin];
376       jl[k] = jl[i]; jl[i] = k;
377     }
378   }
379 
380   ierr = PetscFree(rtmp);CHKERRQ(ierr);
381   ierr = PetscFree(il);CHKERRQ(ierr);
382   ierr = PetscFree(dk);CHKERRQ(ierr);
383   ierr = PetscFree(pivots);CHKERRQ(ierr);
384   if (a->permute){
385     ierr = PetscFree(aa);CHKERRQ(ierr);
386   }
387 
388   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
389   C->factor    = FACTOR_CHOLESKY;
390   C->assembled = PETSC_TRUE;
391   C->preallocated = PETSC_TRUE;
392   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
393   PetscFunctionReturn(0);
394 }
395 
396 #undef __FUNC__
397 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
398 int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
399 {
400   Mat                C = *B;
401   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
402   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
403   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
404   int                bs=a->bs,bs2 = a->bs2;
405   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
406   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
407   MatScalar          *work;
408   int                *pivots;
409 
410   PetscFunctionBegin;
411 
412   /* initialization */
413 
414 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
415   ierr  = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
416 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il    ));CHKERRQ(ierr);
417   jl    = il + mbs;
418   for (i=0; i<mbs; i++) {
419     jl[i] = mbs; il[0] = 0;
420   }
421 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&  dk    );CHKERRQ(ierr);
422   uik   = dk + bs2;
423   work  = uik + bs2;
424 ierr = PetscMalloc(bs*sizeof(int),&(  pivots));CHKERRQ(ierr);
425 
426   ai = a->i; aj = a->j; aa = a->a;
427 
428   /* for each row k */
429   for (k = 0; k<mbs; k++){
430 
431     /*initialize k-th row with elements nonzero in row k of A */
432     jmin = ai[k]; jmax = ai[k+1];
433     if (jmin < jmax) {
434       ap = aa + jmin*bs2;
435       for (j = jmin; j < jmax; j++){
436         vj = aj[j];         /* block col. index */
437         rtmp_ptr = rtmp + vj*bs2;
438         for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
439       }
440     }
441 
442     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
443     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
444     i = jl[k]; /* first row to be added to k_th row  */
445 
446     while (i < mbs){
447       nexti = jl[i]; /* next row to be added to k_th row */
448 
449       /* compute multiplier */
450       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
451 
452       /* uik = -inv(Di)*U_bar(i,k) */
453       diag = ba + i*bs2;
454       u    = ba + ili*bs2;
455       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
456       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
457 
458       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
459       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
460 
461       /* update -U(i,k) */
462       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
463 
464       /* add multiple of row i to k-th row ... */
465       jmin = ili + 1; jmax = bi[i+1];
466       if (jmin < jmax){
467         for (j=jmin; j<jmax; j++) {
468           /* rtmp += -U(i,k)^T * U_bar(i,j) */
469           rtmp_ptr = rtmp + bj[j]*bs2;
470           u = ba + j*bs2;
471           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
472         }
473 
474         /* ... add i to row list for next nonzero entry */
475         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
476         j     = bj[jmin];
477         jl[i] = jl[j]; jl[j] = i; /* update jl */
478       }
479       i = nexti;
480     }
481 
482     /* save nonzero entries in k-th row of U ... */
483 
484     /* invert diagonal block */
485     diag = ba+k*bs2;
486     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
487     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
488 
489     jmin = bi[k]; jmax = bi[k+1];
490     if (jmin < jmax) {
491       for (j=jmin; j<jmax; j++){
492          vj = bj[j];           /* block col. index of U */
493          u   = ba + j*bs2;
494          rtmp_ptr = rtmp + vj*bs2;
495          for (k1=0; k1<bs2; k1++){
496            *u++        = *rtmp_ptr;
497            *rtmp_ptr++ = 0.0;
498          }
499       }
500 
501       /* ... add k to row list for first nonzero entry in k-th row */
502       il[k] = jmin;
503       i     = bj[jmin];
504       jl[k] = jl[i]; jl[i] = k;
505     }
506   }
507 
508   ierr = PetscFree(rtmp);CHKERRQ(ierr);
509   ierr = PetscFree(il);CHKERRQ(ierr);
510   ierr = PetscFree(dk);CHKERRQ(ierr);
511   ierr = PetscFree(pivots);CHKERRQ(ierr);
512 
513   C->factor    = FACTOR_CHOLESKY;
514   C->assembled = PETSC_TRUE;
515   C->preallocated = PETSC_TRUE;
516   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
517   PetscFunctionReturn(0);
518 }
519 
520 /* Version for when blocks are 7 by 7 */
521 #undef __FUNC__
522 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7"
523 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B)
524 {
525   Mat                C = *B;
526   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
527   IS                 perm = b->row;
528   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
529   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
530   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
531   MatScalar          *u,*d,*w,*wp;
532 
533   PetscFunctionBegin;
534   /* initialization */
535 ierr = PetscMalloc(49*mbs*sizeof(MatScalar),&(  w  ));CHKERRQ(ierr);
536   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
537 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
538   jl = il + mbs;
539   for (i=0; i<mbs; i++) {
540     jl[i] = mbs; il[0] = 0;
541   }
542 ierr = PetscMalloc(98*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
543   uik   = dk + 49;
544   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
545 
546   /* check permutation */
547   if (!a->permute){
548     ai = a->i; aj = a->j; aa = a->a;
549   } else {
550     ai = a->inew; aj = a->jnew;
551     aa = (MatScalar*)PetscMalloc(49*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
552     ierr = PetscMemcpy(aa,a->a,49*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
553     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
554     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
555 
556     for (i=0; i<mbs; i++){
557       jmin = ai[i]; jmax = ai[i+1];
558       for (j=jmin; j<jmax; j++){
559         while (a2anew[j] != j){
560           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
561           for (k1=0; k1<49; k1++){
562             dk[k1]       = aa[k*49+k1];
563             aa[k*49+k1] = aa[j*49+k1];
564             aa[j*49+k1] = dk[k1];
565           }
566         }
567         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
568         if (i > aj[j]){
569           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
570           ap = aa + j*49;                     /* ptr to the beginning of j-th block of aa */
571           for (k=0; k<49; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
572           for (k=0; k<7; k++){               /* j-th block of aa <- dk^T */
573             for (k1=0; k1<7; k1++) *ap++ = dk[k + 7*k1];
574           }
575         }
576       }
577     }
578     ierr = PetscFree(a2anew);CHKERRA(ierr);
579   }
580 
581   /* for each row k */
582   for (k = 0; k<mbs; k++){
583 
584     /*initialize k-th row with elements nonzero in row perm(k) of A */
585     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
586     if (jmin < jmax) {
587       ap = aa + jmin*49;
588       for (j = jmin; j < jmax; j++){
589         vj = perm_ptr[aj[j]];         /* block col. index */
590         wp = w + vj*49;
591         for (i=0; i<49; i++) *wp++ = *ap++;
592       }
593     }
594 
595     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
596     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
597     i = jl[k]; /* first row to be added to k_th row  */
598 
599     while (i < mbs){
600       nexti = jl[i]; /* next row to be added to k_th row */
601 
602       /* compute multiplier */
603       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
604 
605       /* uik = -inv(Di)*U_bar(i,k) */
606       d = ba + i*49;
607       u    = ba + ili*49;
608 
609       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
610       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
611       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
612       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
613       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
614       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
615       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
616 
617       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
618       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
619       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
620       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
621       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
622       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
623       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
624 
625       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
626       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
627       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
628       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
629       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
630       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
631       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
632 
633       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
634       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
635       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
636       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
637       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
638       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
639       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
640 
641       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
642       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
643       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
644       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
645       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
646       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
647       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
648 
649       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
650       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
651       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
652       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
653       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
654       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
655       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
656 
657       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
658       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
659       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
660       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
661       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
662       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
663       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
664 
665       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
666       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
667       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
668       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
669       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
670       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
671       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
672       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
673 
674       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
675       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
676       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
677       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
678       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
679       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
680       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
681 
682       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
683       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
684       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
685       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
686       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
687       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
688       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
689 
690       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
691       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
692       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
693       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
694       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
695       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
696       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
697 
698       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
699       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
700       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
701       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
702       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
703       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
704       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
705 
706       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
707       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
708       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
709       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
710       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
711       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
712       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
713 
714       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
715       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
716       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
717       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
718       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
719       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
720       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
721 
722       /* update -U(i,k) */
723       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
724 
725       /* add multiple of row i to k-th row ... */
726       jmin = ili + 1; jmax = bi[i+1];
727       if (jmin < jmax){
728         for (j=jmin; j<jmax; j++) {
729           /* w += -U(i,k)^T * U_bar(i,j) */
730           wp = w + bj[j]*49;
731           u = ba + j*49;
732 
733           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
734           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
735           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
736           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
737           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
738           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
739           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
740 
741           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
742           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
743           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
744           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
745           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
746           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
747           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
748 
749           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
750           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
751           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
752           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
753           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
754           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
755           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
756 
757           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
758           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
759           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
760           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
761           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
762           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
763           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
764 
765           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
766           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
767           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
768           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
769           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
770           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
771           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
772 
773           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
774           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
775           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
776           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
777           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
778           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
779           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
780 
781           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
782           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
783           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
784           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
785           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
786           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
787           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
788         }
789 
790         /* ... add i to row list for next nonzero entry */
791         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
792         j     = bj[jmin];
793         jl[i] = jl[j]; jl[j] = i; /* update jl */
794       }
795       i = nexti;
796     }
797 
798     /* save nonzero entries in k-th row of U ... */
799 
800     /* invert diagonal block */
801     d = ba+k*49;
802     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
803     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
804 
805     jmin = bi[k]; jmax = bi[k+1];
806     if (jmin < jmax) {
807       for (j=jmin; j<jmax; j++){
808          vj = bj[j];           /* block col. index of U */
809          u   = ba + j*49;
810          wp = w + vj*49;
811          for (k1=0; k1<49; k1++){
812            *u++        = *wp;
813            *wp++ = 0.0;
814          }
815       }
816 
817       /* ... add k to row list for first nonzero entry in k-th row */
818       il[k] = jmin;
819       i     = bj[jmin];
820       jl[k] = jl[i]; jl[i] = k;
821     }
822   }
823 
824   ierr = PetscFree(w);CHKERRQ(ierr);
825   ierr = PetscFree(il);CHKERRQ(ierr);
826   ierr = PetscFree(dk);CHKERRQ(ierr);
827   if (a->permute){
828     ierr = PetscFree(aa);CHKERRQ(ierr);
829   }
830 
831   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
832   C->factor    = FACTOR_CHOLESKY;
833   C->assembled = PETSC_TRUE;
834   C->preallocated = PETSC_TRUE;
835   PetscLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
836   PetscFunctionReturn(0);
837 }
838 
839 /*
840       Version for when blocks are 7 by 7 Using natural ordering
841 */
842 #undef __FUNC__
843 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering"
844 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B)
845 {
846   Mat                C = *B;
847   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
848   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
849   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
850   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
851   MatScalar          *u,*d,*w,*wp;
852 
853   PetscFunctionBegin;
854 
855   /* initialization */
856 ierr = PetscMalloc(49*mbs*sizeof(MatScalar),&(  w  ));CHKERRQ(ierr);
857   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
858 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
859   jl = il + mbs;
860   for (i=0; i<mbs; i++) {
861     jl[i] = mbs; il[0] = 0;
862   }
863 ierr = PetscMalloc(98*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
864   uik   = dk + 49;
865 
866   ai = a->i; aj = a->j; aa = a->a;
867 
868   /* for each row k */
869   for (k = 0; k<mbs; k++){
870 
871     /*initialize k-th row with elements nonzero in row k of A */
872     jmin = ai[k]; jmax = ai[k+1];
873     if (jmin < jmax) {
874       ap = aa + jmin*49;
875       for (j = jmin; j < jmax; j++){
876         vj = aj[j];         /* block col. index */
877         wp = w + vj*49;
878         for (i=0; i<49; i++) *wp++ = *ap++;
879       }
880     }
881 
882     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
883     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
884     i = jl[k]; /* first row to be added to k_th row  */
885 
886     while (i < mbs){
887       nexti = jl[i]; /* next row to be added to k_th row */
888 
889       /* compute multiplier */
890       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
891 
892       /* uik = -inv(Di)*U_bar(i,k) */
893       d = ba + i*49;
894       u    = ba + ili*49;
895 
896       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
897       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
898       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
899       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
900       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
901       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
902       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
903 
904       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
905       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
906       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
907       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
908       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
909       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
910       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
911 
912       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
913       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
914       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
915       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
916       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
917       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
918       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
919 
920       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
921       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
922       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
923       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
924       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
925       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
926       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
927 
928       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
929       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
930       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
931       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
932       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
933       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
934       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
935 
936       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
937       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
938       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
939       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
940       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
941       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
942       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
943 
944       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
945       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
946       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
947       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
948       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
949       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
950       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
951 
952       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
953       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
954       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
955       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
956       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
957       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
958       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
959       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
960 
961       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
962       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
963       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
964       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
965       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
966       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
967       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
968 
969       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
970       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
971       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
972       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
973       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
974       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
975       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
976 
977       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
978       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
979       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
980       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
981       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
982       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
983       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
984 
985       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
986       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
987       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
988       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
989       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
990       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
991       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
992 
993       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
994       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
995       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
996       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
997       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
998       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
999       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
1000 
1001       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
1002       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
1003       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
1004       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
1005       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
1006       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
1007       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
1008 
1009       /* update -U(i,k) */
1010       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
1011 
1012       /* add multiple of row i to k-th row ... */
1013       jmin = ili + 1; jmax = bi[i+1];
1014       if (jmin < jmax){
1015         for (j=jmin; j<jmax; j++) {
1016           /* w += -U(i,k)^T * U_bar(i,j) */
1017           wp = w + bj[j]*49;
1018           u = ba + j*49;
1019 
1020           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
1021           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
1022           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
1023           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
1024           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
1025           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
1026           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
1027 
1028           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
1029           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
1030           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
1031           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
1032           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
1033           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
1034           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
1035 
1036           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
1037           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
1038           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
1039           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
1040           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
1041           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
1042           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
1043 
1044           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
1045           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
1046           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
1047           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
1048           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
1049           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
1050           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
1051 
1052           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
1053           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
1054           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
1055           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
1056           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
1057           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
1058           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
1059 
1060           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
1061           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
1062           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
1063           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
1064           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
1065           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
1066           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
1067 
1068           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
1069           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
1070           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
1071           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
1072           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
1073           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
1074           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
1075         }
1076 
1077         /* ... add i to row list for next nonzero entry */
1078         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1079         j     = bj[jmin];
1080         jl[i] = jl[j]; jl[j] = i; /* update jl */
1081       }
1082       i = nexti;
1083     }
1084 
1085     /* save nonzero entries in k-th row of U ... */
1086 
1087     /* invert diagonal block */
1088     d = ba+k*49;
1089     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
1090     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
1091 
1092     jmin = bi[k]; jmax = bi[k+1];
1093     if (jmin < jmax) {
1094       for (j=jmin; j<jmax; j++){
1095          vj = bj[j];           /* block col. index of U */
1096          u   = ba + j*49;
1097          wp = w + vj*49;
1098          for (k1=0; k1<49; k1++){
1099            *u++        = *wp;
1100            *wp++ = 0.0;
1101          }
1102       }
1103 
1104       /* ... add k to row list for first nonzero entry in k-th row */
1105       il[k] = jmin;
1106       i     = bj[jmin];
1107       jl[k] = jl[i]; jl[i] = k;
1108     }
1109   }
1110 
1111   ierr = PetscFree(w);CHKERRQ(ierr);
1112   ierr = PetscFree(il);CHKERRQ(ierr);
1113   ierr = PetscFree(dk);CHKERRQ(ierr);
1114 
1115   C->factor    = FACTOR_CHOLESKY;
1116   C->assembled = PETSC_TRUE;
1117   C->preallocated = PETSC_TRUE;
1118   PetscLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 /* Version for when blocks are 6 by 6 */
1123 #undef __FUNC__
1124 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6"
1125 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B)
1126 {
1127   Mat                C = *B;
1128   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1129   IS                 perm = b->row;
1130   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1131   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1132   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1133   MatScalar          *u,*d,*w,*wp;
1134 
1135   PetscFunctionBegin;
1136   /* initialization */
1137 ierr = PetscMalloc(36*mbs*sizeof(MatScalar),&(  w  ));CHKERRQ(ierr);
1138   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1139 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
1140   jl = il + mbs;
1141   for (i=0; i<mbs; i++) {
1142     jl[i] = mbs; il[0] = 0;
1143   }
1144 ierr = PetscMalloc(72*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
1145   uik   = dk + 36;
1146   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
1147 
1148   /* check permutation */
1149   if (!a->permute){
1150     ai = a->i; aj = a->j; aa = a->a;
1151   } else {
1152     ai = a->inew; aj = a->jnew;
1153     aa = (MatScalar*)PetscMalloc(36*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1154     ierr = PetscMemcpy(aa,a->a,36*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1155     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
1156     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1157 
1158     for (i=0; i<mbs; i++){
1159       jmin = ai[i]; jmax = ai[i+1];
1160       for (j=jmin; j<jmax; j++){
1161         while (a2anew[j] != j){
1162           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1163           for (k1=0; k1<36; k1++){
1164             dk[k1]       = aa[k*36+k1];
1165             aa[k*36+k1] = aa[j*36+k1];
1166             aa[j*36+k1] = dk[k1];
1167           }
1168         }
1169         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1170         if (i > aj[j]){
1171           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1172           ap = aa + j*36;                     /* ptr to the beginning of j-th block of aa */
1173           for (k=0; k<36; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1174           for (k=0; k<6; k++){               /* j-th block of aa <- dk^T */
1175             for (k1=0; k1<6; k1++) *ap++ = dk[k + 6*k1];
1176           }
1177         }
1178       }
1179     }
1180     ierr = PetscFree(a2anew);CHKERRA(ierr);
1181   }
1182 
1183   /* for each row k */
1184   for (k = 0; k<mbs; k++){
1185 
1186     /*initialize k-th row with elements nonzero in row perm(k) of A */
1187     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1188     if (jmin < jmax) {
1189       ap = aa + jmin*36;
1190       for (j = jmin; j < jmax; j++){
1191         vj = perm_ptr[aj[j]];         /* block col. index */
1192         wp = w + vj*36;
1193         for (i=0; i<36; i++) *wp++ = *ap++;
1194       }
1195     }
1196 
1197     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1198     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1199     i = jl[k]; /* first row to be added to k_th row  */
1200 
1201     while (i < mbs){
1202       nexti = jl[i]; /* next row to be added to k_th row */
1203 
1204       /* compute multiplier */
1205       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1206 
1207       /* uik = -inv(Di)*U_bar(i,k) */
1208       d = ba + i*36;
1209       u    = ba + ili*36;
1210 
1211       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1212       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1213       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1214       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1215       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1216       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1217 
1218       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1219       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1220       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1221       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1222       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1223       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1224 
1225       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1226       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1227       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1228       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1229       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1230       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1231 
1232       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1233       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1234       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1235       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1236       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1237       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1238 
1239       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1240       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1241       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1242       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1243       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1244       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1245 
1246       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1247       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1248       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1249       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1250       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1251       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1252 
1253       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1254       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1255       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1256       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1257       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1258       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1259       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1260 
1261       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1262       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1263       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1264       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1265       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1266       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1267 
1268       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1269       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1270       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1271       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1272       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1273       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1274 
1275       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1276       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1277       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1278       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1279       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1280       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1281 
1282       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1283       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1284       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1285       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1286       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1287       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1288 
1289       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1290       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1291       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1292       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1293       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1294       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1295 
1296       /* update -U(i,k) */
1297       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1298 
1299       /* add multiple of row i to k-th row ... */
1300       jmin = ili + 1; jmax = bi[i+1];
1301       if (jmin < jmax){
1302         for (j=jmin; j<jmax; j++) {
1303           /* w += -U(i,k)^T * U_bar(i,j) */
1304           wp = w + bj[j]*36;
1305           u = ba + j*36;
1306           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1307           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1308           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1309           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1310           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1311           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1312 
1313           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1314           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1315           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1316           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1317           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1318           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1319 
1320           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1321           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1322           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1323           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1324           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1325           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1326 
1327           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1328           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1329           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1330           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1331           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1332           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1333 
1334           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1335           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1336           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1337           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1338           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1339           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1340 
1341           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1342           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1343           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1344           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1345           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1346           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1347         }
1348 
1349         /* ... add i to row list for next nonzero entry */
1350         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1351         j     = bj[jmin];
1352         jl[i] = jl[j]; jl[j] = i; /* update jl */
1353       }
1354       i = nexti;
1355     }
1356 
1357     /* save nonzero entries in k-th row of U ... */
1358 
1359     /* invert diagonal block */
1360     d = ba+k*36;
1361     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1362     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1363 
1364     jmin = bi[k]; jmax = bi[k+1];
1365     if (jmin < jmax) {
1366       for (j=jmin; j<jmax; j++){
1367          vj = bj[j];           /* block col. index of U */
1368          u   = ba + j*36;
1369          wp = w + vj*36;
1370          for (k1=0; k1<36; k1++){
1371            *u++        = *wp;
1372            *wp++ = 0.0;
1373          }
1374       }
1375 
1376       /* ... add k to row list for first nonzero entry in k-th row */
1377       il[k] = jmin;
1378       i     = bj[jmin];
1379       jl[k] = jl[i]; jl[i] = k;
1380     }
1381   }
1382 
1383   ierr = PetscFree(w);CHKERRQ(ierr);
1384   ierr = PetscFree(il);CHKERRQ(ierr);
1385   ierr = PetscFree(dk);CHKERRQ(ierr);
1386   if (a->permute){
1387     ierr = PetscFree(aa);CHKERRQ(ierr);
1388   }
1389 
1390   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1391   C->factor    = FACTOR_CHOLESKY;
1392   C->assembled = PETSC_TRUE;
1393   C->preallocated = PETSC_TRUE;
1394   PetscLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 /*
1399       Version for when blocks are 6 by 6 Using natural ordering
1400 */
1401 #undef __FUNC__
1402 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering"
1403 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B)
1404 {
1405   Mat                C = *B;
1406   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1407   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1408   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1409   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1410   MatScalar          *u,*d,*w,*wp;
1411 
1412   PetscFunctionBegin;
1413 
1414   /* initialization */
1415 ierr = PetscMalloc(36*mbs*sizeof(MatScalar),&(  w  ));CHKERRQ(ierr);
1416   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1417 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
1418   jl = il + mbs;
1419   for (i=0; i<mbs; i++) {
1420     jl[i] = mbs; il[0] = 0;
1421   }
1422 ierr = PetscMalloc(72*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
1423   uik   = dk + 36;
1424 
1425   ai = a->i; aj = a->j; aa = a->a;
1426 
1427   /* for each row k */
1428   for (k = 0; k<mbs; k++){
1429 
1430     /*initialize k-th row with elements nonzero in row k of A */
1431     jmin = ai[k]; jmax = ai[k+1];
1432     if (jmin < jmax) {
1433       ap = aa + jmin*36;
1434       for (j = jmin; j < jmax; j++){
1435         vj = aj[j];         /* block col. index */
1436         wp = w + vj*36;
1437         for (i=0; i<36; i++) *wp++ = *ap++;
1438       }
1439     }
1440 
1441     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1442     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1443     i = jl[k]; /* first row to be added to k_th row  */
1444 
1445     while (i < mbs){
1446       nexti = jl[i]; /* next row to be added to k_th row */
1447 
1448       /* compute multiplier */
1449       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1450 
1451       /* uik = -inv(Di)*U_bar(i,k) */
1452       d = ba + i*36;
1453       u    = ba + ili*36;
1454 
1455       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1456       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1457       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1458       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1459       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1460       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1461 
1462       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1463       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1464       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1465       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1466       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1467       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1468 
1469       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1470       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1471       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1472       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1473       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1474       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1475 
1476       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1477       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1478       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1479       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1480       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1481       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1482 
1483       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1484       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1485       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1486       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1487       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1488       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1489 
1490       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1491       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1492       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1493       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1494       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1495       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1496 
1497       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1498       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1499       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1500       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1501       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1502       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1503       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1504 
1505       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1506       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1507       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1508       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1509       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1510       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1511 
1512       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1513       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1514       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1515       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1516       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1517       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1518 
1519       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1520       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1521       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1522       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1523       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1524       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1525 
1526       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1527       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1528       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1529       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1530       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1531       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1532 
1533       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1534       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1535       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1536       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1537       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1538       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1539 
1540       /* update -U(i,k) */
1541       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1542 
1543       /* add multiple of row i to k-th row ... */
1544       jmin = ili + 1; jmax = bi[i+1];
1545       if (jmin < jmax){
1546         for (j=jmin; j<jmax; j++) {
1547           /* w += -U(i,k)^T * U_bar(i,j) */
1548           wp = w + bj[j]*36;
1549           u = ba + j*36;
1550           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1551           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1552           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1553           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1554           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1555           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1556 
1557           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1558           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1559           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1560           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1561           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1562           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1563 
1564           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1565           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1566           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1567           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1568           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1569           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1570 
1571           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1572           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1573           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1574           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1575           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1576           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1577 
1578           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1579           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1580           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1581           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1582           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1583           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1584 
1585           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1586           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1587           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1588           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1589           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1590           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1591         }
1592 
1593         /* ... add i to row list for next nonzero entry */
1594         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1595         j     = bj[jmin];
1596         jl[i] = jl[j]; jl[j] = i; /* update jl */
1597       }
1598       i = nexti;
1599     }
1600 
1601     /* save nonzero entries in k-th row of U ... */
1602 
1603     /* invert diagonal block */
1604     d = ba+k*36;
1605     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1606     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1607 
1608     jmin = bi[k]; jmax = bi[k+1];
1609     if (jmin < jmax) {
1610       for (j=jmin; j<jmax; j++){
1611          vj = bj[j];           /* block col. index of U */
1612          u   = ba + j*36;
1613          wp = w + vj*36;
1614          for (k1=0; k1<36; k1++){
1615            *u++        = *wp;
1616            *wp++ = 0.0;
1617          }
1618       }
1619 
1620       /* ... add k to row list for first nonzero entry in k-th row */
1621       il[k] = jmin;
1622       i     = bj[jmin];
1623       jl[k] = jl[i]; jl[i] = k;
1624     }
1625   }
1626 
1627   ierr = PetscFree(w);CHKERRQ(ierr);
1628   ierr = PetscFree(il);CHKERRQ(ierr);
1629   ierr = PetscFree(dk);CHKERRQ(ierr);
1630 
1631   C->factor    = FACTOR_CHOLESKY;
1632   C->assembled = PETSC_TRUE;
1633   C->preallocated = PETSC_TRUE;
1634   PetscLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 /* Version for when blocks are 5 by 5  */
1639 #undef __FUNC__
1640 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5"
1641 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B)
1642 {
1643   Mat                C = *B;
1644   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1645   IS                 perm = b->row;
1646   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1647   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1648   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1649   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1650 
1651   PetscFunctionBegin;
1652   /* initialization */
1653 ierr = PetscMalloc(25*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
1654   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1655 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
1656   jl = il + mbs;
1657   for (i=0; i<mbs; i++) {
1658     jl[i] = mbs; il[0] = 0;
1659   }
1660 ierr = PetscMalloc(50*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
1661   uik   = dk + 25;
1662   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
1663 
1664   /* check permutation */
1665   if (!a->permute){
1666     ai = a->i; aj = a->j; aa = a->a;
1667   } else {
1668     ai = a->inew; aj = a->jnew;
1669     aa = (MatScalar*)PetscMalloc(25*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1670     ierr = PetscMemcpy(aa,a->a,25*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1671     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
1672     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1673 
1674     for (i=0; i<mbs; i++){
1675       jmin = ai[i]; jmax = ai[i+1];
1676       for (j=jmin; j<jmax; j++){
1677         while (a2anew[j] != j){
1678           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1679           for (k1=0; k1<25; k1++){
1680             dk[k1]       = aa[k*25+k1];
1681             aa[k*25+k1] = aa[j*25+k1];
1682             aa[j*25+k1] = dk[k1];
1683           }
1684         }
1685         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1686         if (i > aj[j]){
1687           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1688           ap = aa + j*25;                     /* ptr to the beginning of j-th block of aa */
1689           for (k=0; k<25; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1690           for (k=0; k<5; k++){               /* j-th block of aa <- dk^T */
1691             for (k1=0; k1<5; k1++) *ap++ = dk[k + 5*k1];
1692           }
1693         }
1694       }
1695     }
1696     ierr = PetscFree(a2anew);CHKERRA(ierr);
1697   }
1698 
1699   /* for each row k */
1700   for (k = 0; k<mbs; k++){
1701 
1702     /*initialize k-th row with elements nonzero in row perm(k) of A */
1703     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1704     if (jmin < jmax) {
1705       ap = aa + jmin*25;
1706       for (j = jmin; j < jmax; j++){
1707         vj = perm_ptr[aj[j]];         /* block col. index */
1708         rtmp_ptr = rtmp + vj*25;
1709         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1710       }
1711     }
1712 
1713     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1714     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1715     i = jl[k]; /* first row to be added to k_th row  */
1716 
1717     while (i < mbs){
1718       nexti = jl[i]; /* next row to be added to k_th row */
1719 
1720       /* compute multiplier */
1721       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1722 
1723       /* uik = -inv(Di)*U_bar(i,k) */
1724       d = ba + i*25;
1725       u    = ba + ili*25;
1726 
1727       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1728       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1729       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1730       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1731       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1732 
1733       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1734       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1735       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1736       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1737       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1738 
1739       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1740       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1741       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1742       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1743       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1744 
1745       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1746       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1747       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1748       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1749       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1750 
1751       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1752       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1753       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1754       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1755       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1756 
1757 
1758       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1759       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1760       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1761       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1762       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1763       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1764 
1765       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1766       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1767       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1768       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1769       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1770 
1771       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1772       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1773       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1774       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1775       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1776 
1777       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1778       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1779       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1780       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1781       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1782 
1783       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1784       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1785       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1786       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1787       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1788 
1789       /* update -U(i,k) */
1790       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1791 
1792       /* add multiple of row i to k-th row ... */
1793       jmin = ili + 1; jmax = bi[i+1];
1794       if (jmin < jmax){
1795         for (j=jmin; j<jmax; j++) {
1796           /* rtmp += -U(i,k)^T * U_bar(i,j) */
1797           rtmp_ptr = rtmp + bj[j]*25;
1798           u = ba + j*25;
1799           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1800           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1801           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1802           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1803           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1804 
1805           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1806           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1807           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1808           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1809           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1810 
1811           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1812           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1813           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1814           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1815           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1816 
1817           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1818           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1819           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1820           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1821           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1822 
1823           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1824           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1825           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1826           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1827           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1828         }
1829 
1830         /* ... add i to row list for next nonzero entry */
1831         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1832         j     = bj[jmin];
1833         jl[i] = jl[j]; jl[j] = i; /* update jl */
1834       }
1835       i = nexti;
1836     }
1837 
1838     /* save nonzero entries in k-th row of U ... */
1839 
1840     /* invert diagonal block */
1841     d = ba+k*25;
1842     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
1843     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
1844 
1845     jmin = bi[k]; jmax = bi[k+1];
1846     if (jmin < jmax) {
1847       for (j=jmin; j<jmax; j++){
1848          vj = bj[j];           /* block col. index of U */
1849          u   = ba + j*25;
1850          rtmp_ptr = rtmp + vj*25;
1851          for (k1=0; k1<25; k1++){
1852            *u++        = *rtmp_ptr;
1853            *rtmp_ptr++ = 0.0;
1854          }
1855       }
1856 
1857       /* ... add k to row list for first nonzero entry in k-th row */
1858       il[k] = jmin;
1859       i     = bj[jmin];
1860       jl[k] = jl[i]; jl[i] = k;
1861     }
1862   }
1863 
1864   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1865   ierr = PetscFree(il);CHKERRQ(ierr);
1866   ierr = PetscFree(dk);CHKERRQ(ierr);
1867   if (a->permute){
1868     ierr = PetscFree(aa);CHKERRQ(ierr);
1869   }
1870 
1871   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1872   C->factor    = FACTOR_CHOLESKY;
1873   C->assembled = PETSC_TRUE;
1874   C->preallocated = PETSC_TRUE;
1875   PetscLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 /*
1880       Version for when blocks are 5 by 5 Using natural ordering
1881 */
1882 #undef __FUNC__
1883 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering"
1884 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B)
1885 {
1886   Mat                C = *B;
1887   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1888   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1889   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1890   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1891   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1892 
1893   PetscFunctionBegin;
1894 
1895   /* initialization */
1896 ierr = PetscMalloc(25*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
1897   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1898 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
1899   jl = il + mbs;
1900   for (i=0; i<mbs; i++) {
1901     jl[i] = mbs; il[0] = 0;
1902   }
1903 ierr = PetscMalloc(50*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
1904   uik   = dk + 25;
1905 
1906   ai = a->i; aj = a->j; aa = a->a;
1907 
1908   /* for each row k */
1909   for (k = 0; k<mbs; k++){
1910 
1911     /*initialize k-th row with elements nonzero in row k of A */
1912     jmin = ai[k]; jmax = ai[k+1];
1913     if (jmin < jmax) {
1914       ap = aa + jmin*25;
1915       for (j = jmin; j < jmax; j++){
1916         vj = aj[j];         /* block col. index */
1917         rtmp_ptr = rtmp + vj*25;
1918         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1919       }
1920     }
1921 
1922     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1923     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1924     i = jl[k]; /* first row to be added to k_th row  */
1925 
1926     while (i < mbs){
1927       nexti = jl[i]; /* next row to be added to k_th row */
1928 
1929       /* compute multiplier */
1930       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1931 
1932       /* uik = -inv(Di)*U_bar(i,k) */
1933       d = ba + i*25;
1934       u    = ba + ili*25;
1935 
1936       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1937       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1938       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1939       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1940       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1941 
1942       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1943       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1944       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1945       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1946       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1947 
1948       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1949       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1950       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1951       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1952       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1953 
1954       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1955       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1956       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1957       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1958       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1959 
1960       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1961       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1962       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1963       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1964       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1965 
1966 
1967       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1968       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1969       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1970       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1971       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1972       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1973 
1974       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1975       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1976       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1977       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1978       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1979 
1980       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1981       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1982       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1983       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1984       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1985 
1986       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1987       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1988       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1989       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1990       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1991 
1992       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1993       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1994       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1995       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1996       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1997 
1998       /* update -U(i,k) */
1999       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
2000 
2001       /* add multiple of row i to k-th row ... */
2002       jmin = ili + 1; jmax = bi[i+1];
2003       if (jmin < jmax){
2004         for (j=jmin; j<jmax; j++) {
2005           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2006           rtmp_ptr = rtmp + bj[j]*25;
2007           u = ba + j*25;
2008           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
2009           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
2010           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
2011           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
2012           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
2013 
2014           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
2015           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
2016           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
2017           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
2018           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
2019 
2020           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
2021           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
2022           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
2023           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
2024           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
2025 
2026           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
2027           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
2028           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
2029           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
2030           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
2031 
2032           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
2033           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
2034           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
2035           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
2036           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
2037         }
2038 
2039         /* ... add i to row list for next nonzero entry */
2040         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2041         j     = bj[jmin];
2042         jl[i] = jl[j]; jl[j] = i; /* update jl */
2043       }
2044       i = nexti;
2045     }
2046 
2047     /* save nonzero entries in k-th row of U ... */
2048 
2049     /* invert diagonal block */
2050     d = ba+k*25;
2051     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
2052     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
2053 
2054     jmin = bi[k]; jmax = bi[k+1];
2055     if (jmin < jmax) {
2056       for (j=jmin; j<jmax; j++){
2057          vj = bj[j];           /* block col. index of U */
2058          u   = ba + j*25;
2059          rtmp_ptr = rtmp + vj*25;
2060          for (k1=0; k1<25; k1++){
2061            *u++        = *rtmp_ptr;
2062            *rtmp_ptr++ = 0.0;
2063          }
2064       }
2065 
2066       /* ... add k to row list for first nonzero entry in k-th row */
2067       il[k] = jmin;
2068       i     = bj[jmin];
2069       jl[k] = jl[i]; jl[i] = k;
2070     }
2071   }
2072 
2073   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2074   ierr = PetscFree(il);CHKERRQ(ierr);
2075   ierr = PetscFree(dk);CHKERRQ(ierr);
2076 
2077   C->factor    = FACTOR_CHOLESKY;
2078   C->assembled = PETSC_TRUE;
2079   C->preallocated = PETSC_TRUE;
2080   PetscLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
2081   PetscFunctionReturn(0);
2082 }
2083 
2084 /*
2085       Version for when blocks are 4 by 4 Using natural ordering
2086 */
2087 #undef __FUNC__
2088 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering"
2089 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B)
2090 {
2091   Mat                C = *B;
2092   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2093   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2094   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2095   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2096   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2097 
2098   PetscFunctionBegin;
2099 
2100   /* initialization */
2101 ierr = PetscMalloc(16*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
2102   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2103 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
2104   jl = il + mbs;
2105   for (i=0; i<mbs; i++) {
2106     jl[i] = mbs; il[0] = 0;
2107   }
2108 ierr = PetscMalloc(32*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
2109   uik   = dk + 16;
2110 
2111   ai = a->i; aj = a->j; aa = a->a;
2112 
2113   /* for each row k */
2114   for (k = 0; k<mbs; k++){
2115 
2116     /*initialize k-th row with elements nonzero in row k of A */
2117     jmin = ai[k]; jmax = ai[k+1];
2118     if (jmin < jmax) {
2119       ap = aa + jmin*16;
2120       for (j = jmin; j < jmax; j++){
2121         vj = aj[j];         /* block col. index */
2122         rtmp_ptr = rtmp + vj*16;
2123         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
2124       }
2125     }
2126 
2127     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2128     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
2129     i = jl[k]; /* first row to be added to k_th row  */
2130 
2131     while (i < mbs){
2132       nexti = jl[i]; /* next row to be added to k_th row */
2133 
2134       /* compute multiplier */
2135       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2136 
2137       /* uik = -inv(Di)*U_bar(i,k) */
2138       diag = ba + i*16;
2139       u    = ba + ili*16;
2140 
2141       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
2142       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
2143       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
2144       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
2145 
2146       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2147       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2148       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2149       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2150 
2151       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2152       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2153       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2154       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2155 
2156       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2157       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2158       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2159       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2160 
2161       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2162       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2163       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2164       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2165       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2166 
2167       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2168       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2169       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2170       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2171 
2172       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2173       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2174       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2175       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2176 
2177       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2178       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2179       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2180       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2181 
2182       /* update -U(i,k) */
2183       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2184 
2185       /* add multiple of row i to k-th row ... */
2186       jmin = ili + 1; jmax = bi[i+1];
2187       if (jmin < jmax){
2188         for (j=jmin; j<jmax; j++) {
2189           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2190           rtmp_ptr = rtmp + bj[j]*16;
2191           u = ba + j*16;
2192           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2193           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2194           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2195           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2196 
2197           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2198           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2199           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2200           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2201 
2202           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2203           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2204           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2205           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2206 
2207           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2208           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2209           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2210           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2211         }
2212 
2213         /* ... add i to row list for next nonzero entry */
2214         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2215         j     = bj[jmin];
2216         jl[i] = jl[j]; jl[j] = i; /* update jl */
2217       }
2218       i = nexti;
2219     }
2220 
2221     /* save nonzero entries in k-th row of U ... */
2222 
2223     /* invert diagonal block */
2224     diag = ba+k*16;
2225     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2226     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2227 
2228     jmin = bi[k]; jmax = bi[k+1];
2229     if (jmin < jmax) {
2230       for (j=jmin; j<jmax; j++){
2231          vj = bj[j];           /* block col. index of U */
2232          u   = ba + j*16;
2233          rtmp_ptr = rtmp + vj*16;
2234          for (k1=0; k1<16; k1++){
2235            *u++        = *rtmp_ptr;
2236            *rtmp_ptr++ = 0.0;
2237          }
2238       }
2239 
2240       /* ... add k to row list for first nonzero entry in k-th row */
2241       il[k] = jmin;
2242       i     = bj[jmin];
2243       jl[k] = jl[i]; jl[i] = k;
2244     }
2245   }
2246 
2247   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2248   ierr = PetscFree(il);CHKERRQ(ierr);
2249   ierr = PetscFree(dk);CHKERRQ(ierr);
2250 
2251   C->factor    = FACTOR_CHOLESKY;
2252   C->assembled = PETSC_TRUE;
2253   C->preallocated = PETSC_TRUE;
2254   PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2255   PetscFunctionReturn(0);
2256 }
2257 
2258 /* Version for when blocks are 4 by 4  */
2259 #undef __FUNC__
2260 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4"
2261 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B)
2262 {
2263   Mat                C = *B;
2264   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2265   IS                 perm = b->row;
2266   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2267   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2268   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2269   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2270 
2271   PetscFunctionBegin;
2272   /* initialization */
2273 ierr = PetscMalloc(16*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
2274   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2275 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
2276   jl = il + mbs;
2277   for (i=0; i<mbs; i++) {
2278     jl[i] = mbs; il[0] = 0;
2279   }
2280 ierr = PetscMalloc(32*sizeof(MatScalar),&(  dk    ));CHKERRQ(ierr);
2281   uik   = dk + 16;
2282   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2283 
2284   /* check permutation */
2285   if (!a->permute){
2286     ai = a->i; aj = a->j; aa = a->a;
2287   } else {
2288     ai = a->inew; aj = a->jnew;
2289     aa = (MatScalar*)PetscMalloc(16*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2290     ierr = PetscMemcpy(aa,a->a,16*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2291     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
2292     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2293 
2294     for (i=0; i<mbs; i++){
2295       jmin = ai[i]; jmax = ai[i+1];
2296       for (j=jmin; j<jmax; j++){
2297         while (a2anew[j] != j){
2298           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2299           for (k1=0; k1<16; k1++){
2300             dk[k1]       = aa[k*16+k1];
2301             aa[k*16+k1] = aa[j*16+k1];
2302             aa[j*16+k1] = dk[k1];
2303           }
2304         }
2305         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2306         if (i > aj[j]){
2307           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2308           ap = aa + j*16;                     /* ptr to the beginning of j-th block of aa */
2309           for (k=0; k<16; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2310           for (k=0; k<4; k++){               /* j-th block of aa <- dk^T */
2311             for (k1=0; k1<4; k1++) *ap++ = dk[k + 4*k1];
2312           }
2313         }
2314       }
2315     }
2316     ierr = PetscFree(a2anew);CHKERRA(ierr);
2317   }
2318 
2319   /* for each row k */
2320   for (k = 0; k<mbs; k++){
2321 
2322     /*initialize k-th row with elements nonzero in row perm(k) of A */
2323     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2324     if (jmin < jmax) {
2325       ap = aa + jmin*16;
2326       for (j = jmin; j < jmax; j++){
2327         vj = perm_ptr[aj[j]];         /* block col. index */
2328         rtmp_ptr = rtmp + vj*16;
2329         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
2330       }
2331     }
2332 
2333     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2334     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
2335     i = jl[k]; /* first row to be added to k_th row  */
2336 
2337     while (i < mbs){
2338       nexti = jl[i]; /* next row to be added to k_th row */
2339 
2340       /* compute multiplier */
2341       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2342 
2343       /* uik = -inv(Di)*U_bar(i,k) */
2344       diag = ba + i*16;
2345       u    = ba + ili*16;
2346 
2347       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
2348       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
2349       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
2350       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
2351 
2352       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2353       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2354       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2355       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2356 
2357       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2358       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2359       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2360       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2361 
2362       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2363       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2364       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2365       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2366 
2367       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2368       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2369       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2370       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2371       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2372 
2373       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2374       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2375       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2376       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2377 
2378       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2379       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2380       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2381       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2382 
2383       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2384       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2385       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2386       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2387 
2388       /* update -U(i,k) */
2389       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2390 
2391       /* add multiple of row i to k-th row ... */
2392       jmin = ili + 1; jmax = bi[i+1];
2393       if (jmin < jmax){
2394         for (j=jmin; j<jmax; j++) {
2395           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2396           rtmp_ptr = rtmp + bj[j]*16;
2397           u = ba + j*16;
2398           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2399           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2400           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2401           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2402 
2403           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2404           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2405           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2406           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2407 
2408           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2409           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2410           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2411           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2412 
2413           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2414           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2415           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2416           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2417         }
2418 
2419         /* ... add i to row list for next nonzero entry */
2420         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2421         j     = bj[jmin];
2422         jl[i] = jl[j]; jl[j] = i; /* update jl */
2423       }
2424       i = nexti;
2425     }
2426 
2427     /* save nonzero entries in k-th row of U ... */
2428 
2429     /* invert diagonal block */
2430     diag = ba+k*16;
2431     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2432     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2433 
2434     jmin = bi[k]; jmax = bi[k+1];
2435     if (jmin < jmax) {
2436       for (j=jmin; j<jmax; j++){
2437          vj = bj[j];           /* block col. index of U */
2438          u   = ba + j*16;
2439          rtmp_ptr = rtmp + vj*16;
2440          for (k1=0; k1<16; k1++){
2441            *u++        = *rtmp_ptr;
2442            *rtmp_ptr++ = 0.0;
2443          }
2444       }
2445 
2446       /* ... add k to row list for first nonzero entry in k-th row */
2447       il[k] = jmin;
2448       i     = bj[jmin];
2449       jl[k] = jl[i]; jl[i] = k;
2450     }
2451   }
2452 
2453   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2454   ierr = PetscFree(il);CHKERRQ(ierr);
2455   ierr = PetscFree(dk);CHKERRQ(ierr);
2456   if (a->permute){
2457     ierr = PetscFree(aa);CHKERRQ(ierr);
2458   }
2459 
2460   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2461   C->factor    = FACTOR_CHOLESKY;
2462   C->assembled = PETSC_TRUE;
2463   C->preallocated = PETSC_TRUE;
2464   PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2465   PetscFunctionReturn(0);
2466 }
2467 
2468 /* Version for when blocks are 3 by 3  */
2469 #undef __FUNC__
2470 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3"
2471 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B)
2472 {
2473   Mat                C = *B;
2474   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2475   IS                 perm = b->row;
2476   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2477   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2478   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2479   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2480 
2481   PetscFunctionBegin;
2482 
2483   /* initialization */
2484 ierr = PetscMalloc(9*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
2485   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2486 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
2487   jl = il + mbs;
2488   for (i=0; i<mbs; i++) {
2489     jl[i] = mbs; il[0] = 0;
2490   }
2491 ierr = PetscMalloc(18*sizeof(MatScalar),&(  dk  ));CHKERRQ(ierr);
2492   uik = dk + 9;
2493   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2494 
2495   /* check permutation */
2496   if (!a->permute){
2497     ai = a->i; aj = a->j; aa = a->a;
2498   } else {
2499     ai = a->inew; aj = a->jnew;
2500     aa = (MatScalar*)PetscMalloc(9*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2501     ierr = PetscMemcpy(aa,a->a,9*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2502     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
2503     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2504 
2505     for (i=0; i<mbs; i++){
2506       jmin = ai[i]; jmax = ai[i+1];
2507       for (j=jmin; j<jmax; j++){
2508         while (a2anew[j] != j){
2509           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2510           for (k1=0; k1<9; k1++){
2511             dk[k1]       = aa[k*9+k1];
2512             aa[k*9+k1] = aa[j*9+k1];
2513             aa[j*9+k1] = dk[k1];
2514           }
2515         }
2516         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2517         if (i > aj[j]){
2518           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2519           ap = aa + j*9;                     /* ptr to the beginning of j-th block of aa */
2520           for (k=0; k<9; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2521           for (k=0; k<3; k++){               /* j-th block of aa <- dk^T */
2522             for (k1=0; k1<3; k1++) *ap++ = dk[k + 3*k1];
2523           }
2524         }
2525       }
2526     }
2527     ierr = PetscFree(a2anew);CHKERRA(ierr);
2528   }
2529 
2530   /* for each row k */
2531   for (k = 0; k<mbs; k++){
2532 
2533     /*initialize k-th row with elements nonzero in row perm(k) of A */
2534     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2535     if (jmin < jmax) {
2536       ap = aa + jmin*9;
2537       for (j = jmin; j < jmax; j++){
2538         vj = perm_ptr[aj[j]];         /* block col. index */
2539         rtmp_ptr = rtmp + vj*9;
2540         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2541       }
2542     }
2543 
2544     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2545     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2546     i = jl[k]; /* first row to be added to k_th row  */
2547 
2548     while (i < mbs){
2549       nexti = jl[i]; /* next row to be added to k_th row */
2550 
2551       /* compute multiplier */
2552       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2553 
2554       /* uik = -inv(Di)*U_bar(i,k) */
2555       diag = ba + i*9;
2556       u    = ba + ili*9;
2557 
2558       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2559       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2560       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2561 
2562       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2563       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2564       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2565 
2566       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2567       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2568       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2569 
2570       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2571       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2572       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2573       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2574 
2575       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2576       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2577       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2578 
2579       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2580       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2581       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2582 
2583       /* update -U(i,k) */
2584       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2585 
2586       /* add multiple of row i to k-th row ... */
2587       jmin = ili + 1; jmax = bi[i+1];
2588       if (jmin < jmax){
2589         for (j=jmin; j<jmax; j++) {
2590           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2591           rtmp_ptr = rtmp + bj[j]*9;
2592           u = ba + j*9;
2593           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2594           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2595           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2596 
2597           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2598           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2599           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2600 
2601           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2602           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2603           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2604         }
2605 
2606         /* ... add i to row list for next nonzero entry */
2607         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2608         j     = bj[jmin];
2609         jl[i] = jl[j]; jl[j] = i; /* update jl */
2610       }
2611       i = nexti;
2612     }
2613 
2614     /* save nonzero entries in k-th row of U ... */
2615 
2616     /* invert diagonal block */
2617     diag = ba+k*9;
2618     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2619     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2620 
2621     jmin = bi[k]; jmax = bi[k+1];
2622     if (jmin < jmax) {
2623       for (j=jmin; j<jmax; j++){
2624          vj = bj[j];           /* block col. index of U */
2625          u   = ba + j*9;
2626          rtmp_ptr = rtmp + vj*9;
2627          for (k1=0; k1<9; k1++){
2628            *u++        = *rtmp_ptr;
2629            *rtmp_ptr++ = 0.0;
2630          }
2631       }
2632 
2633       /* ... add k to row list for first nonzero entry in k-th row */
2634       il[k] = jmin;
2635       i     = bj[jmin];
2636       jl[k] = jl[i]; jl[i] = k;
2637     }
2638   }
2639 
2640   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2641   ierr = PetscFree(il);CHKERRQ(ierr);
2642   ierr = PetscFree(dk);CHKERRQ(ierr);
2643   if (a->permute){
2644     ierr = PetscFree(aa);CHKERRQ(ierr);
2645   }
2646 
2647   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2648   C->factor    = FACTOR_CHOLESKY;
2649   C->assembled = PETSC_TRUE;
2650   C->preallocated = PETSC_TRUE;
2651   PetscLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2652   PetscFunctionReturn(0);
2653 }
2654 
2655 /*
2656       Version for when blocks are 3 by 3 Using natural ordering
2657 */
2658 #undef __FUNC__
2659 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering"
2660 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B)
2661 {
2662   Mat                C = *B;
2663   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2664   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2665   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2666   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2667   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2668 
2669   PetscFunctionBegin;
2670 
2671   /* initialization */
2672 ierr = PetscMalloc(9*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
2673   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2674 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
2675   jl = il + mbs;
2676   for (i=0; i<mbs; i++) {
2677     jl[i] = mbs; il[0] = 0;
2678   }
2679 ierr = PetscMalloc(18*sizeof(MatScalar),&(  dk  ));CHKERRQ(ierr);
2680   uik = dk + 9;
2681 
2682   ai = a->i; aj = a->j; aa = a->a;
2683 
2684   /* for each row k */
2685   for (k = 0; k<mbs; k++){
2686 
2687     /*initialize k-th row with elements nonzero in row k of A */
2688     jmin = ai[k]; jmax = ai[k+1];
2689     if (jmin < jmax) {
2690       ap = aa + jmin*9;
2691       for (j = jmin; j < jmax; j++){
2692         vj = aj[j];         /* block col. index */
2693         rtmp_ptr = rtmp + vj*9;
2694         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2695       }
2696     }
2697 
2698     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2699     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2700     i = jl[k]; /* first row to be added to k_th row  */
2701 
2702     while (i < mbs){
2703       nexti = jl[i]; /* next row to be added to k_th row */
2704 
2705       /* compute multiplier */
2706       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2707 
2708       /* uik = -inv(Di)*U_bar(i,k) */
2709       diag = ba + i*9;
2710       u    = ba + ili*9;
2711 
2712       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2713       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2714       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2715 
2716       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2717       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2718       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2719 
2720       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2721       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2722       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2723 
2724       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2725       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2726       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2727       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2728 
2729       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2730       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2731       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2732 
2733       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2734       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2735       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2736 
2737       /* update -U(i,k) */
2738       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2739 
2740       /* add multiple of row i to k-th row ... */
2741       jmin = ili + 1; jmax = bi[i+1];
2742       if (jmin < jmax){
2743         for (j=jmin; j<jmax; j++) {
2744           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2745           rtmp_ptr = rtmp + bj[j]*9;
2746           u = ba + j*9;
2747           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2748           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2749           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2750 
2751           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2752           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2753           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2754 
2755           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2756           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2757           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2758         }
2759 
2760         /* ... add i to row list for next nonzero entry */
2761         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2762         j     = bj[jmin];
2763         jl[i] = jl[j]; jl[j] = i; /* update jl */
2764       }
2765       i = nexti;
2766     }
2767 
2768     /* save nonzero entries in k-th row of U ... */
2769 
2770     /* invert diagonal block */
2771     diag = ba+k*9;
2772     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2773     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2774 
2775     jmin = bi[k]; jmax = bi[k+1];
2776     if (jmin < jmax) {
2777       for (j=jmin; j<jmax; j++){
2778          vj = bj[j];           /* block col. index of U */
2779          u   = ba + j*9;
2780          rtmp_ptr = rtmp + vj*9;
2781          for (k1=0; k1<9; k1++){
2782            *u++        = *rtmp_ptr;
2783            *rtmp_ptr++ = 0.0;
2784          }
2785       }
2786 
2787       /* ... add k to row list for first nonzero entry in k-th row */
2788       il[k] = jmin;
2789       i     = bj[jmin];
2790       jl[k] = jl[i]; jl[i] = k;
2791     }
2792   }
2793 
2794   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2795   ierr = PetscFree(il);CHKERRQ(ierr);
2796   ierr = PetscFree(dk);CHKERRQ(ierr);
2797 
2798   C->factor    = FACTOR_CHOLESKY;
2799   C->assembled = PETSC_TRUE;
2800   C->preallocated = PETSC_TRUE;
2801   PetscLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2802   PetscFunctionReturn(0);
2803 }
2804 
2805 /*
2806     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
2807     Version for blocks 2 by 2.
2808 */
2809 #undef __FUNC__
2810 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
2811 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
2812 {
2813   Mat                C = *B;
2814   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2815   IS                 perm = b->row;
2816   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2817   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2818   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2819   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2820 
2821   PetscFunctionBegin;
2822 
2823   /* initialization */
2824   /* il and jl record the first nonzero element in each row of the accessing
2825      window U(0:k, k:mbs-1).
2826      jl:    list of rows to be added to uneliminated rows
2827             i>= k: jl(i) is the first row to be added to row i
2828             i<  k: jl(i) is the row following row i in some list of rows
2829             jl(i) = mbs indicates the end of a list
2830      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2831             row i of U */
2832 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
2833   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2834 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
2835   jl = il + mbs;
2836   for (i=0; i<mbs; i++) {
2837     jl[i] = mbs; il[0] = 0;
2838   }
2839 ierr = PetscMalloc(8*sizeof(MatScalar),&(  dk  ));CHKERRQ(ierr);
2840   uik = dk + 4;
2841   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2842 
2843   /* check permutation */
2844   if (!a->permute){
2845     ai = a->i; aj = a->j; aa = a->a;
2846   } else {
2847     ai = a->inew; aj = a->jnew;
2848     aa = (MatScalar*)PetscMalloc(4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2849     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2850     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
2851     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2852 
2853     for (i=0; i<mbs; i++){
2854       jmin = ai[i]; jmax = ai[i+1];
2855       for (j=jmin; j<jmax; j++){
2856         while (a2anew[j] != j){
2857           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2858           for (k1=0; k1<4; k1++){
2859             dk[k1]       = aa[k*4+k1];
2860             aa[k*4+k1] = aa[j*4+k1];
2861             aa[j*4+k1] = dk[k1];
2862           }
2863         }
2864         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2865         if (i > aj[j]){
2866           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2867           ap = aa + j*4;     /* ptr to the beginning of the block */
2868           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
2869           ap[1] = ap[2];
2870           ap[2] = dk[1];
2871         }
2872       }
2873     }
2874     ierr = PetscFree(a2anew);CHKERRA(ierr);
2875   }
2876 
2877   /* for each row k */
2878   for (k = 0; k<mbs; k++){
2879 
2880     /*initialize k-th row with elements nonzero in row perm(k) of A */
2881     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2882     if (jmin < jmax) {
2883       ap = aa + jmin*4;
2884       for (j = jmin; j < jmax; j++){
2885         vj = perm_ptr[aj[j]];         /* block col. index */
2886         rtmp_ptr = rtmp + vj*4;
2887         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
2888       }
2889     }
2890 
2891     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2892     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
2893     i = jl[k]; /* first row to be added to k_th row  */
2894 
2895     while (i < mbs){
2896       nexti = jl[i]; /* next row to be added to k_th row */
2897 
2898       /* compute multiplier */
2899       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2900 
2901       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
2902       diag = ba + i*4;
2903       u    = ba + ili*4;
2904       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
2905       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
2906       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
2907       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
2908 
2909       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
2910       dk[0] += uik[0]*u[0] + uik[1]*u[1];
2911       dk[1] += uik[2]*u[0] + uik[3]*u[1];
2912       dk[2] += uik[0]*u[2] + uik[1]*u[3];
2913       dk[3] += uik[2]*u[2] + uik[3]*u[3];
2914 
2915       /* update -U(i,k): ba[ili] = uik */
2916       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
2917 
2918       /* add multiple of row i to k-th row ... */
2919       jmin = ili + 1; jmax = bi[i+1];
2920       if (jmin < jmax){
2921         for (j=jmin; j<jmax; j++) {
2922           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
2923           rtmp_ptr = rtmp + bj[j]*4;
2924           u = ba + j*4;
2925           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
2926           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
2927           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
2928           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
2929         }
2930 
2931         /* ... add i to row list for next nonzero entry */
2932         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2933         j     = bj[jmin];
2934         jl[i] = jl[j]; jl[j] = i; /* update jl */
2935       }
2936       i = nexti;
2937     }
2938 
2939     /* save nonzero entries in k-th row of U ... */
2940 
2941     /* invert diagonal block */
2942     diag = ba+k*4;
2943     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
2944     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
2945 
2946     jmin = bi[k]; jmax = bi[k+1];
2947     if (jmin < jmax) {
2948       for (j=jmin; j<jmax; j++){
2949          vj = bj[j];           /* block col. index of U */
2950          u   = ba + j*4;
2951          rtmp_ptr = rtmp + vj*4;
2952          for (k1=0; k1<4; k1++){
2953            *u++        = *rtmp_ptr;
2954            *rtmp_ptr++ = 0.0;
2955          }
2956       }
2957 
2958       /* ... add k to row list for first nonzero entry in k-th row */
2959       il[k] = jmin;
2960       i     = bj[jmin];
2961       jl[k] = jl[i]; jl[i] = k;
2962     }
2963   }
2964 
2965   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2966   ierr = PetscFree(il);CHKERRQ(ierr);
2967   ierr = PetscFree(dk);CHKERRQ(ierr);
2968   if (a->permute) {
2969     ierr = PetscFree(aa);CHKERRQ(ierr);
2970   }
2971   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2972   C->factor    = FACTOR_CHOLESKY;
2973   C->assembled = PETSC_TRUE;
2974   C->preallocated = PETSC_TRUE;
2975   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2976   PetscFunctionReturn(0);
2977 }
2978 
2979 /*
2980       Version for when blocks are 2 by 2 Using natural ordering
2981 */
2982 #undef __FUNC__
2983 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
2984 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
2985 {
2986   Mat                C = *B;
2987   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2988   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2989   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2990   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2991   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2992 
2993   PetscFunctionBegin;
2994 
2995   /* initialization */
2996   /* il and jl record the first nonzero element in each row of the accessing
2997      window U(0:k, k:mbs-1).
2998      jl:    list of rows to be added to uneliminated rows
2999             i>= k: jl(i) is the first row to be added to row i
3000             i<  k: jl(i) is the row following row i in some list of rows
3001             jl(i) = mbs indicates the end of a list
3002      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3003             row i of U */
3004 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
3005   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
3006 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
3007   jl = il + mbs;
3008   for (i=0; i<mbs; i++) {
3009     jl[i] = mbs; il[0] = 0;
3010   }
3011 ierr = PetscMalloc(8*sizeof(MatScalar),&(  dk  ));CHKERRQ(ierr);
3012   uik = dk + 4;
3013 
3014   ai = a->i; aj = a->j; aa = a->a;
3015 
3016   /* for each row k */
3017   for (k = 0; k<mbs; k++){
3018 
3019     /*initialize k-th row with elements nonzero in row k of A */
3020     jmin = ai[k]; jmax = ai[k+1];
3021     if (jmin < jmax) {
3022       ap = aa + jmin*4;
3023       for (j = jmin; j < jmax; j++){
3024         vj = aj[j];         /* block col. index */
3025         rtmp_ptr = rtmp + vj*4;
3026         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
3027       }
3028     }
3029 
3030     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3031     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
3032     i = jl[k]; /* first row to be added to k_th row  */
3033 
3034     while (i < mbs){
3035       nexti = jl[i]; /* next row to be added to k_th row */
3036 
3037       /* compute multiplier */
3038       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3039 
3040       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
3041       diag = ba + i*4;
3042       u    = ba + ili*4;
3043       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
3044       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
3045       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
3046       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
3047 
3048       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
3049       dk[0] += uik[0]*u[0] + uik[1]*u[1];
3050       dk[1] += uik[2]*u[0] + uik[3]*u[1];
3051       dk[2] += uik[0]*u[2] + uik[1]*u[3];
3052       dk[3] += uik[2]*u[2] + uik[3]*u[3];
3053 
3054       /* update -U(i,k): ba[ili] = uik */
3055       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
3056 
3057       /* add multiple of row i to k-th row ... */
3058       jmin = ili + 1; jmax = bi[i+1];
3059       if (jmin < jmax){
3060         for (j=jmin; j<jmax; j++) {
3061           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
3062           rtmp_ptr = rtmp + bj[j]*4;
3063           u = ba + j*4;
3064           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
3065           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
3066           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
3067           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
3068         }
3069 
3070         /* ... add i to row list for next nonzero entry */
3071         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3072         j     = bj[jmin];
3073         jl[i] = jl[j]; jl[j] = i; /* update jl */
3074       }
3075       i = nexti;
3076     }
3077 
3078     /* save nonzero entries in k-th row of U ... */
3079 
3080     /* invert diagonal block */
3081     diag = ba+k*4;
3082     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
3083     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
3084 
3085     jmin = bi[k]; jmax = bi[k+1];
3086     if (jmin < jmax) {
3087       for (j=jmin; j<jmax; j++){
3088          vj = bj[j];           /* block col. index of U */
3089          u   = ba + j*4;
3090          rtmp_ptr = rtmp + vj*4;
3091          for (k1=0; k1<4; k1++){
3092            *u++        = *rtmp_ptr;
3093            *rtmp_ptr++ = 0.0;
3094          }
3095       }
3096 
3097       /* ... add k to row list for first nonzero entry in k-th row */
3098       il[k] = jmin;
3099       i     = bj[jmin];
3100       jl[k] = jl[i]; jl[i] = k;
3101     }
3102   }
3103 
3104   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3105   ierr = PetscFree(il);CHKERRQ(ierr);
3106   ierr = PetscFree(dk);CHKERRQ(ierr);
3107 
3108   C->factor    = FACTOR_CHOLESKY;
3109   C->assembled = PETSC_TRUE;
3110   C->preallocated = PETSC_TRUE;
3111   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
3112   PetscFunctionReturn(0);
3113 }
3114 
3115 /*
3116     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
3117     Version for blocks are 1 by 1.
3118 */
3119 #undef __FUNC__
3120 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
3121 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
3122 {
3123   Mat                C = *B;
3124   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
3125   IS                 ip = b->row;
3126   int                *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
3127   int                *ai,*aj,*r;
3128   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
3129   MatScalar          *rtmp;
3130   MatScalar          *ba = b->a,*aa,ak;
3131   MatScalar          dk,uikdi;
3132 
3133   PetscFunctionBegin;
3134   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
3135   if (!a->permute){
3136     ai = a->i; aj = a->j; aa = a->a;
3137   } else {
3138     ai = a->inew; aj = a->jnew;
3139     aa = (MatScalar*)PetscMalloc(ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
3140     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
3141     r   = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKERRQ(ierr);
3142     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
3143 
3144     jmin = ai[0]; jmax = ai[mbs];
3145     for (j=jmin; j<jmax; j++){
3146       while (r[j] != j){
3147         k = r[j]; r[j] = r[k]; r[k] = k;
3148         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
3149       }
3150     }
3151     ierr = PetscFree(r);CHKERRA(ierr);
3152   }
3153 
3154   /* initialization */
3155   /* il and jl record the first nonzero element in each row of the accessing
3156      window U(0:k, k:mbs-1).
3157      jl:    list of rows to be added to uneliminated rows
3158             i>= k: jl(i) is the first row to be added to row i
3159             i<  k: jl(i) is the row following row i in some list of rows
3160             jl(i) = mbs indicates the end of a list
3161      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3162             row i of U */
3163 ierr = PetscMalloc(mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
3164 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il ));CHKERRQ(ierr);
3165   jl = il + mbs;
3166   for (i=0; i<mbs; i++) {
3167     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
3168   }
3169 
3170   /* for each row k */
3171   for (k = 0; k<mbs; k++){
3172 
3173     /*initialize k-th row with elements nonzero in row perm(k) of A */
3174     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
3175     if (jmin < jmax) {
3176       for (j = jmin; j < jmax; j++){
3177         vj = rip[aj[j]];
3178         rtmp[vj] = aa[j];
3179       }
3180     }
3181 
3182     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3183     dk = rtmp[k];
3184     i = jl[k]; /* first row to be added to k_th row  */
3185 
3186     while (i < mbs){
3187       nexti = jl[i]; /* next row to be added to k_th row */
3188 
3189       /* compute multiplier, update D(k) and U(i,k) */
3190       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3191       uikdi = - ba[ili]*ba[i];
3192       dk += uikdi*ba[ili];
3193       ba[ili] = uikdi; /* -U(i,k) */
3194 
3195       /* add multiple of row i to k-th row ... */
3196       jmin = ili + 1; jmax = bi[i+1];
3197       if (jmin < jmax){
3198         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
3199         /* ... add i to row list for next nonzero entry */
3200         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3201         j     = bj[jmin];
3202         jl[i] = jl[j]; jl[j] = i; /* update jl */
3203       }
3204       i = nexti;
3205     }
3206 
3207     /* check for zero pivot and save diagoanl element */
3208     if (dk == 0.0){
3209       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
3210       /*
3211     }else if (PetscRealPart(dk) < 0){
3212       ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk);
3213       */
3214     }
3215 
3216     /* save nonzero entries in k-th row of U ... */
3217     ba[k] = 1.0/dk;
3218     jmin = bi[k]; jmax = bi[k+1];
3219     if (jmin < jmax) {
3220       for (j=jmin; j<jmax; j++){
3221          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
3222       }
3223       /* ... add k to row list for first nonzero entry in k-th row */
3224       il[k] = jmin;
3225       i     = bj[jmin];
3226       jl[k] = jl[i]; jl[i] = k;
3227     }
3228   }
3229 
3230   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3231   ierr = PetscFree(il);CHKERRQ(ierr);
3232   if (a->permute){
3233     ierr = PetscFree(aa);CHKERRQ(ierr);
3234   }
3235 
3236   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
3237   C->factor    = FACTOR_CHOLESKY;
3238   C->assembled = PETSC_TRUE;
3239   C->preallocated = PETSC_TRUE;
3240   PetscLogFlops(b->mbs);
3241   PetscFunctionReturn(0);
3242 }
3243 
3244 /*
3245   Version for when blocks are 1 by 1 Using natural ordering
3246 */
3247 #undef __FUNC__
3248 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
3249 int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
3250 {
3251   Mat                C = *B;
3252   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
3253   int                ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
3254   int                *ai,*aj;
3255   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
3256   MatScalar          *rtmp,*ba = b->a,*aa,dk,uikdi;
3257 
3258   PetscFunctionBegin;
3259 
3260   /* initialization */
3261   /* il and jl record the first nonzero element in each row of the accessing
3262      window U(0:k, k:mbs-1).
3263      jl:    list of rows to be added to uneliminated rows
3264             i>= k: jl(i) is the first row to be added to row i
3265             i<  k: jl(i) is the row following row i in some list of rows
3266             jl(i) = mbs indicates the end of a list
3267      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3268             row i of U */
3269 ierr = PetscMalloc(mbs*sizeof(MatScalar),&(  rtmp  ));CHKERRQ(ierr);
3270 ierr = PetscMalloc(2*mbs*sizeof(int),&(  il    ));CHKERRQ(ierr);
3271   jl    = il + mbs;
3272   for (i=0; i<mbs; i++) {
3273     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
3274   }
3275 
3276   ai = a->i; aj = a->j; aa = a->a;
3277 
3278   /* for each row k */
3279   for (k = 0; k<mbs; k++){
3280 
3281     /*initialize k-th row with elements nonzero in row perm(k) of A */
3282     jmin = ai[k]; jmax = ai[k+1];
3283     if (jmin < jmax) {
3284       for (j = jmin; j < jmax; j++){
3285         vj = aj[j];
3286         rtmp[vj] = aa[j];
3287       }
3288     }
3289 
3290     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3291     dk = rtmp[k];
3292     i = jl[k]; /* first row to be added to k_th row  */
3293 
3294     while (i < mbs){
3295       nexti = jl[i]; /* next row to be added to k_th row */
3296 
3297       /* compute multiplier, update D(k) and U(i,k) */
3298       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3299       uikdi = - ba[ili]*ba[i];
3300       dk += uikdi*ba[ili];
3301       ba[ili] = uikdi; /* -U(i,k) */
3302 
3303       /* add multiple of row i to k-th row ... */
3304       jmin = ili + 1; jmax = bi[i+1];
3305       if (jmin < jmax){
3306         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
3307         /* ... add i to row list for next nonzero entry */
3308         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3309         j     = bj[jmin];
3310         jl[i] = jl[j]; jl[j] = i; /* update jl */
3311       }
3312       i = nexti;
3313     }
3314 
3315     /* check for zero pivot and save diagoanl element */
3316     if (dk == 0.0){
3317       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
3318       /*
3319     }else if (PetscRealPart(dk) < 0){
3320       ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk);
3321       */
3322     }
3323 
3324     /* save nonzero entries in k-th row of U ... */
3325     ba[k] = 1.0/dk;
3326     jmin = bi[k]; jmax = bi[k+1];
3327     if (jmin < jmax) {
3328       for (j=jmin; j<jmax; j++){
3329          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
3330       }
3331       /* ... add k to row list for first nonzero entry in k-th row */
3332       il[k] = jmin;
3333       i     = bj[jmin];
3334       jl[k] = jl[i]; jl[i] = k;
3335     }
3336   }
3337 
3338   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3339   ierr = PetscFree(il);CHKERRQ(ierr);
3340 
3341   C->factor    = FACTOR_CHOLESKY;
3342   C->assembled = PETSC_TRUE;
3343   C->preallocated = PETSC_TRUE;
3344   PetscLogFlops(b->mbs);
3345   PetscFunctionReturn(0);
3346 }
3347 
3348 #undef __FUNC__
3349 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ"
3350 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
3351 {
3352   int ierr;
3353   Mat C;
3354 
3355   PetscFunctionBegin;
3356   ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr);
3357   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
3358   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
3359   PetscFunctionReturn(0);
3360 }
3361 
3362 
3363