xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 4669023fa3fe324d9ca9fb5cfb0591afc357609e)
1 /* Using Modified Sparse Row (MSR) storage.
2 See page 85, "Iterative Methods ..." by Saad. */
3 
4 /*$Id: sbaijfact.c,v 1.46 2000/11/05 20:01:21 hzhang Exp hzhang $*/
5 /*
6     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7 */
8 #include "sbaij.h"
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/ilu.h"
12 #include "include/petscis.h"
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
16 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
17 {
18   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
19   int         *rip,ierr,i,mbs = a->mbs,*ai,*aj;
20   int         *jutmp,bs = a->bs,bs2=a->bs2;
21   int         m,nzi,realloc = 0;
22   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
23   /* PetscTruth  ident; */
24 
25   PetscFunctionBegin;
26   PetscValidHeaderSpecific(perm,IS_COOKIE);
27   if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,"matrix must be square");
28 
29   /* check whether perm is the identity mapping */
30   /*
31   ierr = ISView(perm, VIEWER_STDOUT_SELF);CHKERRA(ierr);
32   ierr = ISIdentity(perm,&ident);CHKERRQ(ierr);
33   printf("ident = %d\n", ident);
34   */
35   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
36   for (i=0; i<mbs; i++){
37     if (rip[i] != i){
38       a->permute = PETSC_TRUE;
39       /* printf("non-trivial perm\n"); */
40       break;
41     }
42   }
43 
44   if (!a->permute){ /* without permutation */
45     ai = a->i; aj = a->j;
46   } else {          /* non-trivial permutation */
47     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRA(ierr);
48     ai = a->inew; aj = a->jnew;
49   }
50 
51   /* initialization */
52   /* Don't know how many column pointers are needed so estimate.
53      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
54   iu   = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(iu);
55   umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
56   ju   = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
57   iu[0] = mbs+1;
58   juptr = mbs;
59   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
60   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
61   for (i=0; i<mbs; i++){
62     jl[i] = mbs; q[i] = 0;
63   }
64 
65   /* for each row k */
66   for (k=0; k<mbs; k++){
67     nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
68     q[k] = mbs;
69     /* initialize nonzero structure of k-th row to row rip[k] of A */
70     jmin = ai[rip[k]];
71     jmax = ai[rip[k]+1];
72     for (j=jmin; j<jmax; j++){
73       vj = rip[aj[j]]; /* col. value */
74       if(vj > k){
75         qm = k;
76         do {
77           m  = qm; qm = q[m];
78         } while(qm < vj);
79         if (qm == vj) {
80           printf(" error: duplicate entry in A\n"); break;
81         }
82         nzk++;
83         q[m] = vj;
84         q[vj] = qm;
85       } /* if(vj > k) */
86     } /* for (j=jmin; j<jmax; j++) */
87 
88     /* modify nonzero structure of k-th row by computing fill-in
89        for each row i to be merged in */
90     i = k;
91     i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
92     /* printf(" next pivot row i=%d\n",i); */
93     while (i < mbs){
94       /* merge row i into k-th row */
95       nzi = iu[i+1] - (iu[i]+1);
96       jmin = iu[i] + 1; jmax = iu[i] + nzi;
97       qm = k;
98       for (j=jmin; j<jmax+1; j++){
99         vj = ju[j];
100         do {
101           m = qm; qm = q[m];
102         } while (qm < vj);
103         if (qm != vj){
104          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
105         }
106       }
107       i = jl[i]; /* next pivot row */
108     }
109 
110     /* add k to row list for first nonzero element in k-th row */
111     if (nzk > 0){
112       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
113       jl[k] = jl[i]; jl[i] = k;
114     }
115     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
116 
117     /* allocate more space to ju if needed */
118     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
119       /* estimate how much additional space we will need */
120       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
121       /* just double the memory each time */
122       maxadd = umax;
123       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
124       umax += maxadd;
125 
126       /* allocate a longer ju */
127       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
128       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
129       ierr  = PetscFree(ju);CHKERRQ(ierr);
130       ju    = jutmp;
131       realloc++; /* count how many times we realloc */
132     }
133 
134     /* save nonzero structure of k-th row in ju */
135     i=k;
136     jumin = juptr + 1; juptr += nzk;
137     for (j=jumin; j<juptr+1; j++){
138       i=q[i];
139       ju[j]=i;
140     }
141   }
142 
143   if (ai[mbs] != 0) {
144     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
145     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
146     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
147     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
148     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
149   } else {
150      PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
151   }
152 
153   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
154   ierr = PetscFree(q);CHKERRQ(ierr);
155   ierr = PetscFree(jl);CHKERRQ(ierr);
156 
157   /* put together the new matrix */
158   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
159   /* PLogObjectParent(*B,iperm); */
160   b = (Mat_SeqSBAIJ*)(*B)->data;
161   ierr = PetscFree(b->imax);CHKERRQ(ierr);
162   b->singlemalloc = PETSC_FALSE;
163   /* the next line frees the default space generated by the Create() */
164   ierr = PetscFree(b->a);CHKERRQ(ierr);
165   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
166   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
167   b->j          = ju;
168   b->i          = iu;
169   b->diag       = 0;
170   b->ilen       = 0;
171   b->imax       = 0;
172   b->row        = perm;
173   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
174   b->icol       = perm;
175   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
176   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
177   /* In b structure:  Free imax, ilen, old a, old j.
178      Allocate idnew, solve_work, new a, new j */
179   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
180   b->s_maxnz = b->s_nz = iu[mbs];
181 
182   (*B)->factor                 = FACTOR_CHOLESKY;
183   (*B)->info.factor_mallocs    = realloc;
184   (*B)->info.fill_ratio_given  = f;
185   if (ai[mbs] != 0) {
186     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
187   } else {
188     (*B)->info.fill_ratio_needed = 0.0;
189   }
190 
191   PetscFunctionReturn(0);
192 }
193 
194 #undef __FUNC__
195 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
196 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
197 {
198   Mat                C = *B;
199   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
200   IS                 perm = b->row;
201   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
202   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
203   int                bs=a->bs,bs2 = a->bs2;
204   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
205   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
206   MatScalar          *W,*work;
207   int                *pivots;
208 
209   PetscFunctionBegin;
210 
211   /* initialization */
212   rtmp  = (MatScalar*)PetscMalloc(bs2*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
213   ierr  = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
214   il    = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
215   jl    = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
216   for (i=0; i<mbs; i++) {
217     jl[i] = mbs; il[0] = 0;
218   }
219   dk    = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(dk);
220   uik   = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(uik);
221   W     = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(W);
222   work  = (MatScalar*)PetscMalloc(bs*sizeof(MatScalar));CHKPTRQ(work);
223   pivots= (int*)PetscMalloc(bs*sizeof(int));CHKPTRQ(pivots);
224 
225   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
226 
227   /* check permutation */
228   if (!a->permute){
229     ai = a->i; aj = a->j; aa = a->a;
230   } else {
231     ai = a->inew; aj = a->jnew;
232     aa = (MatScalar*)PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
233     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
234     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
235     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
236 
237     for (i=0; i<mbs; i++){
238       jmin = ai[i]; jmax = ai[i+1];
239       for (j=jmin; j<jmax; j++){
240         while (a2anew[j] != j){
241           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
242           for (k1=0; k1<bs2; k1++){
243             dk[k1]       = aa[k*bs2+k1];
244             aa[k*bs2+k1] = aa[j*bs2+k1];
245             aa[j*bs2+k1] = dk[k1];
246           }
247         }
248         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
249         if (i > aj[j]){
250           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
251           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
252           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
253           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
254             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
255           }
256         }
257       }
258     }
259     ierr = PetscFree(a2anew);CHKERRA(ierr);
260   }
261 
262   /* for each row k */
263   for (k = 0; k<mbs; k++){
264 
265     /*initialize k-th row with elements nonzero in row perm(k) of A */
266     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
267     if (jmin < jmax) {
268       ap = aa + jmin*bs2;
269       for (j = jmin; j < jmax; j++){
270         vj = perm_ptr[aj[j]];         /* block col. index */
271         rtmp_ptr = rtmp + vj*bs2;
272         for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
273       }
274     }
275 
276     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
277     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
278     i = jl[k]; /* first row to be added to k_th row  */
279 
280     while (i < mbs){
281       nexti = jl[i]; /* next row to be added to k_th row */
282 
283       /* compute multiplier */
284       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
285 
286       /* uik = -inv(Di)*U_bar(i,k) */
287       diag = ba + i*bs2;
288       u    = ba + ili*bs2;
289       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
290       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
291 
292       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
293       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
294 
295       /* update -U(i,k) */
296       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
297 
298       /* add multiple of row i to k-th row ... */
299       jmin = ili + 1; jmax = bi[i+1];
300       if (jmin < jmax){
301         for (j=jmin; j<jmax; j++) {
302           /* rtmp += -U(i,k)^T * U_bar(i,j) */
303           rtmp_ptr = rtmp + bj[j]*bs2;
304           u = ba + j*bs2;
305           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
306         }
307 
308         /* ... add i to row list for next nonzero entry */
309         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
310         j     = bj[jmin];
311         jl[i] = jl[j]; jl[j] = i; /* update jl */
312       }
313       i = nexti;
314     }
315 
316     /* save nonzero entries in k-th row of U ... */
317 
318     /* invert diagonal block */
319     diag = ba+k*bs2;
320     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
321     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
322 
323     jmin = bi[k]; jmax = bi[k+1];
324     if (jmin < jmax) {
325       for (j=jmin; j<jmax; j++){
326          vj = bj[j];           /* block col. index of U */
327          u   = ba + j*bs2;
328          rtmp_ptr = rtmp + vj*bs2;
329          for (k1=0; k1<bs2; k1++){
330            *u++        = *rtmp_ptr;
331            *rtmp_ptr++ = 0.0;
332          }
333       }
334 
335       /* ... add k to row list for first nonzero entry in k-th row */
336       il[k] = jmin;
337       i     = bj[jmin];
338       jl[k] = jl[i]; jl[i] = k;
339     }
340   }
341 
342   ierr = PetscFree(rtmp);CHKERRQ(ierr);
343   ierr = PetscFree(il);CHKERRQ(ierr);
344   ierr = PetscFree(jl);CHKERRQ(ierr);
345   ierr = PetscFree(dk);CHKERRQ(ierr);
346   ierr = PetscFree(uik);CHKERRQ(ierr);
347   ierr = PetscFree(W);CHKERRQ(ierr);
348   ierr = PetscFree(work);CHKERRQ(ierr);
349   ierr = PetscFree(pivots);CHKERRQ(ierr);
350   if (a->permute){
351     ierr = PetscFree(aa);CHKERRQ(ierr);
352   }
353 
354   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
355   C->factor    = FACTOR_CHOLESKY;
356   C->assembled = PETSC_TRUE;
357   C->preallocated = PETSC_TRUE;
358   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
359   PetscFunctionReturn(0);
360 }
361 
362 /* Version for when blocks are 7 by 7 */
363 #undef __FUNC__
364 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7"
365 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B)
366 {
367   Mat                C = *B;
368   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
369   IS                 perm = b->row;
370   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
371   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
372   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
373   MatScalar          *u,*d,*w,*wp;
374 
375   PetscFunctionBegin;
376   /* initialization */
377   w  = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w);
378   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
379   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
380   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
381   for (i=0; i<mbs; i++) {
382     jl[i] = mbs; il[0] = 0;
383   }
384   dk    = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(dk);
385   uik   = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(uik);
386   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
387 
388   /* check permutation */
389   if (!a->permute){
390     ai = a->i; aj = a->j; aa = a->a;
391   } else {
392     ai = a->inew; aj = a->jnew;
393     aa = (MatScalar*)PetscMalloc(49*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
394     ierr = PetscMemcpy(aa,a->a,49*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
395     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
396     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
397 
398     for (i=0; i<mbs; i++){
399       jmin = ai[i]; jmax = ai[i+1];
400       for (j=jmin; j<jmax; j++){
401         while (a2anew[j] != j){
402           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
403           for (k1=0; k1<49; k1++){
404             dk[k1]       = aa[k*49+k1];
405             aa[k*49+k1] = aa[j*49+k1];
406             aa[j*49+k1] = dk[k1];
407           }
408         }
409         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
410         if (i > aj[j]){
411           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
412           ap = aa + j*49;                     /* ptr to the beginning of j-th block of aa */
413           for (k=0; k<49; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
414           for (k=0; k<7; k++){               /* j-th block of aa <- dk^T */
415             for (k1=0; k1<7; k1++) *ap++ = dk[k + 7*k1];
416           }
417         }
418       }
419     }
420     ierr = PetscFree(a2anew);CHKERRA(ierr);
421   }
422 
423   /* for each row k */
424   for (k = 0; k<mbs; k++){
425 
426     /*initialize k-th row with elements nonzero in row perm(k) of A */
427     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
428     if (jmin < jmax) {
429       ap = aa + jmin*49;
430       for (j = jmin; j < jmax; j++){
431         vj = perm_ptr[aj[j]];         /* block col. index */
432         wp = w + vj*49;
433         for (i=0; i<49; i++) *wp++ = *ap++;
434       }
435     }
436 
437     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
438     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
439     i = jl[k]; /* first row to be added to k_th row  */
440 
441     while (i < mbs){
442       nexti = jl[i]; /* next row to be added to k_th row */
443 
444       /* compute multiplier */
445       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
446 
447       /* uik = -inv(Di)*U_bar(i,k) */
448       d = ba + i*49;
449       u    = ba + ili*49;
450 
451       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
452       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
453       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
454       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
455       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
456       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
457       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
458 
459       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
460       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
461       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
462       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
463       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
464       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
465       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
466 
467       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
468       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
469       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
470       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
471       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
472       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
473       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
474 
475       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
476       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
477       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
478       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
479       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
480       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
481       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
482 
483       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
484       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
485       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
486       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
487       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
488       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
489       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
490 
491       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
492       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
493       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
494       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
495       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
496       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
497       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
498 
499       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
500       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
501       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
502       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
503       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
504       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
505       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
506 
507       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
508       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
509       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
510       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
511       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
512       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
513       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
514       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
515 
516       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
517       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
518       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
519       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
520       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
521       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
522       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
523 
524       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
525       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
526       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
527       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
528       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
529       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
530       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
531 
532       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
533       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
534       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
535       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
536       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
537       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
538       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
539 
540       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
541       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
542       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
543       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
544       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
545       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
546       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
547 
548       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
549       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
550       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
551       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
552       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
553       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
554       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
555 
556       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
557       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
558       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
559       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
560       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
561       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
562       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
563 
564       /* update -U(i,k) */
565       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
566 
567       /* add multiple of row i to k-th row ... */
568       jmin = ili + 1; jmax = bi[i+1];
569       if (jmin < jmax){
570         for (j=jmin; j<jmax; j++) {
571           /* w += -U(i,k)^T * U_bar(i,j) */
572           wp = w + bj[j]*49;
573           u = ba + j*49;
574 
575           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
576           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
577           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
578           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
579           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
580           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
581           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
582 
583           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
584           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
585           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
586           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
587           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
588           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
589           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
590 
591           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
592           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
593           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
594           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
595           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
596           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
597           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
598 
599           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
600           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
601           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
602           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
603           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
604           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
605           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
606 
607           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
608           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
609           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
610           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
611           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
612           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
613           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
614 
615           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
616           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
617           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
618           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
619           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
620           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
621           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
622 
623           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
624           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
625           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
626           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
627           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
628           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
629           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
630         }
631 
632         /* ... add i to row list for next nonzero entry */
633         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
634         j     = bj[jmin];
635         jl[i] = jl[j]; jl[j] = i; /* update jl */
636       }
637       i = nexti;
638     }
639 
640     /* save nonzero entries in k-th row of U ... */
641 
642     /* invert diagonal block */
643     d = ba+k*49;
644     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
645     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
646 
647     jmin = bi[k]; jmax = bi[k+1];
648     if (jmin < jmax) {
649       for (j=jmin; j<jmax; j++){
650          vj = bj[j];           /* block col. index of U */
651          u   = ba + j*49;
652          wp = w + vj*49;
653          for (k1=0; k1<49; k1++){
654            *u++        = *wp;
655            *wp++ = 0.0;
656          }
657       }
658 
659       /* ... add k to row list for first nonzero entry in k-th row */
660       il[k] = jmin;
661       i     = bj[jmin];
662       jl[k] = jl[i]; jl[i] = k;
663     }
664   }
665 
666   ierr = PetscFree(w);CHKERRQ(ierr);
667   ierr = PetscFree(il);CHKERRQ(ierr);
668   ierr = PetscFree(jl);CHKERRQ(ierr);
669   ierr = PetscFree(dk);CHKERRQ(ierr);
670   ierr = PetscFree(uik);CHKERRQ(ierr);
671   if (a->permute){
672     ierr = PetscFree(aa);CHKERRQ(ierr);
673   }
674 
675   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
676   C->factor    = FACTOR_CHOLESKY;
677   C->assembled = PETSC_TRUE;
678   C->preallocated = PETSC_TRUE;
679   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
680   PetscFunctionReturn(0);
681 }
682 
683 /*
684       Version for when blocks are 7 by 7 Using natural ordering
685 */
686 #undef __FUNC__
687 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering"
688 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B)
689 {
690   Mat                C = *B;
691   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
692   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
693   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
694   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
695   MatScalar          *u,*d,*w,*wp;
696 
697   PetscFunctionBegin;
698   printf("called FactorNumeric_SeqSBAIJ_7_NaturalOrdering\n");
699   /* initialization */
700   w  = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w);
701   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
702   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
703   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
704   for (i=0; i<mbs; i++) {
705     jl[i] = mbs; il[0] = 0;
706   }
707   dk    = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(dk);
708   uik   = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(uik);
709 
710   ai = a->i; aj = a->j; aa = a->a;
711 
712   /* for each row k */
713   for (k = 0; k<mbs; k++){
714 
715     /*initialize k-th row with elements nonzero in row k of A */
716     jmin = ai[k]; jmax = ai[k+1];
717     if (jmin < jmax) {
718       ap = aa + jmin*49;
719       for (j = jmin; j < jmax; j++){
720         vj = aj[j];         /* block col. index */
721         wp = w + vj*49;
722         for (i=0; i<49; i++) *wp++ = *ap++;
723       }
724     }
725 
726     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
727     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
728     i = jl[k]; /* first row to be added to k_th row  */
729 
730     while (i < mbs){
731       nexti = jl[i]; /* next row to be added to k_th row */
732 
733       /* compute multiplier */
734       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
735 
736       /* uik = -inv(Di)*U_bar(i,k) */
737       d = ba + i*49;
738       u    = ba + ili*49;
739 
740       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
741       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
742       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
743       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
744       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
745       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
746       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
747 
748       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
749       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
750       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
751       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
752       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
753       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
754       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
755 
756       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
757       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
758       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
759       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
760       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
761       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
762       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
763 
764       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
765       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
766       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
767       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
768       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
769       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
770       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
771 
772       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
773       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
774       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
775       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
776       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
777       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
778       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
779 
780       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
781       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
782       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
783       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
784       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
785       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
786       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
787 
788       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
789       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
790       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
791       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
792       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
793       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
794       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
795 
796       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
797       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
798       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
799       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
800       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
801       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
802       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
803       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
804 
805       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
806       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
807       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
808       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
809       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
810       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
811       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
812 
813       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
814       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
815       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
816       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
817       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
818       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
819       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
820 
821       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
822       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
823       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
824       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
825       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
826       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
827       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
828 
829       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
830       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
831       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
832       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
833       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
834       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
835       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
836 
837       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
838       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
839       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
840       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
841       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
842       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
843       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
844 
845       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
846       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
847       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
848       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
849       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
850       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
851       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
852 
853       /* update -U(i,k) */
854       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
855 
856       /* add multiple of row i to k-th row ... */
857       jmin = ili + 1; jmax = bi[i+1];
858       if (jmin < jmax){
859         for (j=jmin; j<jmax; j++) {
860           /* w += -U(i,k)^T * U_bar(i,j) */
861           wp = w + bj[j]*49;
862           u = ba + j*49;
863 
864           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
865           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
866           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
867           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
868           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
869           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
870           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
871 
872           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
873           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
874           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
875           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
876           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
877           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
878           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
879 
880           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
881           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
882           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
883           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
884           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
885           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
886           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
887 
888           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
889           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
890           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
891           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
892           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
893           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
894           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
895 
896           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
897           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
898           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
899           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
900           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
901           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
902           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
903 
904           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
905           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
906           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
907           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
908           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
909           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
910           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
911 
912           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
913           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
914           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
915           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
916           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
917           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
918           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
919         }
920 
921         /* ... add i to row list for next nonzero entry */
922         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
923         j     = bj[jmin];
924         jl[i] = jl[j]; jl[j] = i; /* update jl */
925       }
926       i = nexti;
927     }
928 
929     /* save nonzero entries in k-th row of U ... */
930 
931     /* invert diagonal block */
932     d = ba+k*49;
933     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
934     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
935 
936     jmin = bi[k]; jmax = bi[k+1];
937     if (jmin < jmax) {
938       for (j=jmin; j<jmax; j++){
939          vj = bj[j];           /* block col. index of U */
940          u   = ba + j*49;
941          wp = w + vj*49;
942          for (k1=0; k1<49; k1++){
943            *u++        = *wp;
944            *wp++ = 0.0;
945          }
946       }
947 
948       /* ... add k to row list for first nonzero entry in k-th row */
949       il[k] = jmin;
950       i     = bj[jmin];
951       jl[k] = jl[i]; jl[i] = k;
952     }
953   }
954 
955   ierr = PetscFree(w);CHKERRQ(ierr);
956   ierr = PetscFree(il);CHKERRQ(ierr);
957   ierr = PetscFree(jl);CHKERRQ(ierr);
958   ierr = PetscFree(dk);CHKERRQ(ierr);
959   ierr = PetscFree(uik);CHKERRQ(ierr);
960 
961   C->factor    = FACTOR_CHOLESKY;
962   C->assembled = PETSC_TRUE;
963   C->preallocated = PETSC_TRUE;
964   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
965   PetscFunctionReturn(0);
966 }
967 
968 /* Version for when blocks are 6 by 6 */
969 #undef __FUNC__
970 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6"
971 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B)
972 {
973   Mat                C = *B;
974   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
975   IS                 perm = b->row;
976   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
977   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
978   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
979   MatScalar          *u,*d,*w,*wp;
980 
981   PetscFunctionBegin;
982   /* initialization */
983   w  = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w);
984   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
985   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
986   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
987   for (i=0; i<mbs; i++) {
988     jl[i] = mbs; il[0] = 0;
989   }
990   dk    = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(dk);
991   uik   = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(uik);
992   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
993 
994   /* check permutation */
995   if (!a->permute){
996     ai = a->i; aj = a->j; aa = a->a;
997   } else {
998     ai = a->inew; aj = a->jnew;
999     aa = (MatScalar*)PetscMalloc(36*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
1000     ierr = PetscMemcpy(aa,a->a,36*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1001     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
1002     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1003 
1004     for (i=0; i<mbs; i++){
1005       jmin = ai[i]; jmax = ai[i+1];
1006       for (j=jmin; j<jmax; j++){
1007         while (a2anew[j] != j){
1008           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1009           for (k1=0; k1<36; k1++){
1010             dk[k1]       = aa[k*36+k1];
1011             aa[k*36+k1] = aa[j*36+k1];
1012             aa[j*36+k1] = dk[k1];
1013           }
1014         }
1015         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1016         if (i > aj[j]){
1017           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1018           ap = aa + j*36;                     /* ptr to the beginning of j-th block of aa */
1019           for (k=0; k<36; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1020           for (k=0; k<6; k++){               /* j-th block of aa <- dk^T */
1021             for (k1=0; k1<6; k1++) *ap++ = dk[k + 6*k1];
1022           }
1023         }
1024       }
1025     }
1026     ierr = PetscFree(a2anew);CHKERRA(ierr);
1027   }
1028 
1029   /* for each row k */
1030   for (k = 0; k<mbs; k++){
1031 
1032     /*initialize k-th row with elements nonzero in row perm(k) of A */
1033     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1034     if (jmin < jmax) {
1035       ap = aa + jmin*36;
1036       for (j = jmin; j < jmax; j++){
1037         vj = perm_ptr[aj[j]];         /* block col. index */
1038         wp = w + vj*36;
1039         for (i=0; i<36; i++) *wp++ = *ap++;
1040       }
1041     }
1042 
1043     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1044     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1045     i = jl[k]; /* first row to be added to k_th row  */
1046 
1047     while (i < mbs){
1048       nexti = jl[i]; /* next row to be added to k_th row */
1049 
1050       /* compute multiplier */
1051       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1052 
1053       /* uik = -inv(Di)*U_bar(i,k) */
1054       d = ba + i*36;
1055       u    = ba + ili*36;
1056 
1057       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1058       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1059       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1060       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1061       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1062       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1063 
1064       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1065       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1066       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1067       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1068       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1069       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1070 
1071       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1072       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1073       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1074       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1075       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1076       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1077 
1078       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1079       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1080       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1081       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1082       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1083       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1084 
1085       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1086       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1087       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1088       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1089       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1090       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1091 
1092       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1093       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1094       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1095       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1096       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1097       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1098 
1099       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1100       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1101       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1102       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1103       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1104       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1105       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1106 
1107       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1108       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1109       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1110       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1111       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1112       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1113 
1114       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1115       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1116       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1117       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1118       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1119       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1120 
1121       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1122       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1123       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1124       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1125       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1126       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1127 
1128       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1129       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1130       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1131       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1132       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1133       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1134 
1135       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1136       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1137       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1138       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1139       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1140       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1141 
1142       /* update -U(i,k) */
1143       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1144 
1145       /* add multiple of row i to k-th row ... */
1146       jmin = ili + 1; jmax = bi[i+1];
1147       if (jmin < jmax){
1148         for (j=jmin; j<jmax; j++) {
1149           /* w += -U(i,k)^T * U_bar(i,j) */
1150           wp = w + bj[j]*36;
1151           u = ba + j*36;
1152           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1153           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1154           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1155           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1156           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1157           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1158 
1159           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1160           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1161           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1162           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1163           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1164           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1165 
1166           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1167           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1168           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1169           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1170           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1171           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1172 
1173           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1174           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1175           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1176           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1177           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1178           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1179 
1180           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1181           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1182           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1183           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1184           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1185           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1186 
1187           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1188           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1189           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1190           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1191           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1192           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1193         }
1194 
1195         /* ... add i to row list for next nonzero entry */
1196         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1197         j     = bj[jmin];
1198         jl[i] = jl[j]; jl[j] = i; /* update jl */
1199       }
1200       i = nexti;
1201     }
1202 
1203     /* save nonzero entries in k-th row of U ... */
1204 
1205     /* invert diagonal block */
1206     d = ba+k*36;
1207     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1208     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1209 
1210     jmin = bi[k]; jmax = bi[k+1];
1211     if (jmin < jmax) {
1212       for (j=jmin; j<jmax; j++){
1213          vj = bj[j];           /* block col. index of U */
1214          u   = ba + j*36;
1215          wp = w + vj*36;
1216          for (k1=0; k1<36; k1++){
1217            *u++        = *wp;
1218            *wp++ = 0.0;
1219          }
1220       }
1221 
1222       /* ... add k to row list for first nonzero entry in k-th row */
1223       il[k] = jmin;
1224       i     = bj[jmin];
1225       jl[k] = jl[i]; jl[i] = k;
1226     }
1227   }
1228 
1229   ierr = PetscFree(w);CHKERRQ(ierr);
1230   ierr = PetscFree(il);CHKERRQ(ierr);
1231   ierr = PetscFree(jl);CHKERRQ(ierr);
1232   ierr = PetscFree(dk);CHKERRQ(ierr);
1233   ierr = PetscFree(uik);CHKERRQ(ierr);
1234   if (a->permute){
1235     ierr = PetscFree(aa);CHKERRQ(ierr);
1236   }
1237 
1238   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1239   C->factor    = FACTOR_CHOLESKY;
1240   C->assembled = PETSC_TRUE;
1241   C->preallocated = PETSC_TRUE;
1242   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 /*
1247       Version for when blocks are 6 by 6 Using natural ordering
1248 */
1249 #undef __FUNC__
1250 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering"
1251 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B)
1252 {
1253   Mat                C = *B;
1254   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1255   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1256   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1257   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1258   MatScalar          *u,*d,*w,*wp;
1259 
1260   PetscFunctionBegin;
1261   printf("called FactorNumeric_SeqSBAIJ_6_NaturalOrdering\n");
1262   /* initialization */
1263   w  = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w);
1264   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1265   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1266   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1267   for (i=0; i<mbs; i++) {
1268     jl[i] = mbs; il[0] = 0;
1269   }
1270   dk    = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(dk);
1271   uik   = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(uik);
1272 
1273   ai = a->i; aj = a->j; aa = a->a;
1274 
1275   /* for each row k */
1276   for (k = 0; k<mbs; k++){
1277 
1278     /*initialize k-th row with elements nonzero in row k of A */
1279     jmin = ai[k]; jmax = ai[k+1];
1280     if (jmin < jmax) {
1281       ap = aa + jmin*36;
1282       for (j = jmin; j < jmax; j++){
1283         vj = aj[j];         /* block col. index */
1284         wp = w + vj*36;
1285         for (i=0; i<36; i++) *wp++ = *ap++;
1286       }
1287     }
1288 
1289     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1290     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1291     i = jl[k]; /* first row to be added to k_th row  */
1292 
1293     while (i < mbs){
1294       nexti = jl[i]; /* next row to be added to k_th row */
1295 
1296       /* compute multiplier */
1297       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1298 
1299       /* uik = -inv(Di)*U_bar(i,k) */
1300       d = ba + i*36;
1301       u    = ba + ili*36;
1302 
1303       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1304       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1305       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1306       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1307       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1308       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1309 
1310       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1311       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1312       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1313       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1314       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1315       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1316 
1317       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1318       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1319       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1320       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1321       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1322       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1323 
1324       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1325       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1326       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1327       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1328       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1329       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1330 
1331       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1332       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1333       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1334       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1335       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1336       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1337 
1338       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1339       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1340       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1341       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1342       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1343       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1344 
1345       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1346       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1347       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1348       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1349       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1350       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1351       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1352 
1353       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1354       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1355       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1356       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1357       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1358       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1359 
1360       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1361       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1362       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1363       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1364       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1365       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1366 
1367       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1368       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1369       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1370       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1371       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1372       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1373 
1374       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1375       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1376       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1377       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1378       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1379       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1380 
1381       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1382       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1383       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1384       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1385       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1386       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1387 
1388       /* update -U(i,k) */
1389       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1390 
1391       /* add multiple of row i to k-th row ... */
1392       jmin = ili + 1; jmax = bi[i+1];
1393       if (jmin < jmax){
1394         for (j=jmin; j<jmax; j++) {
1395           /* w += -U(i,k)^T * U_bar(i,j) */
1396           wp = w + bj[j]*36;
1397           u = ba + j*36;
1398           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1399           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1400           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1401           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1402           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1403           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1404 
1405           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1406           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1407           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1408           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1409           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1410           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1411 
1412           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1413           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1414           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1415           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1416           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1417           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1418 
1419           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1420           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1421           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1422           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1423           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1424           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1425 
1426           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1427           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1428           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1429           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1430           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1431           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1432 
1433           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1434           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1435           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1436           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1437           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1438           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1439         }
1440 
1441         /* ... add i to row list for next nonzero entry */
1442         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1443         j     = bj[jmin];
1444         jl[i] = jl[j]; jl[j] = i; /* update jl */
1445       }
1446       i = nexti;
1447     }
1448 
1449     /* save nonzero entries in k-th row of U ... */
1450 
1451     /* invert diagonal block */
1452     d = ba+k*36;
1453     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1454     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1455 
1456     jmin = bi[k]; jmax = bi[k+1];
1457     if (jmin < jmax) {
1458       for (j=jmin; j<jmax; j++){
1459          vj = bj[j];           /* block col. index of U */
1460          u   = ba + j*36;
1461          wp = w + vj*36;
1462          for (k1=0; k1<36; k1++){
1463            *u++        = *wp;
1464            *wp++ = 0.0;
1465          }
1466       }
1467 
1468       /* ... add k to row list for first nonzero entry in k-th row */
1469       il[k] = jmin;
1470       i     = bj[jmin];
1471       jl[k] = jl[i]; jl[i] = k;
1472     }
1473   }
1474 
1475   ierr = PetscFree(w);CHKERRQ(ierr);
1476   ierr = PetscFree(il);CHKERRQ(ierr);
1477   ierr = PetscFree(jl);CHKERRQ(ierr);
1478   ierr = PetscFree(dk);CHKERRQ(ierr);
1479   ierr = PetscFree(uik);CHKERRQ(ierr);
1480 
1481   C->factor    = FACTOR_CHOLESKY;
1482   C->assembled = PETSC_TRUE;
1483   C->preallocated = PETSC_TRUE;
1484   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 /* Version for when blocks are 5 by 5  */
1489 #undef __FUNC__
1490 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5"
1491 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B)
1492 {
1493   Mat                C = *B;
1494   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1495   IS                 perm = b->row;
1496   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1497   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1498   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1499   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1500 
1501   PetscFunctionBegin;
1502   /* initialization */
1503   rtmp  = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1504   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1505   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1506   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1507   for (i=0; i<mbs; i++) {
1508     jl[i] = mbs; il[0] = 0;
1509   }
1510   dk    = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(dk);
1511   uik   = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(uik);
1512   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
1513 
1514   /* check permutation */
1515   if (!a->permute){
1516     ai = a->i; aj = a->j; aa = a->a;
1517   } else {
1518     ai = a->inew; aj = a->jnew;
1519     aa = (MatScalar*)PetscMalloc(25*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
1520     ierr = PetscMemcpy(aa,a->a,25*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1521     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
1522     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1523 
1524     for (i=0; i<mbs; i++){
1525       jmin = ai[i]; jmax = ai[i+1];
1526       for (j=jmin; j<jmax; j++){
1527         while (a2anew[j] != j){
1528           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1529           for (k1=0; k1<25; k1++){
1530             dk[k1]       = aa[k*25+k1];
1531             aa[k*25+k1] = aa[j*25+k1];
1532             aa[j*25+k1] = dk[k1];
1533           }
1534         }
1535         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1536         if (i > aj[j]){
1537           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1538           ap = aa + j*25;                     /* ptr to the beginning of j-th block of aa */
1539           for (k=0; k<25; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1540           for (k=0; k<5; k++){               /* j-th block of aa <- dk^T */
1541             for (k1=0; k1<5; k1++) *ap++ = dk[k + 5*k1];
1542           }
1543         }
1544       }
1545     }
1546     ierr = PetscFree(a2anew);CHKERRA(ierr);
1547   }
1548 
1549   /* for each row k */
1550   for (k = 0; k<mbs; k++){
1551 
1552     /*initialize k-th row with elements nonzero in row perm(k) of A */
1553     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1554     if (jmin < jmax) {
1555       ap = aa + jmin*25;
1556       for (j = jmin; j < jmax; j++){
1557         vj = perm_ptr[aj[j]];         /* block col. index */
1558         rtmp_ptr = rtmp + vj*25;
1559         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1560       }
1561     }
1562 
1563     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1564     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1565     i = jl[k]; /* first row to be added to k_th row  */
1566 
1567     while (i < mbs){
1568       nexti = jl[i]; /* next row to be added to k_th row */
1569 
1570       /* compute multiplier */
1571       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1572 
1573       /* uik = -inv(Di)*U_bar(i,k) */
1574       d = ba + i*25;
1575       u    = ba + ili*25;
1576 
1577       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1578       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1579       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1580       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1581       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1582 
1583       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1584       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1585       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1586       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1587       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1588 
1589       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1590       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1591       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1592       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1593       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1594 
1595       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1596       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1597       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1598       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1599       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1600 
1601       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1602       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1603       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1604       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1605       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1606 
1607 
1608       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1609       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1610       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1611       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1612       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1613       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1614 
1615       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1616       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1617       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1618       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1619       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1620 
1621       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1622       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1623       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1624       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1625       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1626 
1627       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1628       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1629       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1630       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1631       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1632 
1633       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1634       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1635       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1636       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1637       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1638 
1639       /* update -U(i,k) */
1640       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1641 
1642       /* add multiple of row i to k-th row ... */
1643       jmin = ili + 1; jmax = bi[i+1];
1644       if (jmin < jmax){
1645         for (j=jmin; j<jmax; j++) {
1646           /* rtmp += -U(i,k)^T * U_bar(i,j) */
1647           rtmp_ptr = rtmp + bj[j]*25;
1648           u = ba + j*25;
1649           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1650           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1651           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1652           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1653           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1654 
1655           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1656           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1657           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1658           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1659           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1660 
1661           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1662           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1663           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1664           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1665           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1666 
1667           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1668           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1669           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1670           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1671           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1672 
1673           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1674           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1675           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1676           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1677           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1678         }
1679 
1680         /* ... add i to row list for next nonzero entry */
1681         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1682         j     = bj[jmin];
1683         jl[i] = jl[j]; jl[j] = i; /* update jl */
1684       }
1685       i = nexti;
1686     }
1687 
1688     /* save nonzero entries in k-th row of U ... */
1689 
1690     /* invert diagonal block */
1691     d = ba+k*25;
1692     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
1693     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
1694 
1695     jmin = bi[k]; jmax = bi[k+1];
1696     if (jmin < jmax) {
1697       for (j=jmin; j<jmax; j++){
1698          vj = bj[j];           /* block col. index of U */
1699          u   = ba + j*25;
1700          rtmp_ptr = rtmp + vj*25;
1701          for (k1=0; k1<25; k1++){
1702            *u++        = *rtmp_ptr;
1703            *rtmp_ptr++ = 0.0;
1704          }
1705       }
1706 
1707       /* ... add k to row list for first nonzero entry in k-th row */
1708       il[k] = jmin;
1709       i     = bj[jmin];
1710       jl[k] = jl[i]; jl[i] = k;
1711     }
1712   }
1713 
1714   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1715   ierr = PetscFree(il);CHKERRQ(ierr);
1716   ierr = PetscFree(jl);CHKERRQ(ierr);
1717   ierr = PetscFree(dk);CHKERRQ(ierr);
1718   ierr = PetscFree(uik);CHKERRQ(ierr);
1719   if (a->permute){
1720     ierr = PetscFree(aa);CHKERRQ(ierr);
1721   }
1722 
1723   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1724   C->factor    = FACTOR_CHOLESKY;
1725   C->assembled = PETSC_TRUE;
1726   C->preallocated = PETSC_TRUE;
1727   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1728   PetscFunctionReturn(0);
1729 }
1730 
1731 /*
1732       Version for when blocks are 5 by 5 Using natural ordering
1733 */
1734 #undef __FUNC__
1735 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering"
1736 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B)
1737 {
1738   Mat                C = *B;
1739   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1740   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1741   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1742   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1743   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1744 
1745   PetscFunctionBegin;
1746   printf("called FactorNumeric_SeqSBAIJ_5_NaturalOrdering\n");
1747   /* initialization */
1748   rtmp  = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1749   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1750   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1751   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1752   for (i=0; i<mbs; i++) {
1753     jl[i] = mbs; il[0] = 0;
1754   }
1755   dk    = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(dk);
1756   uik   = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(uik);
1757 
1758   ai = a->i; aj = a->j; aa = a->a;
1759 
1760   /* for each row k */
1761   for (k = 0; k<mbs; k++){
1762 
1763     /*initialize k-th row with elements nonzero in row k of A */
1764     jmin = ai[k]; jmax = ai[k+1];
1765     if (jmin < jmax) {
1766       ap = aa + jmin*25;
1767       for (j = jmin; j < jmax; j++){
1768         vj = aj[j];         /* block col. index */
1769         rtmp_ptr = rtmp + vj*25;
1770         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1771       }
1772     }
1773 
1774     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1775     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1776     i = jl[k]; /* first row to be added to k_th row  */
1777 
1778     while (i < mbs){
1779       nexti = jl[i]; /* next row to be added to k_th row */
1780 
1781       /* compute multiplier */
1782       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1783 
1784       /* uik = -inv(Di)*U_bar(i,k) */
1785       d = ba + i*25;
1786       u    = ba + ili*25;
1787 
1788       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1789       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1790       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1791       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1792       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1793 
1794       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1795       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1796       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1797       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1798       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1799 
1800       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1801       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1802       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1803       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1804       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1805 
1806       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1807       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1808       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1809       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1810       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1811 
1812       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1813       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1814       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1815       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1816       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1817 
1818 
1819       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1820       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1821       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1822       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1823       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1824       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1825 
1826       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1827       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1828       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1829       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1830       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1831 
1832       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1833       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1834       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1835       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1836       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1837 
1838       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1839       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1840       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1841       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1842       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1843 
1844       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1845       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1846       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1847       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1848       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1849 
1850       /* update -U(i,k) */
1851       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1852 
1853       /* add multiple of row i to k-th row ... */
1854       jmin = ili + 1; jmax = bi[i+1];
1855       if (jmin < jmax){
1856         for (j=jmin; j<jmax; j++) {
1857           /* rtmp += -U(i,k)^T * U_bar(i,j) */
1858           rtmp_ptr = rtmp + bj[j]*25;
1859           u = ba + j*25;
1860           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1861           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1862           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1863           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1864           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1865 
1866           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1867           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1868           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1869           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1870           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1871 
1872           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1873           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1874           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1875           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1876           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1877 
1878           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1879           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1880           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1881           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1882           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1883 
1884           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1885           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1886           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1887           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1888           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1889         }
1890 
1891         /* ... add i to row list for next nonzero entry */
1892         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1893         j     = bj[jmin];
1894         jl[i] = jl[j]; jl[j] = i; /* update jl */
1895       }
1896       i = nexti;
1897     }
1898 
1899     /* save nonzero entries in k-th row of U ... */
1900 
1901     /* invert diagonal block */
1902     d = ba+k*25;
1903     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
1904     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
1905 
1906     jmin = bi[k]; jmax = bi[k+1];
1907     if (jmin < jmax) {
1908       for (j=jmin; j<jmax; j++){
1909          vj = bj[j];           /* block col. index of U */
1910          u   = ba + j*25;
1911          rtmp_ptr = rtmp + vj*25;
1912          for (k1=0; k1<25; k1++){
1913            *u++        = *rtmp_ptr;
1914            *rtmp_ptr++ = 0.0;
1915          }
1916       }
1917 
1918       /* ... add k to row list for first nonzero entry in k-th row */
1919       il[k] = jmin;
1920       i     = bj[jmin];
1921       jl[k] = jl[i]; jl[i] = k;
1922     }
1923   }
1924 
1925   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1926   ierr = PetscFree(il);CHKERRQ(ierr);
1927   ierr = PetscFree(jl);CHKERRQ(ierr);
1928   ierr = PetscFree(dk);CHKERRQ(ierr);
1929   ierr = PetscFree(uik);CHKERRQ(ierr);
1930 
1931   C->factor    = FACTOR_CHOLESKY;
1932   C->assembled = PETSC_TRUE;
1933   C->preallocated = PETSC_TRUE;
1934   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 /*
1939       Version for when blocks are 4 by 4 Using natural ordering
1940 */
1941 #undef __FUNC__
1942 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering"
1943 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B)
1944 {
1945   Mat                C = *B;
1946   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1947   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1948   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1949   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1950   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
1951 
1952   PetscFunctionBegin;
1953   printf("called FactorNumeric_SeqSBAIJ_4_NaturalOrdering\n");
1954   /* initialization */
1955   rtmp  = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1956   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1957   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1958   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1959   for (i=0; i<mbs; i++) {
1960     jl[i] = mbs; il[0] = 0;
1961   }
1962   dk    = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(dk);
1963   uik   = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(uik);
1964 
1965   ai = a->i; aj = a->j; aa = a->a;
1966 
1967   /* for each row k */
1968   for (k = 0; k<mbs; k++){
1969 
1970     /*initialize k-th row with elements nonzero in row k of A */
1971     jmin = ai[k]; jmax = ai[k+1];
1972     if (jmin < jmax) {
1973       ap = aa + jmin*16;
1974       for (j = jmin; j < jmax; j++){
1975         vj = aj[j];         /* block col. index */
1976         rtmp_ptr = rtmp + vj*16;
1977         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
1978       }
1979     }
1980 
1981     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1982     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
1983     i = jl[k]; /* first row to be added to k_th row  */
1984 
1985     while (i < mbs){
1986       nexti = jl[i]; /* next row to be added to k_th row */
1987 
1988       /* compute multiplier */
1989       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1990 
1991       /* uik = -inv(Di)*U_bar(i,k) */
1992       diag = ba + i*16;
1993       u    = ba + ili*16;
1994 
1995       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
1996       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
1997       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
1998       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
1999 
2000       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2001       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2002       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2003       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2004 
2005       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2006       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2007       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2008       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2009 
2010       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2011       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2012       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2013       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2014 
2015       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2016       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2017       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2018       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2019       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2020 
2021       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2022       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2023       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2024       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2025 
2026       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2027       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2028       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2029       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2030 
2031       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2032       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2033       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2034       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2035 
2036       /* update -U(i,k) */
2037       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2038 
2039       /* add multiple of row i to k-th row ... */
2040       jmin = ili + 1; jmax = bi[i+1];
2041       if (jmin < jmax){
2042         for (j=jmin; j<jmax; j++) {
2043           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2044           rtmp_ptr = rtmp + bj[j]*16;
2045           u = ba + j*16;
2046           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2047           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2048           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2049           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2050 
2051           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2052           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2053           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2054           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2055 
2056           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2057           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2058           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2059           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2060 
2061           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2062           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2063           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2064           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2065         }
2066 
2067         /* ... add i to row list for next nonzero entry */
2068         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2069         j     = bj[jmin];
2070         jl[i] = jl[j]; jl[j] = i; /* update jl */
2071       }
2072       i = nexti;
2073     }
2074 
2075     /* save nonzero entries in k-th row of U ... */
2076 
2077     /* invert diagonal block */
2078     diag = ba+k*16;
2079     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2080     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2081 
2082     jmin = bi[k]; jmax = bi[k+1];
2083     if (jmin < jmax) {
2084       for (j=jmin; j<jmax; j++){
2085          vj = bj[j];           /* block col. index of U */
2086          u   = ba + j*16;
2087          rtmp_ptr = rtmp + vj*16;
2088          for (k1=0; k1<16; k1++){
2089            *u++        = *rtmp_ptr;
2090            *rtmp_ptr++ = 0.0;
2091          }
2092       }
2093 
2094       /* ... add k to row list for first nonzero entry in k-th row */
2095       il[k] = jmin;
2096       i     = bj[jmin];
2097       jl[k] = jl[i]; jl[i] = k;
2098     }
2099   }
2100 
2101   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2102   ierr = PetscFree(il);CHKERRQ(ierr);
2103   ierr = PetscFree(jl);CHKERRQ(ierr);
2104   ierr = PetscFree(dk);CHKERRQ(ierr);
2105   ierr = PetscFree(uik);CHKERRQ(ierr);
2106 
2107   C->factor    = FACTOR_CHOLESKY;
2108   C->assembled = PETSC_TRUE;
2109   C->preallocated = PETSC_TRUE;
2110   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2111   PetscFunctionReturn(0);
2112 }
2113 
2114 /* Version for when blocks are 4 by 4  */
2115 #undef __FUNC__
2116 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4"
2117 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B)
2118 {
2119   Mat                C = *B;
2120   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2121   IS                 perm = b->row;
2122   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2123   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2124   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2125   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2126 
2127   PetscFunctionBegin;
2128   /* initialization */
2129   rtmp  = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2130   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2131   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2132   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2133   for (i=0; i<mbs; i++) {
2134     jl[i] = mbs; il[0] = 0;
2135   }
2136   dk    = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(dk);
2137   uik   = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(uik);
2138   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2139 
2140   /* check permutation */
2141   if (!a->permute){
2142     ai = a->i; aj = a->j; aa = a->a;
2143   } else {
2144     ai = a->inew; aj = a->jnew;
2145     aa = (MatScalar*)PetscMalloc(16*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2146     ierr = PetscMemcpy(aa,a->a,16*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2147     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2148     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2149 
2150     for (i=0; i<mbs; i++){
2151       jmin = ai[i]; jmax = ai[i+1];
2152       for (j=jmin; j<jmax; j++){
2153         while (a2anew[j] != j){
2154           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2155           for (k1=0; k1<16; k1++){
2156             dk[k1]       = aa[k*16+k1];
2157             aa[k*16+k1] = aa[j*16+k1];
2158             aa[j*16+k1] = dk[k1];
2159           }
2160         }
2161         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2162         if (i > aj[j]){
2163           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2164           ap = aa + j*16;                     /* ptr to the beginning of j-th block of aa */
2165           for (k=0; k<16; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2166           for (k=0; k<4; k++){               /* j-th block of aa <- dk^T */
2167             for (k1=0; k1<4; k1++) *ap++ = dk[k + 4*k1];
2168           }
2169         }
2170       }
2171     }
2172     ierr = PetscFree(a2anew);CHKERRA(ierr);
2173   }
2174 
2175   /* for each row k */
2176   for (k = 0; k<mbs; k++){
2177 
2178     /*initialize k-th row with elements nonzero in row perm(k) of A */
2179     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2180     if (jmin < jmax) {
2181       ap = aa + jmin*16;
2182       for (j = jmin; j < jmax; j++){
2183         vj = perm_ptr[aj[j]];         /* block col. index */
2184         rtmp_ptr = rtmp + vj*16;
2185         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
2186       }
2187     }
2188 
2189     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2190     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
2191     i = jl[k]; /* first row to be added to k_th row  */
2192 
2193     while (i < mbs){
2194       nexti = jl[i]; /* next row to be added to k_th row */
2195 
2196       /* compute multiplier */
2197       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2198 
2199       /* uik = -inv(Di)*U_bar(i,k) */
2200       diag = ba + i*16;
2201       u    = ba + ili*16;
2202 
2203       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
2204       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
2205       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
2206       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
2207 
2208       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2209       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2210       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2211       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2212 
2213       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2214       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2215       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2216       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2217 
2218       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2219       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2220       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2221       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2222 
2223       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2224       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2225       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2226       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2227       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2228 
2229       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2230       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2231       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2232       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2233 
2234       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2235       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2236       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2237       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2238 
2239       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2240       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2241       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2242       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2243 
2244       /* update -U(i,k) */
2245       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2246 
2247       /* add multiple of row i to k-th row ... */
2248       jmin = ili + 1; jmax = bi[i+1];
2249       if (jmin < jmax){
2250         for (j=jmin; j<jmax; j++) {
2251           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2252           rtmp_ptr = rtmp + bj[j]*16;
2253           u = ba + j*16;
2254           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2255           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2256           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2257           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2258 
2259           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2260           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2261           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2262           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2263 
2264           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2265           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2266           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2267           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2268 
2269           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2270           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2271           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2272           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2273         }
2274 
2275         /* ... add i to row list for next nonzero entry */
2276         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2277         j     = bj[jmin];
2278         jl[i] = jl[j]; jl[j] = i; /* update jl */
2279       }
2280       i = nexti;
2281     }
2282 
2283     /* save nonzero entries in k-th row of U ... */
2284 
2285     /* invert diagonal block */
2286     diag = ba+k*16;
2287     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2288     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2289 
2290     jmin = bi[k]; jmax = bi[k+1];
2291     if (jmin < jmax) {
2292       for (j=jmin; j<jmax; j++){
2293          vj = bj[j];           /* block col. index of U */
2294          u   = ba + j*16;
2295          rtmp_ptr = rtmp + vj*16;
2296          for (k1=0; k1<16; k1++){
2297            *u++        = *rtmp_ptr;
2298            *rtmp_ptr++ = 0.0;
2299          }
2300       }
2301 
2302       /* ... add k to row list for first nonzero entry in k-th row */
2303       il[k] = jmin;
2304       i     = bj[jmin];
2305       jl[k] = jl[i]; jl[i] = k;
2306     }
2307   }
2308 
2309   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2310   ierr = PetscFree(il);CHKERRQ(ierr);
2311   ierr = PetscFree(jl);CHKERRQ(ierr);
2312   ierr = PetscFree(dk);CHKERRQ(ierr);
2313   ierr = PetscFree(uik);CHKERRQ(ierr);
2314   if (a->permute){
2315     ierr = PetscFree(aa);CHKERRQ(ierr);
2316   }
2317 
2318   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2319   C->factor    = FACTOR_CHOLESKY;
2320   C->assembled = PETSC_TRUE;
2321   C->preallocated = PETSC_TRUE;
2322   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2323   PetscFunctionReturn(0);
2324 }
2325 
2326 /* Version for when blocks are 3 by 3  */
2327 #undef __FUNC__
2328 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3"
2329 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B)
2330 {
2331   Mat                C = *B;
2332   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2333   IS                 perm = b->row;
2334   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2335   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2336   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2337   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2338 
2339   PetscFunctionBegin;
2340   /* printf("called FactorNumeric_SeqSBAIJ_3\n"); */
2341   /* initialization */
2342   rtmp  = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2343   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2344   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2345   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2346   for (i=0; i<mbs; i++) {
2347     jl[i] = mbs; il[0] = 0;
2348   }
2349   dk  = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(dk);
2350   uik = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(uik);
2351   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2352 
2353   /* check permutation */
2354   if (!a->permute){
2355     ai = a->i; aj = a->j; aa = a->a;
2356   } else {
2357     ai = a->inew; aj = a->jnew;
2358     aa = (MatScalar*)PetscMalloc(9*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2359     ierr = PetscMemcpy(aa,a->a,9*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2360     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2361     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2362 
2363     for (i=0; i<mbs; i++){
2364       jmin = ai[i]; jmax = ai[i+1];
2365       for (j=jmin; j<jmax; j++){
2366         while (a2anew[j] != j){
2367           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2368           for (k1=0; k1<9; k1++){
2369             dk[k1]       = aa[k*9+k1];
2370             aa[k*9+k1] = aa[j*9+k1];
2371             aa[j*9+k1] = dk[k1];
2372           }
2373         }
2374         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2375         if (i > aj[j]){
2376           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2377           ap = aa + j*9;                     /* ptr to the beginning of j-th block of aa */
2378           for (k=0; k<9; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2379           for (k=0; k<3; k++){               /* j-th block of aa <- dk^T */
2380             for (k1=0; k1<3; k1++) *ap++ = dk[k + 3*k1];
2381           }
2382         }
2383       }
2384     }
2385     ierr = PetscFree(a2anew);CHKERRA(ierr);
2386   }
2387 
2388   /* for each row k */
2389   for (k = 0; k<mbs; k++){
2390 
2391     /*initialize k-th row with elements nonzero in row perm(k) of A */
2392     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2393     if (jmin < jmax) {
2394       ap = aa + jmin*9;
2395       for (j = jmin; j < jmax; j++){
2396         vj = perm_ptr[aj[j]];         /* block col. index */
2397         rtmp_ptr = rtmp + vj*9;
2398         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2399       }
2400     }
2401 
2402     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2403     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2404     i = jl[k]; /* first row to be added to k_th row  */
2405 
2406     while (i < mbs){
2407       nexti = jl[i]; /* next row to be added to k_th row */
2408 
2409       /* compute multiplier */
2410       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2411 
2412       /* uik = -inv(Di)*U_bar(i,k) */
2413       diag = ba + i*9;
2414       u    = ba + ili*9;
2415 
2416       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2417       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2418       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2419 
2420       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2421       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2422       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2423 
2424       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2425       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2426       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2427 
2428       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2429       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2430       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2431       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2432 
2433       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2434       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2435       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2436 
2437       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2438       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2439       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2440 
2441       /* update -U(i,k) */
2442       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2443 
2444       /* add multiple of row i to k-th row ... */
2445       jmin = ili + 1; jmax = bi[i+1];
2446       if (jmin < jmax){
2447         for (j=jmin; j<jmax; j++) {
2448           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2449           rtmp_ptr = rtmp + bj[j]*9;
2450           u = ba + j*9;
2451           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2452           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2453           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2454 
2455           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2456           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2457           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2458 
2459           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2460           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2461           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2462         }
2463 
2464         /* ... add i to row list for next nonzero entry */
2465         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2466         j     = bj[jmin];
2467         jl[i] = jl[j]; jl[j] = i; /* update jl */
2468       }
2469       i = nexti;
2470     }
2471 
2472     /* save nonzero entries in k-th row of U ... */
2473 
2474     /* invert diagonal block */
2475     diag = ba+k*9;
2476     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2477     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2478 
2479     jmin = bi[k]; jmax = bi[k+1];
2480     if (jmin < jmax) {
2481       for (j=jmin; j<jmax; j++){
2482          vj = bj[j];           /* block col. index of U */
2483          u   = ba + j*9;
2484          rtmp_ptr = rtmp + vj*9;
2485          for (k1=0; k1<9; k1++){
2486            *u++        = *rtmp_ptr;
2487            *rtmp_ptr++ = 0.0;
2488          }
2489       }
2490 
2491       /* ... add k to row list for first nonzero entry in k-th row */
2492       il[k] = jmin;
2493       i     = bj[jmin];
2494       jl[k] = jl[i]; jl[i] = k;
2495     }
2496   }
2497 
2498   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2499   ierr = PetscFree(il);CHKERRQ(ierr);
2500   ierr = PetscFree(jl);CHKERRQ(ierr);
2501   ierr = PetscFree(dk);CHKERRQ(ierr);
2502   ierr = PetscFree(uik);CHKERRQ(ierr);
2503   if (a->permute){
2504     ierr = PetscFree(aa);CHKERRQ(ierr);
2505   }
2506 
2507   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2508   C->factor    = FACTOR_CHOLESKY;
2509   C->assembled = PETSC_TRUE;
2510   C->preallocated = PETSC_TRUE;
2511   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2512   PetscFunctionReturn(0);
2513 }
2514 
2515 /*
2516       Version for when blocks are 3 by 3 Using natural ordering
2517 */
2518 #undef __FUNC__
2519 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering"
2520 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B)
2521 {
2522   Mat                C = *B;
2523   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2524   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2525   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2526   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2527   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2528 
2529   PetscFunctionBegin;
2530   printf("called FactorNumeric_SeqSBAIJ_3_NaturalOrdering\n");
2531   /* initialization */
2532   rtmp  = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2533   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2534   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2535   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2536   for (i=0; i<mbs; i++) {
2537     jl[i] = mbs; il[0] = 0;
2538   }
2539   dk  = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(dk);
2540   uik = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(uik);
2541 
2542   ai = a->i; aj = a->j; aa = a->a;
2543 
2544   /* for each row k */
2545   for (k = 0; k<mbs; k++){
2546 
2547     /*initialize k-th row with elements nonzero in row k of A */
2548     jmin = ai[k]; jmax = ai[k+1];
2549     if (jmin < jmax) {
2550       ap = aa + jmin*9;
2551       for (j = jmin; j < jmax; j++){
2552         vj = aj[j];         /* block col. index */
2553         rtmp_ptr = rtmp + vj*9;
2554         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2555       }
2556     }
2557 
2558     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2559     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2560     i = jl[k]; /* first row to be added to k_th row  */
2561 
2562     while (i < mbs){
2563       nexti = jl[i]; /* next row to be added to k_th row */
2564 
2565       /* compute multiplier */
2566       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2567 
2568       /* uik = -inv(Di)*U_bar(i,k) */
2569       diag = ba + i*9;
2570       u    = ba + ili*9;
2571 
2572       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2573       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2574       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2575 
2576       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2577       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2578       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2579 
2580       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2581       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2582       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2583 
2584       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2585       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2586       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2587       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2588 
2589       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2590       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2591       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2592 
2593       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2594       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2595       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2596 
2597       /* update -U(i,k) */
2598       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2599 
2600       /* add multiple of row i to k-th row ... */
2601       jmin = ili + 1; jmax = bi[i+1];
2602       if (jmin < jmax){
2603         for (j=jmin; j<jmax; j++) {
2604           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2605           rtmp_ptr = rtmp + bj[j]*9;
2606           u = ba + j*9;
2607           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2608           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2609           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2610 
2611           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2612           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2613           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2614 
2615           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2616           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2617           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2618         }
2619 
2620         /* ... add i to row list for next nonzero entry */
2621         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2622         j     = bj[jmin];
2623         jl[i] = jl[j]; jl[j] = i; /* update jl */
2624       }
2625       i = nexti;
2626     }
2627 
2628     /* save nonzero entries in k-th row of U ... */
2629 
2630     /* invert diagonal block */
2631     diag = ba+k*9;
2632     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2633     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2634 
2635     jmin = bi[k]; jmax = bi[k+1];
2636     if (jmin < jmax) {
2637       for (j=jmin; j<jmax; j++){
2638          vj = bj[j];           /* block col. index of U */
2639          u   = ba + j*9;
2640          rtmp_ptr = rtmp + vj*9;
2641          for (k1=0; k1<9; k1++){
2642            *u++        = *rtmp_ptr;
2643            *rtmp_ptr++ = 0.0;
2644          }
2645       }
2646 
2647       /* ... add k to row list for first nonzero entry in k-th row */
2648       il[k] = jmin;
2649       i     = bj[jmin];
2650       jl[k] = jl[i]; jl[i] = k;
2651     }
2652   }
2653 
2654   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2655   ierr = PetscFree(il);CHKERRQ(ierr);
2656   ierr = PetscFree(jl);CHKERRQ(ierr);
2657   ierr = PetscFree(dk);CHKERRQ(ierr);
2658   ierr = PetscFree(uik);CHKERRQ(ierr);
2659 
2660   C->factor    = FACTOR_CHOLESKY;
2661   C->assembled = PETSC_TRUE;
2662   C->preallocated = PETSC_TRUE;
2663   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2664   PetscFunctionReturn(0);
2665 }
2666 
2667 /*
2668     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
2669     Version for blocks 2 by 2.
2670 */
2671 #undef __FUNC__
2672 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
2673 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
2674 {
2675   Mat                C = *B;
2676   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2677   IS                 perm = b->row;
2678   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2679   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2680   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2681   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2682 
2683   PetscFunctionBegin;
2684   /* printf("called FactorNumeric_SeqSBAIJ_2\n"); */
2685   /* initialization */
2686   /* il and jl record the first nonzero element in each row of the accessing
2687      window U(0:k, k:mbs-1).
2688      jl:    list of rows to be added to uneliminated rows
2689             i>= k: jl(i) is the first row to be added to row i
2690             i<  k: jl(i) is the row following row i in some list of rows
2691             jl(i) = mbs indicates the end of a list
2692      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2693             row i of U */
2694   rtmp  = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2695   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2696   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2697   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2698   for (i=0; i<mbs; i++) {
2699     jl[i] = mbs; il[0] = 0;
2700   }
2701   dk  = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(dk);
2702   uik = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(uik);
2703   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2704 
2705   /* check permutation */
2706   if (!a->permute){
2707     ai = a->i; aj = a->j; aa = a->a;
2708   } else {
2709     ai = a->inew; aj = a->jnew;
2710     aa = (MatScalar*)PetscMalloc(4*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2711     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2712     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2713     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2714 
2715     for (i=0; i<mbs; i++){
2716       jmin = ai[i]; jmax = ai[i+1];
2717       for (j=jmin; j<jmax; j++){
2718         while (a2anew[j] != j){
2719           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2720           for (k1=0; k1<4; k1++){
2721             dk[k1]       = aa[k*4+k1];
2722             aa[k*4+k1] = aa[j*4+k1];
2723             aa[j*4+k1] = dk[k1];
2724           }
2725         }
2726         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2727         if (i > aj[j]){
2728           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2729           ap = aa + j*4;     /* ptr to the beginning of the block */
2730           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
2731           ap[1] = ap[2];
2732           ap[2] = dk[1];
2733         }
2734       }
2735     }
2736     ierr = PetscFree(a2anew);CHKERRA(ierr);
2737   }
2738 
2739   /* for each row k */
2740   for (k = 0; k<mbs; k++){
2741 
2742     /*initialize k-th row with elements nonzero in row perm(k) of A */
2743     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2744     if (jmin < jmax) {
2745       ap = aa + jmin*4;
2746       for (j = jmin; j < jmax; j++){
2747         vj = perm_ptr[aj[j]];         /* block col. index */
2748         rtmp_ptr = rtmp + vj*4;
2749         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
2750       }
2751     }
2752 
2753     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2754     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
2755     i = jl[k]; /* first row to be added to k_th row  */
2756 
2757     while (i < mbs){
2758       nexti = jl[i]; /* next row to be added to k_th row */
2759 
2760       /* compute multiplier */
2761       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2762 
2763       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
2764       diag = ba + i*4;
2765       u    = ba + ili*4;
2766       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
2767       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
2768       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
2769       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
2770 
2771       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
2772       dk[0] += uik[0]*u[0] + uik[1]*u[1];
2773       dk[1] += uik[2]*u[0] + uik[3]*u[1];
2774       dk[2] += uik[0]*u[2] + uik[1]*u[3];
2775       dk[3] += uik[2]*u[2] + uik[3]*u[3];
2776 
2777       /* update -U(i,k): ba[ili] = uik */
2778       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
2779 
2780       /* add multiple of row i to k-th row ... */
2781       jmin = ili + 1; jmax = bi[i+1];
2782       if (jmin < jmax){
2783         for (j=jmin; j<jmax; j++) {
2784           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
2785           rtmp_ptr = rtmp + bj[j]*4;
2786           u = ba + j*4;
2787           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
2788           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
2789           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
2790           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
2791         }
2792 
2793         /* ... add i to row list for next nonzero entry */
2794         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2795         j     = bj[jmin];
2796         jl[i] = jl[j]; jl[j] = i; /* update jl */
2797       }
2798       i = nexti;
2799     }
2800 
2801     /* save nonzero entries in k-th row of U ... */
2802 
2803     /* invert diagonal block */
2804     diag = ba+k*4;
2805     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
2806     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
2807 
2808     jmin = bi[k]; jmax = bi[k+1];
2809     if (jmin < jmax) {
2810       for (j=jmin; j<jmax; j++){
2811          vj = bj[j];           /* block col. index of U */
2812          u   = ba + j*4;
2813          rtmp_ptr = rtmp + vj*4;
2814          for (k1=0; k1<4; k1++){
2815            *u++        = *rtmp_ptr;
2816            *rtmp_ptr++ = 0.0;
2817          }
2818       }
2819 
2820       /* ... add k to row list for first nonzero entry in k-th row */
2821       il[k] = jmin;
2822       i     = bj[jmin];
2823       jl[k] = jl[i]; jl[i] = k;
2824     }
2825   }
2826 
2827   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2828   ierr = PetscFree(il);CHKERRQ(ierr);
2829   ierr = PetscFree(jl);CHKERRQ(ierr);
2830   ierr = PetscFree(dk);CHKERRQ(ierr);
2831   ierr = PetscFree(uik);CHKERRQ(ierr);
2832   if (a->permute) {
2833     ierr = PetscFree(aa);CHKERRQ(ierr);
2834   }
2835   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2836   C->factor    = FACTOR_CHOLESKY;
2837   C->assembled = PETSC_TRUE;
2838   C->preallocated = PETSC_TRUE;
2839   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2840   PetscFunctionReturn(0);
2841 }
2842 
2843 /*
2844       Version for when blocks are 2 by 2 Using natural ordering
2845 */
2846 #undef __FUNC__
2847 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
2848 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
2849 {
2850   Mat                C = *B;
2851   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2852   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2853   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2854   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2855   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2856 
2857   PetscFunctionBegin;
2858   printf("called FactorNumeric_SeqSBAIJ_2_NaturalOrdering\n");
2859   /* initialization */
2860   /* il and jl record the first nonzero element in each row of the accessing
2861      window U(0:k, k:mbs-1).
2862      jl:    list of rows to be added to uneliminated rows
2863             i>= k: jl(i) is the first row to be added to row i
2864             i<  k: jl(i) is the row following row i in some list of rows
2865             jl(i) = mbs indicates the end of a list
2866      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2867             row i of U */
2868   rtmp  = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2869   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2870   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2871   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2872   for (i=0; i<mbs; i++) {
2873     jl[i] = mbs; il[0] = 0;
2874   }
2875   dk  = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(dk);
2876   uik = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(uik);
2877 
2878   ai = a->i; aj = a->j; aa = a->a;
2879 
2880   /* for each row k */
2881   for (k = 0; k<mbs; k++){
2882 
2883     /*initialize k-th row with elements nonzero in row k of A */
2884     jmin = ai[k]; jmax = ai[k+1];
2885     if (jmin < jmax) {
2886       ap = aa + jmin*4;
2887       for (j = jmin; j < jmax; j++){
2888         vj = aj[j];         /* block col. index */
2889         rtmp_ptr = rtmp + vj*4;
2890         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
2891       }
2892     }
2893 
2894     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2895     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
2896     i = jl[k]; /* first row to be added to k_th row  */
2897 
2898     while (i < mbs){
2899       nexti = jl[i]; /* next row to be added to k_th row */
2900 
2901       /* compute multiplier */
2902       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2903 
2904       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
2905       diag = ba + i*4;
2906       u    = ba + ili*4;
2907       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
2908       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
2909       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
2910       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
2911 
2912       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
2913       dk[0] += uik[0]*u[0] + uik[1]*u[1];
2914       dk[1] += uik[2]*u[0] + uik[3]*u[1];
2915       dk[2] += uik[0]*u[2] + uik[1]*u[3];
2916       dk[3] += uik[2]*u[2] + uik[3]*u[3];
2917 
2918       /* update -U(i,k): ba[ili] = uik */
2919       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
2920 
2921       /* add multiple of row i to k-th row ... */
2922       jmin = ili + 1; jmax = bi[i+1];
2923       if (jmin < jmax){
2924         for (j=jmin; j<jmax; j++) {
2925           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
2926           rtmp_ptr = rtmp + bj[j]*4;
2927           u = ba + j*4;
2928           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
2929           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
2930           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
2931           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
2932         }
2933 
2934         /* ... add i to row list for next nonzero entry */
2935         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2936         j     = bj[jmin];
2937         jl[i] = jl[j]; jl[j] = i; /* update jl */
2938       }
2939       i = nexti;
2940     }
2941 
2942     /* save nonzero entries in k-th row of U ... */
2943 
2944     /* invert diagonal block */
2945     diag = ba+k*4;
2946     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
2947     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
2948 
2949     jmin = bi[k]; jmax = bi[k+1];
2950     if (jmin < jmax) {
2951       for (j=jmin; j<jmax; j++){
2952          vj = bj[j];           /* block col. index of U */
2953          u   = ba + j*4;
2954          rtmp_ptr = rtmp + vj*4;
2955          for (k1=0; k1<4; k1++){
2956            *u++        = *rtmp_ptr;
2957            *rtmp_ptr++ = 0.0;
2958          }
2959       }
2960 
2961       /* ... add k to row list for first nonzero entry in k-th row */
2962       il[k] = jmin;
2963       i     = bj[jmin];
2964       jl[k] = jl[i]; jl[i] = k;
2965     }
2966   }
2967 
2968   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2969   ierr = PetscFree(il);CHKERRQ(ierr);
2970   ierr = PetscFree(jl);CHKERRQ(ierr);
2971   ierr = PetscFree(dk);CHKERRQ(ierr);
2972   ierr = PetscFree(uik);CHKERRQ(ierr);
2973 
2974   C->factor    = FACTOR_CHOLESKY;
2975   C->assembled = PETSC_TRUE;
2976   C->preallocated = PETSC_TRUE;
2977   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2978   PetscFunctionReturn(0);
2979 }
2980 
2981 /*
2982     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
2983     Version for blocks are 1 by 1.
2984 */
2985 #undef __FUNC__
2986 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
2987 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
2988 {
2989   Mat                C = *B;
2990   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2991   IS                 ip = b->row;
2992   int                *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
2993   int                *ai,*aj,*r;
2994   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
2995   MatScalar          *rtmp;
2996   MatScalar          *ba = b->a,*aa,ak;
2997   MatScalar          dk,uikdi;
2998 
2999   PetscFunctionBegin;
3000   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
3001   if (!a->permute){
3002     ai = a->i; aj = a->j; aa = a->a;
3003   } else {
3004     ai = a->inew; aj = a->jnew;
3005     aa = (MatScalar*)PetscMalloc(ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
3006     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
3007     r   = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(r);
3008     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
3009 
3010     jmin = ai[0]; jmax = ai[mbs];
3011     for (j=jmin; j<jmax; j++){
3012       while (r[j] != j){
3013         k = r[j]; r[j] = r[k]; r[k] = k;
3014         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
3015       }
3016     }
3017     ierr = PetscFree(r);CHKERRA(ierr);
3018   }
3019 
3020   /* initialization */
3021   /* il and jl record the first nonzero element in each row of the accessing
3022      window U(0:k, k:mbs-1).
3023      jl:    list of rows to be added to uneliminated rows
3024             i>= k: jl(i) is the first row to be added to row i
3025             i<  k: jl(i) is the row following row i in some list of rows
3026             jl(i) = mbs indicates the end of a list
3027      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3028             row i of U */
3029   rtmp  = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
3030   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
3031   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
3032   for (i=0; i<mbs; i++) {
3033     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
3034   }
3035 
3036   /* for each row k */
3037   for (k = 0; k<mbs; k++){
3038 
3039     /*initialize k-th row with elements nonzero in row perm(k) of A */
3040     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
3041     if (jmin < jmax) {
3042       for (j = jmin; j < jmax; j++){
3043         vj = rip[aj[j]];
3044         rtmp[vj] = aa[j];
3045       }
3046     }
3047 
3048     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3049     dk = rtmp[k];
3050     i = jl[k]; /* first row to be added to k_th row  */
3051 
3052     while (i < mbs){
3053       nexti = jl[i]; /* next row to be added to k_th row */
3054 
3055       /* compute multiplier, update D(k) and U(i,k) */
3056       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3057       uikdi = - ba[ili]*ba[i];
3058       dk += uikdi*ba[ili];
3059       ba[ili] = uikdi; /* -U(i,k) */
3060 
3061       /* add multiple of row i to k-th row ... */
3062       jmin = ili + 1; jmax = bi[i+1];
3063       if (jmin < jmax){
3064         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
3065         /* ... add i to row list for next nonzero entry */
3066         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3067         j     = bj[jmin];
3068         jl[i] = jl[j]; jl[j] = i; /* update jl */
3069       }
3070       i = nexti;
3071     }
3072 
3073     /* check for zero pivot and save diagoanl element */
3074     if (dk == 0.0){
3075       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
3076     }else if (PetscRealPart(dk) < 0){
3077       ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk);
3078     }
3079 
3080     /* save nonzero entries in k-th row of U ... */
3081     ba[k] = 1.0/dk;
3082     jmin = bi[k]; jmax = bi[k+1];
3083     if (jmin < jmax) {
3084       for (j=jmin; j<jmax; j++){
3085          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
3086       }
3087       /* ... add k to row list for first nonzero entry in k-th row */
3088       il[k] = jmin;
3089       i     = bj[jmin];
3090       jl[k] = jl[i]; jl[i] = k;
3091     }
3092   }
3093 
3094   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3095   ierr = PetscFree(il);CHKERRQ(ierr);
3096   ierr = PetscFree(jl);CHKERRQ(ierr);
3097   if (a->permute){
3098     ierr = PetscFree(aa);CHKERRQ(ierr);
3099   }
3100 
3101   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
3102   C->factor    = FACTOR_CHOLESKY;
3103   C->assembled = PETSC_TRUE;
3104   C->preallocated = PETSC_TRUE;
3105   PLogFlops(b->mbs);
3106   PetscFunctionReturn(0);
3107 }
3108 
3109 #undef __FUNC__
3110 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ"
3111 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
3112 {
3113   int ierr;
3114   Mat C;
3115 
3116   PetscFunctionBegin;
3117   ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr);
3118   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
3119   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
3120   PetscFunctionReturn(0);
3121 }
3122 
3123 
3124