xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 103535cf5aa016be7a0dbdf0f5c09f2e6e5a4916)
1 /* Using Modified Sparse Row (MSR) storage.
2 See page 85, "Iterative Methods ..." by Saad. */
3 
4 /*$Id: sbaijfact.c,v 1.44 2000/11/03 18:01:19 hzhang Exp hzhang $*/
5 /*
6     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7 */
8 #include "sbaij.h"
9 #include "src/mat/impls/baij/seq/baij.h"
10 #include "src/vec/vecimpl.h"
11 #include "src/inline/ilu.h"
12 #include "include/petscis.h"
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
16 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
17 {
18   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
19   int         *rip,ierr,i,mbs = a->mbs,*ai,*aj;
20   int         *jutmp,bs = a->bs,bs2=a->bs2;
21   int         m,nzi,realloc = 0;
22   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
23   /* PetscTruth  ident; */
24 
25   PetscFunctionBegin;
26   PetscValidHeaderSpecific(perm,IS_COOKIE);
27   if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,"matrix must be square");
28 
29   /* check whether perm is the identity mapping */
30   /*
31   ierr = ISView(perm, VIEWER_STDOUT_SELF);CHKERRA(ierr);
32   ierr = ISIdentity(perm,&ident);CHKERRQ(ierr);
33   printf("ident = %d\n", ident);
34   */
35   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
36   for (i=0; i<mbs; i++){
37     if (rip[i] != i){
38       a->permute = PETSC_TRUE;
39       /* printf("non-trivial perm\n"); */
40       break;
41     }
42   }
43 
44   if (!a->permute){ /* without permutation */
45     ai = a->i; aj = a->j;
46   } else {          /* non-trivial permutation */
47     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRA(ierr);
48     ai = a->inew; aj = a->jnew;
49   }
50 
51   /* initialization */
52   /* Don't know how many column pointers are needed so estimate.
53      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
54   iu   = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(iu);
55   umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
56   ju   = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
57   iu[0] = mbs+1;
58   juptr = mbs;
59   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
60   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
61   for (i=0; i<mbs; i++){
62     jl[i] = mbs; q[i] = 0;
63   }
64 
65   /* for each row k */
66   for (k=0; k<mbs; k++){
67     nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
68     q[k] = mbs;
69     /* initialize nonzero structure of k-th row to row rip[k] of A */
70     jmin = ai[rip[k]];
71     jmax = ai[rip[k]+1];
72     for (j=jmin; j<jmax; j++){
73       vj = rip[aj[j]]; /* col. value */
74       if(vj > k){
75         qm = k;
76         do {
77           m  = qm; qm = q[m];
78         } while(qm < vj);
79         if (qm == vj) {
80           printf(" error: duplicate entry in A\n"); break;
81         }
82         nzk++;
83         q[m] = vj;
84         q[vj] = qm;
85       } /* if(vj > k) */
86     } /* for (j=jmin; j<jmax; j++) */
87 
88     /* modify nonzero structure of k-th row by computing fill-in
89        for each row i to be merged in */
90     i = k;
91     i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
92     /* printf(" next pivot row i=%d\n",i); */
93     while (i < mbs){
94       /* merge row i into k-th row */
95       nzi = iu[i+1] - (iu[i]+1);
96       jmin = iu[i] + 1; jmax = iu[i] + nzi;
97       qm = k;
98       for (j=jmin; j<jmax+1; j++){
99         vj = ju[j];
100         do {
101           m = qm; qm = q[m];
102         } while (qm < vj);
103         if (qm != vj){
104          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
105         }
106       }
107       i = jl[i]; /* next pivot row */
108     }
109 
110     /* add k to row list for first nonzero element in k-th row */
111     if (nzk > 0){
112       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
113       jl[k] = jl[i]; jl[i] = k;
114     }
115     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
116 
117     /* allocate more space to ju if needed */
118     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
119       /* estimate how much additional space we will need */
120       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
121       /* just double the memory each time */
122       maxadd = umax;
123       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
124       umax += maxadd;
125 
126       /* allocate a longer ju */
127       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
128       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
129       ierr  = PetscFree(ju);CHKERRQ(ierr);
130       ju    = jutmp;
131       realloc++; /* count how many times we realloc */
132     }
133 
134     /* save nonzero structure of k-th row in ju */
135     i=k;
136     jumin = juptr + 1; juptr += nzk;
137     for (j=jumin; j<juptr+1; j++){
138       i=q[i];
139       ju[j]=i;
140     }
141   }
142 
143   if (ai[mbs] != 0) {
144     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
145     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
146     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
147     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
148     PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
149   } else {
150      PLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
151   }
152 
153   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
154   ierr = PetscFree(q);CHKERRQ(ierr);
155   ierr = PetscFree(jl);CHKERRQ(ierr);
156 
157   /* put together the new matrix */
158   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
159   /* PLogObjectParent(*B,iperm); */
160   b = (Mat_SeqSBAIJ*)(*B)->data;
161   ierr = PetscFree(b->imax);CHKERRQ(ierr);
162   b->singlemalloc = PETSC_FALSE;
163   /* the next line frees the default space generated by the Create() */
164   ierr = PetscFree(b->a);CHKERRQ(ierr);
165   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
166   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
167   b->j          = ju;
168   b->i          = iu;
169   b->diag       = 0;
170   b->ilen       = 0;
171   b->imax       = 0;
172   b->row        = perm;
173   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
174   b->icol       = perm;
175   ierr          = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
176   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
177   /* In b structure:  Free imax, ilen, old a, old j.
178      Allocate idnew, solve_work, new a, new j */
179   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
180   b->s_maxnz = b->s_nz = iu[mbs];
181 
182   (*B)->factor                 = FACTOR_CHOLESKY;
183   (*B)->info.factor_mallocs    = realloc;
184   (*B)->info.fill_ratio_given  = f;
185   if (ai[mbs] != 0) {
186     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
187   } else {
188     (*B)->info.fill_ratio_needed = 0.0;
189   }
190 
191   PetscFunctionReturn(0);
192 }
193 
194 #undef __FUNC__
195 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
196 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
197 {
198   Mat                C = *B;
199   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
200   IS                 perm = b->row;
201   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
202   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
203   int                bs=a->bs,bs2 = a->bs2;
204   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
205   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
206   MatScalar          *W,*work;
207   int                *pivots;
208 
209   PetscFunctionBegin;
210 
211   /* initialization */
212   rtmp  = (MatScalar*)PetscMalloc(bs2*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
213   ierr  = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
214   il    = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
215   jl    = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
216   for (i=0; i<mbs; i++) {
217     jl[i] = mbs; il[0] = 0;
218   }
219   dk    = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(dk);
220   uik   = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(uik);
221   W     = (MatScalar*)PetscMalloc(bs2*sizeof(MatScalar));CHKPTRQ(W);
222   work  = (MatScalar*)PetscMalloc(bs*sizeof(MatScalar));CHKPTRQ(work);
223   pivots= (int*)PetscMalloc(bs*sizeof(int));CHKPTRQ(pivots);
224 
225   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
226 
227   /* check permutation */
228   if (!a->permute){
229     ai = a->i; aj = a->j; aa = a->a;
230   } else {
231     ai = a->inew; aj = a->jnew;
232     aa = (MatScalar*)PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
233     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
234     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
235     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
236 
237     for (i=0; i<mbs; i++){
238       jmin = ai[i]; jmax = ai[i+1];
239       for (j=jmin; j<jmax; j++){
240         while (a2anew[j] != j){
241           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
242           for (k1=0; k1<bs2; k1++){
243             dk[k1]       = aa[k*bs2+k1];
244             aa[k*bs2+k1] = aa[j*bs2+k1];
245             aa[j*bs2+k1] = dk[k1];
246           }
247         }
248         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
249         if (i > aj[j]){
250           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
251           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
252           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
253           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
254             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
255           }
256         }
257       }
258     }
259     ierr = PetscFree(a2anew);CHKERRA(ierr);
260   }
261 
262   /* for each row k */
263   for (k = 0; k<mbs; k++){
264 
265     /*initialize k-th row with elements nonzero in row perm(k) of A */
266     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
267     if (jmin < jmax) {
268       ap = aa + jmin*bs2;
269       for (j = jmin; j < jmax; j++){
270         vj = perm_ptr[aj[j]];         /* block col. index */
271         rtmp_ptr = rtmp + vj*bs2;
272         for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
273       }
274     }
275 
276     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
277     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
278     i = jl[k]; /* first row to be added to k_th row  */
279 
280     while (i < mbs){
281       nexti = jl[i]; /* next row to be added to k_th row */
282 
283       /* compute multiplier */
284       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
285 
286       /* uik = -inv(Di)*U_bar(i,k) */
287       diag = ba + i*bs2;
288       u    = ba + ili*bs2;
289       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
290       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
291 
292       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
293       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
294 
295       /* update -U(i,k) */
296       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
297 
298       /* add multiple of row i to k-th row ... */
299       jmin = ili + 1; jmax = bi[i+1];
300       if (jmin < jmax){
301         for (j=jmin; j<jmax; j++) {
302           /* rtmp += -U(i,k)^T * U_bar(i,j) */
303           rtmp_ptr = rtmp + bj[j]*bs2;
304           u = ba + j*bs2;
305           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
306         }
307 
308         /* ... add i to row list for next nonzero entry */
309         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
310         j     = bj[jmin];
311         jl[i] = jl[j]; jl[j] = i; /* update jl */
312       }
313       i = nexti;
314     }
315 
316     /* save nonzero entries in k-th row of U ... */
317 
318     /* invert diagonal block */
319     diag = ba+k*bs2;
320     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
321     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
322 
323     jmin = bi[k]; jmax = bi[k+1];
324     if (jmin < jmax) {
325       for (j=jmin; j<jmax; j++){
326          vj = bj[j];           /* block col. index of U */
327          u   = ba + j*bs2;
328          rtmp_ptr = rtmp + vj*bs2;
329          for (k1=0; k1<bs2; k1++){
330            *u++        = *rtmp_ptr;
331            *rtmp_ptr++ = 0.0;
332          }
333       }
334 
335       /* ... add k to row list for first nonzero entry in k-th row */
336       il[k] = jmin;
337       i     = bj[jmin];
338       jl[k] = jl[i]; jl[i] = k;
339     }
340   }
341 
342   ierr = PetscFree(rtmp);CHKERRQ(ierr);
343   ierr = PetscFree(il);CHKERRQ(ierr);
344   ierr = PetscFree(jl);CHKERRQ(ierr);
345   ierr = PetscFree(dk);CHKERRQ(ierr);
346   ierr = PetscFree(uik);CHKERRQ(ierr);
347   ierr = PetscFree(W);CHKERRQ(ierr);
348   ierr = PetscFree(work);CHKERRQ(ierr);
349   ierr = PetscFree(pivots);CHKERRQ(ierr);
350   if (a->permute){
351     ierr = PetscFree(aa);CHKERRQ(ierr);
352   }
353 
354   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
355   C->factor    = FACTOR_CHOLESKY;
356   C->assembled = PETSC_TRUE;
357   C->preallocated = PETSC_TRUE;
358   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
359   PetscFunctionReturn(0);
360 }
361 
362 /* Version for when blocks are 7 by 7 */
363 #undef __FUNC__
364 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7"
365 int MatCholeskyFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B)
366 {
367   Mat                C = *B;
368   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
369   IS                 perm = b->row;
370   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
371   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
372   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
373   MatScalar          *u,*d,*w,*wp;
374 
375   PetscFunctionBegin;
376   /* initialization */
377   w  = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w);
378   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
379   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
380   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
381   for (i=0; i<mbs; i++) {
382     jl[i] = mbs; il[0] = 0;
383   }
384   dk    = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(dk);
385   uik   = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(uik);
386   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
387 
388   /* check permutation */
389   if (!a->permute){
390     ai = a->i; aj = a->j; aa = a->a;
391   } else {
392     ai = a->inew; aj = a->jnew;
393     aa = (MatScalar*)PetscMalloc(49*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
394     ierr = PetscMemcpy(aa,a->a,49*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
395     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
396     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
397 
398     for (i=0; i<mbs; i++){
399       jmin = ai[i]; jmax = ai[i+1];
400       for (j=jmin; j<jmax; j++){
401         while (a2anew[j] != j){
402           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
403           for (k1=0; k1<49; k1++){
404             dk[k1]       = aa[k*49+k1];
405             aa[k*49+k1] = aa[j*49+k1];
406             aa[j*49+k1] = dk[k1];
407           }
408         }
409         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
410         if (i > aj[j]){
411           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
412           ap = aa + j*49;                     /* ptr to the beginning of j-th block of aa */
413           for (k=0; k<49; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
414           for (k=0; k<7; k++){               /* j-th block of aa <- dk^T */
415             for (k1=0; k1<7; k1++) *ap++ = dk[k + 7*k1];
416           }
417         }
418       }
419     }
420     ierr = PetscFree(a2anew);CHKERRA(ierr);
421   }
422 
423   /* for each row k */
424   for (k = 0; k<mbs; k++){
425 
426     /*initialize k-th row with elements nonzero in row perm(k) of A */
427     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
428     if (jmin < jmax) {
429       ap = aa + jmin*49;
430       for (j = jmin; j < jmax; j++){
431         vj = perm_ptr[aj[j]];         /* block col. index */
432         wp = w + vj*49;
433         for (i=0; i<49; i++) *wp++ = *ap++;
434       }
435     }
436 
437     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
438     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
439     i = jl[k]; /* first row to be added to k_th row  */
440 
441     while (i < mbs){
442       nexti = jl[i]; /* next row to be added to k_th row */
443 
444       /* compute multiplier */
445       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
446 
447       /* uik = -inv(Di)*U_bar(i,k) */
448       d = ba + i*49;
449       u    = ba + ili*49;
450 
451       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
452       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
453       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
454       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
455       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
456       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
457       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
458 
459       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
460       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
461       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
462       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
463       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
464       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
465       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
466 
467       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
468       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
469       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
470       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
471       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
472       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
473       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
474 
475       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
476       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
477       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
478       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
479       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
480       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
481       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
482 
483       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
484       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
485       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
486       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
487       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
488       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
489       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
490 
491       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
492       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
493       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
494       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
495       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
496       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
497       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
498 
499       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
500       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
501       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
502       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
503       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
504       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
505       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
506 
507       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
508       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
509       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
510       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
511       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
512       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
513       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
514       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
515 
516       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
517       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
518       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
519       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
520       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
521       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
522       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
523 
524       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
525       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
526       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
527       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
528       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
529       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
530       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
531 
532       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
533       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
534       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
535       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
536       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
537       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
538       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
539 
540       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
541       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
542       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
543       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
544       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
545       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
546       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
547 
548       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
549       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
550       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
551       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
552       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
553       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
554       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
555 
556       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
557       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
558       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
559       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
560       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
561       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
562       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
563 
564       /* update -U(i,k) */
565       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
566 
567       /* add multiple of row i to k-th row ... */
568       jmin = ili + 1; jmax = bi[i+1];
569       if (jmin < jmax){
570         for (j=jmin; j<jmax; j++) {
571           /* w += -U(i,k)^T * U_bar(i,j) */
572           wp = w + bj[j]*49;
573           u = ba + j*49;
574 
575           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
576           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
577           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
578           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
579           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
580           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
581           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
582 
583           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
584           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
585           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
586           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
587           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
588           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
589           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
590 
591           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
592           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
593           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
594           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
595           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
596           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
597           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
598 
599           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
600           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
601           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
602           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
603           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
604           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
605           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
606 
607           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
608           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
609           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
610           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
611           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
612           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
613           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
614 
615           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
616           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
617           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
618           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
619           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
620           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
621           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
622 
623           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
624           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
625           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
626           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
627           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
628           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
629           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
630         }
631 
632         /* ... add i to row list for next nonzero entry */
633         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
634         j     = bj[jmin];
635         jl[i] = jl[j]; jl[j] = i; /* update jl */
636       }
637       i = nexti;
638     }
639 
640     /* save nonzero entries in k-th row of U ... */
641 
642     /* invert diagonal block */
643     d = ba+k*49;
644     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
645     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
646 
647     jmin = bi[k]; jmax = bi[k+1];
648     if (jmin < jmax) {
649       for (j=jmin; j<jmax; j++){
650          vj = bj[j];           /* block col. index of U */
651          u   = ba + j*49;
652          wp = w + vj*49;
653          for (k1=0; k1<49; k1++){
654            *u++        = *wp;
655            *wp++ = 0.0;
656          }
657       }
658 
659       /* ... add k to row list for first nonzero entry in k-th row */
660       il[k] = jmin;
661       i     = bj[jmin];
662       jl[k] = jl[i]; jl[i] = k;
663     }
664   }
665 
666   ierr = PetscFree(w);CHKERRQ(ierr);
667   ierr = PetscFree(il);CHKERRQ(ierr);
668   ierr = PetscFree(jl);CHKERRQ(ierr);
669   ierr = PetscFree(dk);CHKERRQ(ierr);
670   ierr = PetscFree(uik);CHKERRQ(ierr);
671   if (a->permute){
672     ierr = PetscFree(aa);CHKERRQ(ierr);
673   }
674 
675   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
676   C->factor    = FACTOR_CHOLESKY;
677   C->assembled = PETSC_TRUE;
678   C->preallocated = PETSC_TRUE;
679   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
680   PetscFunctionReturn(0);
681 }
682 
683 /*
684       Version for when blocks are 7 by 7 Using natural ordering
685 */
686 #undef __FUNC__
687 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering"
688 int MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B)
689 {
690   Mat                C = *B;
691   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
692   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
693   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
694   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
695   MatScalar          *u,*d,*w,*wp;
696 
697   PetscFunctionBegin;
698   /* initialization */
699   w  = (MatScalar*)PetscMalloc(49*mbs*sizeof(MatScalar));CHKPTRQ(w);
700   ierr = PetscMemzero(w,49*mbs*sizeof(MatScalar));CHKERRQ(ierr);
701   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
702   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
703   for (i=0; i<mbs; i++) {
704     jl[i] = mbs; il[0] = 0;
705   }
706   dk    = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(dk);
707   uik   = (MatScalar*)PetscMalloc(49*sizeof(MatScalar));CHKPTRQ(uik);
708 
709   ai = a->i; aj = a->j; aa = a->a;
710 
711   /* for each row k */
712   for (k = 0; k<mbs; k++){
713 
714     /*initialize k-th row with elements nonzero in row k of A */
715     jmin = ai[k]; jmax = ai[k+1];
716     if (jmin < jmax) {
717       ap = aa + jmin*49;
718       for (j = jmin; j < jmax; j++){
719         vj = aj[j];         /* block col. index */
720         wp = w + vj*49;
721         for (i=0; i<49; i++) *wp++ = *ap++;
722       }
723     }
724 
725     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
726     ierr = PetscMemcpy(dk,w+k*49,49*sizeof(MatScalar));CHKERRQ(ierr);
727     i = jl[k]; /* first row to be added to k_th row  */
728 
729     while (i < mbs){
730       nexti = jl[i]; /* next row to be added to k_th row */
731 
732       /* compute multiplier */
733       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
734 
735       /* uik = -inv(Di)*U_bar(i,k) */
736       d = ba + i*49;
737       u    = ba + ili*49;
738 
739       uik[0] = -(d[0]*u[0] + d[7]*u[1]+ d[14]*u[2]+ d[21]*u[3]+ d[28]*u[4]+ d[35]*u[5]+ d[42]*u[6]);
740       uik[1] = -(d[1]*u[0] + d[8]*u[1]+ d[15]*u[2]+ d[22]*u[3]+ d[29]*u[4]+ d[36]*u[5]+ d[43]*u[6]);
741       uik[2] = -(d[2]*u[0] + d[9]*u[1]+ d[16]*u[2]+ d[23]*u[3]+ d[30]*u[4]+ d[37]*u[5]+ d[44]*u[6]);
742       uik[3] = -(d[3]*u[0]+ d[10]*u[1]+ d[17]*u[2]+ d[24]*u[3]+ d[31]*u[4]+ d[38]*u[5]+ d[45]*u[6]);
743       uik[4] = -(d[4]*u[0]+ d[11]*u[1]+ d[18]*u[2]+ d[25]*u[3]+ d[32]*u[4]+ d[39]*u[5]+ d[46]*u[6]);
744       uik[5] = -(d[5]*u[0]+ d[12]*u[1]+ d[19]*u[2]+ d[26]*u[3]+ d[33]*u[4]+ d[40]*u[5]+ d[47]*u[6]);
745       uik[6] = -(d[6]*u[0]+ d[13]*u[1]+ d[20]*u[2]+ d[27]*u[3]+ d[34]*u[4]+ d[41]*u[5]+ d[48]*u[6]);
746 
747       uik[7] = -(d[0]*u[7] + d[7]*u[8]+ d[14]*u[9]+ d[21]*u[10]+ d[28]*u[11]+ d[35]*u[12]+ d[42]*u[13]);
748       uik[8] = -(d[1]*u[7] + d[8]*u[8]+ d[15]*u[9]+ d[22]*u[10]+ d[29]*u[11]+ d[36]*u[12]+ d[43]*u[13]);
749       uik[9] = -(d[2]*u[7] + d[9]*u[8]+ d[16]*u[9]+ d[23]*u[10]+ d[30]*u[11]+ d[37]*u[12]+ d[44]*u[13]);
750       uik[10]= -(d[3]*u[7]+ d[10]*u[8]+ d[17]*u[9]+ d[24]*u[10]+ d[31]*u[11]+ d[38]*u[12]+ d[45]*u[13]);
751       uik[11]= -(d[4]*u[7]+ d[11]*u[8]+ d[18]*u[9]+ d[25]*u[10]+ d[32]*u[11]+ d[39]*u[12]+ d[46]*u[13]);
752       uik[12]= -(d[5]*u[7]+ d[12]*u[8]+ d[19]*u[9]+ d[26]*u[10]+ d[33]*u[11]+ d[40]*u[12]+ d[47]*u[13]);
753       uik[13]= -(d[6]*u[7]+ d[13]*u[8]+ d[20]*u[9]+ d[27]*u[10]+ d[34]*u[11]+ d[41]*u[12]+ d[48]*u[13]);
754 
755       uik[14]= -(d[0]*u[14] + d[7]*u[15]+ d[14]*u[16]+ d[21]*u[17]+ d[28]*u[18]+ d[35]*u[19]+ d[42]*u[20]);
756       uik[15]= -(d[1]*u[14] + d[8]*u[15]+ d[15]*u[16]+ d[22]*u[17]+ d[29]*u[18]+ d[36]*u[19]+ d[43]*u[20]);
757       uik[16]= -(d[2]*u[14] + d[9]*u[15]+ d[16]*u[16]+ d[23]*u[17]+ d[30]*u[18]+ d[37]*u[19]+ d[44]*u[20]);
758       uik[17]= -(d[3]*u[14]+ d[10]*u[15]+ d[17]*u[16]+ d[24]*u[17]+ d[31]*u[18]+ d[38]*u[19]+ d[45]*u[20]);
759       uik[18]= -(d[4]*u[14]+ d[11]*u[15]+ d[18]*u[16]+ d[25]*u[17]+ d[32]*u[18]+ d[39]*u[19]+ d[46]*u[20]);
760       uik[19]= -(d[5]*u[14]+ d[12]*u[15]+ d[19]*u[16]+ d[26]*u[17]+ d[33]*u[18]+ d[40]*u[19]+ d[47]*u[20]);
761       uik[20]= -(d[6]*u[14]+ d[13]*u[15]+ d[20]*u[16]+ d[27]*u[17]+ d[34]*u[18]+ d[41]*u[19]+ d[48]*u[20]);
762 
763       uik[21]= -(d[0]*u[21] + d[7]*u[22]+ d[14]*u[23]+ d[21]*u[24]+ d[28]*u[25]+ d[35]*u[26]+ d[42]*u[27]);
764       uik[22]= -(d[1]*u[21] + d[8]*u[22]+ d[15]*u[23]+ d[22]*u[24]+ d[29]*u[25]+ d[36]*u[26]+ d[43]*u[27]);
765       uik[23]= -(d[2]*u[21] + d[9]*u[22]+ d[16]*u[23]+ d[23]*u[24]+ d[30]*u[25]+ d[37]*u[26]+ d[44]*u[27]);
766       uik[24]= -(d[3]*u[21]+ d[10]*u[22]+ d[17]*u[23]+ d[24]*u[24]+ d[31]*u[25]+ d[38]*u[26]+ d[45]*u[27]);
767       uik[25]= -(d[4]*u[21]+ d[11]*u[22]+ d[18]*u[23]+ d[25]*u[24]+ d[32]*u[25]+ d[39]*u[26]+ d[46]*u[27]);
768       uik[26]= -(d[5]*u[21]+ d[12]*u[22]+ d[19]*u[23]+ d[26]*u[24]+ d[33]*u[25]+ d[40]*u[26]+ d[47]*u[27]);
769       uik[27]= -(d[6]*u[21]+ d[13]*u[22]+ d[20]*u[23]+ d[27]*u[24]+ d[34]*u[25]+ d[41]*u[26]+ d[48]*u[27]);
770 
771       uik[28]= -(d[0]*u[28] + d[7]*u[29]+ d[14]*u[30]+ d[21]*u[31]+ d[28]*u[32]+ d[35]*u[33]+ d[42]*u[34]);
772       uik[29]= -(d[1]*u[28] + d[8]*u[29]+ d[15]*u[30]+ d[22]*u[31]+ d[29]*u[32]+ d[36]*u[33]+ d[43]*u[34]);
773       uik[30]= -(d[2]*u[28] + d[9]*u[29]+ d[16]*u[30]+ d[23]*u[31]+ d[30]*u[32]+ d[37]*u[33]+ d[44]*u[34]);
774       uik[31]= -(d[3]*u[28]+ d[10]*u[29]+ d[17]*u[30]+ d[24]*u[31]+ d[31]*u[32]+ d[38]*u[33]+ d[45]*u[34]);
775       uik[32]= -(d[4]*u[28]+ d[11]*u[29]+ d[18]*u[30]+ d[25]*u[31]+ d[32]*u[32]+ d[39]*u[33]+ d[46]*u[34]);
776       uik[33]= -(d[5]*u[28]+ d[12]*u[29]+ d[19]*u[30]+ d[26]*u[31]+ d[33]*u[32]+ d[40]*u[33]+ d[47]*u[34]);
777       uik[34]= -(d[6]*u[28]+ d[13]*u[29]+ d[20]*u[30]+ d[27]*u[31]+ d[34]*u[32]+ d[41]*u[33]+ d[48]*u[34]);
778 
779       uik[35]= -(d[0]*u[35] + d[7]*u[36]+ d[14]*u[37]+ d[21]*u[38]+ d[28]*u[39]+ d[35]*u[40]+ d[42]*u[41]);
780       uik[36]= -(d[1]*u[35] + d[8]*u[36]+ d[15]*u[37]+ d[22]*u[38]+ d[29]*u[39]+ d[36]*u[40]+ d[43]*u[41]);
781       uik[37]= -(d[2]*u[35] + d[9]*u[36]+ d[16]*u[37]+ d[23]*u[38]+ d[30]*u[39]+ d[37]*u[40]+ d[44]*u[41]);
782       uik[38]= -(d[3]*u[35]+ d[10]*u[36]+ d[17]*u[37]+ d[24]*u[38]+ d[31]*u[39]+ d[38]*u[40]+ d[45]*u[41]);
783       uik[39]= -(d[4]*u[35]+ d[11]*u[36]+ d[18]*u[37]+ d[25]*u[38]+ d[32]*u[39]+ d[39]*u[40]+ d[46]*u[41]);
784       uik[40]= -(d[5]*u[35]+ d[12]*u[36]+ d[19]*u[37]+ d[26]*u[38]+ d[33]*u[39]+ d[40]*u[40]+ d[47]*u[41]);
785       uik[41]= -(d[6]*u[35]+ d[13]*u[36]+ d[20]*u[37]+ d[27]*u[38]+ d[34]*u[39]+ d[41]*u[40]+ d[48]*u[41]);
786 
787       uik[42]= -(d[0]*u[42] + d[7]*u[43]+ d[14]*u[44]+ d[21]*u[45]+ d[28]*u[46]+ d[35]*u[47]+ d[42]*u[48]);
788       uik[43]= -(d[1]*u[42] + d[8]*u[43]+ d[15]*u[44]+ d[22]*u[45]+ d[29]*u[46]+ d[36]*u[47]+ d[43]*u[48]);
789       uik[44]= -(d[2]*u[42] + d[9]*u[43]+ d[16]*u[44]+ d[23]*u[45]+ d[30]*u[46]+ d[37]*u[47]+ d[44]*u[48]);
790       uik[45]= -(d[3]*u[42]+ d[10]*u[43]+ d[17]*u[44]+ d[24]*u[45]+ d[31]*u[46]+ d[38]*u[47]+ d[45]*u[48]);
791       uik[46]= -(d[4]*u[42]+ d[11]*u[43]+ d[18]*u[44]+ d[25]*u[45]+ d[32]*u[46]+ d[39]*u[47]+ d[46]*u[48]);
792       uik[47]= -(d[5]*u[42]+ d[12]*u[43]+ d[19]*u[44]+ d[26]*u[45]+ d[33]*u[46]+ d[40]*u[47]+ d[47]*u[48]);
793       uik[48]= -(d[6]*u[42]+ d[13]*u[43]+ d[20]*u[44]+ d[27]*u[45]+ d[34]*u[46]+ d[41]*u[47]+ d[48]*u[48]);
794 
795       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
796       dk[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
797       dk[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
798       dk[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
799       dk[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
800       dk[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
801       dk[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
802       dk[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
803 
804       dk[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
805       dk[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
806       dk[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
807       dk[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
808       dk[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
809       dk[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
810       dk[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
811 
812       dk[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
813       dk[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
814       dk[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
815       dk[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
816       dk[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
817       dk[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
818       dk[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
819 
820       dk[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
821       dk[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
822       dk[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
823       dk[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
824       dk[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
825       dk[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
826       dk[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
827 
828       dk[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
829       dk[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
830       dk[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
831       dk[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
832       dk[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
833       dk[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
834       dk[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
835 
836       dk[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
837       dk[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
838       dk[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
839       dk[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
840       dk[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
841       dk[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
842       dk[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
843 
844       dk[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
845       dk[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
846       dk[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
847       dk[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
848       dk[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
849       dk[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
850       dk[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
851 
852       /* update -U(i,k) */
853       ierr = PetscMemcpy(ba+ili*49,uik,49*sizeof(MatScalar));CHKERRQ(ierr);
854 
855       /* add multiple of row i to k-th row ... */
856       jmin = ili + 1; jmax = bi[i+1];
857       if (jmin < jmax){
858         for (j=jmin; j<jmax; j++) {
859           /* w += -U(i,k)^T * U_bar(i,j) */
860           wp = w + bj[j]*49;
861           u = ba + j*49;
862 
863           wp[0]+=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6];
864           wp[1]+=  uik[7]*u[0] + uik[8]*u[1] + uik[9]*u[2]+ uik[10]*u[3]+ uik[11]*u[4]+ uik[12]*u[5]+ uik[13]*u[6];
865           wp[2]+= uik[14]*u[0]+ uik[15]*u[1]+ uik[16]*u[2]+ uik[17]*u[3]+ uik[18]*u[4]+ uik[19]*u[5]+ uik[20]*u[6];
866           wp[3]+= uik[21]*u[0]+ uik[22]*u[1]+ uik[23]*u[2]+ uik[24]*u[3]+ uik[25]*u[4]+ uik[26]*u[5]+ uik[27]*u[6];
867           wp[4]+= uik[28]*u[0]+ uik[29]*u[1]+ uik[30]*u[2]+ uik[31]*u[3]+ uik[32]*u[4]+ uik[33]*u[5]+ uik[34]*u[6];
868           wp[5]+= uik[35]*u[0]+ uik[36]*u[1]+ uik[37]*u[2]+ uik[38]*u[3]+ uik[39]*u[4]+ uik[40]*u[5]+ uik[41]*u[6];
869           wp[6]+= uik[42]*u[0]+ uik[43]*u[1]+ uik[44]*u[2]+ uik[45]*u[3]+ uik[46]*u[4]+ uik[47]*u[5]+ uik[48]*u[6];
870 
871           wp[7]+=  uik[0]*u[7] + uik[1]*u[8] + uik[2]*u[9] + uik[3]*u[10] + uik[4]*u[11] + uik[5]*u[12] + uik[6]*u[13];
872           wp[8]+=  uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13];
873           wp[9]+= uik[14]*u[7]+ uik[15]*u[8]+ uik[16]*u[9]+ uik[17]*u[10]+ uik[18]*u[11]+ uik[19]*u[12]+ uik[20]*u[13];
874           wp[10]+=uik[21]*u[7]+ uik[22]*u[8]+ uik[23]*u[9]+ uik[24]*u[10]+ uik[25]*u[11]+ uik[26]*u[12]+ uik[27]*u[13];
875           wp[11]+=uik[28]*u[7]+ uik[29]*u[8]+ uik[30]*u[9]+ uik[31]*u[10]+ uik[32]*u[11]+ uik[33]*u[12]+ uik[34]*u[13];
876           wp[12]+=uik[35]*u[7]+ uik[36]*u[8]+ uik[37]*u[9]+ uik[38]*u[10]+ uik[39]*u[11]+ uik[40]*u[12]+ uik[41]*u[13];
877           wp[13]+=uik[42]*u[7]+ uik[43]*u[8]+ uik[44]*u[9]+ uik[45]*u[10]+ uik[46]*u[11]+ uik[47]*u[12]+ uik[48]*u[13];
878 
879           wp[14]+=  uik[0]*u[14] + uik[1]*u[15] + uik[2]*u[16] + uik[3]*u[17] + uik[4]*u[18] + uik[5]*u[19] + uik[6]*u[20];
880           wp[15]+=  uik[7]*u[14] + uik[8]*u[15] + uik[9]*u[16]+ uik[10]*u[17]+ uik[11]*u[18]+ uik[12]*u[19]+ uik[13]*u[20];
881           wp[16]+= uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20];
882           wp[17]+= uik[21]*u[14]+ uik[22]*u[15]+ uik[23]*u[16]+ uik[24]*u[17]+ uik[25]*u[18]+ uik[26]*u[19]+ uik[27]*u[20];
883           wp[18]+= uik[28]*u[14]+ uik[29]*u[15]+ uik[30]*u[16]+ uik[31]*u[17]+ uik[32]*u[18]+ uik[33]*u[19]+ uik[34]*u[20];
884           wp[19]+= uik[35]*u[14]+ uik[36]*u[15]+ uik[37]*u[16]+ uik[38]*u[17]+ uik[39]*u[18]+ uik[40]*u[19]+ uik[41]*u[20];
885           wp[20]+= uik[42]*u[14]+ uik[43]*u[15]+ uik[44]*u[16]+ uik[45]*u[17]+ uik[46]*u[18]+ uik[47]*u[19]+ uik[48]*u[20];
886 
887           wp[21]+=  uik[0]*u[21] + uik[1]*u[22] + uik[2]*u[23] + uik[3]*u[24] + uik[4]*u[25] + uik[5]*u[26] + uik[6]*u[27];
888           wp[22]+=  uik[7]*u[21] + uik[8]*u[22] + uik[9]*u[23]+ uik[10]*u[24]+ uik[11]*u[25]+ uik[12]*u[26]+ uik[13]*u[27];
889           wp[23]+= uik[14]*u[21]+ uik[15]*u[22]+ uik[16]*u[23]+ uik[17]*u[24]+ uik[18]*u[25]+ uik[19]*u[26]+ uik[20]*u[27];
890           wp[24]+= uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27];
891           wp[25]+= uik[28]*u[21]+ uik[29]*u[22]+ uik[30]*u[23]+ uik[31]*u[24]+ uik[32]*u[25]+ uik[33]*u[26]+ uik[34]*u[27];
892           wp[26]+= uik[35]*u[21]+ uik[36]*u[22]+ uik[37]*u[23]+ uik[38]*u[24]+ uik[39]*u[25]+ uik[40]*u[26]+ uik[41]*u[27];
893           wp[27]+= uik[42]*u[21]+ uik[43]*u[22]+ uik[44]*u[23]+ uik[45]*u[24]+ uik[46]*u[25]+ uik[47]*u[26]+ uik[48]*u[27];
894 
895           wp[28]+=  uik[0]*u[28] + uik[1]*u[29] + uik[2]*u[30] + uik[3]*u[31] + uik[4]*u[32] + uik[5]*u[33] + uik[6]*u[34];
896           wp[29]+=  uik[7]*u[28] + uik[8]*u[29] + uik[9]*u[30]+ uik[10]*u[31]+ uik[11]*u[32]+ uik[12]*u[33]+ uik[13]*u[34];
897           wp[30]+= uik[14]*u[28]+ uik[15]*u[29]+ uik[16]*u[30]+ uik[17]*u[31]+ uik[18]*u[32]+ uik[19]*u[33]+ uik[20]*u[34];
898           wp[31]+= uik[21]*u[28]+ uik[22]*u[29]+ uik[23]*u[30]+ uik[24]*u[31]+ uik[25]*u[32]+ uik[26]*u[33]+ uik[27]*u[34];
899           wp[32]+= uik[28]*u[28]+ uik[29]*u[29]+ uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34];
900           wp[33]+= uik[35]*u[28]+ uik[36]*u[29]+ uik[37]*u[30]+ uik[38]*u[31]+ uik[39]*u[32]+ uik[40]*u[33]+ uik[41]*u[34];
901           wp[34]+= uik[42]*u[28]+ uik[43]*u[29]+ uik[44]*u[30]+ uik[45]*u[31]+ uik[46]*u[32]+ uik[47]*u[33]+ uik[48]*u[34];
902 
903           wp[35]+=  uik[0]*u[35] + uik[1]*u[36] + uik[2]*u[37] + uik[3]*u[38] + uik[4]*u[39] + uik[5]*u[40] + uik[6]*u[41];
904           wp[36]+=  uik[7]*u[35] + uik[8]*u[36] + uik[9]*u[37]+ uik[10]*u[38]+ uik[11]*u[39]+ uik[12]*u[40]+ uik[13]*u[41];
905           wp[37]+= uik[14]*u[35]+ uik[15]*u[36]+ uik[16]*u[37]+ uik[17]*u[38]+ uik[18]*u[39]+ uik[19]*u[40]+ uik[20]*u[41];
906           wp[38]+= uik[21]*u[35]+ uik[22]*u[36]+ uik[23]*u[37]+ uik[24]*u[38]+ uik[25]*u[39]+ uik[26]*u[40]+ uik[27]*u[41];
907           wp[39]+= uik[28]*u[35]+ uik[29]*u[36]+ uik[30]*u[37]+ uik[31]*u[38]+ uik[32]*u[39]+ uik[33]*u[40]+ uik[34]*u[41];
908           wp[40]+= uik[35]*u[35]+ uik[36]*u[36]+ uik[37]*u[37]+ uik[38]*u[38]+ uik[39]*u[39]+ uik[40]*u[40]+ uik[41]*u[41];
909           wp[41]+= uik[42]*u[35]+ uik[43]*u[36]+ uik[44]*u[37]+ uik[45]*u[38]+ uik[46]*u[39]+ uik[47]*u[40]+ uik[48]*u[41];
910 
911           wp[42]+=  uik[0]*u[42] + uik[1]*u[43] + uik[2]*u[44] + uik[3]*u[45] + uik[4]*u[46] + uik[5]*u[47] + uik[6]*u[48];
912           wp[43]+=  uik[7]*u[42] + uik[8]*u[43] + uik[9]*u[44]+ uik[10]*u[45]+ uik[11]*u[46]+ uik[12]*u[47]+ uik[13]*u[48];
913           wp[44]+= uik[14]*u[42]+ uik[15]*u[43]+ uik[16]*u[44]+ uik[17]*u[45]+ uik[18]*u[46]+ uik[19]*u[47]+ uik[20]*u[48];
914           wp[45]+= uik[21]*u[42]+ uik[22]*u[43]+ uik[23]*u[44]+ uik[24]*u[45]+ uik[25]*u[46]+ uik[26]*u[47]+ uik[27]*u[48];
915           wp[46]+= uik[28]*u[42]+ uik[29]*u[43]+ uik[30]*u[44]+ uik[31]*u[45]+ uik[32]*u[46]+ uik[33]*u[47]+ uik[34]*u[48];
916           wp[47]+= uik[35]*u[42]+ uik[36]*u[43]+ uik[37]*u[44]+ uik[38]*u[45]+ uik[39]*u[46]+ uik[40]*u[47]+ uik[41]*u[48];
917           wp[48]+= uik[42]*u[42]+ uik[43]*u[43]+ uik[44]*u[44]+ uik[45]*u[45]+ uik[46]*u[46]+ uik[47]*u[47]+ uik[48]*u[48];
918         }
919 
920         /* ... add i to row list for next nonzero entry */
921         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
922         j     = bj[jmin];
923         jl[i] = jl[j]; jl[j] = i; /* update jl */
924       }
925       i = nexti;
926     }
927 
928     /* save nonzero entries in k-th row of U ... */
929 
930     /* invert diagonal block */
931     d = ba+k*49;
932     ierr = PetscMemcpy(d,dk,49*sizeof(MatScalar));CHKERRQ(ierr);
933     ierr = Kernel_A_gets_inverse_A_7(d);CHKERRQ(ierr);
934 
935     jmin = bi[k]; jmax = bi[k+1];
936     if (jmin < jmax) {
937       for (j=jmin; j<jmax; j++){
938          vj = bj[j];           /* block col. index of U */
939          u   = ba + j*49;
940          wp = w + vj*49;
941          for (k1=0; k1<49; k1++){
942            *u++        = *wp;
943            *wp++ = 0.0;
944          }
945       }
946 
947       /* ... add k to row list for first nonzero entry in k-th row */
948       il[k] = jmin;
949       i     = bj[jmin];
950       jl[k] = jl[i]; jl[i] = k;
951     }
952   }
953 
954   ierr = PetscFree(w);CHKERRQ(ierr);
955   ierr = PetscFree(il);CHKERRQ(ierr);
956   ierr = PetscFree(jl);CHKERRQ(ierr);
957   ierr = PetscFree(dk);CHKERRQ(ierr);
958   ierr = PetscFree(uik);CHKERRQ(ierr);
959 
960   C->factor    = FACTOR_CHOLESKY;
961   C->assembled = PETSC_TRUE;
962   C->preallocated = PETSC_TRUE;
963   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
964   PetscFunctionReturn(0);
965 }
966 
967 /* Version for when blocks are 6 by 6 */
968 #undef __FUNC__
969 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6"
970 int MatCholeskyFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B)
971 {
972   Mat                C = *B;
973   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
974   IS                 perm = b->row;
975   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
976   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
977   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
978   MatScalar          *u,*d,*w,*wp;
979 
980   PetscFunctionBegin;
981   /* initialization */
982   w  = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w);
983   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
984   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
985   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
986   for (i=0; i<mbs; i++) {
987     jl[i] = mbs; il[0] = 0;
988   }
989   dk    = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(dk);
990   uik   = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(uik);
991   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
992 
993   /* check permutation */
994   if (!a->permute){
995     ai = a->i; aj = a->j; aa = a->a;
996   } else {
997     ai = a->inew; aj = a->jnew;
998     aa = (MatScalar*)PetscMalloc(36*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
999     ierr = PetscMemcpy(aa,a->a,36*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1000     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
1001     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1002 
1003     for (i=0; i<mbs; i++){
1004       jmin = ai[i]; jmax = ai[i+1];
1005       for (j=jmin; j<jmax; j++){
1006         while (a2anew[j] != j){
1007           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1008           for (k1=0; k1<36; k1++){
1009             dk[k1]       = aa[k*36+k1];
1010             aa[k*36+k1] = aa[j*36+k1];
1011             aa[j*36+k1] = dk[k1];
1012           }
1013         }
1014         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1015         if (i > aj[j]){
1016           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1017           ap = aa + j*36;                     /* ptr to the beginning of j-th block of aa */
1018           for (k=0; k<36; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1019           for (k=0; k<6; k++){               /* j-th block of aa <- dk^T */
1020             for (k1=0; k1<6; k1++) *ap++ = dk[k + 6*k1];
1021           }
1022         }
1023       }
1024     }
1025     ierr = PetscFree(a2anew);CHKERRA(ierr);
1026   }
1027 
1028   /* for each row k */
1029   for (k = 0; k<mbs; k++){
1030 
1031     /*initialize k-th row with elements nonzero in row perm(k) of A */
1032     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1033     if (jmin < jmax) {
1034       ap = aa + jmin*36;
1035       for (j = jmin; j < jmax; j++){
1036         vj = perm_ptr[aj[j]];         /* block col. index */
1037         wp = w + vj*36;
1038         for (i=0; i<36; i++) *wp++ = *ap++;
1039       }
1040     }
1041 
1042     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1043     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1044     i = jl[k]; /* first row to be added to k_th row  */
1045 
1046     while (i < mbs){
1047       nexti = jl[i]; /* next row to be added to k_th row */
1048 
1049       /* compute multiplier */
1050       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1051 
1052       /* uik = -inv(Di)*U_bar(i,k) */
1053       d = ba + i*36;
1054       u    = ba + ili*36;
1055 
1056       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1057       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1058       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1059       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1060       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1061       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1062 
1063       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1064       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1065       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1066       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1067       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1068       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1069 
1070       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1071       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1072       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1073       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1074       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1075       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1076 
1077       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1078       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1079       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1080       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1081       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1082       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1083 
1084       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1085       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1086       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1087       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1088       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1089       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1090 
1091       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1092       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1093       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1094       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1095       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1096       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1097 
1098       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1099       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1100       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1101       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1102       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1103       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1104       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1105 
1106       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1107       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1108       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1109       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1110       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1111       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1112 
1113       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1114       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1115       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1116       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1117       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1118       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1119 
1120       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1121       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1122       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1123       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1124       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1125       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1126 
1127       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1128       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1129       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1130       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1131       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1132       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1133 
1134       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1135       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1136       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1137       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1138       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1139       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1140 
1141       /* update -U(i,k) */
1142       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1143 
1144       /* add multiple of row i to k-th row ... */
1145       jmin = ili + 1; jmax = bi[i+1];
1146       if (jmin < jmax){
1147         for (j=jmin; j<jmax; j++) {
1148           /* w += -U(i,k)^T * U_bar(i,j) */
1149           wp = w + bj[j]*36;
1150           u = ba + j*36;
1151           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1152           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1153           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1154           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1155           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1156           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1157 
1158           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1159           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1160           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1161           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1162           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1163           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1164 
1165           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1166           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1167           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1168           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1169           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1170           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1171 
1172           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1173           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1174           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1175           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1176           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1177           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1178 
1179           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1180           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1181           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1182           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1183           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1184           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1185 
1186           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1187           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1188           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1189           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1190           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1191           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1192         }
1193 
1194         /* ... add i to row list for next nonzero entry */
1195         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1196         j     = bj[jmin];
1197         jl[i] = jl[j]; jl[j] = i; /* update jl */
1198       }
1199       i = nexti;
1200     }
1201 
1202     /* save nonzero entries in k-th row of U ... */
1203 
1204     /* invert diagonal block */
1205     d = ba+k*36;
1206     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1207     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1208 
1209     jmin = bi[k]; jmax = bi[k+1];
1210     if (jmin < jmax) {
1211       for (j=jmin; j<jmax; j++){
1212          vj = bj[j];           /* block col. index of U */
1213          u   = ba + j*36;
1214          wp = w + vj*36;
1215          for (k1=0; k1<36; k1++){
1216            *u++        = *wp;
1217            *wp++ = 0.0;
1218          }
1219       }
1220 
1221       /* ... add k to row list for first nonzero entry in k-th row */
1222       il[k] = jmin;
1223       i     = bj[jmin];
1224       jl[k] = jl[i]; jl[i] = k;
1225     }
1226   }
1227 
1228   ierr = PetscFree(w);CHKERRQ(ierr);
1229   ierr = PetscFree(il);CHKERRQ(ierr);
1230   ierr = PetscFree(jl);CHKERRQ(ierr);
1231   ierr = PetscFree(dk);CHKERRQ(ierr);
1232   ierr = PetscFree(uik);CHKERRQ(ierr);
1233   if (a->permute){
1234     ierr = PetscFree(aa);CHKERRQ(ierr);
1235   }
1236 
1237   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1238   C->factor    = FACTOR_CHOLESKY;
1239   C->assembled = PETSC_TRUE;
1240   C->preallocated = PETSC_TRUE;
1241   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 /*
1246       Version for when blocks are 6 by 6 Using natural ordering
1247 */
1248 #undef __FUNC__
1249 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering"
1250 int MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B)
1251 {
1252   Mat                C = *B;
1253   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1254   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1255   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1256   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1257   MatScalar          *u,*d,*w,*wp;
1258 
1259   PetscFunctionBegin;
1260   /* initialization */
1261   w  = (MatScalar*)PetscMalloc(36*mbs*sizeof(MatScalar));CHKPTRQ(w);
1262   ierr = PetscMemzero(w,36*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1263   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1264   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1265   for (i=0; i<mbs; i++) {
1266     jl[i] = mbs; il[0] = 0;
1267   }
1268   dk    = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(dk);
1269   uik   = (MatScalar*)PetscMalloc(36*sizeof(MatScalar));CHKPTRQ(uik);
1270 
1271   ai = a->i; aj = a->j; aa = a->a;
1272 
1273   /* for each row k */
1274   for (k = 0; k<mbs; k++){
1275 
1276     /*initialize k-th row with elements nonzero in row k of A */
1277     jmin = ai[k]; jmax = ai[k+1];
1278     if (jmin < jmax) {
1279       ap = aa + jmin*36;
1280       for (j = jmin; j < jmax; j++){
1281         vj = aj[j];         /* block col. index */
1282         wp = w + vj*36;
1283         for (i=0; i<36; i++) *wp++ = *ap++;
1284       }
1285     }
1286 
1287     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1288     ierr = PetscMemcpy(dk,w+k*36,36*sizeof(MatScalar));CHKERRQ(ierr);
1289     i = jl[k]; /* first row to be added to k_th row  */
1290 
1291     while (i < mbs){
1292       nexti = jl[i]; /* next row to be added to k_th row */
1293 
1294       /* compute multiplier */
1295       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1296 
1297       /* uik = -inv(Di)*U_bar(i,k) */
1298       d = ba + i*36;
1299       u    = ba + ili*36;
1300 
1301       uik[0] = -(d[0]*u[0] + d[6]*u[1] + d[12]*u[2] + d[18]*u[3] + d[24]*u[4] + d[30]*u[5]);
1302       uik[1] = -(d[1]*u[0] + d[7]*u[1] + d[13]*u[2] + d[19]*u[3] + d[25]*u[4] + d[31]*u[5]);
1303       uik[2] = -(d[2]*u[0] + d[8]*u[1] + d[14]*u[2] + d[20]*u[3] + d[26]*u[4] + d[32]*u[5]);
1304       uik[3] = -(d[3]*u[0] + d[9]*u[1] + d[15]*u[2] + d[21]*u[3] + d[27]*u[4] + d[33]*u[5]);
1305       uik[4] = -(d[4]*u[0]+ d[10]*u[1] + d[16]*u[2] + d[22]*u[3] + d[28]*u[4] + d[34]*u[5]);
1306       uik[5] = -(d[5]*u[0]+ d[11]*u[1] + d[17]*u[2] + d[23]*u[3] + d[29]*u[4] + d[35]*u[5]);
1307 
1308       uik[6] = -(d[0]*u[6] + d[6]*u[7] + d[12]*u[8] + d[18]*u[9] + d[24]*u[10] + d[30]*u[11]);
1309       uik[7] = -(d[1]*u[6] + d[7]*u[7] + d[13]*u[8] + d[19]*u[9] + d[25]*u[10] + d[31]*u[11]);
1310       uik[8] = -(d[2]*u[6] + d[8]*u[7] + d[14]*u[8] + d[20]*u[9] + d[26]*u[10] + d[32]*u[11]);
1311       uik[9] = -(d[3]*u[6] + d[9]*u[7] + d[15]*u[8] + d[21]*u[9] + d[27]*u[10] + d[33]*u[11]);
1312       uik[10]= -(d[4]*u[6]+ d[10]*u[7] + d[16]*u[8] + d[22]*u[9] + d[28]*u[10] + d[34]*u[11]);
1313       uik[11]= -(d[5]*u[6]+ d[11]*u[7] + d[17]*u[8] + d[23]*u[9] + d[29]*u[10] + d[35]*u[11]);
1314 
1315       uik[12] = -(d[0]*u[12] + d[6]*u[13] + d[12]*u[14] + d[18]*u[15] + d[24]*u[16] + d[30]*u[17]);
1316       uik[13] = -(d[1]*u[12] + d[7]*u[13] + d[13]*u[14] + d[19]*u[15] + d[25]*u[16] + d[31]*u[17]);
1317       uik[14] = -(d[2]*u[12] + d[8]*u[13] + d[14]*u[14] + d[20]*u[15] + d[26]*u[16] + d[32]*u[17]);
1318       uik[15] = -(d[3]*u[12] + d[9]*u[13] + d[15]*u[14] + d[21]*u[15] + d[27]*u[16] + d[33]*u[17]);
1319       uik[16] = -(d[4]*u[12]+ d[10]*u[13] + d[16]*u[14] + d[22]*u[15] + d[28]*u[16] + d[34]*u[17]);
1320       uik[17] = -(d[5]*u[12]+ d[11]*u[13] + d[17]*u[14] + d[23]*u[15] + d[29]*u[16] + d[35]*u[17]);
1321 
1322       uik[18] = -(d[0]*u[18] + d[6]*u[19] + d[12]*u[20] + d[18]*u[21] + d[24]*u[22] + d[30]*u[23]);
1323       uik[19] = -(d[1]*u[18] + d[7]*u[19] + d[13]*u[20] + d[19]*u[21] + d[25]*u[22] + d[31]*u[23]);
1324       uik[20] = -(d[2]*u[18] + d[8]*u[19] + d[14]*u[20] + d[20]*u[21] + d[26]*u[22] + d[32]*u[23]);
1325       uik[21] = -(d[3]*u[18] + d[9]*u[19] + d[15]*u[20] + d[21]*u[21] + d[27]*u[22] + d[33]*u[23]);
1326       uik[22] = -(d[4]*u[18]+ d[10]*u[19] + d[16]*u[20] + d[22]*u[21] + d[28]*u[22] + d[34]*u[23]);
1327       uik[23] = -(d[5]*u[18]+ d[11]*u[19] + d[17]*u[20] + d[23]*u[21] + d[29]*u[22] + d[35]*u[23]);
1328 
1329       uik[24] = -(d[0]*u[24] + d[6]*u[25] + d[12]*u[26] + d[18]*u[27] + d[24]*u[28] + d[30]*u[29]);
1330       uik[25] = -(d[1]*u[24] + d[7]*u[25] + d[13]*u[26] + d[19]*u[27] + d[25]*u[28] + d[31]*u[29]);
1331       uik[26] = -(d[2]*u[24] + d[8]*u[25] + d[14]*u[26] + d[20]*u[27] + d[26]*u[28] + d[32]*u[29]);
1332       uik[27] = -(d[3]*u[24] + d[9]*u[25] + d[15]*u[26] + d[21]*u[27] + d[27]*u[28] + d[33]*u[29]);
1333       uik[28] = -(d[4]*u[24]+ d[10]*u[25] + d[16]*u[26] + d[22]*u[27] + d[28]*u[28] + d[34]*u[29]);
1334       uik[29] = -(d[5]*u[24]+ d[11]*u[25] + d[17]*u[26] + d[23]*u[27] + d[29]*u[28] + d[35]*u[29]);
1335 
1336       uik[30] = -(d[0]*u[30] + d[6]*u[31] + d[12]*u[32] + d[18]*u[33] + d[24]*u[34] + d[30]*u[35]);
1337       uik[31] = -(d[1]*u[30] + d[7]*u[31] + d[13]*u[32] + d[19]*u[33] + d[25]*u[34] + d[31]*u[35]);
1338       uik[32] = -(d[2]*u[30] + d[8]*u[31] + d[14]*u[32] + d[20]*u[33] + d[26]*u[34] + d[32]*u[35]);
1339       uik[33] = -(d[3]*u[30] + d[9]*u[31] + d[15]*u[32] + d[21]*u[33] + d[27]*u[34] + d[33]*u[35]);
1340       uik[34] = -(d[4]*u[30]+ d[10]*u[31] + d[16]*u[32] + d[22]*u[33] + d[28]*u[34] + d[34]*u[35]);
1341       uik[35] = -(d[5]*u[30]+ d[11]*u[31] + d[17]*u[32] + d[23]*u[33] + d[29]*u[34] + d[35]*u[35]);
1342 
1343       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1344       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1345       dk[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1346       dk[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1347       dk[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1348       dk[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1349       dk[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1350 
1351       dk[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1352       dk[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1353       dk[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1354       dk[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1355       dk[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1356       dk[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1357 
1358       dk[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1359       dk[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1360       dk[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1361       dk[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1362       dk[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1363       dk[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1364 
1365       dk[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1366       dk[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1367       dk[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1368       dk[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1369       dk[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1370       dk[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1371 
1372       dk[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1373       dk[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1374       dk[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1375       dk[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1376       dk[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1377       dk[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1378 
1379       dk[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1380       dk[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1381       dk[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1382       dk[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1383       dk[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1384       dk[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1385 
1386       /* update -U(i,k) */
1387       ierr = PetscMemcpy(ba+ili*36,uik,36*sizeof(MatScalar));CHKERRQ(ierr);
1388 
1389       /* add multiple of row i to k-th row ... */
1390       jmin = ili + 1; jmax = bi[i+1];
1391       if (jmin < jmax){
1392         for (j=jmin; j<jmax; j++) {
1393           /* w += -U(i,k)^T * U_bar(i,j) */
1394           wp = w + bj[j]*36;
1395           u = ba + j*36;
1396           wp[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
1397           wp[1] +=  uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2] + uik[9]*u[3]+ uik[10]*u[4]+ uik[11]*u[5];
1398           wp[2] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3]+ uik[16]*u[4]+ uik[17]*u[5];
1399           wp[3] += uik[18]*u[0]+ uik[19]*u[1]+ uik[20]*u[2]+ uik[21]*u[3]+ uik[22]*u[4]+ uik[23]*u[5];
1400           wp[4] += uik[24]*u[0]+ uik[25]*u[1]+ uik[26]*u[2]+ uik[27]*u[3]+ uik[28]*u[4]+ uik[29]*u[5];
1401           wp[5] += uik[30]*u[0]+ uik[31]*u[1]+ uik[32]*u[2]+ uik[33]*u[3]+ uik[34]*u[4]+ uik[35]*u[5];
1402 
1403           wp[6] +=  uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8] + uik[3]*u[9] + uik[4]*u[10] + uik[5]*u[11];
1404           wp[7] +=  uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9]+ uik[10]*u[10]+ uik[11]*u[11];
1405           wp[8] += uik[12]*u[6]+ uik[13]*u[7]+ uik[14]*u[8]+ uik[15]*u[9]+ uik[16]*u[10]+ uik[17]*u[11];
1406           wp[9] += uik[18]*u[6]+ uik[19]*u[7]+ uik[20]*u[8]+ uik[21]*u[9]+ uik[22]*u[10]+ uik[23]*u[11];
1407           wp[10]+= uik[24]*u[6]+ uik[25]*u[7]+ uik[26]*u[8]+ uik[27]*u[9]+ uik[28]*u[10]+ uik[29]*u[11];
1408           wp[11]+= uik[30]*u[6]+ uik[31]*u[7]+ uik[32]*u[8]+ uik[33]*u[9]+ uik[34]*u[10]+ uik[35]*u[11];
1409 
1410           wp[12]+=  uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15] + uik[4]*u[16] + uik[5]*u[17];
1411           wp[13]+=  uik[6]*u[12] + uik[7]*u[13] + uik[8]*u[14] + uik[9]*u[15]+ uik[10]*u[16]+ uik[11]*u[17];
1412           wp[14]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17];
1413           wp[15]+= uik[18]*u[12]+ uik[19]*u[13]+ uik[20]*u[14]+ uik[21]*u[15]+ uik[22]*u[16]+ uik[23]*u[17];
1414           wp[16]+= uik[24]*u[12]+ uik[25]*u[13]+ uik[26]*u[14]+ uik[27]*u[15]+ uik[28]*u[16]+ uik[29]*u[17];
1415           wp[17]+= uik[30]*u[12]+ uik[31]*u[13]+ uik[32]*u[14]+ uik[33]*u[15]+ uik[34]*u[16]+ uik[35]*u[17];
1416 
1417           wp[18]+=  uik[0]*u[18] + uik[1]*u[19] + uik[2]*u[20] + uik[3]*u[21] + uik[4]*u[22] + uik[5]*u[23];
1418           wp[19]+=  uik[6]*u[18] + uik[7]*u[19] + uik[8]*u[20] + uik[9]*u[21]+ uik[10]*u[22]+ uik[11]*u[23];
1419           wp[20]+= uik[12]*u[18]+ uik[13]*u[19]+ uik[14]*u[20]+ uik[15]*u[21]+ uik[16]*u[22]+ uik[17]*u[23];
1420           wp[21]+= uik[18]*u[18]+ uik[19]*u[19]+ uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23];
1421           wp[22]+= uik[24]*u[18]+ uik[25]*u[19]+ uik[26]*u[20]+ uik[27]*u[21]+ uik[28]*u[22]+ uik[29]*u[23];
1422           wp[23]+= uik[30]*u[18]+ uik[31]*u[19]+ uik[32]*u[20]+ uik[33]*u[21]+ uik[34]*u[22]+ uik[35]*u[23];
1423 
1424           wp[24]+=  uik[0]*u[24] + uik[1]*u[25] + uik[2]*u[26] + uik[3]*u[27] + uik[4]*u[28] + uik[5]*u[29];
1425           wp[25]+=  uik[6]*u[24] + uik[7]*u[25] + uik[8]*u[26] + uik[9]*u[27]+ uik[10]*u[28]+ uik[11]*u[29];
1426           wp[26]+= uik[12]*u[24]+ uik[13]*u[25]+ uik[14]*u[26]+ uik[15]*u[27]+ uik[16]*u[28]+ uik[17]*u[29];
1427           wp[27]+= uik[18]*u[24]+ uik[19]*u[25]+ uik[20]*u[26]+ uik[21]*u[27]+ uik[22]*u[28]+ uik[23]*u[29];
1428           wp[28]+= uik[24]*u[24]+ uik[25]*u[25]+ uik[26]*u[26]+ uik[27]*u[27]+ uik[28]*u[28]+ uik[29]*u[29];
1429           wp[29]+= uik[30]*u[24]+ uik[31]*u[25]+ uik[32]*u[26]+ uik[33]*u[27]+ uik[34]*u[28]+ uik[35]*u[29];
1430 
1431           wp[30]+=  uik[0]*u[30] + uik[1]*u[31] + uik[2]*u[32] + uik[3]*u[33] + uik[4]*u[34] + uik[5]*u[35];
1432           wp[31]+=  uik[6]*u[30] + uik[7]*u[31] + uik[8]*u[32] + uik[9]*u[33]+ uik[10]*u[34]+ uik[11]*u[35];
1433           wp[32]+= uik[12]*u[30]+ uik[13]*u[31]+ uik[14]*u[32]+ uik[15]*u[33]+ uik[16]*u[34]+ uik[17]*u[35];
1434           wp[33]+= uik[18]*u[30]+ uik[19]*u[31]+ uik[20]*u[32]+ uik[21]*u[33]+ uik[22]*u[34]+ uik[23]*u[35];
1435           wp[34]+= uik[24]*u[30]+ uik[25]*u[31]+ uik[26]*u[32]+ uik[27]*u[33]+ uik[28]*u[34]+ uik[29]*u[35];
1436           wp[35]+= uik[30]*u[30]+ uik[31]*u[31]+ uik[32]*u[32]+ uik[33]*u[33]+ uik[34]*u[34]+ uik[35]*u[35];
1437         }
1438 
1439         /* ... add i to row list for next nonzero entry */
1440         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1441         j     = bj[jmin];
1442         jl[i] = jl[j]; jl[j] = i; /* update jl */
1443       }
1444       i = nexti;
1445     }
1446 
1447     /* save nonzero entries in k-th row of U ... */
1448 
1449     /* invert diagonal block */
1450     d = ba+k*36;
1451     ierr = PetscMemcpy(d,dk,36*sizeof(MatScalar));CHKERRQ(ierr);
1452     ierr = Kernel_A_gets_inverse_A_6(d);CHKERRQ(ierr);
1453 
1454     jmin = bi[k]; jmax = bi[k+1];
1455     if (jmin < jmax) {
1456       for (j=jmin; j<jmax; j++){
1457          vj = bj[j];           /* block col. index of U */
1458          u   = ba + j*36;
1459          wp = w + vj*36;
1460          for (k1=0; k1<36; k1++){
1461            *u++        = *wp;
1462            *wp++ = 0.0;
1463          }
1464       }
1465 
1466       /* ... add k to row list for first nonzero entry in k-th row */
1467       il[k] = jmin;
1468       i     = bj[jmin];
1469       jl[k] = jl[i]; jl[i] = k;
1470     }
1471   }
1472 
1473   ierr = PetscFree(w);CHKERRQ(ierr);
1474   ierr = PetscFree(il);CHKERRQ(ierr);
1475   ierr = PetscFree(jl);CHKERRQ(ierr);
1476   ierr = PetscFree(dk);CHKERRQ(ierr);
1477   ierr = PetscFree(uik);CHKERRQ(ierr);
1478 
1479   C->factor    = FACTOR_CHOLESKY;
1480   C->assembled = PETSC_TRUE;
1481   C->preallocated = PETSC_TRUE;
1482   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1483   PetscFunctionReturn(0);
1484 }
1485 
1486 /* Version for when blocks are 5 by 5  */
1487 #undef __FUNC__
1488 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5"
1489 int MatCholeskyFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B)
1490 {
1491   Mat                C = *B;
1492   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1493   IS                 perm = b->row;
1494   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1495   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1496   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1497   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1498 
1499   PetscFunctionBegin;
1500   /* initialization */
1501   rtmp  = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1502   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1503   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1504   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1505   for (i=0; i<mbs; i++) {
1506     jl[i] = mbs; il[0] = 0;
1507   }
1508   dk    = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(dk);
1509   uik   = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(uik);
1510   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
1511 
1512   /* check permutation */
1513   if (!a->permute){
1514     ai = a->i; aj = a->j; aa = a->a;
1515   } else {
1516     ai = a->inew; aj = a->jnew;
1517     aa = (MatScalar*)PetscMalloc(25*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
1518     ierr = PetscMemcpy(aa,a->a,25*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1519     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
1520     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1521 
1522     for (i=0; i<mbs; i++){
1523       jmin = ai[i]; jmax = ai[i+1];
1524       for (j=jmin; j<jmax; j++){
1525         while (a2anew[j] != j){
1526           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
1527           for (k1=0; k1<25; k1++){
1528             dk[k1]       = aa[k*25+k1];
1529             aa[k*25+k1] = aa[j*25+k1];
1530             aa[j*25+k1] = dk[k1];
1531           }
1532         }
1533         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
1534         if (i > aj[j]){
1535           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
1536           ap = aa + j*25;                     /* ptr to the beginning of j-th block of aa */
1537           for (k=0; k<25; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
1538           for (k=0; k<5; k++){               /* j-th block of aa <- dk^T */
1539             for (k1=0; k1<5; k1++) *ap++ = dk[k + 5*k1];
1540           }
1541         }
1542       }
1543     }
1544     ierr = PetscFree(a2anew);CHKERRA(ierr);
1545   }
1546 
1547   /* for each row k */
1548   for (k = 0; k<mbs; k++){
1549 
1550     /*initialize k-th row with elements nonzero in row perm(k) of A */
1551     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
1552     if (jmin < jmax) {
1553       ap = aa + jmin*25;
1554       for (j = jmin; j < jmax; j++){
1555         vj = perm_ptr[aj[j]];         /* block col. index */
1556         rtmp_ptr = rtmp + vj*25;
1557         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1558       }
1559     }
1560 
1561     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1562     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1563     i = jl[k]; /* first row to be added to k_th row  */
1564 
1565     while (i < mbs){
1566       nexti = jl[i]; /* next row to be added to k_th row */
1567 
1568       /* compute multiplier */
1569       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1570 
1571       /* uik = -inv(Di)*U_bar(i,k) */
1572       d = ba + i*25;
1573       u    = ba + ili*25;
1574 
1575       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1576       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1577       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1578       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1579       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1580 
1581       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1582       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1583       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1584       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1585       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1586 
1587       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1588       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1589       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1590       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1591       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1592 
1593       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1594       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1595       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1596       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1597       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1598 
1599       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1600       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1601       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1602       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1603       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1604 
1605 
1606       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1607       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1608       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1609       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1610       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1611       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1612 
1613       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1614       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1615       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1616       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1617       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1618 
1619       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1620       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1621       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1622       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1623       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1624 
1625       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1626       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1627       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1628       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1629       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1630 
1631       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1632       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1633       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1634       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1635       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1636 
1637       /* update -U(i,k) */
1638       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1639 
1640       /* add multiple of row i to k-th row ... */
1641       jmin = ili + 1; jmax = bi[i+1];
1642       if (jmin < jmax){
1643         for (j=jmin; j<jmax; j++) {
1644           /* rtmp += -U(i,k)^T * U_bar(i,j) */
1645           rtmp_ptr = rtmp + bj[j]*25;
1646           u = ba + j*25;
1647           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1648           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1649           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1650           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1651           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1652 
1653           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1654           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1655           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1656           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1657           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1658 
1659           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1660           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1661           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1662           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1663           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1664 
1665           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1666           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1667           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1668           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1669           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1670 
1671           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1672           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1673           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1674           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1675           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1676         }
1677 
1678         /* ... add i to row list for next nonzero entry */
1679         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1680         j     = bj[jmin];
1681         jl[i] = jl[j]; jl[j] = i; /* update jl */
1682       }
1683       i = nexti;
1684     }
1685 
1686     /* save nonzero entries in k-th row of U ... */
1687 
1688     /* invert diagonal block */
1689     d = ba+k*25;
1690     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
1691     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
1692 
1693     jmin = bi[k]; jmax = bi[k+1];
1694     if (jmin < jmax) {
1695       for (j=jmin; j<jmax; j++){
1696          vj = bj[j];           /* block col. index of U */
1697          u   = ba + j*25;
1698          rtmp_ptr = rtmp + vj*25;
1699          for (k1=0; k1<25; k1++){
1700            *u++        = *rtmp_ptr;
1701            *rtmp_ptr++ = 0.0;
1702          }
1703       }
1704 
1705       /* ... add k to row list for first nonzero entry in k-th row */
1706       il[k] = jmin;
1707       i     = bj[jmin];
1708       jl[k] = jl[i]; jl[i] = k;
1709     }
1710   }
1711 
1712   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1713   ierr = PetscFree(il);CHKERRQ(ierr);
1714   ierr = PetscFree(jl);CHKERRQ(ierr);
1715   ierr = PetscFree(dk);CHKERRQ(ierr);
1716   ierr = PetscFree(uik);CHKERRQ(ierr);
1717   if (a->permute){
1718     ierr = PetscFree(aa);CHKERRQ(ierr);
1719   }
1720 
1721   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1722   C->factor    = FACTOR_CHOLESKY;
1723   C->assembled = PETSC_TRUE;
1724   C->preallocated = PETSC_TRUE;
1725   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 /*
1730       Version for when blocks are 5 by 5 Using natural ordering
1731 */
1732 #undef __FUNC__
1733 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering"
1734 int MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B)
1735 {
1736   Mat                C = *B;
1737   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1738   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1739   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1740   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1741   MatScalar          *u,*d,*rtmp,*rtmp_ptr;
1742 
1743   PetscFunctionBegin;
1744   /* initialization */
1745   rtmp  = (MatScalar*)PetscMalloc(25*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1746   ierr = PetscMemzero(rtmp,25*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1747   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1748   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1749   for (i=0; i<mbs; i++) {
1750     jl[i] = mbs; il[0] = 0;
1751   }
1752   dk    = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(dk);
1753   uik   = (MatScalar*)PetscMalloc(25*sizeof(MatScalar));CHKPTRQ(uik);
1754 
1755   ai = a->i; aj = a->j; aa = a->a;
1756 
1757   /* for each row k */
1758   for (k = 0; k<mbs; k++){
1759 
1760     /*initialize k-th row with elements nonzero in row k of A */
1761     jmin = ai[k]; jmax = ai[k+1];
1762     if (jmin < jmax) {
1763       ap = aa + jmin*25;
1764       for (j = jmin; j < jmax; j++){
1765         vj = aj[j];         /* block col. index */
1766         rtmp_ptr = rtmp + vj*25;
1767         for (i=0; i<25; i++) *rtmp_ptr++ = *ap++;
1768       }
1769     }
1770 
1771     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1772     ierr = PetscMemcpy(dk,rtmp+k*25,25*sizeof(MatScalar));CHKERRQ(ierr);
1773     i = jl[k]; /* first row to be added to k_th row  */
1774 
1775     while (i < mbs){
1776       nexti = jl[i]; /* next row to be added to k_th row */
1777 
1778       /* compute multiplier */
1779       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1780 
1781       /* uik = -inv(Di)*U_bar(i,k) */
1782       d = ba + i*25;
1783       u    = ba + ili*25;
1784 
1785       uik[0] = -(d[0]*u[0] + d[5]*u[1] + d[10]*u[2] + d[15]*u[3] + d[20]*u[4]);
1786       uik[1] = -(d[1]*u[0] + d[6]*u[1] + d[11]*u[2] + d[16]*u[3] + d[21]*u[4]);
1787       uik[2] = -(d[2]*u[0] + d[7]*u[1] + d[12]*u[2] + d[17]*u[3] + d[22]*u[4]);
1788       uik[3] = -(d[3]*u[0] + d[8]*u[1] + d[13]*u[2] + d[18]*u[3] + d[23]*u[4]);
1789       uik[4] = -(d[4]*u[0] + d[9]*u[1] + d[14]*u[2] + d[19]*u[3] + d[24]*u[4]);
1790 
1791       uik[5] = -(d[0]*u[5] + d[5]*u[6] + d[10]*u[7] + d[15]*u[8] + d[20]*u[9]);
1792       uik[6] = -(d[1]*u[5] + d[6]*u[6] + d[11]*u[7] + d[16]*u[8] + d[21]*u[9]);
1793       uik[7] = -(d[2]*u[5] + d[7]*u[6] + d[12]*u[7] + d[17]*u[8] + d[22]*u[9]);
1794       uik[8] = -(d[3]*u[5] + d[8]*u[6] + d[13]*u[7] + d[18]*u[8] + d[23]*u[9]);
1795       uik[9] = -(d[4]*u[5] + d[9]*u[6] + d[14]*u[7] + d[19]*u[8] + d[24]*u[9]);
1796 
1797       uik[10]= -(d[0]*u[10] + d[5]*u[11] + d[10]*u[12] + d[15]*u[13] + d[20]*u[14]);
1798       uik[11]= -(d[1]*u[10] + d[6]*u[11] + d[11]*u[12] + d[16]*u[13] + d[21]*u[14]);
1799       uik[12]= -(d[2]*u[10] + d[7]*u[11] + d[12]*u[12] + d[17]*u[13] + d[22]*u[14]);
1800       uik[13]= -(d[3]*u[10] + d[8]*u[11] + d[13]*u[12] + d[18]*u[13] + d[23]*u[14]);
1801       uik[14]= -(d[4]*u[10] + d[9]*u[11] + d[14]*u[12] + d[19]*u[13] + d[24]*u[14]);
1802 
1803       uik[15]= -(d[0]*u[15] + d[5]*u[16] + d[10]*u[17] + d[15]*u[18] + d[20]*u[19]);
1804       uik[16]= -(d[1]*u[15] + d[6]*u[16] + d[11]*u[17] + d[16]*u[18] + d[21]*u[19]);
1805       uik[17]= -(d[2]*u[15] + d[7]*u[16] + d[12]*u[17] + d[17]*u[18] + d[22]*u[19]);
1806       uik[18]= -(d[3]*u[15] + d[8]*u[16] + d[13]*u[17] + d[18]*u[18] + d[23]*u[19]);
1807       uik[19]= -(d[4]*u[15] + d[9]*u[16] + d[14]*u[17] + d[19]*u[18] + d[24]*u[19]);
1808 
1809       uik[20]= -(d[0]*u[20] + d[5]*u[21] + d[10]*u[22] + d[15]*u[23] + d[20]*u[24]);
1810       uik[21]= -(d[1]*u[20] + d[6]*u[21] + d[11]*u[22] + d[16]*u[23] + d[21]*u[24]);
1811       uik[22]= -(d[2]*u[20] + d[7]*u[21] + d[12]*u[22] + d[17]*u[23] + d[22]*u[24]);
1812       uik[23]= -(d[3]*u[20] + d[8]*u[21] + d[13]*u[22] + d[18]*u[23] + d[23]*u[24]);
1813       uik[24]= -(d[4]*u[20] + d[9]*u[21] + d[14]*u[22] + d[19]*u[23] + d[24]*u[24]);
1814 
1815 
1816       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
1817       dk[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1818       dk[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1819       dk[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1820       dk[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1821       dk[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1822 
1823       dk[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1824       dk[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1825       dk[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1826       dk[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1827       dk[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1828 
1829       dk[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1830       dk[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1831       dk[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1832       dk[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1833       dk[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1834 
1835       dk[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1836       dk[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1837       dk[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1838       dk[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1839       dk[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1840 
1841       dk[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1842       dk[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1843       dk[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1844       dk[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1845       dk[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1846 
1847       /* update -U(i,k) */
1848       ierr = PetscMemcpy(ba+ili*25,uik,25*sizeof(MatScalar));CHKERRQ(ierr);
1849 
1850       /* add multiple of row i to k-th row ... */
1851       jmin = ili + 1; jmax = bi[i+1];
1852       if (jmin < jmax){
1853         for (j=jmin; j<jmax; j++) {
1854           /* rtmp += -U(i,k)^T * U_bar(i,j) */
1855           rtmp_ptr = rtmp + bj[j]*25;
1856           u = ba + j*25;
1857           rtmp_ptr[0] +=  uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3] + uik[4]*u[4];
1858           rtmp_ptr[1] +=  uik[5]*u[0] + uik[6]*u[1] + uik[7]*u[2] + uik[8]*u[3] + uik[9]*u[4];
1859           rtmp_ptr[2] += uik[10]*u[0]+ uik[11]*u[1]+ uik[12]*u[2]+ uik[13]*u[3]+ uik[14]*u[4];
1860           rtmp_ptr[3] += uik[15]*u[0]+ uik[16]*u[1]+ uik[17]*u[2]+ uik[18]*u[3]+ uik[19]*u[4];
1861           rtmp_ptr[4] += uik[20]*u[0]+ uik[21]*u[1]+ uik[22]*u[2]+ uik[23]*u[3]+ uik[24]*u[4];
1862 
1863           rtmp_ptr[5] +=  uik[0]*u[5] + uik[1]*u[6] + uik[2]*u[7] + uik[3]*u[8] + uik[4]*u[9];
1864           rtmp_ptr[6] +=  uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8] + uik[9]*u[9];
1865           rtmp_ptr[7] += uik[10]*u[5]+ uik[11]*u[6]+ uik[12]*u[7]+ uik[13]*u[8]+ uik[14]*u[9];
1866           rtmp_ptr[8] += uik[15]*u[5]+ uik[16]*u[6]+ uik[17]*u[7]+ uik[18]*u[8]+ uik[19]*u[9];
1867           rtmp_ptr[9] += uik[20]*u[5]+ uik[21]*u[6]+ uik[22]*u[7]+ uik[23]*u[8]+ uik[24]*u[9];
1868 
1869           rtmp_ptr[10] +=  uik[0]*u[10] + uik[1]*u[11] + uik[2]*u[12] + uik[3]*u[13] + uik[4]*u[14];
1870           rtmp_ptr[11] +=  uik[5]*u[10] + uik[6]*u[11] + uik[7]*u[12] + uik[8]*u[13] + uik[9]*u[14];
1871           rtmp_ptr[12] += uik[10]*u[10]+ uik[11]*u[11]+ uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14];
1872           rtmp_ptr[13] += uik[15]*u[10]+ uik[16]*u[11]+ uik[17]*u[12]+ uik[18]*u[13]+ uik[19]*u[14];
1873           rtmp_ptr[14] += uik[20]*u[10]+ uik[21]*u[11]+ uik[22]*u[12]+ uik[23]*u[13]+ uik[24]*u[14];
1874 
1875           rtmp_ptr[15] +=  uik[0]*u[15] + uik[1]*u[16] + uik[2]*u[17] + uik[3]*u[18] + uik[4]*u[19];
1876           rtmp_ptr[16] +=  uik[5]*u[15] + uik[6]*u[16] + uik[7]*u[17] + uik[8]*u[18] + uik[9]*u[19];
1877           rtmp_ptr[17] += uik[10]*u[15]+ uik[11]*u[16]+ uik[12]*u[17]+ uik[13]*u[18]+ uik[14]*u[19];
1878           rtmp_ptr[18] += uik[15]*u[15]+ uik[16]*u[16]+ uik[17]*u[17]+ uik[18]*u[18]+ uik[19]*u[19];
1879           rtmp_ptr[19] += uik[20]*u[15]+ uik[21]*u[16]+ uik[22]*u[17]+ uik[23]*u[18]+ uik[24]*u[19];
1880 
1881           rtmp_ptr[20] +=  uik[0]*u[20] + uik[1]*u[21] + uik[2]*u[22] + uik[3]*u[23] + uik[4]*u[24];
1882           rtmp_ptr[21] +=  uik[5]*u[20] + uik[6]*u[21] + uik[7]*u[22] + uik[8]*u[23] + uik[9]*u[24];
1883           rtmp_ptr[22] += uik[10]*u[20]+ uik[11]*u[21]+ uik[12]*u[22]+ uik[13]*u[23]+ uik[14]*u[24];
1884           rtmp_ptr[23] += uik[15]*u[20]+ uik[16]*u[21]+ uik[17]*u[22]+ uik[18]*u[23]+ uik[19]*u[24];
1885           rtmp_ptr[24] += uik[20]*u[20]+ uik[21]*u[21]+ uik[22]*u[22]+ uik[23]*u[23]+ uik[24]*u[24];
1886         }
1887 
1888         /* ... add i to row list for next nonzero entry */
1889         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1890         j     = bj[jmin];
1891         jl[i] = jl[j]; jl[j] = i; /* update jl */
1892       }
1893       i = nexti;
1894     }
1895 
1896     /* save nonzero entries in k-th row of U ... */
1897 
1898     /* invert diagonal block */
1899     d = ba+k*25;
1900     ierr = PetscMemcpy(d,dk,25*sizeof(MatScalar));CHKERRQ(ierr);
1901     ierr = Kernel_A_gets_inverse_A_5(d);CHKERRQ(ierr);
1902 
1903     jmin = bi[k]; jmax = bi[k+1];
1904     if (jmin < jmax) {
1905       for (j=jmin; j<jmax; j++){
1906          vj = bj[j];           /* block col. index of U */
1907          u   = ba + j*25;
1908          rtmp_ptr = rtmp + vj*25;
1909          for (k1=0; k1<25; k1++){
1910            *u++        = *rtmp_ptr;
1911            *rtmp_ptr++ = 0.0;
1912          }
1913       }
1914 
1915       /* ... add k to row list for first nonzero entry in k-th row */
1916       il[k] = jmin;
1917       i     = bj[jmin];
1918       jl[k] = jl[i]; jl[i] = k;
1919     }
1920   }
1921 
1922   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1923   ierr = PetscFree(il);CHKERRQ(ierr);
1924   ierr = PetscFree(jl);CHKERRQ(ierr);
1925   ierr = PetscFree(dk);CHKERRQ(ierr);
1926   ierr = PetscFree(uik);CHKERRQ(ierr);
1927 
1928   C->factor    = FACTOR_CHOLESKY;
1929   C->assembled = PETSC_TRUE;
1930   C->preallocated = PETSC_TRUE;
1931   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 /*
1936       Version for when blocks are 4 by 4 Using natural ordering
1937 */
1938 #undef __FUNC__
1939 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering"
1940 int MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B)
1941 {
1942   Mat                C = *B;
1943   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1944   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1945   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1946   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
1947   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
1948 
1949   PetscFunctionBegin;
1950   /* initialization */
1951   rtmp  = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
1952   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1953   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
1954   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
1955   for (i=0; i<mbs; i++) {
1956     jl[i] = mbs; il[0] = 0;
1957   }
1958   dk    = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(dk);
1959   uik   = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(uik);
1960 
1961   ai = a->i; aj = a->j; aa = a->a;
1962 
1963   /* for each row k */
1964   for (k = 0; k<mbs; k++){
1965 
1966     /*initialize k-th row with elements nonzero in row k of A */
1967     jmin = ai[k]; jmax = ai[k+1];
1968     if (jmin < jmax) {
1969       ap = aa + jmin*16;
1970       for (j = jmin; j < jmax; j++){
1971         vj = aj[j];         /* block col. index */
1972         rtmp_ptr = rtmp + vj*16;
1973         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
1974       }
1975     }
1976 
1977     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1978     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
1979     i = jl[k]; /* first row to be added to k_th row  */
1980 
1981     while (i < mbs){
1982       nexti = jl[i]; /* next row to be added to k_th row */
1983 
1984       /* compute multiplier */
1985       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1986 
1987       /* uik = -inv(Di)*U_bar(i,k) */
1988       diag = ba + i*16;
1989       u    = ba + ili*16;
1990 
1991       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
1992       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
1993       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
1994       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
1995 
1996       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
1997       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
1998       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
1999       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2000 
2001       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2002       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2003       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2004       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2005 
2006       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2007       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2008       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2009       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2010 
2011       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2012       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2013       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2014       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2015       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2016 
2017       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2018       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2019       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2020       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2021 
2022       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2023       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2024       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2025       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2026 
2027       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2028       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2029       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2030       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2031 
2032       /* update -U(i,k) */
2033       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2034 
2035       /* add multiple of row i to k-th row ... */
2036       jmin = ili + 1; jmax = bi[i+1];
2037       if (jmin < jmax){
2038         for (j=jmin; j<jmax; j++) {
2039           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2040           rtmp_ptr = rtmp + bj[j]*16;
2041           u = ba + j*16;
2042           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2043           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2044           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2045           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2046 
2047           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2048           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2049           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2050           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2051 
2052           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2053           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2054           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2055           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2056 
2057           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2058           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2059           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2060           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2061         }
2062 
2063         /* ... add i to row list for next nonzero entry */
2064         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2065         j     = bj[jmin];
2066         jl[i] = jl[j]; jl[j] = i; /* update jl */
2067       }
2068       i = nexti;
2069     }
2070 
2071     /* save nonzero entries in k-th row of U ... */
2072 
2073     /* invert diagonal block */
2074     diag = ba+k*16;
2075     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2076     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2077 
2078     jmin = bi[k]; jmax = bi[k+1];
2079     if (jmin < jmax) {
2080       for (j=jmin; j<jmax; j++){
2081          vj = bj[j];           /* block col. index of U */
2082          u   = ba + j*16;
2083          rtmp_ptr = rtmp + vj*16;
2084          for (k1=0; k1<16; k1++){
2085            *u++        = *rtmp_ptr;
2086            *rtmp_ptr++ = 0.0;
2087          }
2088       }
2089 
2090       /* ... add k to row list for first nonzero entry in k-th row */
2091       il[k] = jmin;
2092       i     = bj[jmin];
2093       jl[k] = jl[i]; jl[i] = k;
2094     }
2095   }
2096 
2097   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2098   ierr = PetscFree(il);CHKERRQ(ierr);
2099   ierr = PetscFree(jl);CHKERRQ(ierr);
2100   ierr = PetscFree(dk);CHKERRQ(ierr);
2101   ierr = PetscFree(uik);CHKERRQ(ierr);
2102 
2103   C->factor    = FACTOR_CHOLESKY;
2104   C->assembled = PETSC_TRUE;
2105   C->preallocated = PETSC_TRUE;
2106   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 /* Version for when blocks are 4 by 4  */
2111 #undef __FUNC__
2112 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_4"
2113 int MatCholeskyFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B)
2114 {
2115   Mat                C = *B;
2116   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2117   IS                 perm = b->row;
2118   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2119   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2120   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2121   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2122 
2123   PetscFunctionBegin;
2124   /* initialization */
2125   rtmp  = (MatScalar*)PetscMalloc(16*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2126   ierr = PetscMemzero(rtmp,16*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2127   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2128   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2129   for (i=0; i<mbs; i++) {
2130     jl[i] = mbs; il[0] = 0;
2131   }
2132   dk    = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(dk);
2133   uik   = (MatScalar*)PetscMalloc(16*sizeof(MatScalar));CHKPTRQ(uik);
2134   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2135 
2136   /* check permutation */
2137   if (!a->permute){
2138     ai = a->i; aj = a->j; aa = a->a;
2139   } else {
2140     ai = a->inew; aj = a->jnew;
2141     aa = (MatScalar*)PetscMalloc(16*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2142     ierr = PetscMemcpy(aa,a->a,16*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2143     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2144     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2145 
2146     for (i=0; i<mbs; i++){
2147       jmin = ai[i]; jmax = ai[i+1];
2148       for (j=jmin; j<jmax; j++){
2149         while (a2anew[j] != j){
2150           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2151           for (k1=0; k1<16; k1++){
2152             dk[k1]       = aa[k*16+k1];
2153             aa[k*16+k1] = aa[j*16+k1];
2154             aa[j*16+k1] = dk[k1];
2155           }
2156         }
2157         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2158         if (i > aj[j]){
2159           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2160           ap = aa + j*16;                     /* ptr to the beginning of j-th block of aa */
2161           for (k=0; k<16; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2162           for (k=0; k<4; k++){               /* j-th block of aa <- dk^T */
2163             for (k1=0; k1<4; k1++) *ap++ = dk[k + 4*k1];
2164           }
2165         }
2166       }
2167     }
2168     ierr = PetscFree(a2anew);CHKERRA(ierr);
2169   }
2170 
2171   /* for each row k */
2172   for (k = 0; k<mbs; k++){
2173 
2174     /*initialize k-th row with elements nonzero in row perm(k) of A */
2175     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2176     if (jmin < jmax) {
2177       ap = aa + jmin*16;
2178       for (j = jmin; j < jmax; j++){
2179         vj = perm_ptr[aj[j]];         /* block col. index */
2180         rtmp_ptr = rtmp + vj*16;
2181         for (i=0; i<16; i++) *rtmp_ptr++ = *ap++;
2182       }
2183     }
2184 
2185     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2186     ierr = PetscMemcpy(dk,rtmp+k*16,16*sizeof(MatScalar));CHKERRQ(ierr);
2187     i = jl[k]; /* first row to be added to k_th row  */
2188 
2189     while (i < mbs){
2190       nexti = jl[i]; /* next row to be added to k_th row */
2191 
2192       /* compute multiplier */
2193       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2194 
2195       /* uik = -inv(Di)*U_bar(i,k) */
2196       diag = ba + i*16;
2197       u    = ba + ili*16;
2198 
2199       uik[0] = -(diag[0]*u[0] + diag[4]*u[1] + diag[8]*u[2] + diag[12]*u[3]);
2200       uik[1] = -(diag[1]*u[0] + diag[5]*u[1] + diag[9]*u[2] + diag[13]*u[3]);
2201       uik[2] = -(diag[2]*u[0] + diag[6]*u[1] + diag[10]*u[2]+ diag[14]*u[3]);
2202       uik[3] = -(diag[3]*u[0] + diag[7]*u[1] + diag[11]*u[2]+ diag[15]*u[3]);
2203 
2204       uik[4] = -(diag[0]*u[4] + diag[4]*u[5] + diag[8]*u[6] + diag[12]*u[7]);
2205       uik[5] = -(diag[1]*u[4] + diag[5]*u[5] + diag[9]*u[6] + diag[13]*u[7]);
2206       uik[6] = -(diag[2]*u[4] + diag[6]*u[5] + diag[10]*u[6]+ diag[14]*u[7]);
2207       uik[7] = -(diag[3]*u[4] + diag[7]*u[5] + diag[11]*u[6]+ diag[15]*u[7]);
2208 
2209       uik[8] = -(diag[0]*u[8] + diag[4]*u[9] + diag[8]*u[10] + diag[12]*u[11]);
2210       uik[9] = -(diag[1]*u[8] + diag[5]*u[9] + diag[9]*u[10] + diag[13]*u[11]);
2211       uik[10]= -(diag[2]*u[8] + diag[6]*u[9] + diag[10]*u[10]+ diag[14]*u[11]);
2212       uik[11]= -(diag[3]*u[8] + diag[7]*u[9] + diag[11]*u[10]+ diag[15]*u[11]);
2213 
2214       uik[12]= -(diag[0]*u[12] + diag[4]*u[13] + diag[8]*u[14] + diag[12]*u[15]);
2215       uik[13]= -(diag[1]*u[12] + diag[5]*u[13] + diag[9]*u[14] + diag[13]*u[15]);
2216       uik[14]= -(diag[2]*u[12] + diag[6]*u[13] + diag[10]*u[14]+ diag[14]*u[15]);
2217       uik[15]= -(diag[3]*u[12] + diag[7]*u[13] + diag[11]*u[14]+ diag[15]*u[15]);
2218 
2219       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2220       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2221       dk[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2222       dk[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2223       dk[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2224 
2225       dk[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2226       dk[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2227       dk[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2228       dk[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2229 
2230       dk[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2231       dk[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2232       dk[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2233       dk[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2234 
2235       dk[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2236       dk[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2237       dk[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2238       dk[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2239 
2240       /* update -U(i,k) */
2241       ierr = PetscMemcpy(ba+ili*16,uik,16*sizeof(MatScalar));CHKERRQ(ierr);
2242 
2243       /* add multiple of row i to k-th row ... */
2244       jmin = ili + 1; jmax = bi[i+1];
2245       if (jmin < jmax){
2246         for (j=jmin; j<jmax; j++) {
2247           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2248           rtmp_ptr = rtmp + bj[j]*16;
2249           u = ba + j*16;
2250           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2] + uik[3]*u[3];
2251           rtmp_ptr[1] += uik[4]*u[0] + uik[5]*u[1] + uik[6]*u[2] + uik[7]*u[3];
2252           rtmp_ptr[2] += uik[8]*u[0] + uik[9]*u[1] + uik[10]*u[2]+ uik[11]*u[3];
2253           rtmp_ptr[3] += uik[12]*u[0]+ uik[13]*u[1]+ uik[14]*u[2]+ uik[15]*u[3];
2254 
2255           rtmp_ptr[4] += uik[0]*u[4] + uik[1]*u[5] + uik[2]*u[6] + uik[3]*u[7];
2256           rtmp_ptr[5] += uik[4]*u[4] + uik[5]*u[5] + uik[6]*u[6] + uik[7]*u[7];
2257           rtmp_ptr[6] += uik[8]*u[4] + uik[9]*u[5] + uik[10]*u[6]+ uik[11]*u[7];
2258           rtmp_ptr[7] += uik[12]*u[4]+ uik[13]*u[5]+ uik[14]*u[6]+ uik[15]*u[7];
2259 
2260           rtmp_ptr[8] += uik[0]*u[8] + uik[1]*u[9] + uik[2]*u[10] + uik[3]*u[11];
2261           rtmp_ptr[9] += uik[4]*u[8] + uik[5]*u[9] + uik[6]*u[10] + uik[7]*u[11];
2262           rtmp_ptr[10]+= uik[8]*u[8] + uik[9]*u[9] + uik[10]*u[10]+ uik[11]*u[11];
2263           rtmp_ptr[11]+= uik[12]*u[8]+ uik[13]*u[9]+ uik[14]*u[10]+ uik[15]*u[11];
2264 
2265           rtmp_ptr[12]+= uik[0]*u[12] + uik[1]*u[13] + uik[2]*u[14] + uik[3]*u[15];
2266           rtmp_ptr[13]+= uik[4]*u[12] + uik[5]*u[13] + uik[6]*u[14] + uik[7]*u[15];
2267           rtmp_ptr[14]+= uik[8]*u[12] + uik[9]*u[13] + uik[10]*u[14]+ uik[11]*u[15];
2268           rtmp_ptr[15]+= uik[12]*u[12]+ uik[13]*u[13]+ uik[14]*u[14]+ uik[15]*u[15];
2269         }
2270 
2271         /* ... add i to row list for next nonzero entry */
2272         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2273         j     = bj[jmin];
2274         jl[i] = jl[j]; jl[j] = i; /* update jl */
2275       }
2276       i = nexti;
2277     }
2278 
2279     /* save nonzero entries in k-th row of U ... */
2280 
2281     /* invert diagonal block */
2282     diag = ba+k*16;
2283     ierr = PetscMemcpy(diag,dk,16*sizeof(MatScalar));CHKERRQ(ierr);
2284     ierr = Kernel_A_gets_inverse_A_4(diag);CHKERRQ(ierr);
2285 
2286     jmin = bi[k]; jmax = bi[k+1];
2287     if (jmin < jmax) {
2288       for (j=jmin; j<jmax; j++){
2289          vj = bj[j];           /* block col. index of U */
2290          u   = ba + j*16;
2291          rtmp_ptr = rtmp + vj*16;
2292          for (k1=0; k1<16; k1++){
2293            *u++        = *rtmp_ptr;
2294            *rtmp_ptr++ = 0.0;
2295          }
2296       }
2297 
2298       /* ... add k to row list for first nonzero entry in k-th row */
2299       il[k] = jmin;
2300       i     = bj[jmin];
2301       jl[k] = jl[i]; jl[i] = k;
2302     }
2303   }
2304 
2305   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2306   ierr = PetscFree(il);CHKERRQ(ierr);
2307   ierr = PetscFree(jl);CHKERRQ(ierr);
2308   ierr = PetscFree(dk);CHKERRQ(ierr);
2309   ierr = PetscFree(uik);CHKERRQ(ierr);
2310   if (a->permute){
2311     ierr = PetscFree(aa);CHKERRQ(ierr);
2312   }
2313 
2314   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2315   C->factor    = FACTOR_CHOLESKY;
2316   C->assembled = PETSC_TRUE;
2317   C->preallocated = PETSC_TRUE;
2318   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
2319   PetscFunctionReturn(0);
2320 }
2321 
2322 /* Version for when blocks are 3 by 3  */
2323 #undef __FUNC__
2324 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3"
2325 int MatCholeskyFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B)
2326 {
2327   Mat                C = *B;
2328   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2329   IS                 perm = b->row;
2330   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2331   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2332   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2333   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2334 
2335   PetscFunctionBegin;
2336   /* initialization */
2337   rtmp  = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2338   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2339   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2340   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2341   for (i=0; i<mbs; i++) {
2342     jl[i] = mbs; il[0] = 0;
2343   }
2344   dk  = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(dk);
2345   uik = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(uik);
2346   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2347 
2348   /* check permutation */
2349   if (!a->permute){
2350     ai = a->i; aj = a->j; aa = a->a;
2351   } else {
2352     ai = a->inew; aj = a->jnew;
2353     aa = (MatScalar*)PetscMalloc(9*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2354     ierr = PetscMemcpy(aa,a->a,9*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2355     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2356     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2357 
2358     for (i=0; i<mbs; i++){
2359       jmin = ai[i]; jmax = ai[i+1];
2360       for (j=jmin; j<jmax; j++){
2361         while (a2anew[j] != j){
2362           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2363           for (k1=0; k1<9; k1++){
2364             dk[k1]       = aa[k*9+k1];
2365             aa[k*9+k1] = aa[j*9+k1];
2366             aa[j*9+k1] = dk[k1];
2367           }
2368         }
2369         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2370         if (i > aj[j]){
2371           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2372           ap = aa + j*9;                     /* ptr to the beginning of j-th block of aa */
2373           for (k=0; k<9; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
2374           for (k=0; k<3; k++){               /* j-th block of aa <- dk^T */
2375             for (k1=0; k1<3; k1++) *ap++ = dk[k + 3*k1];
2376           }
2377         }
2378       }
2379     }
2380     ierr = PetscFree(a2anew);CHKERRA(ierr);
2381   }
2382 
2383   /* for each row k */
2384   for (k = 0; k<mbs; k++){
2385 
2386     /*initialize k-th row with elements nonzero in row perm(k) of A */
2387     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2388     if (jmin < jmax) {
2389       ap = aa + jmin*9;
2390       for (j = jmin; j < jmax; j++){
2391         vj = perm_ptr[aj[j]];         /* block col. index */
2392         rtmp_ptr = rtmp + vj*9;
2393         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2394       }
2395     }
2396 
2397     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2398     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2399     i = jl[k]; /* first row to be added to k_th row  */
2400 
2401     while (i < mbs){
2402       nexti = jl[i]; /* next row to be added to k_th row */
2403 
2404       /* compute multiplier */
2405       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2406 
2407       /* uik = -inv(Di)*U_bar(i,k) */
2408       diag = ba + i*9;
2409       u    = ba + ili*9;
2410 
2411       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2412       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2413       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2414 
2415       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2416       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2417       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2418 
2419       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2420       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2421       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2422 
2423       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2424       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2425       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2426       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2427 
2428       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2429       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2430       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2431 
2432       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2433       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2434       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2435 
2436       /* update -U(i,k) */
2437       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2438 
2439       /* add multiple of row i to k-th row ... */
2440       jmin = ili + 1; jmax = bi[i+1];
2441       if (jmin < jmax){
2442         for (j=jmin; j<jmax; j++) {
2443           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2444           rtmp_ptr = rtmp + bj[j]*9;
2445           u = ba + j*9;
2446           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2447           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2448           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2449 
2450           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2451           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2452           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2453 
2454           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2455           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2456           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2457         }
2458 
2459         /* ... add i to row list for next nonzero entry */
2460         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2461         j     = bj[jmin];
2462         jl[i] = jl[j]; jl[j] = i; /* update jl */
2463       }
2464       i = nexti;
2465     }
2466 
2467     /* save nonzero entries in k-th row of U ... */
2468 
2469     /* invert diagonal block */
2470     diag = ba+k*9;
2471     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2472     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2473 
2474     jmin = bi[k]; jmax = bi[k+1];
2475     if (jmin < jmax) {
2476       for (j=jmin; j<jmax; j++){
2477          vj = bj[j];           /* block col. index of U */
2478          u   = ba + j*9;
2479          rtmp_ptr = rtmp + vj*9;
2480          for (k1=0; k1<9; k1++){
2481            *u++        = *rtmp_ptr;
2482            *rtmp_ptr++ = 0.0;
2483          }
2484       }
2485 
2486       /* ... add k to row list for first nonzero entry in k-th row */
2487       il[k] = jmin;
2488       i     = bj[jmin];
2489       jl[k] = jl[i]; jl[i] = k;
2490     }
2491   }
2492 
2493   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2494   ierr = PetscFree(il);CHKERRQ(ierr);
2495   ierr = PetscFree(jl);CHKERRQ(ierr);
2496   ierr = PetscFree(dk);CHKERRQ(ierr);
2497   ierr = PetscFree(uik);CHKERRQ(ierr);
2498   if (a->permute){
2499     ierr = PetscFree(aa);CHKERRQ(ierr);
2500   }
2501 
2502   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2503   C->factor    = FACTOR_CHOLESKY;
2504   C->assembled = PETSC_TRUE;
2505   C->preallocated = PETSC_TRUE;
2506   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2507   PetscFunctionReturn(0);
2508 }
2509 
2510 /*
2511       Version for when blocks are 3 by 3 Using natural ordering
2512 */
2513 #undef __FUNC__
2514 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering"
2515 int MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B)
2516 {
2517   Mat                C = *B;
2518   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2519   IS                 perm = b->row;
2520   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2521   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2522   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2523   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2524 
2525   PetscFunctionBegin;
2526   /* initialization */
2527   rtmp  = (MatScalar*)PetscMalloc(9*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2528   ierr = PetscMemzero(rtmp,9*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2529   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2530   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2531   for (i=0; i<mbs; i++) {
2532     jl[i] = mbs; il[0] = 0;
2533   }
2534   dk  = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(dk);
2535   uik = (MatScalar*)PetscMalloc(9*sizeof(MatScalar));CHKPTRQ(uik);
2536 
2537   ai = a->i; aj = a->j; aa = a->a;
2538 
2539   /* for each row k */
2540   for (k = 0; k<mbs; k++){
2541 
2542     /*initialize k-th row with elements nonzero in row k of A */
2543     jmin = ai[k]; jmax = ai[k+1];
2544     if (jmin < jmax) {
2545       ap = aa + jmin*9;
2546       for (j = jmin; j < jmax; j++){
2547         vj = aj[j];         /* block col. index */
2548         rtmp_ptr = rtmp + vj*9;
2549         for (i=0; i<9; i++) *rtmp_ptr++ = *ap++;
2550       }
2551     }
2552 
2553     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2554     ierr = PetscMemcpy(dk,rtmp+k*9,9*sizeof(MatScalar));CHKERRQ(ierr);
2555     i = jl[k]; /* first row to be added to k_th row  */
2556 
2557     while (i < mbs){
2558       nexti = jl[i]; /* next row to be added to k_th row */
2559 
2560       /* compute multiplier */
2561       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2562 
2563       /* uik = -inv(Di)*U_bar(i,k) */
2564       diag = ba + i*9;
2565       u    = ba + ili*9;
2566 
2567       uik[0] = -(diag[0]*u[0] + diag[3]*u[1] + diag[6]*u[2]);
2568       uik[1] = -(diag[1]*u[0] + diag[4]*u[1] + diag[7]*u[2]);
2569       uik[2] = -(diag[2]*u[0] + diag[5]*u[1] + diag[8]*u[2]);
2570 
2571       uik[3] = -(diag[0]*u[3] + diag[3]*u[4] + diag[6]*u[5]);
2572       uik[4] = -(diag[1]*u[3] + diag[4]*u[4] + diag[7]*u[5]);
2573       uik[5] = -(diag[2]*u[3] + diag[5]*u[4] + diag[8]*u[5]);
2574 
2575       uik[6] = -(diag[0]*u[6] + diag[3]*u[7] + diag[6]*u[8]);
2576       uik[7] = -(diag[1]*u[6] + diag[4]*u[7] + diag[7]*u[8]);
2577       uik[8] = -(diag[2]*u[6] + diag[5]*u[7] + diag[8]*u[8]);
2578 
2579       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
2580       dk[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2581       dk[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2582       dk[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2583 
2584       dk[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2585       dk[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2586       dk[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2587 
2588       dk[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2589       dk[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2590       dk[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2591 
2592       /* update -U(i,k) */
2593       ierr = PetscMemcpy(ba+ili*9,uik,9*sizeof(MatScalar));CHKERRQ(ierr);
2594 
2595       /* add multiple of row i to k-th row ... */
2596       jmin = ili + 1; jmax = bi[i+1];
2597       if (jmin < jmax){
2598         for (j=jmin; j<jmax; j++) {
2599           /* rtmp += -U(i,k)^T * U_bar(i,j) */
2600           rtmp_ptr = rtmp + bj[j]*9;
2601           u = ba + j*9;
2602           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1] + uik[2]*u[2];
2603           rtmp_ptr[1] += uik[3]*u[0] + uik[4]*u[1] + uik[5]*u[2];
2604           rtmp_ptr[2] += uik[6]*u[0] + uik[7]*u[1] + uik[8]*u[2];
2605 
2606           rtmp_ptr[3] += uik[0]*u[3] + uik[1]*u[4] + uik[2]*u[5];
2607           rtmp_ptr[4] += uik[3]*u[3] + uik[4]*u[4] + uik[5]*u[5];
2608           rtmp_ptr[5] += uik[6]*u[3] + uik[7]*u[4] + uik[8]*u[5];
2609 
2610           rtmp_ptr[6] += uik[0]*u[6] + uik[1]*u[7] + uik[2]*u[8];
2611           rtmp_ptr[7] += uik[3]*u[6] + uik[4]*u[7] + uik[5]*u[8];
2612           rtmp_ptr[8] += uik[6]*u[6] + uik[7]*u[7] + uik[8]*u[8];
2613         }
2614 
2615         /* ... add i to row list for next nonzero entry */
2616         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2617         j     = bj[jmin];
2618         jl[i] = jl[j]; jl[j] = i; /* update jl */
2619       }
2620       i = nexti;
2621     }
2622 
2623     /* save nonzero entries in k-th row of U ... */
2624 
2625     /* invert diagonal block */
2626     diag = ba+k*9;
2627     ierr = PetscMemcpy(diag,dk,9*sizeof(MatScalar));CHKERRQ(ierr);
2628     ierr = Kernel_A_gets_inverse_A_3(diag);CHKERRQ(ierr);
2629 
2630     jmin = bi[k]; jmax = bi[k+1];
2631     if (jmin < jmax) {
2632       for (j=jmin; j<jmax; j++){
2633          vj = bj[j];           /* block col. index of U */
2634          u   = ba + j*9;
2635          rtmp_ptr = rtmp + vj*9;
2636          for (k1=0; k1<9; k1++){
2637            *u++        = *rtmp_ptr;
2638            *rtmp_ptr++ = 0.0;
2639          }
2640       }
2641 
2642       /* ... add k to row list for first nonzero entry in k-th row */
2643       il[k] = jmin;
2644       i     = bj[jmin];
2645       jl[k] = jl[i]; jl[i] = k;
2646     }
2647   }
2648 
2649   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2650   ierr = PetscFree(il);CHKERRQ(ierr);
2651   ierr = PetscFree(jl);CHKERRQ(ierr);
2652   ierr = PetscFree(dk);CHKERRQ(ierr);
2653   ierr = PetscFree(uik);CHKERRQ(ierr);
2654 
2655   C->factor    = FACTOR_CHOLESKY;
2656   C->assembled = PETSC_TRUE;
2657   C->preallocated = PETSC_TRUE;
2658   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2659   PetscFunctionReturn(0);
2660 }
2661 
2662 /*
2663     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
2664     Version for blocks 2 by 2.
2665 */
2666 #undef __FUNC__
2667 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
2668 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
2669 {
2670   Mat                C = *B;
2671   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2672   IS                 perm = b->row;
2673   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2674   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2675   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2676   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2677 
2678   PetscFunctionBegin;
2679   /* printf("called Numeric_SeqSBAIJ_2\n"); */
2680   /* initialization */
2681   /* il and jl record the first nonzero element in each row of the accessing
2682      window U(0:k, k:mbs-1).
2683      jl:    list of rows to be added to uneliminated rows
2684             i>= k: jl(i) is the first row to be added to row i
2685             i<  k: jl(i) is the row following row i in some list of rows
2686             jl(i) = mbs indicates the end of a list
2687      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2688             row i of U */
2689   rtmp  = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2690   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2691   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2692   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2693   for (i=0; i<mbs; i++) {
2694     jl[i] = mbs; il[0] = 0;
2695   }
2696   dk  = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(dk);
2697   uik = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(uik);
2698   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
2699 
2700   /* check permutation */
2701   if (!a->permute){
2702     ai = a->i; aj = a->j; aa = a->a;
2703   } else {
2704     ai = a->inew; aj = a->jnew;
2705     aa = (MatScalar*)PetscMalloc(4*ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
2706     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2707     a2anew  = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(a2anew);
2708     ierr= PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
2709 
2710     for (i=0; i<mbs; i++){
2711       jmin = ai[i]; jmax = ai[i+1];
2712       for (j=jmin; j<jmax; j++){
2713         while (a2anew[j] != j){
2714           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
2715           for (k1=0; k1<4; k1++){
2716             dk[k1]       = aa[k*4+k1];
2717             aa[k*4+k1] = aa[j*4+k1];
2718             aa[j*4+k1] = dk[k1];
2719           }
2720         }
2721         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
2722         if (i > aj[j]){
2723           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
2724           ap = aa + j*4;     /* ptr to the beginning of the block */
2725           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
2726           ap[1] = ap[2];
2727           ap[2] = dk[1];
2728         }
2729       }
2730     }
2731     ierr = PetscFree(a2anew);CHKERRA(ierr);
2732   }
2733 
2734   /* for each row k */
2735   for (k = 0; k<mbs; k++){
2736 
2737     /*initialize k-th row with elements nonzero in row perm(k) of A */
2738     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
2739     if (jmin < jmax) {
2740       ap = aa + jmin*4;
2741       for (j = jmin; j < jmax; j++){
2742         vj = perm_ptr[aj[j]];         /* block col. index */
2743         rtmp_ptr = rtmp + vj*4;
2744         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
2745       }
2746     }
2747 
2748     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2749     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
2750     i = jl[k]; /* first row to be added to k_th row  */
2751 
2752     while (i < mbs){
2753       nexti = jl[i]; /* next row to be added to k_th row */
2754 
2755       /* compute multiplier */
2756       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2757 
2758       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
2759       diag = ba + i*4;
2760       u    = ba + ili*4;
2761       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
2762       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
2763       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
2764       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
2765 
2766       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
2767       dk[0] += uik[0]*u[0] + uik[1]*u[1];
2768       dk[1] += uik[2]*u[0] + uik[3]*u[1];
2769       dk[2] += uik[0]*u[2] + uik[1]*u[3];
2770       dk[3] += uik[2]*u[2] + uik[3]*u[3];
2771 
2772       /* update -U(i,k): ba[ili] = uik */
2773       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
2774 
2775       /* add multiple of row i to k-th row ... */
2776       jmin = ili + 1; jmax = bi[i+1];
2777       if (jmin < jmax){
2778         for (j=jmin; j<jmax; j++) {
2779           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
2780           rtmp_ptr = rtmp + bj[j]*4;
2781           u = ba + j*4;
2782           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
2783           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
2784           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
2785           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
2786         }
2787 
2788         /* ... add i to row list for next nonzero entry */
2789         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2790         j     = bj[jmin];
2791         jl[i] = jl[j]; jl[j] = i; /* update jl */
2792       }
2793       i = nexti;
2794     }
2795 
2796     /* save nonzero entries in k-th row of U ... */
2797 
2798     /* invert diagonal block */
2799     diag = ba+k*4;
2800     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
2801     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
2802 
2803     jmin = bi[k]; jmax = bi[k+1];
2804     if (jmin < jmax) {
2805       for (j=jmin; j<jmax; j++){
2806          vj = bj[j];           /* block col. index of U */
2807          u   = ba + j*4;
2808          rtmp_ptr = rtmp + vj*4;
2809          for (k1=0; k1<4; k1++){
2810            *u++        = *rtmp_ptr;
2811            *rtmp_ptr++ = 0.0;
2812          }
2813       }
2814 
2815       /* ... add k to row list for first nonzero entry in k-th row */
2816       il[k] = jmin;
2817       i     = bj[jmin];
2818       jl[k] = jl[i]; jl[i] = k;
2819     }
2820   }
2821 
2822   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2823   ierr = PetscFree(il);CHKERRQ(ierr);
2824   ierr = PetscFree(jl);CHKERRQ(ierr);
2825   ierr = PetscFree(dk);CHKERRQ(ierr);
2826   ierr = PetscFree(uik);CHKERRQ(ierr);
2827   if (a->permute) {
2828     ierr = PetscFree(aa);CHKERRQ(ierr);
2829   }
2830   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
2831   C->factor    = FACTOR_CHOLESKY;
2832   C->assembled = PETSC_TRUE;
2833   C->preallocated = PETSC_TRUE;
2834   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2835   PetscFunctionReturn(0);
2836 }
2837 
2838 /*
2839       Version for when blocks are 2 by 2 Using natural ordering
2840 */
2841 #undef __FUNC__
2842 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
2843 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
2844 {
2845   Mat                C = *B;
2846   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2847   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
2848   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
2849   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
2850   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
2851 
2852   PetscFunctionBegin;
2853   /* printf("called Numeric_SeqSBAIJ_2_NaturalOrdering\n"); */
2854   /* initialization */
2855   /* il and jl record the first nonzero element in each row of the accessing
2856      window U(0:k, k:mbs-1).
2857      jl:    list of rows to be added to uneliminated rows
2858             i>= k: jl(i) is the first row to be added to row i
2859             i<  k: jl(i) is the row following row i in some list of rows
2860             jl(i) = mbs indicates the end of a list
2861      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2862             row i of U */
2863   rtmp  = (MatScalar*)PetscMalloc(4*mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2864   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2865   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2866   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2867   for (i=0; i<mbs; i++) {
2868     jl[i] = mbs; il[0] = 0;
2869   }
2870   dk  = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(dk);
2871   uik = (MatScalar*)PetscMalloc(4*sizeof(MatScalar));CHKPTRQ(uik);
2872 
2873   ai = a->i; aj = a->j; aa = a->a;
2874 
2875   /* for each row k */
2876   for (k = 0; k<mbs; k++){
2877 
2878     /*initialize k-th row with elements nonzero in row k of A */
2879     jmin = ai[k]; jmax = ai[k+1];
2880     if (jmin < jmax) {
2881       ap = aa + jmin*4;
2882       for (j = jmin; j < jmax; j++){
2883         vj = aj[j];         /* block col. index */
2884         rtmp_ptr = rtmp + vj*4;
2885         for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
2886       }
2887     }
2888 
2889     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
2890     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
2891     i = jl[k]; /* first row to be added to k_th row  */
2892 
2893     while (i < mbs){
2894       nexti = jl[i]; /* next row to be added to k_th row */
2895 
2896       /* compute multiplier */
2897       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2898 
2899       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
2900       diag = ba + i*4;
2901       u    = ba + ili*4;
2902       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
2903       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
2904       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
2905       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
2906 
2907       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
2908       dk[0] += uik[0]*u[0] + uik[1]*u[1];
2909       dk[1] += uik[2]*u[0] + uik[3]*u[1];
2910       dk[2] += uik[0]*u[2] + uik[1]*u[3];
2911       dk[3] += uik[2]*u[2] + uik[3]*u[3];
2912 
2913       /* update -U(i,k): ba[ili] = uik */
2914       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
2915 
2916       /* add multiple of row i to k-th row ... */
2917       jmin = ili + 1; jmax = bi[i+1];
2918       if (jmin < jmax){
2919         for (j=jmin; j<jmax; j++) {
2920           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
2921           rtmp_ptr = rtmp + bj[j]*4;
2922           u = ba + j*4;
2923           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
2924           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
2925           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
2926           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
2927         }
2928 
2929         /* ... add i to row list for next nonzero entry */
2930         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2931         j     = bj[jmin];
2932         jl[i] = jl[j]; jl[j] = i; /* update jl */
2933       }
2934       i = nexti;
2935     }
2936 
2937     /* save nonzero entries in k-th row of U ... */
2938 
2939     /* invert diagonal block */
2940     diag = ba+k*4;
2941     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
2942     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
2943 
2944     jmin = bi[k]; jmax = bi[k+1];
2945     if (jmin < jmax) {
2946       for (j=jmin; j<jmax; j++){
2947          vj = bj[j];           /* block col. index of U */
2948          u   = ba + j*4;
2949          rtmp_ptr = rtmp + vj*4;
2950          for (k1=0; k1<4; k1++){
2951            *u++        = *rtmp_ptr;
2952            *rtmp_ptr++ = 0.0;
2953          }
2954       }
2955 
2956       /* ... add k to row list for first nonzero entry in k-th row */
2957       il[k] = jmin;
2958       i     = bj[jmin];
2959       jl[k] = jl[i]; jl[i] = k;
2960     }
2961   }
2962 
2963   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2964   ierr = PetscFree(il);CHKERRQ(ierr);
2965   ierr = PetscFree(jl);CHKERRQ(ierr);
2966   ierr = PetscFree(dk);CHKERRQ(ierr);
2967   ierr = PetscFree(uik);CHKERRQ(ierr);
2968 
2969   C->factor    = FACTOR_CHOLESKY;
2970   C->assembled = PETSC_TRUE;
2971   C->preallocated = PETSC_TRUE;
2972   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2973   PetscFunctionReturn(0);
2974 }
2975 
2976 /*
2977     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
2978     Version for blocks are 1 by 1.
2979 */
2980 #undef __FUNC__
2981 #define __FUNC__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
2982 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
2983 {
2984   Mat                C = *B;
2985   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2986   IS                 ip = b->row;
2987   int                *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
2988   int                *ai,*aj,*r;
2989   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
2990   MatScalar          *rtmp;
2991   MatScalar          *ba = b->a,*aa,ak;
2992   MatScalar          dk,uikdi;
2993 
2994   PetscFunctionBegin;
2995   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
2996   if (!a->permute){
2997     ai = a->i; aj = a->j; aa = a->a;
2998   } else {
2999     ai = a->inew; aj = a->jnew;
3000     aa = (MatScalar*)PetscMalloc(ai[mbs]*sizeof(MatScalar));CHKPTRQ(aa);
3001     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
3002     r   = (int*)PetscMalloc(ai[mbs]*sizeof(int));CHKPTRQ(r);
3003     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
3004 
3005     jmin = ai[0]; jmax = ai[mbs];
3006     for (j=jmin; j<jmax; j++){
3007       while (r[j] != j){
3008         k = r[j]; r[j] = r[k]; r[k] = k;
3009         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
3010       }
3011     }
3012     ierr = PetscFree(r);CHKERRA(ierr);
3013   }
3014 
3015   /* initialization */
3016   /* il and jl record the first nonzero element in each row of the accessing
3017      window U(0:k, k:mbs-1).
3018      jl:    list of rows to be added to uneliminated rows
3019             i>= k: jl(i) is the first row to be added to row i
3020             i<  k: jl(i) is the row following row i in some list of rows
3021             jl(i) = mbs indicates the end of a list
3022      il(i): points to the first nonzero element in columns k,...,mbs-1 of
3023             row i of U */
3024   rtmp  = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
3025   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
3026   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
3027   for (i=0; i<mbs; i++) {
3028     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
3029   }
3030 
3031   /* for each row k */
3032   for (k = 0; k<mbs; k++){
3033 
3034     /*initialize k-th row with elements nonzero in row perm(k) of A */
3035     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
3036     if (jmin < jmax) {
3037       for (j = jmin; j < jmax; j++){
3038         vj = rip[aj[j]];
3039         rtmp[vj] = aa[j];
3040       }
3041     }
3042 
3043     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
3044     dk = rtmp[k];
3045     i = jl[k]; /* first row to be added to k_th row  */
3046 
3047     while (i < mbs){
3048       nexti = jl[i]; /* next row to be added to k_th row */
3049 
3050       /* compute multiplier, update D(k) and U(i,k) */
3051       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
3052       uikdi = - ba[ili]*ba[i];
3053       dk += uikdi*ba[ili];
3054       ba[ili] = uikdi; /* -U(i,k) */
3055 
3056       /* add multiple of row i to k-th row ... */
3057       jmin = ili + 1; jmax = bi[i+1];
3058       if (jmin < jmax){
3059         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
3060         /* ... add i to row list for next nonzero entry */
3061         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
3062         j     = bj[jmin];
3063         jl[i] = jl[j]; jl[j] = i; /* update jl */
3064       }
3065       i = nexti;
3066     }
3067 
3068     /* check for zero pivot and save diagoanl element */
3069     if (dk == 0.0){
3070       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
3071     }else if (PetscRealPart(dk) < 0){
3072       ierr = PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %g\n",k,dk);
3073     }
3074 
3075     /* save nonzero entries in k-th row of U ... */
3076     ba[k] = 1.0/dk;
3077     jmin = bi[k]; jmax = bi[k+1];
3078     if (jmin < jmax) {
3079       for (j=jmin; j<jmax; j++){
3080          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
3081       }
3082       /* ... add k to row list for first nonzero entry in k-th row */
3083       il[k] = jmin;
3084       i     = bj[jmin];
3085       jl[k] = jl[i]; jl[i] = k;
3086     }
3087   }
3088 
3089   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3090   ierr = PetscFree(il);CHKERRQ(ierr);
3091   ierr = PetscFree(jl);CHKERRQ(ierr);
3092   if (a->permute){
3093     ierr = PetscFree(aa);CHKERRQ(ierr);
3094   }
3095 
3096   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
3097   C->factor    = FACTOR_CHOLESKY;
3098   C->assembled = PETSC_TRUE;
3099   C->preallocated = PETSC_TRUE;
3100   PLogFlops(b->mbs);
3101   PetscFunctionReturn(0);
3102 }
3103 
3104 #undef __FUNC__
3105 #define __FUNC__ "MatCholeskyFactor_SeqSBAIJ"
3106 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
3107 {
3108   int ierr;
3109   Mat C;
3110 
3111   PetscFunctionBegin;
3112   ierr = MatCholeskyFactorSymbolic(A,perm,f,&C);CHKERRQ(ierr);
3113   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
3114   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
3115   PetscFunctionReturn(0);
3116 }
3117 
3118 
3119