xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision f37cbb0890cfcc075bf42ae2b4760d76e6f1168d)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/mat/impls/sbaij/seq/sbaij.h"
4 #include "src/inline/ilu.h"
5 #include "include/petscis.h"
6 
7 #if !defined(PETSC_USE_COMPLEX)
8 /*
9   input:
10    F -- numeric factor
11   output:
12    nneg, nzero, npos: matrix inertia
13 */
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
17 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,int *nneig,int *nzero,int *npos)
18 {
19   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
20   PetscScalar  *dd = fact_ptr->a;
21   int          mbs=fact_ptr->mbs,bs=fact_ptr->bs,i,nneig_tmp,npos_tmp,
22                *fi = fact_ptr->i;
23 
24   PetscFunctionBegin;
25   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %d >1 yet",bs);
26   nneig_tmp = 0; npos_tmp = 0;
27   for (i=0; i<mbs; i++){
28     if (PetscRealPart(dd[*fi]) > 0.0){
29       npos_tmp++;
30     } else if (PetscRealPart(dd[*fi]) < 0.0){
31       nneig_tmp++;
32     }
33     fi++;
34   }
35   if (nneig) *nneig = nneig_tmp;
36   if (npos)  *npos  = npos_tmp;
37   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
38 
39   PetscFunctionReturn(0);
40 }
41 #endif /* !defined(PETSC_USE_COMPLEX) */
42 
43 /* Using Modified Sparse Row (MSR) storage.
44 See page 85, "Iterative Methods ..." by Saad. */
45 /*
46     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
47 */
48 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
49 #undef __FUNCT__
50 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
51 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
52 {
53   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
54   PetscErrorCode ierr;
55   int          *rip,i,mbs = a->mbs,*ai,*aj;
56   int          *jutmp,bs = a->bs,bs2=a->bs2;
57   int          m,realloc = 0,prow;
58   int          *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
59   int          *il,ili,nextprow;
60   PetscReal    f = info->fill;
61   PetscTruth   perm_identity;
62 
63   PetscFunctionBegin;
64   /* check whether perm is the identity mapping */
65   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
66 
67   /* -- inplace factorization, i.e., use sbaij for *B -- */
68   if (perm_identity && bs==1 ){
69     if (!perm_identity) a->permute = PETSC_TRUE;
70 
71   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
72 
73   if (perm_identity){ /* without permutation */
74     ai = a->i; aj = a->j;
75   } else {            /* non-trivial permutation */
76     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
77     ai = a->inew; aj = a->jnew;
78   }
79 
80   /* initialization */
81   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
82   umax  = (int)(f*ai[mbs] + 1);
83   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
84   iu[0] = 0;
85   juidx = 0; /* index for ju */
86   ierr  = PetscMalloc((3*mbs+1)*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */
87   q     = jl + mbs;   /* linked list for col index of active row */
88   il    = q  + mbs;
89   for (i=0; i<mbs; i++){
90     jl[i] = mbs;
91     q[i]  = 0;
92     il[i] = 0;
93   }
94 
95   /* for each row k */
96   for (k=0; k<mbs; k++){
97     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
98     q[k] = mbs;
99     /* initialize nonzero structure of k-th row to row rip[k] of A */
100     jmin = ai[rip[k]] +1; /* exclude diag[k] */
101     jmax = ai[rip[k]+1];
102     for (j=jmin; j<jmax; j++){
103       vj = rip[aj[j]]; /* col. value */
104       if(vj > k){
105         qm = k;
106         do {
107           m  = qm; qm = q[m];
108         } while(qm < vj);
109         if (qm == vj) {
110           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
111         }
112         nzk++;
113         q[m]  = vj;
114         q[vj] = qm;
115       } /* if(vj > k) */
116     } /* for (j=jmin; j<jmax; j++) */
117 
118     /* modify nonzero structure of k-th row by computing fill-in
119        for each row i to be merged in */
120     prow = k;
121     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
122 
123     while (prow < k){
124       nextprow = jl[prow];
125 
126       /* merge row prow into k-th row */
127       ili = il[prow];
128       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
129       jmax = iu[prow+1];
130       qm = k;
131       for (j=jmin; j<jmax; j++){
132         vj = ju[j];
133         do {
134           m = qm; qm = q[m];
135         } while (qm < vj);
136         if (qm != vj){  /* a fill */
137           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
138         }
139       } /* end of for (j=jmin; j<jmax; j++) */
140       if (jmin < jmax){
141         il[prow] = jmin;
142         j = ju[jmin];
143         jl[prow] = jl[j]; jl[j] = prow;  /* update jl */
144       }
145       prow = nextprow;
146     }
147 
148     /* update il and jl */
149     if (nzk > 0){
150       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
151       jl[k] = jl[i]; jl[i] = k;
152       il[k] = iu[k] + 1;
153     }
154     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */
155 
156     /* allocate more space to ju if needed */
157     if (iu[k+1] > umax) {
158       /* estimate how much additional space we will need */
159       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
160       /* just double the memory each time */
161       maxadd = umax;
162       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
163       umax += maxadd;
164 
165       /* allocate a longer ju */
166       ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
167       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
168       ierr = PetscFree(ju);CHKERRQ(ierr);
169       ju   = jutmp;
170       realloc++; /* count how many times we realloc */
171     }
172 
173     /* save nonzero structure of k-th row in ju */
174     ju[juidx++] = k; /* diag[k] */
175     i = k;
176     while (nzk --) {
177       i           = q[i];
178       ju[juidx++] = i;
179     }
180   }
181 
182   if (ai[mbs] != 0) {
183     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
184     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
185     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
186     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
187     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
188   } else {
189      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
190   }
191 
192   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
193   /* ierr = PetscFree(q);CHKERRQ(ierr); */
194   ierr = PetscFree(jl);CHKERRQ(ierr);
195 
196   /* put together the new matrix */
197   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
198   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
199   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
200 
201   /* PetscLogObjectParent(*B,iperm); */
202   b = (Mat_SeqSBAIJ*)(*B)->data;
203   ierr = PetscFree(b->imax);CHKERRQ(ierr);
204   b->singlemalloc = PETSC_FALSE;
205   /* the next line frees the default space generated by the Create() */
206   ierr = PetscFree(b->a);CHKERRQ(ierr);
207   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
208   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
209   b->j    = ju;
210   b->i    = iu;
211   b->diag = 0;
212   b->ilen = 0;
213   b->imax = 0;
214   b->row  = perm;
215   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
216   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
217   b->icol = perm;
218   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
219   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
220   /* In b structure:  Free imax, ilen, old a, old j.
221      Allocate idnew, solve_work, new a, new j */
222   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
223   b->maxnz = b->nz = iu[mbs];
224 
225   (*B)->factor                 = FACTOR_CHOLESKY;
226   (*B)->info.factor_mallocs    = realloc;
227   (*B)->info.fill_ratio_given  = f;
228   if (ai[mbs] != 0) {
229     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
230   } else {
231     (*B)->info.fill_ratio_needed = 0.0;
232   }
233 
234 
235   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
236   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
237   (*B)->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
238   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
239 
240   PetscFunctionReturn(0);
241   }
242   /* -----------  end of new code --------------------*/
243 
244 
245   if (!perm_identity) a->permute = PETSC_TRUE;
246 
247   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
248 
249   if (perm_identity){ /* without permutation */
250     ai = a->i; aj = a->j;
251   } else {            /* non-trivial permutation */
252     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
253     ai = a->inew; aj = a->jnew;
254   }
255 
256   /* initialization */
257   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
258   umax  = (int)(f*ai[mbs] + 1); umax += mbs + 1;
259   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
260   iu[0] = mbs+1;
261   juidx = mbs + 1; /* index for ju */
262   ierr  = PetscMalloc(2*mbs*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for pivot row */
263   q     = jl + mbs;   /* linked list for col index */
264   for (i=0; i<mbs; i++){
265     jl[i] = mbs;
266     q[i] = 0;
267   }
268 
269   /* for each row k */
270   for (k=0; k<mbs; k++){
271     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
272     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
273     q[k] = mbs;
274     /* initialize nonzero structure of k-th row to row rip[k] of A */
275     jmin = ai[rip[k]] +1; /* exclude diag[k] */
276     jmax = ai[rip[k]+1];
277     for (j=jmin; j<jmax; j++){
278       vj = rip[aj[j]]; /* col. value */
279       if(vj > k){
280         qm = k;
281         do {
282           m  = qm; qm = q[m];
283         } while(qm < vj);
284         if (qm == vj) {
285           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
286         }
287         nzk++;
288         q[m]  = vj;
289         q[vj] = qm;
290       } /* if(vj > k) */
291     } /* for (j=jmin; j<jmax; j++) */
292 
293     /* modify nonzero structure of k-th row by computing fill-in
294        for each row i to be merged in */
295     prow = k;
296     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
297 
298     while (prow < k){
299       /* merge row prow into k-th row */
300       jmin = iu[prow] + 1; jmax = iu[prow+1];
301       qm = k;
302       for (j=jmin; j<jmax; j++){
303         vj = ju[j];
304         do {
305           m = qm; qm = q[m];
306         } while (qm < vj);
307         if (qm != vj){
308          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
309         }
310       }
311       prow = jl[prow]; /* next pivot row */
312     }
313 
314     /* add k to row list for first nonzero element in k-th row */
315     if (nzk > 0){
316       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
317       jl[k] = jl[i]; jl[i] = k;
318     }
319     iu[k+1] = iu[k] + nzk;
320 
321     /* allocate more space to ju if needed */
322     if (iu[k+1] > umax) {
323       /* estimate how much additional space we will need */
324       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
325       /* just double the memory each time */
326       maxadd = umax;
327       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
328       umax += maxadd;
329 
330       /* allocate a longer ju */
331       ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
332       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
333       ierr = PetscFree(ju);CHKERRQ(ierr);
334       ju   = jutmp;
335       realloc++; /* count how many times we realloc */
336     }
337 
338     /* save nonzero structure of k-th row in ju */
339     i=k;
340     while (nzk --) {
341       i           = q[i];
342       ju[juidx++] = i;
343     }
344   }
345 
346   if (ai[mbs] != 0) {
347     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
348     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
349     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
350     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
351     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
352   } else {
353      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
354   }
355 
356   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
357   /* ierr = PetscFree(q);CHKERRQ(ierr); */
358   ierr = PetscFree(jl);CHKERRQ(ierr);
359 
360   /* put together the new matrix */
361   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
362   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
363   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
364 
365   /* PetscLogObjectParent(*B,iperm); */
366   b = (Mat_SeqSBAIJ*)(*B)->data;
367   ierr = PetscFree(b->imax);CHKERRQ(ierr);
368   b->singlemalloc = PETSC_FALSE;
369   /* the next line frees the default space generated by the Create() */
370   ierr = PetscFree(b->a);CHKERRQ(ierr);
371   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
372   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
373   b->j    = ju;
374   b->i    = iu;
375   b->diag = 0;
376   b->ilen = 0;
377   b->imax = 0;
378   b->row  = perm;
379   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
380   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
381   b->icol = perm;
382   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
383   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
384   /* In b structure:  Free imax, ilen, old a, old j.
385      Allocate idnew, solve_work, new a, new j */
386   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
387   b->maxnz = b->nz = iu[mbs];
388 
389   (*B)->factor                 = FACTOR_CHOLESKY;
390   (*B)->info.factor_mallocs    = realloc;
391   (*B)->info.fill_ratio_given  = f;
392   if (ai[mbs] != 0) {
393     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
394   } else {
395     (*B)->info.fill_ratio_needed = 0.0;
396   }
397 
398   if (perm_identity){
399     switch (bs) {
400       case 1:
401         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
402         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
403         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
404         break;
405       case 2:
406         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
407         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
408         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
409         break;
410       case 3:
411         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
412         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
413         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
414         break;
415       case 4:
416         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
417         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
418         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
419         break;
420       case 5:
421         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
422         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
423         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
424         break;
425       case 6:
426         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
427         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
428         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
429         break;
430       case 7:
431         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
432         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
433         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
434       break;
435       default:
436         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
437         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
438         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
439       break;
440     }
441   }
442 
443   PetscFunctionReturn(0);
444 }
445 
446 
447 #undef __FUNCT__
448 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
449 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
450 {
451   Mat                C = *B;
452   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
453   IS                 perm = b->row;
454   PetscErrorCode ierr;
455   int                *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
456   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
457   int                bs=a->bs,bs2 = a->bs2;
458   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
459   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
460   MatScalar          *work;
461   int                *pivots;
462 
463   PetscFunctionBegin;
464 
465   /* initialization */
466   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
467   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
468   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
469   jl   = il + mbs;
470   for (i=0; i<mbs; i++) {
471     jl[i] = mbs; il[0] = 0;
472   }
473   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
474   uik  = dk + bs2;
475   work = uik + bs2;
476   ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr);
477 
478   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
479 
480   /* check permutation */
481   if (!a->permute){
482     ai = a->i; aj = a->j; aa = a->a;
483   } else {
484     ai   = a->inew; aj = a->jnew;
485     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
486     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
487     ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr);
488     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
489 
490     for (i=0; i<mbs; i++){
491       jmin = ai[i]; jmax = ai[i+1];
492       for (j=jmin; j<jmax; j++){
493         while (a2anew[j] != j){
494           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
495           for (k1=0; k1<bs2; k1++){
496             dk[k1]       = aa[k*bs2+k1];
497             aa[k*bs2+k1] = aa[j*bs2+k1];
498             aa[j*bs2+k1] = dk[k1];
499           }
500         }
501         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
502         if (i > aj[j]){
503           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
504           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
505           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
506           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
507             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
508           }
509         }
510       }
511     }
512     ierr = PetscFree(a2anew);CHKERRQ(ierr);
513   }
514 
515   /* for each row k */
516   for (k = 0; k<mbs; k++){
517 
518     /*initialize k-th row with elements nonzero in row perm(k) of A */
519     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
520 
521     ap = aa + jmin*bs2;
522     for (j = jmin; j < jmax; j++){
523       vj = perm_ptr[aj[j]];         /* block col. index */
524       rtmp_ptr = rtmp + vj*bs2;
525       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
526     }
527 
528     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
529     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
530     i = jl[k]; /* first row to be added to k_th row  */
531 
532     while (i < k){
533       nexti = jl[i]; /* next row to be added to k_th row */
534 
535       /* compute multiplier */
536       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
537 
538       /* uik = -inv(Di)*U_bar(i,k) */
539       diag = ba + i*bs2;
540       u    = ba + ili*bs2;
541       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
542       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
543 
544       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
545       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
546 
547       /* update -U(i,k) */
548       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
549 
550       /* add multiple of row i to k-th row ... */
551       jmin = ili + 1; jmax = bi[i+1];
552       if (jmin < jmax){
553         for (j=jmin; j<jmax; j++) {
554           /* rtmp += -U(i,k)^T * U_bar(i,j) */
555           rtmp_ptr = rtmp + bj[j]*bs2;
556           u = ba + j*bs2;
557           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
558         }
559 
560         /* ... add i to row list for next nonzero entry */
561         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
562         j     = bj[jmin];
563         jl[i] = jl[j]; jl[j] = i; /* update jl */
564       }
565       i = nexti;
566     }
567 
568     /* save nonzero entries in k-th row of U ... */
569 
570     /* invert diagonal block */
571     diag = ba+k*bs2;
572     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
573     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
574 
575     jmin = bi[k]; jmax = bi[k+1];
576     if (jmin < jmax) {
577       for (j=jmin; j<jmax; j++){
578          vj = bj[j];           /* block col. index of U */
579          u   = ba + j*bs2;
580          rtmp_ptr = rtmp + vj*bs2;
581          for (k1=0; k1<bs2; k1++){
582            *u++        = *rtmp_ptr;
583            *rtmp_ptr++ = 0.0;
584          }
585       }
586 
587       /* ... add k to row list for first nonzero entry in k-th row */
588       il[k] = jmin;
589       i     = bj[jmin];
590       jl[k] = jl[i]; jl[i] = k;
591     }
592   }
593 
594   ierr = PetscFree(rtmp);CHKERRQ(ierr);
595   ierr = PetscFree(il);CHKERRQ(ierr);
596   ierr = PetscFree(dk);CHKERRQ(ierr);
597   ierr = PetscFree(pivots);CHKERRQ(ierr);
598   if (a->permute){
599     ierr = PetscFree(aa);CHKERRQ(ierr);
600   }
601 
602   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
603   C->factor       = FACTOR_CHOLESKY;
604   C->assembled    = PETSC_TRUE;
605   C->preallocated = PETSC_TRUE;
606   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
607   PetscFunctionReturn(0);
608 }
609 
610 #undef __FUNCT__
611 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
612 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
613 {
614   Mat                C = *B;
615   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
616   PetscErrorCode ierr;
617   int i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
618   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
619   int                bs=a->bs,bs2 = a->bs2;
620   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
621   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
622   MatScalar          *work;
623   int                *pivots;
624 
625   PetscFunctionBegin;
626 
627   /* initialization */
628 
629   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
630   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
631   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
632   jl   = il + mbs;
633   for (i=0; i<mbs; i++) {
634     jl[i] = mbs; il[0] = 0;
635   }
636   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
637   uik  = dk + bs2;
638   work = uik + bs2;
639   ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr);
640 
641   ai = a->i; aj = a->j; aa = a->a;
642 
643   /* for each row k */
644   for (k = 0; k<mbs; k++){
645 
646     /*initialize k-th row with elements nonzero in row k of A */
647     jmin = ai[k]; jmax = ai[k+1];
648     ap = aa + jmin*bs2;
649     for (j = jmin; j < jmax; j++){
650       vj = aj[j];         /* block col. index */
651       rtmp_ptr = rtmp + vj*bs2;
652       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
653     }
654 
655     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
656     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
657     i = jl[k]; /* first row to be added to k_th row  */
658 
659     while (i < k){
660       nexti = jl[i]; /* next row to be added to k_th row */
661 
662       /* compute multiplier */
663       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
664 
665       /* uik = -inv(Di)*U_bar(i,k) */
666       diag = ba + i*bs2;
667       u    = ba + ili*bs2;
668       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
669       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
670 
671       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
672       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
673 
674       /* update -U(i,k) */
675       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
676 
677       /* add multiple of row i to k-th row ... */
678       jmin = ili + 1; jmax = bi[i+1];
679       if (jmin < jmax){
680         for (j=jmin; j<jmax; j++) {
681           /* rtmp += -U(i,k)^T * U_bar(i,j) */
682           rtmp_ptr = rtmp + bj[j]*bs2;
683           u = ba + j*bs2;
684           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
685         }
686 
687         /* ... add i to row list for next nonzero entry */
688         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
689         j     = bj[jmin];
690         jl[i] = jl[j]; jl[j] = i; /* update jl */
691       }
692       i = nexti;
693     }
694 
695     /* save nonzero entries in k-th row of U ... */
696 
697     /* invert diagonal block */
698     diag = ba+k*bs2;
699     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
700     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
701 
702     jmin = bi[k]; jmax = bi[k+1];
703     if (jmin < jmax) {
704       for (j=jmin; j<jmax; j++){
705          vj = bj[j];           /* block col. index of U */
706          u   = ba + j*bs2;
707          rtmp_ptr = rtmp + vj*bs2;
708          for (k1=0; k1<bs2; k1++){
709            *u++        = *rtmp_ptr;
710            *rtmp_ptr++ = 0.0;
711          }
712       }
713 
714       /* ... add k to row list for first nonzero entry in k-th row */
715       il[k] = jmin;
716       i     = bj[jmin];
717       jl[k] = jl[i]; jl[i] = k;
718     }
719   }
720 
721   ierr = PetscFree(rtmp);CHKERRQ(ierr);
722   ierr = PetscFree(il);CHKERRQ(ierr);
723   ierr = PetscFree(dk);CHKERRQ(ierr);
724   ierr = PetscFree(pivots);CHKERRQ(ierr);
725 
726   C->factor    = FACTOR_CHOLESKY;
727   C->assembled = PETSC_TRUE;
728   C->preallocated = PETSC_TRUE;
729   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
730   PetscFunctionReturn(0);
731 }
732 
733 /*
734     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
735     Version for blocks 2 by 2.
736 */
737 #undef __FUNCT__
738 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
739 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
740 {
741   Mat                C = *B;
742   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
743   IS                 perm = b->row;
744   PetscErrorCode ierr;
745   int                *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
746   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
747   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
748   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
749 
750   PetscFunctionBegin;
751 
752   /* initialization */
753   /* il and jl record the first nonzero element in each row of the accessing
754      window U(0:k, k:mbs-1).
755      jl:    list of rows to be added to uneliminated rows
756             i>= k: jl(i) is the first row to be added to row i
757             i<  k: jl(i) is the row following row i in some list of rows
758             jl(i) = mbs indicates the end of a list
759      il(i): points to the first nonzero element in columns k,...,mbs-1 of
760             row i of U */
761   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
762   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
763   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
764   jl   = il + mbs;
765   for (i=0; i<mbs; i++) {
766     jl[i] = mbs; il[0] = 0;
767   }
768   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
769   uik  = dk + 4;
770   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
771 
772   /* check permutation */
773   if (!a->permute){
774     ai = a->i; aj = a->j; aa = a->a;
775   } else {
776     ai   = a->inew; aj = a->jnew;
777     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
778     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
779     ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr);
780     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
781 
782     for (i=0; i<mbs; i++){
783       jmin = ai[i]; jmax = ai[i+1];
784       for (j=jmin; j<jmax; j++){
785         while (a2anew[j] != j){
786           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
787           for (k1=0; k1<4; k1++){
788             dk[k1]       = aa[k*4+k1];
789             aa[k*4+k1] = aa[j*4+k1];
790             aa[j*4+k1] = dk[k1];
791           }
792         }
793         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
794         if (i > aj[j]){
795           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
796           ap = aa + j*4;     /* ptr to the beginning of the block */
797           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
798           ap[1] = ap[2];
799           ap[2] = dk[1];
800         }
801       }
802     }
803     ierr = PetscFree(a2anew);CHKERRQ(ierr);
804   }
805 
806   /* for each row k */
807   for (k = 0; k<mbs; k++){
808 
809     /*initialize k-th row with elements nonzero in row perm(k) of A */
810     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
811     ap = aa + jmin*4;
812     for (j = jmin; j < jmax; j++){
813       vj = perm_ptr[aj[j]];         /* block col. index */
814       rtmp_ptr = rtmp + vj*4;
815       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
816     }
817 
818     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
819     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
820     i = jl[k]; /* first row to be added to k_th row  */
821 
822     while (i < k){
823       nexti = jl[i]; /* next row to be added to k_th row */
824 
825       /* compute multiplier */
826       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
827 
828       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
829       diag = ba + i*4;
830       u    = ba + ili*4;
831       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
832       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
833       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
834       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
835 
836       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
837       dk[0] += uik[0]*u[0] + uik[1]*u[1];
838       dk[1] += uik[2]*u[0] + uik[3]*u[1];
839       dk[2] += uik[0]*u[2] + uik[1]*u[3];
840       dk[3] += uik[2]*u[2] + uik[3]*u[3];
841 
842       /* update -U(i,k): ba[ili] = uik */
843       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
844 
845       /* add multiple of row i to k-th row ... */
846       jmin = ili + 1; jmax = bi[i+1];
847       if (jmin < jmax){
848         for (j=jmin; j<jmax; j++) {
849           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
850           rtmp_ptr = rtmp + bj[j]*4;
851           u = ba + j*4;
852           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
853           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
854           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
855           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
856         }
857 
858         /* ... add i to row list for next nonzero entry */
859         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
860         j     = bj[jmin];
861         jl[i] = jl[j]; jl[j] = i; /* update jl */
862       }
863       i = nexti;
864     }
865 
866     /* save nonzero entries in k-th row of U ... */
867 
868     /* invert diagonal block */
869     diag = ba+k*4;
870     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
871     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
872 
873     jmin = bi[k]; jmax = bi[k+1];
874     if (jmin < jmax) {
875       for (j=jmin; j<jmax; j++){
876          vj = bj[j];           /* block col. index of U */
877          u   = ba + j*4;
878          rtmp_ptr = rtmp + vj*4;
879          for (k1=0; k1<4; k1++){
880            *u++        = *rtmp_ptr;
881            *rtmp_ptr++ = 0.0;
882          }
883       }
884 
885       /* ... add k to row list for first nonzero entry in k-th row */
886       il[k] = jmin;
887       i     = bj[jmin];
888       jl[k] = jl[i]; jl[i] = k;
889     }
890   }
891 
892   ierr = PetscFree(rtmp);CHKERRQ(ierr);
893   ierr = PetscFree(il);CHKERRQ(ierr);
894   ierr = PetscFree(dk);CHKERRQ(ierr);
895   if (a->permute) {
896     ierr = PetscFree(aa);CHKERRQ(ierr);
897   }
898   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
899   C->factor    = FACTOR_CHOLESKY;
900   C->assembled = PETSC_TRUE;
901   C->preallocated = PETSC_TRUE;
902   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
903   PetscFunctionReturn(0);
904 }
905 
906 /*
907       Version for when blocks are 2 by 2 Using natural ordering
908 */
909 #undef __FUNCT__
910 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
911 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
912 {
913   Mat                C = *B;
914   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
915   PetscErrorCode ierr;
916   int i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
917   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
918   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
919   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
920 
921   PetscFunctionBegin;
922 
923   /* initialization */
924   /* il and jl record the first nonzero element in each row of the accessing
925      window U(0:k, k:mbs-1).
926      jl:    list of rows to be added to uneliminated rows
927             i>= k: jl(i) is the first row to be added to row i
928             i<  k: jl(i) is the row following row i in some list of rows
929             jl(i) = mbs indicates the end of a list
930      il(i): points to the first nonzero element in columns k,...,mbs-1 of
931             row i of U */
932   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
933   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
934   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
935   jl   = il + mbs;
936   for (i=0; i<mbs; i++) {
937     jl[i] = mbs; il[0] = 0;
938   }
939   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
940   uik  = dk + 4;
941 
942   ai = a->i; aj = a->j; aa = a->a;
943 
944   /* for each row k */
945   for (k = 0; k<mbs; k++){
946 
947     /*initialize k-th row with elements nonzero in row k of A */
948     jmin = ai[k]; jmax = ai[k+1];
949     ap = aa + jmin*4;
950     for (j = jmin; j < jmax; j++){
951       vj = aj[j];         /* block col. index */
952       rtmp_ptr = rtmp + vj*4;
953       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
954     }
955 
956     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
957     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
958     i = jl[k]; /* first row to be added to k_th row  */
959 
960     while (i < k){
961       nexti = jl[i]; /* next row to be added to k_th row */
962 
963       /* compute multiplier */
964       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
965 
966       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
967       diag = ba + i*4;
968       u    = ba + ili*4;
969       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
970       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
971       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
972       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
973 
974       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
975       dk[0] += uik[0]*u[0] + uik[1]*u[1];
976       dk[1] += uik[2]*u[0] + uik[3]*u[1];
977       dk[2] += uik[0]*u[2] + uik[1]*u[3];
978       dk[3] += uik[2]*u[2] + uik[3]*u[3];
979 
980       /* update -U(i,k): ba[ili] = uik */
981       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
982 
983       /* add multiple of row i to k-th row ... */
984       jmin = ili + 1; jmax = bi[i+1];
985       if (jmin < jmax){
986         for (j=jmin; j<jmax; j++) {
987           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
988           rtmp_ptr = rtmp + bj[j]*4;
989           u = ba + j*4;
990           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
991           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
992           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
993           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
994         }
995 
996         /* ... add i to row list for next nonzero entry */
997         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
998         j     = bj[jmin];
999         jl[i] = jl[j]; jl[j] = i; /* update jl */
1000       }
1001       i = nexti;
1002     }
1003 
1004     /* save nonzero entries in k-th row of U ... */
1005 
1006     /* invert diagonal block */
1007     diag = ba+k*4;
1008     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1009     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1010 
1011     jmin = bi[k]; jmax = bi[k+1];
1012     if (jmin < jmax) {
1013       for (j=jmin; j<jmax; j++){
1014          vj = bj[j];           /* block col. index of U */
1015          u   = ba + j*4;
1016          rtmp_ptr = rtmp + vj*4;
1017          for (k1=0; k1<4; k1++){
1018            *u++        = *rtmp_ptr;
1019            *rtmp_ptr++ = 0.0;
1020          }
1021       }
1022 
1023       /* ... add k to row list for first nonzero entry in k-th row */
1024       il[k] = jmin;
1025       i     = bj[jmin];
1026       jl[k] = jl[i]; jl[i] = k;
1027     }
1028   }
1029 
1030   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1031   ierr = PetscFree(il);CHKERRQ(ierr);
1032   ierr = PetscFree(dk);CHKERRQ(ierr);
1033 
1034   C->factor    = FACTOR_CHOLESKY;
1035   C->assembled = PETSC_TRUE;
1036   C->preallocated = PETSC_TRUE;
1037   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 /*
1042     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
1043     Version for blocks are 1 by 1.
1044 */
1045 #undef __FUNCT__
1046 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1047 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
1048 {
1049   Mat                C = *B;
1050   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1051   IS                 ip = b->row;
1052   PetscErrorCode ierr;
1053   int                *rip,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
1054   int                *ai,*aj,*r;
1055   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
1056   MatScalar          *rtmp;
1057   MatScalar          *ba = b->a,*aa,ak;
1058   MatScalar          dk,uikdi;
1059 
1060   PetscFunctionBegin;
1061   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1062   if (!a->permute){
1063     ai = a->i; aj = a->j; aa = a->a;
1064   } else {
1065     ai = a->inew; aj = a->jnew;
1066     ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1067     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1068     ierr = PetscMalloc(ai[mbs]*sizeof(int),&r);CHKERRQ(ierr);
1069     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1070 
1071     jmin = ai[0]; jmax = ai[mbs];
1072     for (j=jmin; j<jmax; j++){
1073       while (r[j] != j){
1074         k = r[j]; r[j] = r[k]; r[k] = k;
1075         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
1076       }
1077     }
1078     ierr = PetscFree(r);CHKERRQ(ierr);
1079   }
1080 
1081   /* initialization */
1082   /* il and jl record the first nonzero element in each row of the accessing
1083      window U(0:k, k:mbs-1).
1084      jl:    list of rows to be added to uneliminated rows
1085             i>= k: jl(i) is the first row to be added to row i
1086             i<  k: jl(i) is the row following row i in some list of rows
1087             jl(i) = mbs indicates the end of a list
1088      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1089             row i of U */
1090   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1091   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
1092   jl   = il + mbs;
1093   for (i=0; i<mbs; i++) {
1094     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1095   }
1096 
1097   /* for each row k */
1098   for (k = 0; k<mbs; k++){
1099 
1100     /*initialize k-th row with elements nonzero in row perm(k) of A */
1101     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1102 
1103     for (j = jmin; j < jmax; j++){
1104       vj = rip[aj[j]];
1105       rtmp[vj] = aa[j];
1106     }
1107 
1108     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1109     dk = rtmp[k];
1110     i = jl[k]; /* first row to be added to k_th row  */
1111 
1112     while (i < k){
1113       nexti = jl[i]; /* next row to be added to k_th row */
1114 
1115       /* compute multiplier, update D(k) and U(i,k) */
1116       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1117       uikdi = - ba[ili]*ba[i];
1118       dk += uikdi*ba[ili];
1119       ba[ili] = uikdi; /* -U(i,k) */
1120 
1121       /* add multiple of row i to k-th row ... */
1122       jmin = ili + 1; jmax = bi[i+1];
1123       if (jmin < jmax){
1124         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1125         /* ... add i to row list for next nonzero entry */
1126         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1127         j     = bj[jmin];
1128         jl[i] = jl[j]; jl[j] = i; /* update jl */
1129       }
1130       i = nexti;
1131     }
1132 
1133     /* check for zero pivot and save diagoanl element */
1134     if (dk == 0.0){
1135       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1136       /*
1137     } else if (PetscRealPart(dk) < 0.0){
1138       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk);
1139       */
1140     }
1141 
1142     /* save nonzero entries in k-th row of U ... */
1143     ba[k] = 1.0/dk;
1144     jmin = bi[k]; jmax = bi[k+1];
1145     if (jmin < jmax) {
1146       for (j=jmin; j<jmax; j++){
1147          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1148       }
1149       /* ... add k to row list for first nonzero entry in k-th row */
1150       il[k] = jmin;
1151       i     = bj[jmin];
1152       jl[k] = jl[i]; jl[i] = k;
1153     }
1154   }
1155 
1156   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1157   ierr = PetscFree(il);CHKERRQ(ierr);
1158   if (a->permute){
1159     ierr = PetscFree(aa);CHKERRQ(ierr);
1160   }
1161 
1162   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1163   C->factor    = FACTOR_CHOLESKY;
1164   C->assembled = PETSC_TRUE;
1165   C->preallocated = PETSC_TRUE;
1166   PetscLogFlops(b->mbs);
1167   PetscFunctionReturn(0);
1168 }
1169 
1170 /*
1171   Version for when blocks are 1 by 1 Using natural ordering
1172 */
1173 #undef __FUNCT__
1174 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1175 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1176 {
1177   Mat                C = *B;
1178   Mat_SeqSBAIJ       *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1179   PetscErrorCode ierr;
1180   int i,j,mbs = a->mbs;
1181   int                *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1182   int                k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1183   MatScalar          *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1184   PetscReal          damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
1185   PetscTruth         damp,chshift;
1186   int                nshift=0;
1187 
1188   PetscFunctionBegin;
1189   /* initialization */
1190   /* il and jl record the first nonzero element in each row of the accessing
1191      window U(0:k, k:mbs-1).
1192      jl:    list of rows to be added to uneliminated rows
1193             i>= k: jl(i) is the first row to be added to row i
1194             i<  k: jl(i) is the row following row i in some list of rows
1195             jl(i) = mbs indicates the end of a list
1196      il(i): points to the first nonzero element in U(i,k:mbs-1)
1197   */
1198   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1199   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
1200   jl   = il + mbs;
1201 
1202   shift_amount = 0;
1203   do {
1204     damp = PETSC_FALSE;
1205     chshift = PETSC_FALSE;
1206     for (i=0; i<mbs; i++) {
1207       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1208     }
1209 
1210     for (k = 0; k<mbs; k++){ /* row k */
1211     /*initialize k-th row with elements nonzero in row perm(k) of A */
1212       nz   = ai[k+1] - ai[k];
1213       acol = aj + ai[k];
1214       aval = aa + ai[k];
1215       bval = ba + bi[k];
1216       while (nz -- ){
1217         rtmp[*acol++] = *aval++;
1218         *bval++       = 0.0; /* for in-place factorization */
1219       }
1220       /* damp the diagonal of the matrix */
1221       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1222 
1223       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1224       dk = rtmp[k];
1225       i  = jl[k]; /* first row to be added to k_th row  */
1226 
1227       while (i < k){
1228         nexti = jl[i]; /* next row to be added to k_th row */
1229 
1230         /* compute multiplier, update D(k) and U(i,k) */
1231         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1232         uikdi = - ba[ili]*ba[bi[i]];
1233         dk   += uikdi*ba[ili];
1234         ba[ili] = uikdi; /* -U(i,k) */
1235 
1236         /* add multiple of row i to k-th row ... */
1237         jmin = ili + 1;
1238         nz   = bi[i+1] - jmin;
1239         if (nz > 0){
1240           bcol = bj + jmin;
1241           bval = ba + jmin;
1242           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1243           /* ... add i to row list for next nonzero entry */
1244           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1245           j     = bj[jmin];
1246           jl[i] = jl[j]; jl[j] = i; /* update jl */
1247         }
1248         i = nexti;
1249       }
1250 
1251       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1252 	/* calculate a shift that would make this row diagonally dominant */
1253 	PetscReal rs = PetscAbs(PetscRealPart(dk));
1254 	jmin      = bi[k]+1;
1255 	nz        = bi[k+1] - jmin;
1256 	if (nz){
1257 	  bcol = bj + jmin;
1258 	  bval = ba + jmin;
1259 	  while (nz--){
1260 	    rs += PetscAbsScalar(rtmp[*bcol++]);
1261 	  }
1262 	}
1263 	/* if this shift is less than the previous, just up the previous
1264 	   one by a bit */
1265 	shift_amount = PetscMax(rs,1.1*shift_amount);
1266 	chshift  = PETSC_TRUE;
1267 	/* Unlike in the ILU case there is no exit condition on nshift:
1268 	   we increase the shift until it converges. There is no guarantee that
1269 	   this algorithm converges faster or slower, or is better or worse
1270 	   than the ILU algorithm. */
1271 	nshift++;
1272 	break;
1273       }
1274       if (PetscRealPart(dk) < zeropivot){
1275         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1276         if (damping > 0.0) {
1277           if (ndamp) damping *= 2.0;
1278           damp = PETSC_TRUE;
1279           ndamp++;
1280           break;
1281         } else if (PetscAbsScalar(dk) < zeropivot){
1282           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1283         } else {
1284           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %d of Cholesky factorization\n",PetscRealPart(dk),k);
1285         }
1286       }
1287 
1288       /* save nonzero entries in k-th row of U ... */
1289       /* printf("%d, dk: %g, 1/dk: %g\n",k,dk,1/dk); */
1290       ba[bi[k]] = 1.0/dk;
1291       jmin      = bi[k]+1;
1292       nz        = bi[k+1] - jmin;
1293       if (nz){
1294         bcol = bj + jmin;
1295         bval = ba + jmin;
1296         while (nz--){
1297           *bval++       = rtmp[*bcol];
1298           rtmp[*bcol++] = 0.0;
1299         }
1300         /* ... add k to row list for first nonzero entry in k-th row */
1301         il[k] = jmin;
1302         i     = bj[jmin];
1303         jl[k] = jl[i]; jl[i] = k;
1304       }
1305     } /* end of for (k = 0; k<mbs; k++) */
1306   } while (damp||chshift);
1307   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1308   ierr = PetscFree(il);CHKERRQ(ierr);
1309 
1310   C->factor       = FACTOR_CHOLESKY;
1311   C->assembled    = PETSC_TRUE;
1312   C->preallocated = PETSC_TRUE;
1313   PetscLogFlops(b->mbs);
1314   if (ndamp) {
1315     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %d damping value %g\n",ndamp,damping);
1316   }
1317  if (nshift) {
1318     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %d shifts\n",nshift);
1319   }
1320 
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 #undef __FUNCT__
1325 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1326 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1327 {
1328   PetscErrorCode ierr;
1329   Mat C;
1330 
1331   PetscFunctionBegin;
1332   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1333   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
1334   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 
1339