xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 9ee9b77b2cca71d2a08ebc06fc4da7cd72e3025a)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/mat/impls/sbaij/seq/sbaij.h"
4 #include "src/inline/ilu.h"
5 #include "include/petscis.h"
6 
7 #if !defined(PETSC_USE_COMPLEX)
8 /*
9   input:
10    F -- numeric factor
11   output:
12    nneg, nzero, npos: matrix inertia
13 */
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
17 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
18 {
19   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
20   PetscScalar  *dd = fact_ptr->a;
21   PetscInt     mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
22 
23   PetscFunctionBegin;
24   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
25   nneig_tmp = 0; npos_tmp = 0;
26   for (i=0; i<mbs; i++){
27     if (PetscRealPart(dd[*fi]) > 0.0){
28       npos_tmp++;
29     } else if (PetscRealPart(dd[*fi]) < 0.0){
30       nneig_tmp++;
31     }
32     fi++;
33   }
34   if (nneig) *nneig = nneig_tmp;
35   if (npos)  *npos  = npos_tmp;
36   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
37 
38   PetscFunctionReturn(0);
39 }
40 #endif /* !defined(PETSC_USE_COMPLEX) */
41 
42 /*
43   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
44   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
45 */
46 #undef __FUNCT__
47 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
48 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
49 {
50   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
51   PetscErrorCode ierr;
52   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
53   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
54   PetscInt       m,reallocs = 0,prow;
55   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
56   PetscReal      f = info->fill;
57   PetscTruth     perm_identity;
58 
59   PetscFunctionBegin;
60   /* check whether perm is the identity mapping */
61   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
62   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
63 
64   if (perm_identity){ /* without permutation */
65     a->permute = PETSC_FALSE;
66     ai = a->i; aj = a->j;
67   } else {            /* non-trivial permutation */
68     a->permute = PETSC_TRUE;
69     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
70     ai = a->inew; aj = a->jnew;
71   }
72 
73   /* initialization */
74   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
75   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
76   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
77   iu[0] = mbs+1;
78   juidx = mbs + 1; /* index for ju */
79   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
80   q     = jl + mbs;   /* linked list for col index */
81   for (i=0; i<mbs; i++){
82     jl[i] = mbs;
83     q[i] = 0;
84   }
85 
86   /* for each row k */
87   for (k=0; k<mbs; k++){
88     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
89     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
90     q[k] = mbs;
91     /* initialize nonzero structure of k-th row to row rip[k] of A */
92     jmin = ai[rip[k]] +1; /* exclude diag[k] */
93     jmax = ai[rip[k]+1];
94     for (j=jmin; j<jmax; j++){
95       vj = rip[aj[j]]; /* col. value */
96       if(vj > k){
97         qm = k;
98         do {
99           m  = qm; qm = q[m];
100         } while(qm < vj);
101         if (qm == vj) {
102           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
103         }
104         nzk++;
105         q[m]  = vj;
106         q[vj] = qm;
107       } /* if(vj > k) */
108     } /* for (j=jmin; j<jmax; j++) */
109 
110     /* modify nonzero structure of k-th row by computing fill-in
111        for each row i to be merged in */
112     prow = k;
113     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
114 
115     while (prow < k){
116       /* merge row prow into k-th row */
117       jmin = iu[prow] + 1; jmax = iu[prow+1];
118       qm = k;
119       for (j=jmin; j<jmax; j++){
120         vj = ju[j];
121         do {
122           m = qm; qm = q[m];
123         } while (qm < vj);
124         if (qm != vj){
125          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
126         }
127       }
128       prow = jl[prow]; /* next pivot row */
129     }
130 
131     /* add k to row list for first nonzero element in k-th row */
132     if (nzk > 0){
133       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134       jl[k] = jl[i]; jl[i] = k;
135     }
136     iu[k+1] = iu[k] + nzk;
137 
138     /* allocate more space to ju if needed */
139     if (iu[k+1] > umax) {
140       /* estimate how much additional space we will need */
141       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
142       /* just double the memory each time */
143       maxadd = umax;
144       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
145       umax += maxadd;
146 
147       /* allocate a longer ju */
148       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
149       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
150       ierr = PetscFree(ju);CHKERRQ(ierr);
151       ju   = jutmp;
152       reallocs++; /* count how many times we realloc */
153     }
154 
155     /* save nonzero structure of k-th row in ju */
156     i=k;
157     while (nzk --) {
158       i           = q[i];
159       ju[juidx++] = i;
160     }
161   }
162 
163   if (ai[mbs] != 0) {
164     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
165     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
166     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
167     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
168     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
169   } else {
170      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
171   }
172 
173   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
174   ierr = PetscFree(jl);CHKERRQ(ierr);
175 
176   /* put together the new matrix */
177   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
178   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
179   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
180 
181   /* PetscLogObjectParent(*B,iperm); */
182   b = (Mat_SeqSBAIJ*)(*B)->data;
183   ierr = PetscFree(b->imax);CHKERRQ(ierr);
184   b->singlemalloc = PETSC_FALSE;
185   /* the next line frees the default space generated by the Create() */
186   ierr = PetscFree(b->a);CHKERRQ(ierr);
187   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
188   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
189   b->j    = ju;
190   b->i    = iu;
191   b->diag = 0;
192   b->ilen = 0;
193   b->imax = 0;
194   b->row  = perm;
195   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
197   b->icol = perm;
198   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
199   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
200   /* In b structure:  Free imax, ilen, old a, old j.
201      Allocate idnew, solve_work, new a, new j */
202   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
203   b->maxnz = b->nz = iu[mbs];
204 
205   (*B)->factor                 = FACTOR_CHOLESKY;
206   (*B)->info.factor_mallocs    = reallocs;
207   (*B)->info.fill_ratio_given  = f;
208   if (ai[mbs] != 0) {
209     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
210   } else {
211     (*B)->info.fill_ratio_needed = 0.0;
212   }
213 
214   if (perm_identity){
215     switch (bs) {
216       case 1:
217         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
218         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
219         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
220         break;
221       case 2:
222         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
223         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
224         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
225         break;
226       case 3:
227         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
228         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
229         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
230         break;
231       case 4:
232         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
233         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
234         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
235         break;
236       case 5:
237         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
238         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
239         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
240         break;
241       case 6:
242         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
243         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
244         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
245         break;
246       case 7:
247         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
248         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
249         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
250       break;
251       default:
252         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
253         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
254         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
255       break;
256     }
257   }
258   PetscFunctionReturn(0);
259 }
260 /*
261     Symbolic U^T*D*U factorization for SBAIJ format.
262 */
263 #undef __FUNCT__
264 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
265 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
266 {
267   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
268   PetscErrorCode ierr;
269   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
270   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
271   PetscInt       m,reallocs = 0,prow;
272   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
273   PetscInt       *il,ili,nextprow;
274   PetscReal      f = info->fill;
275   PetscTruth     perm_identity;
276 
277   PetscFunctionBegin;
278   /*
279    This code originally uses Modified Sparse Row (MSR) storage
280    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
281    Then it is rewritten so the factor B takes seqsbaij format. However the associated
282    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
283    thus the original code in MSR format is still used for these cases.
284    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
285    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
286   */
287 
288   /* check whether perm is the identity mapping */
289   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
290   if (bs>1 || !perm_identity){
291     a->permute = PETSC_TRUE;
292     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,B);CHKERRQ(ierr);
293     PetscFunctionReturn(0);
294   }
295 
296   /* At present, the code below works only for perm_identity=PETSC_TRUE */
297   printf(" MatCholeskyFactorSymbolic_SeqSBAIJ\n");
298   a->permute = PETSC_FALSE;
299 
300   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
301   ai = a->i; aj = a->j;
302 
303   /* initialization */
304   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
305   umax  = (PetscInt)(f*ai[mbs] + 1);
306   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
307   iu[0] = 0;
308   juidx = 0; /* index for ju */
309   ierr  = PetscMalloc((3*mbs+1)*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */
310   q     = jl + mbs;   /* linked list for col index of active row */
311   il    = q  + mbs;
312   for (i=0; i<mbs; i++){
313     jl[i] = mbs;
314     q[i]  = 0;
315     il[i] = 0;
316   }
317 
318   /* for each row k */
319   for (k=0; k<mbs; k++){
320     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
321     q[k] = mbs;
322     /* initialize nonzero structure of k-th row to row rip[k] of A */
323     jmin = ai[rip[k]] +1; /* exclude diag[k] */
324     jmax = ai[rip[k]+1];
325     for (j=jmin; j<jmax; j++){
326       vj = rip[aj[j]]; /* col. value */
327       if(vj > k){
328         qm = k;
329         do {
330           m  = qm; qm = q[m];
331         } while(qm < vj);
332         if (qm == vj) {
333           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
334         }
335         nzk++;
336         q[m]  = vj;
337         q[vj] = qm;
338       } /* if(vj > k) */
339     } /* for (j=jmin; j<jmax; j++) */
340 
341     /* modify nonzero structure of k-th row by computing fill-in
342        for each row i to be merged in */
343     prow = k;
344     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
345 
346     while (prow < k){
347       nextprow = jl[prow];
348 
349       /* merge row prow into k-th row */
350       ili = il[prow];
351       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
352       jmax = iu[prow+1];
353       qm = k;
354       for (j=jmin; j<jmax; j++){
355         vj = ju[j];
356         do {
357           m = qm; qm = q[m];
358         } while (qm < vj);
359         if (qm != vj){  /* a fill */
360           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
361         }
362       } /* end of for (j=jmin; j<jmax; j++) */
363       if (jmin < jmax){
364         il[prow] = jmin;
365         j = ju[jmin];
366         jl[prow] = jl[j]; jl[j] = prow;  /* update jl */
367       }
368       prow = nextprow;
369     }
370 
371     /* update il and jl */
372     if (nzk > 0){
373       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
374       jl[k] = jl[i]; jl[i] = k;
375       il[k] = iu[k] + 1;
376     }
377     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */
378 
379     /* allocate more space to ju if needed */
380     if (iu[k+1] > umax) {
381       /* estimate how much additional space we will need */
382       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
383       /* just double the memory each time */
384       maxadd = umax;
385       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
386       umax += maxadd;
387 
388       /* allocate a longer ju */
389       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
390       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
391       ierr = PetscFree(ju);CHKERRQ(ierr);
392       ju   = jutmp;
393       reallocs++; /* count how many times we realloc */
394     }
395 
396     /* save nonzero structure of k-th row in ju */
397     ju[juidx++] = k; /* diag[k] */
398     i = k;
399     while (nzk --) {
400       i           = q[i];
401       ju[juidx++] = i;
402     }
403   }
404 
405   if (ai[mbs] != 0) {
406     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
407     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
408     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
409     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
410     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
411   } else {
412      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
413   }
414 
415   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
416   /* ierr = PetscFree(q);CHKERRQ(ierr); */
417   ierr = PetscFree(jl);CHKERRQ(ierr);
418 
419   /* put together the new matrix */
420   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
421   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
422   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
423 
424   /* PetscLogObjectParent(*B,iperm); */
425   b = (Mat_SeqSBAIJ*)(*B)->data;
426   ierr = PetscFree(b->imax);CHKERRQ(ierr);
427   b->singlemalloc = PETSC_FALSE;
428   /* the next line frees the default space generated by the Create() */
429   ierr = PetscFree(b->a);CHKERRQ(ierr);
430   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
431   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
432   b->j    = ju;
433   b->i    = iu;
434   b->diag = 0;
435   b->ilen = 0;
436   b->imax = 0;
437   b->row  = perm;
438   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
439   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
440   b->icol = perm;
441   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
442   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
443   /* In b structure:  Free imax, ilen, old a, old j.
444      Allocate idnew, solve_work, new a, new j */
445   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
446   b->maxnz = b->nz = iu[mbs];
447 
448   (*B)->factor                 = FACTOR_CHOLESKY;
449   (*B)->info.factor_mallocs    = reallocs;
450   (*B)->info.fill_ratio_given  = f;
451   if (ai[mbs] != 0) {
452     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
453   } else {
454     (*B)->info.fill_ratio_needed = 0.0;
455   }
456 
457   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
458   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
459   (*B)->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
460   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
461   PetscFunctionReturn(0);
462 }
463 #undef __FUNCT__
464 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
465 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
466 {
467   Mat            C = *B;
468   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
469   IS             perm = b->row;
470   PetscErrorCode ierr;
471   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
472   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
473   PetscInt       bs=A->bs,bs2 = a->bs2;
474   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
475   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
476   MatScalar      *work;
477   PetscInt       *pivots;
478 
479   PetscFunctionBegin;
480   /* initialization */
481   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
482   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
483   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
484   jl   = il + mbs;
485   for (i=0; i<mbs; i++) {
486     jl[i] = mbs; il[0] = 0;
487   }
488   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
489   uik  = dk + bs2;
490   work = uik + bs2;
491   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
492 
493   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
494 
495   /* check permutation */
496   if (!a->permute){
497     ai = a->i; aj = a->j; aa = a->a;
498   } else {
499     ai   = a->inew; aj = a->jnew;
500     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
501     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
502     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
503     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
504 
505     for (i=0; i<mbs; i++){
506       jmin = ai[i]; jmax = ai[i+1];
507       for (j=jmin; j<jmax; j++){
508         while (a2anew[j] != j){
509           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
510           for (k1=0; k1<bs2; k1++){
511             dk[k1]       = aa[k*bs2+k1];
512             aa[k*bs2+k1] = aa[j*bs2+k1];
513             aa[j*bs2+k1] = dk[k1];
514           }
515         }
516         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
517         if (i > aj[j]){
518           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
519           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
520           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
521           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
522             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
523           }
524         }
525       }
526     }
527     ierr = PetscFree(a2anew);CHKERRQ(ierr);
528   }
529 
530   /* for each row k */
531   for (k = 0; k<mbs; k++){
532 
533     /*initialize k-th row with elements nonzero in row perm(k) of A */
534     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
535 
536     ap = aa + jmin*bs2;
537     for (j = jmin; j < jmax; j++){
538       vj = perm_ptr[aj[j]];         /* block col. index */
539       rtmp_ptr = rtmp + vj*bs2;
540       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
541     }
542 
543     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
544     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
545     i = jl[k]; /* first row to be added to k_th row  */
546 
547     while (i < k){
548       nexti = jl[i]; /* next row to be added to k_th row */
549 
550       /* compute multiplier */
551       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
552 
553       /* uik = -inv(Di)*U_bar(i,k) */
554       diag = ba + i*bs2;
555       u    = ba + ili*bs2;
556       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
557       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
558 
559       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
560       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
561 
562       /* update -U(i,k) */
563       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
564 
565       /* add multiple of row i to k-th row ... */
566       jmin = ili + 1; jmax = bi[i+1];
567       if (jmin < jmax){
568         for (j=jmin; j<jmax; j++) {
569           /* rtmp += -U(i,k)^T * U_bar(i,j) */
570           rtmp_ptr = rtmp + bj[j]*bs2;
571           u = ba + j*bs2;
572           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
573         }
574 
575         /* ... add i to row list for next nonzero entry */
576         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
577         j     = bj[jmin];
578         jl[i] = jl[j]; jl[j] = i; /* update jl */
579       }
580       i = nexti;
581     }
582 
583     /* save nonzero entries in k-th row of U ... */
584 
585     /* invert diagonal block */
586     diag = ba+k*bs2;
587     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
588     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
589 
590     jmin = bi[k]; jmax = bi[k+1];
591     if (jmin < jmax) {
592       for (j=jmin; j<jmax; j++){
593          vj = bj[j];           /* block col. index of U */
594          u   = ba + j*bs2;
595          rtmp_ptr = rtmp + vj*bs2;
596          for (k1=0; k1<bs2; k1++){
597            *u++        = *rtmp_ptr;
598            *rtmp_ptr++ = 0.0;
599          }
600       }
601 
602       /* ... add k to row list for first nonzero entry in k-th row */
603       il[k] = jmin;
604       i     = bj[jmin];
605       jl[k] = jl[i]; jl[i] = k;
606     }
607   }
608 
609   ierr = PetscFree(rtmp);CHKERRQ(ierr);
610   ierr = PetscFree(il);CHKERRQ(ierr);
611   ierr = PetscFree(dk);CHKERRQ(ierr);
612   ierr = PetscFree(pivots);CHKERRQ(ierr);
613   if (a->permute){
614     ierr = PetscFree(aa);CHKERRQ(ierr);
615   }
616 
617   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
618   C->factor       = FACTOR_CHOLESKY;
619   C->assembled    = PETSC_TRUE;
620   C->preallocated = PETSC_TRUE;
621   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
622   PetscFunctionReturn(0);
623 }
624 
625 #undef __FUNCT__
626 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
627 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
628 {
629   Mat            C = *B;
630   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
631   PetscErrorCode ierr;
632   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
633   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
634   PetscInt       bs=A->bs,bs2 = a->bs2;
635   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
636   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
637   MatScalar      *work;
638   PetscInt       *pivots;
639 
640   PetscFunctionBegin;
641   /* initialization */
642 
643   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
644   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
645   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
646   jl   = il + mbs;
647   for (i=0; i<mbs; i++) {
648     jl[i] = mbs; il[0] = 0;
649   }
650   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
651   uik  = dk + bs2;
652   work = uik + bs2;
653   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
654 
655   ai = a->i; aj = a->j; aa = a->a;
656 
657   /* for each row k */
658   for (k = 0; k<mbs; k++){
659 
660     /*initialize k-th row with elements nonzero in row k of A */
661     jmin = ai[k]; jmax = ai[k+1];
662     ap = aa + jmin*bs2;
663     for (j = jmin; j < jmax; j++){
664       vj = aj[j];         /* block col. index */
665       rtmp_ptr = rtmp + vj*bs2;
666       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
667     }
668 
669     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
670     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
671     i = jl[k]; /* first row to be added to k_th row  */
672 
673     while (i < k){
674       nexti = jl[i]; /* next row to be added to k_th row */
675 
676       /* compute multiplier */
677       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
678 
679       /* uik = -inv(Di)*U_bar(i,k) */
680       diag = ba + i*bs2;
681       u    = ba + ili*bs2;
682       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
683       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
684 
685       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
686       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
687 
688       /* update -U(i,k) */
689       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
690 
691       /* add multiple of row i to k-th row ... */
692       jmin = ili + 1; jmax = bi[i+1];
693       if (jmin < jmax){
694         for (j=jmin; j<jmax; j++) {
695           /* rtmp += -U(i,k)^T * U_bar(i,j) */
696           rtmp_ptr = rtmp + bj[j]*bs2;
697           u = ba + j*bs2;
698           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
699         }
700 
701         /* ... add i to row list for next nonzero entry */
702         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
703         j     = bj[jmin];
704         jl[i] = jl[j]; jl[j] = i; /* update jl */
705       }
706       i = nexti;
707     }
708 
709     /* save nonzero entries in k-th row of U ... */
710 
711     /* invert diagonal block */
712     diag = ba+k*bs2;
713     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
714     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
715 
716     jmin = bi[k]; jmax = bi[k+1];
717     if (jmin < jmax) {
718       for (j=jmin; j<jmax; j++){
719          vj = bj[j];           /* block col. index of U */
720          u   = ba + j*bs2;
721          rtmp_ptr = rtmp + vj*bs2;
722          for (k1=0; k1<bs2; k1++){
723            *u++        = *rtmp_ptr;
724            *rtmp_ptr++ = 0.0;
725          }
726       }
727 
728       /* ... add k to row list for first nonzero entry in k-th row */
729       il[k] = jmin;
730       i     = bj[jmin];
731       jl[k] = jl[i]; jl[i] = k;
732     }
733   }
734 
735   ierr = PetscFree(rtmp);CHKERRQ(ierr);
736   ierr = PetscFree(il);CHKERRQ(ierr);
737   ierr = PetscFree(dk);CHKERRQ(ierr);
738   ierr = PetscFree(pivots);CHKERRQ(ierr);
739 
740   C->factor    = FACTOR_CHOLESKY;
741   C->assembled = PETSC_TRUE;
742   C->preallocated = PETSC_TRUE;
743   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
744   PetscFunctionReturn(0);
745 }
746 
747 /*
748     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
749     Version for blocks 2 by 2.
750 */
751 #undef __FUNCT__
752 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
753 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
754 {
755   Mat            C = *B;
756   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
757   IS             perm = b->row;
758   PetscErrorCode ierr;
759   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
760   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
761   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
762   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
763 
764   PetscFunctionBegin;
765   /* initialization */
766   /* il and jl record the first nonzero element in each row of the accessing
767      window U(0:k, k:mbs-1).
768      jl:    list of rows to be added to uneliminated rows
769             i>= k: jl(i) is the first row to be added to row i
770             i<  k: jl(i) is the row following row i in some list of rows
771             jl(i) = mbs indicates the end of a list
772      il(i): points to the first nonzero element in columns k,...,mbs-1 of
773             row i of U */
774   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
775   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
776   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
777   jl   = il + mbs;
778   for (i=0; i<mbs; i++) {
779     jl[i] = mbs; il[0] = 0;
780   }
781   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
782   uik  = dk + 4;
783   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
784 
785   /* check permutation */
786   if (!a->permute){
787     ai = a->i; aj = a->j; aa = a->a;
788   } else {
789     ai   = a->inew; aj = a->jnew;
790     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
791     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
792     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
793     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
794 
795     for (i=0; i<mbs; i++){
796       jmin = ai[i]; jmax = ai[i+1];
797       for (j=jmin; j<jmax; j++){
798         while (a2anew[j] != j){
799           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
800           for (k1=0; k1<4; k1++){
801             dk[k1]       = aa[k*4+k1];
802             aa[k*4+k1] = aa[j*4+k1];
803             aa[j*4+k1] = dk[k1];
804           }
805         }
806         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
807         if (i > aj[j]){
808           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
809           ap = aa + j*4;     /* ptr to the beginning of the block */
810           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
811           ap[1] = ap[2];
812           ap[2] = dk[1];
813         }
814       }
815     }
816     ierr = PetscFree(a2anew);CHKERRQ(ierr);
817   }
818 
819   /* for each row k */
820   for (k = 0; k<mbs; k++){
821 
822     /*initialize k-th row with elements nonzero in row perm(k) of A */
823     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
824     ap = aa + jmin*4;
825     for (j = jmin; j < jmax; j++){
826       vj = perm_ptr[aj[j]];         /* block col. index */
827       rtmp_ptr = rtmp + vj*4;
828       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
829     }
830 
831     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
832     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
833     i = jl[k]; /* first row to be added to k_th row  */
834 
835     while (i < k){
836       nexti = jl[i]; /* next row to be added to k_th row */
837 
838       /* compute multiplier */
839       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
840 
841       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
842       diag = ba + i*4;
843       u    = ba + ili*4;
844       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
845       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
846       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
847       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
848 
849       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
850       dk[0] += uik[0]*u[0] + uik[1]*u[1];
851       dk[1] += uik[2]*u[0] + uik[3]*u[1];
852       dk[2] += uik[0]*u[2] + uik[1]*u[3];
853       dk[3] += uik[2]*u[2] + uik[3]*u[3];
854 
855       /* update -U(i,k): ba[ili] = uik */
856       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
857 
858       /* add multiple of row i to k-th row ... */
859       jmin = ili + 1; jmax = bi[i+1];
860       if (jmin < jmax){
861         for (j=jmin; j<jmax; j++) {
862           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
863           rtmp_ptr = rtmp + bj[j]*4;
864           u = ba + j*4;
865           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
866           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
867           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
868           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
869         }
870 
871         /* ... add i to row list for next nonzero entry */
872         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
873         j     = bj[jmin];
874         jl[i] = jl[j]; jl[j] = i; /* update jl */
875       }
876       i = nexti;
877     }
878 
879     /* save nonzero entries in k-th row of U ... */
880 
881     /* invert diagonal block */
882     diag = ba+k*4;
883     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
884     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
885 
886     jmin = bi[k]; jmax = bi[k+1];
887     if (jmin < jmax) {
888       for (j=jmin; j<jmax; j++){
889          vj = bj[j];           /* block col. index of U */
890          u   = ba + j*4;
891          rtmp_ptr = rtmp + vj*4;
892          for (k1=0; k1<4; k1++){
893            *u++        = *rtmp_ptr;
894            *rtmp_ptr++ = 0.0;
895          }
896       }
897 
898       /* ... add k to row list for first nonzero entry in k-th row */
899       il[k] = jmin;
900       i     = bj[jmin];
901       jl[k] = jl[i]; jl[i] = k;
902     }
903   }
904 
905   ierr = PetscFree(rtmp);CHKERRQ(ierr);
906   ierr = PetscFree(il);CHKERRQ(ierr);
907   ierr = PetscFree(dk);CHKERRQ(ierr);
908   if (a->permute) {
909     ierr = PetscFree(aa);CHKERRQ(ierr);
910   }
911   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
912   C->factor    = FACTOR_CHOLESKY;
913   C->assembled = PETSC_TRUE;
914   C->preallocated = PETSC_TRUE;
915   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
916   PetscFunctionReturn(0);
917 }
918 
919 /*
920       Version for when blocks are 2 by 2 Using natural ordering
921 */
922 #undef __FUNCT__
923 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
924 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
925 {
926   Mat            C = *B;
927   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
928   PetscErrorCode ierr;
929   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
930   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
931   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
932   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
933 
934   PetscFunctionBegin;
935   /* initialization */
936   /* il and jl record the first nonzero element in each row of the accessing
937      window U(0:k, k:mbs-1).
938      jl:    list of rows to be added to uneliminated rows
939             i>= k: jl(i) is the first row to be added to row i
940             i<  k: jl(i) is the row following row i in some list of rows
941             jl(i) = mbs indicates the end of a list
942      il(i): points to the first nonzero element in columns k,...,mbs-1 of
943             row i of U */
944   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
945   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
946   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
947   jl   = il + mbs;
948   for (i=0; i<mbs; i++) {
949     jl[i] = mbs; il[0] = 0;
950   }
951   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
952   uik  = dk + 4;
953 
954   ai = a->i; aj = a->j; aa = a->a;
955 
956   /* for each row k */
957   for (k = 0; k<mbs; k++){
958 
959     /*initialize k-th row with elements nonzero in row k of A */
960     jmin = ai[k]; jmax = ai[k+1];
961     ap = aa + jmin*4;
962     for (j = jmin; j < jmax; j++){
963       vj = aj[j];         /* block col. index */
964       rtmp_ptr = rtmp + vj*4;
965       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
966     }
967 
968     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
969     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
970     i = jl[k]; /* first row to be added to k_th row  */
971 
972     while (i < k){
973       nexti = jl[i]; /* next row to be added to k_th row */
974 
975       /* compute multiplier */
976       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
977 
978       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
979       diag = ba + i*4;
980       u    = ba + ili*4;
981       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
982       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
983       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
984       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
985 
986       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
987       dk[0] += uik[0]*u[0] + uik[1]*u[1];
988       dk[1] += uik[2]*u[0] + uik[3]*u[1];
989       dk[2] += uik[0]*u[2] + uik[1]*u[3];
990       dk[3] += uik[2]*u[2] + uik[3]*u[3];
991 
992       /* update -U(i,k): ba[ili] = uik */
993       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
994 
995       /* add multiple of row i to k-th row ... */
996       jmin = ili + 1; jmax = bi[i+1];
997       if (jmin < jmax){
998         for (j=jmin; j<jmax; j++) {
999           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1000           rtmp_ptr = rtmp + bj[j]*4;
1001           u = ba + j*4;
1002           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1003           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1004           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1005           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1006         }
1007 
1008         /* ... add i to row list for next nonzero entry */
1009         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1010         j     = bj[jmin];
1011         jl[i] = jl[j]; jl[j] = i; /* update jl */
1012       }
1013       i = nexti;
1014     }
1015 
1016     /* save nonzero entries in k-th row of U ... */
1017 
1018     /* invert diagonal block */
1019     diag = ba+k*4;
1020     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1021     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1022 
1023     jmin = bi[k]; jmax = bi[k+1];
1024     if (jmin < jmax) {
1025       for (j=jmin; j<jmax; j++){
1026          vj = bj[j];           /* block col. index of U */
1027          u   = ba + j*4;
1028          rtmp_ptr = rtmp + vj*4;
1029          for (k1=0; k1<4; k1++){
1030            *u++        = *rtmp_ptr;
1031            *rtmp_ptr++ = 0.0;
1032          }
1033       }
1034 
1035       /* ... add k to row list for first nonzero entry in k-th row */
1036       il[k] = jmin;
1037       i     = bj[jmin];
1038       jl[k] = jl[i]; jl[i] = k;
1039     }
1040   }
1041 
1042   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1043   ierr = PetscFree(il);CHKERRQ(ierr);
1044   ierr = PetscFree(dk);CHKERRQ(ierr);
1045 
1046   C->factor    = FACTOR_CHOLESKY;
1047   C->assembled = PETSC_TRUE;
1048   C->preallocated = PETSC_TRUE;
1049   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 /*
1054     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
1055     Version for blocks are 1 by 1.
1056 */
1057 #undef __FUNCT__
1058 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1059 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
1060 {
1061   Mat            C = *B;
1062   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1063   IS             ip = b->row;
1064   PetscErrorCode ierr;
1065   PetscInt       *rip,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
1066   PetscInt       *ai,*aj,*r;
1067   PetscInt       k,jmin,jmax,*jl,*il,vj,nexti,ili;
1068   MatScalar      *rtmp;
1069   MatScalar      *ba = b->a,*aa,ak;
1070   MatScalar      dk,uikdi;
1071 
1072   PetscFunctionBegin;
1073   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1074   if (!a->permute){
1075     ai = a->i; aj = a->j; aa = a->a;
1076   } else {
1077     ai = a->inew; aj = a->jnew;
1078     ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1079     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1080     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&r);CHKERRQ(ierr);
1081     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
1082 
1083     jmin = ai[0]; jmax = ai[mbs];
1084     for (j=jmin; j<jmax; j++){
1085       while (r[j] != j){
1086         k = r[j]; r[j] = r[k]; r[k] = k;
1087         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
1088       }
1089     }
1090     ierr = PetscFree(r);CHKERRQ(ierr);
1091   }
1092 
1093   /* initialization */
1094   /* il and jl record the first nonzero element in each row of the accessing
1095      window U(0:k, k:mbs-1).
1096      jl:    list of rows to be added to uneliminated rows
1097             i>= k: jl(i) is the first row to be added to row i
1098             i<  k: jl(i) is the row following row i in some list of rows
1099             jl(i) = mbs indicates the end of a list
1100      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1101             row i of U */
1102   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1103   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1104   jl   = il + mbs;
1105   for (i=0; i<mbs; i++) {
1106     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1107   }
1108 
1109   /* for each row k */
1110   for (k = 0; k<mbs; k++){
1111 
1112     /*initialize k-th row with elements nonzero in row perm(k) of A */
1113     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1114 
1115     for (j = jmin; j < jmax; j++){
1116       vj = rip[aj[j]];
1117       rtmp[vj] = aa[j];
1118     }
1119 
1120     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1121     dk = rtmp[k];
1122     i = jl[k]; /* first row to be added to k_th row  */
1123 
1124     while (i < k){
1125       nexti = jl[i]; /* next row to be added to k_th row */
1126 
1127       /* compute multiplier, update D(k) and U(i,k) */
1128       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1129       uikdi = - ba[ili]*ba[i];
1130       dk += uikdi*ba[ili];
1131       ba[ili] = uikdi; /* -U(i,k) */
1132 
1133       /* add multiple of row i to k-th row ... */
1134       jmin = ili + 1; jmax = bi[i+1];
1135       if (jmin < jmax){
1136         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1137         /* ... add i to row list for next nonzero entry */
1138         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1139         j     = bj[jmin];
1140         jl[i] = jl[j]; jl[j] = i; /* update jl */
1141       }
1142       i = nexti;
1143     }
1144 
1145     /* check for zero pivot and save diagoanl element */
1146     if (dk == 0.0){
1147       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1148       /*
1149     } else if (PetscRealPart(dk) < 0.0){
1150       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk);
1151       */
1152     }
1153 
1154     /* save nonzero entries in k-th row of U ... */
1155     ba[k] = 1.0/dk;
1156     jmin = bi[k]; jmax = bi[k+1];
1157     if (jmin < jmax) {
1158       for (j=jmin; j<jmax; j++){
1159          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1160       }
1161       /* ... add k to row list for first nonzero entry in k-th row */
1162       il[k] = jmin;
1163       i     = bj[jmin];
1164       jl[k] = jl[i]; jl[i] = k;
1165     }
1166   }
1167 
1168   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1169   ierr = PetscFree(il);CHKERRQ(ierr);
1170   if (a->permute){
1171     ierr = PetscFree(aa);CHKERRQ(ierr);
1172   }
1173 
1174   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1175   C->factor    = FACTOR_CHOLESKY;
1176   C->assembled = PETSC_TRUE;
1177   C->preallocated = PETSC_TRUE;
1178   PetscLogFlops(b->mbs);
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 /*
1183   Version for when blocks are 1 by 1 Using natural ordering
1184 */
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1187 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1188 {
1189   Mat            C = *B;
1190   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1191   PetscErrorCode ierr;
1192   PetscInt       i,j,mbs = a->mbs;
1193   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1194   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1195   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1196   PetscReal      damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
1197   PetscTruth     damp,chshift;
1198   PetscInt       nshift=0;
1199 
1200   PetscFunctionBegin;
1201   /* initialization */
1202   /* il and jl record the first nonzero element in each row of the accessing
1203      window U(0:k, k:mbs-1).
1204      jl:    list of rows to be added to uneliminated rows
1205             i>= k: jl(i) is the first row to be added to row i
1206             i<  k: jl(i) is the row following row i in some list of rows
1207             jl(i) = mbs indicates the end of a list
1208      il(i): points to the first nonzero element in U(i,k:mbs-1)
1209   */
1210   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1211   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1212   jl   = il + mbs;
1213 
1214   shift_amount = 0;
1215   do {
1216     damp = PETSC_FALSE;
1217     chshift = PETSC_FALSE;
1218     for (i=0; i<mbs; i++) {
1219       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1220     }
1221 
1222     for (k = 0; k<mbs; k++){ /* row k */
1223     /*initialize k-th row with elements nonzero in row perm(k) of A */
1224       nz   = ai[k+1] - ai[k];
1225       acol = aj + ai[k];
1226       aval = aa + ai[k];
1227       bval = ba + bi[k];
1228       while (nz -- ){
1229         rtmp[*acol++] = *aval++;
1230         *bval++       = 0.0; /* for in-place factorization */
1231       }
1232       /* damp the diagonal of the matrix */
1233       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1234 
1235       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1236       dk = rtmp[k];
1237       i  = jl[k]; /* first row to be added to k_th row  */
1238 
1239       while (i < k){
1240         nexti = jl[i]; /* next row to be added to k_th row */
1241 
1242         /* compute multiplier, update D(k) and U(i,k) */
1243         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1244         uikdi = - ba[ili]*ba[bi[i]];
1245         dk   += uikdi*ba[ili];
1246         ba[ili] = uikdi; /* -U(i,k) */
1247 
1248         /* add multiple of row i to k-th row ... */
1249         jmin = ili + 1;
1250         nz   = bi[i+1] - jmin;
1251         if (nz > 0){
1252           bcol = bj + jmin;
1253           bval = ba + jmin;
1254           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1255           /* ... add i to row list for next nonzero entry */
1256           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1257           j     = bj[jmin];
1258           jl[i] = jl[j]; jl[j] = i; /* update jl */
1259         }
1260         i = nexti;
1261       }
1262 
1263       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1264 	/* calculate a shift that would make this row diagonally dominant */
1265 	PetscReal rs = PetscAbs(PetscRealPart(dk));
1266 	jmin      = bi[k]+1;
1267 	nz        = bi[k+1] - jmin;
1268 	if (nz){
1269 	  bcol = bj + jmin;
1270 	  bval = ba + jmin;
1271 	  while (nz--){
1272 	    rs += PetscAbsScalar(rtmp[*bcol++]);
1273 	  }
1274 	}
1275 	/* if this shift is less than the previous, just up the previous
1276 	   one by a bit */
1277 	shift_amount = PetscMax(rs,1.1*shift_amount);
1278 	chshift  = PETSC_TRUE;
1279 	/* Unlike in the ILU case there is no exit condition on nshift:
1280 	   we increase the shift until it converges. There is no guarantee that
1281 	   this algorithm converges faster or slower, or is better or worse
1282 	   than the ILU algorithm. */
1283 	nshift++;
1284 	break;
1285       }
1286       if (PetscRealPart(dk) < zeropivot){
1287         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1288         if (damping > 0.0) {
1289           if (ndamp) damping *= 2.0;
1290           damp = PETSC_TRUE;
1291           ndamp++;
1292           break;
1293         } else if (PetscAbsScalar(dk) < zeropivot){
1294           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1295         } else {
1296           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %D of Cholesky factorization\n",PetscRealPart(dk),k);
1297         }
1298       }
1299 
1300       /* save nonzero entries in k-th row of U ... */
1301       /* printf("%D, dk: %g, 1/dk: %g\n",k,dk,1/dk); */
1302       ba[bi[k]] = 1.0/dk;
1303       jmin      = bi[k]+1;
1304       nz        = bi[k+1] - jmin;
1305       if (nz){
1306         bcol = bj + jmin;
1307         bval = ba + jmin;
1308         while (nz--){
1309           *bval++       = rtmp[*bcol];
1310           rtmp[*bcol++] = 0.0;
1311         }
1312         /* ... add k to row list for first nonzero entry in k-th row */
1313         il[k] = jmin;
1314         i     = bj[jmin];
1315         jl[k] = jl[i]; jl[i] = k;
1316       }
1317     } /* end of for (k = 0; k<mbs; k++) */
1318   } while (damp||chshift);
1319   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1320   ierr = PetscFree(il);CHKERRQ(ierr);
1321 
1322   C->factor       = FACTOR_CHOLESKY;
1323   C->assembled    = PETSC_TRUE;
1324   C->preallocated = PETSC_TRUE;
1325   PetscLogFlops(b->mbs);
1326   if (ndamp) {
1327     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %D damping value %g\n",ndamp,damping);
1328   }
1329  if (nshift) {
1330     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %D shifts\n",nshift);
1331   }
1332 
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 #undef __FUNCT__
1337 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1338 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1339 {
1340   PetscErrorCode ierr;
1341   Mat            C;
1342 
1343   PetscFunctionBegin;
1344   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1345   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
1346   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 
1351