xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision d4e9f3b6443ec535fe5e409f785fa3cf019ceb6c)
1 /*$Id: sbaijfact.c,v 1.61 2001/08/06 21:15:47 bsmith Exp $*/
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/mat/impls/sbaij/seq/sbaij.h"
5 #include "src/vec/vecimpl.h"
6 #include "src/inline/ilu.h"
7 #include "include/petscis.h"
8 
9 #if !defined(PETSC_USE_COMPLEX)
10 /*
11   input:
12    F -- numeric factor
13   output:
14    nneg, nzero, npos: matrix inertia
15 */
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
19 int MatGetInertia_SeqSBAIJ(Mat F,int *nneig,int *nzero,int *npos)
20 {
21   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
22   PetscScalar  *dd = fact_ptr->a;
23   int          m = F->m,i;
24 
25   PetscFunctionBegin;
26   if (nneig){
27     *nneig = 0;
28     for (i=0; i<m; i++){
29       if (PetscRealPart(dd[i]) < 0.0) (*nneig)++;
30     }
31   }
32   if (nzero){
33     *nzero = 0;
34     for (i=0; i<m; i++){
35       if (PetscRealPart(dd[i]) == 0.0) (*nzero)++;
36     }
37   }
38   if (npos){
39     *npos = 0;
40     for (i=0; i<m; i++){
41       if (PetscRealPart(dd[i]) > 0.0) (*npos)++;
42     }
43   }
44   PetscFunctionReturn(0);
45 }
46 #endif /* !defined(PETSC_USE_COMPLEX) */
47 
48 /* Using Modified Sparse Row (MSR) storage.
49 See page 85, "Iterative Methods ..." by Saad. */
50 /*
51     Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
52 */
53 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
56 int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
57 {
58   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
59   int          *rip,ierr,i,mbs = a->mbs,*ai,*aj;
60   int          *jutmp,bs = a->bs,bs2=a->bs2;
61   int          m,realloc = 0,prow;
62   int          *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
63   int          *il,ili,nextprow;
64   PetscReal    f = info->fill;
65   PetscTruth   perm_identity;
66 
67   PetscFunctionBegin;
68   /* check whether perm is the identity mapping */
69   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
70 
71   /* -- inplace factorization, i.e., use sbaij for *B -- */
72   if (perm_identity && bs==1 ){
73     if (!perm_identity) a->permute = PETSC_TRUE;
74 
75   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
76 
77   if (perm_identity){ /* without permutation */
78     ai = a->i; aj = a->j;
79   } else {            /* non-trivial permutation */
80     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
81     ai = a->inew; aj = a->jnew;
82   }
83 
84   /* initialization */
85   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
86   umax  = (int)(f*ai[mbs] + 1);
87   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
88   iu[0] = 0;
89   juidx = 0; /* index for ju */
90   ierr  = PetscMalloc((3*mbs+1)*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */
91   q     = jl + mbs;   /* linked list for col index of active row */
92   il    = q  + mbs;
93   for (i=0; i<mbs; i++){
94     jl[i] = mbs;
95     q[i]  = 0;
96     il[i] = 0;
97   }
98 
99   /* for each row k */
100   for (k=0; k<mbs; k++){
101     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
102     q[k] = mbs;
103     /* initialize nonzero structure of k-th row to row rip[k] of A */
104     jmin = ai[rip[k]] +1; /* exclude diag[k] */
105     jmax = ai[rip[k]+1];
106     for (j=jmin; j<jmax; j++){
107       vj = rip[aj[j]]; /* col. value */
108       if(vj > k){
109         qm = k;
110         do {
111           m  = qm; qm = q[m];
112         } while(qm < vj);
113         if (qm == vj) {
114           SETERRQ(1," error: duplicate entry in A\n");
115         }
116         nzk++;
117         q[m]  = vj;
118         q[vj] = qm;
119       } /* if(vj > k) */
120     } /* for (j=jmin; j<jmax; j++) */
121 
122     /* modify nonzero structure of k-th row by computing fill-in
123        for each row i to be merged in */
124     prow = k;
125     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
126 
127     while (prow < k){
128       nextprow = jl[prow];
129 
130       /* merge row prow into k-th row */
131       ili = il[prow];
132       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
133       jmax = iu[prow+1];
134       qm = k;
135       for (j=jmin; j<jmax; j++){
136         vj = ju[j];
137         do {
138           m = qm; qm = q[m];
139         } while (qm < vj);
140         if (qm != vj){  /* a fill */
141           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
142         }
143       } /* end of for (j=jmin; j<jmax; j++) */
144       if (jmin < jmax){
145         il[prow] = jmin;
146         j = ju[jmin];
147         jl[prow] = jl[j]; jl[j] = prow;  /* update jl */
148       }
149       prow = nextprow;
150     }
151 
152     /* update il and jl */
153     if (nzk > 0){
154       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
155       jl[k] = jl[i]; jl[i] = k;
156       il[k] = iu[k] + 1;
157     }
158     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */
159 
160     /* allocate more space to ju if needed */
161     if (iu[k+1] > umax) {
162       /* estimate how much additional space we will need */
163       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
164       /* just double the memory each time */
165       maxadd = umax;
166       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
167       umax += maxadd;
168 
169       /* allocate a longer ju */
170       ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
171       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
172       ierr = PetscFree(ju);CHKERRQ(ierr);
173       ju   = jutmp;
174       realloc++; /* count how many times we realloc */
175     }
176 
177     /* save nonzero structure of k-th row in ju */
178     ju[juidx++] = k; /* diag[k] */
179     i = k;
180     while (nzk --) {
181       i           = q[i];
182       ju[juidx++] = i;
183     }
184   }
185 
186   if (ai[mbs] != 0) {
187     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
188     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
189     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
190     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
191     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
192   } else {
193      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
194   }
195 
196   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
197   /* ierr = PetscFree(q);CHKERRQ(ierr); */
198   ierr = PetscFree(jl);CHKERRQ(ierr);
199 
200   /* put together the new matrix */
201   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
202   /* PetscLogObjectParent(*B,iperm); */
203   b = (Mat_SeqSBAIJ*)(*B)->data;
204   ierr = PetscFree(b->imax);CHKERRQ(ierr);
205   b->singlemalloc = PETSC_FALSE;
206   /* the next line frees the default space generated by the Create() */
207   ierr = PetscFree(b->a);CHKERRQ(ierr);
208   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
209   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
210   b->j    = ju;
211   b->i    = iu;
212   b->diag = 0;
213   b->ilen = 0;
214   b->imax = 0;
215   b->row  = perm;
216   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
217   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
218   b->icol = perm;
219   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
220   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
221   /* In b structure:  Free imax, ilen, old a, old j.
222      Allocate idnew, solve_work, new a, new j */
223   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
224   b->s_maxnz = b->s_nz = iu[mbs];
225 
226   (*B)->factor                 = FACTOR_CHOLESKY;
227   (*B)->info.factor_mallocs    = realloc;
228   (*B)->info.fill_ratio_given  = f;
229   if (ai[mbs] != 0) {
230     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
231   } else {
232     (*B)->info.fill_ratio_needed = 0.0;
233   }
234 
235 
236   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
237   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
238   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
239 
240   PetscFunctionReturn(0);
241   }
242   /* -----------  end of new code --------------------*/
243 
244 
245   if (!perm_identity) a->permute = PETSC_TRUE;
246 
247   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
248 
249   if (perm_identity){ /* without permutation */
250     ai = a->i; aj = a->j;
251   } else {            /* non-trivial permutation */
252     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
253     ai = a->inew; aj = a->jnew;
254   }
255 
256   /* initialization */
257   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
258   umax  = (int)(f*ai[mbs] + 1); umax += mbs + 1;
259   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
260   iu[0] = mbs+1;
261   juidx = mbs + 1; /* index for ju */
262   ierr  = PetscMalloc(2*mbs*sizeof(int),&jl);CHKERRQ(ierr); /* linked list for pivot row */
263   q     = jl + mbs;   /* linked list for col index */
264   for (i=0; i<mbs; i++){
265     jl[i] = mbs;
266     q[i] = 0;
267   }
268 
269   /* for each row k */
270   for (k=0; k<mbs; k++){
271     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
272     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
273     q[k] = mbs;
274     /* initialize nonzero structure of k-th row to row rip[k] of A */
275     jmin = ai[rip[k]] +1; /* exclude diag[k] */
276     jmax = ai[rip[k]+1];
277     for (j=jmin; j<jmax; j++){
278       vj = rip[aj[j]]; /* col. value */
279       if(vj > k){
280         qm = k;
281         do {
282           m  = qm; qm = q[m];
283         } while(qm < vj);
284         if (qm == vj) {
285           SETERRQ(1," error: duplicate entry in A\n");
286         }
287         nzk++;
288         q[m]  = vj;
289         q[vj] = qm;
290       } /* if(vj > k) */
291     } /* for (j=jmin; j<jmax; j++) */
292 
293     /* modify nonzero structure of k-th row by computing fill-in
294        for each row i to be merged in */
295     prow = k;
296     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
297 
298     while (prow < k){
299       /* merge row prow into k-th row */
300       jmin = iu[prow] + 1; jmax = iu[prow+1];
301       qm = k;
302       for (j=jmin; j<jmax; j++){
303         vj = ju[j];
304         do {
305           m = qm; qm = q[m];
306         } while (qm < vj);
307         if (qm != vj){
308          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
309         }
310       }
311       prow = jl[prow]; /* next pivot row */
312     }
313 
314     /* add k to row list for first nonzero element in k-th row */
315     if (nzk > 0){
316       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
317       jl[k] = jl[i]; jl[i] = k;
318     }
319     iu[k+1] = iu[k] + nzk;
320 
321     /* allocate more space to ju if needed */
322     if (iu[k+1] > umax) {
323       /* estimate how much additional space we will need */
324       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
325       /* just double the memory each time */
326       maxadd = umax;
327       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
328       umax += maxadd;
329 
330       /* allocate a longer ju */
331       ierr = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
332       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
333       ierr = PetscFree(ju);CHKERRQ(ierr);
334       ju   = jutmp;
335       realloc++; /* count how many times we realloc */
336     }
337 
338     /* save nonzero structure of k-th row in ju */
339     i=k;
340     while (nzk --) {
341       i           = q[i];
342       ju[juidx++] = i;
343     }
344   }
345 
346   if (ai[mbs] != 0) {
347     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
348     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
349     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
350     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
351     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
352   } else {
353      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
354   }
355 
356   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
357   /* ierr = PetscFree(q);CHKERRQ(ierr); */
358   ierr = PetscFree(jl);CHKERRQ(ierr);
359 
360   /* put together the new matrix */
361   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
362   /* PetscLogObjectParent(*B,iperm); */
363   b = (Mat_SeqSBAIJ*)(*B)->data;
364   ierr = PetscFree(b->imax);CHKERRQ(ierr);
365   b->singlemalloc = PETSC_FALSE;
366   /* the next line frees the default space generated by the Create() */
367   ierr = PetscFree(b->a);CHKERRQ(ierr);
368   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
369   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
370   b->j    = ju;
371   b->i    = iu;
372   b->diag = 0;
373   b->ilen = 0;
374   b->imax = 0;
375   b->row  = perm;
376   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
377   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
378   b->icol = perm;
379   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
380   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
381   /* In b structure:  Free imax, ilen, old a, old j.
382      Allocate idnew, solve_work, new a, new j */
383   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
384   b->s_maxnz = b->s_nz = iu[mbs];
385 
386   (*B)->factor                 = FACTOR_CHOLESKY;
387   (*B)->info.factor_mallocs    = realloc;
388   (*B)->info.fill_ratio_given  = f;
389   if (ai[mbs] != 0) {
390     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
391   } else {
392     (*B)->info.fill_ratio_needed = 0.0;
393   }
394 
395   if (perm_identity){
396     switch (bs) {
397       case 1:
398         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
399         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
400         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
401         break;
402       case 2:
403         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
404         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
405         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
406         break;
407       case 3:
408         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
409         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
410         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
411         break;
412       case 4:
413         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
414         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
415         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
416         break;
417       case 5:
418         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
419         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
420         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
421         break;
422       case 6:
423         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
424         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
425         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
426         break;
427       case 7:
428         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
429         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
430         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
431       break;
432       default:
433         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
434         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
435         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
436       break;
437     }
438   }
439 
440   PetscFunctionReturn(0);
441 }
442 
443 
444 #undef __FUNCT__
445 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
446 int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
447 {
448   Mat                C = *B;
449   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
450   IS                 perm = b->row;
451   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
452   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
453   int                bs=a->bs,bs2 = a->bs2;
454   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
455   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
456   MatScalar          *work;
457   int                *pivots;
458 
459   PetscFunctionBegin;
460 
461   /* initialization */
462   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
463   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
464   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
465   jl   = il + mbs;
466   for (i=0; i<mbs; i++) {
467     jl[i] = mbs; il[0] = 0;
468   }
469   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
470   uik  = dk + bs2;
471   work = uik + bs2;
472   ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr);
473 
474   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
475 
476   /* check permutation */
477   if (!a->permute){
478     ai = a->i; aj = a->j; aa = a->a;
479   } else {
480     ai   = a->inew; aj = a->jnew;
481     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
482     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
483     ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr);
484     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
485 
486     for (i=0; i<mbs; i++){
487       jmin = ai[i]; jmax = ai[i+1];
488       for (j=jmin; j<jmax; j++){
489         while (a2anew[j] != j){
490           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
491           for (k1=0; k1<bs2; k1++){
492             dk[k1]       = aa[k*bs2+k1];
493             aa[k*bs2+k1] = aa[j*bs2+k1];
494             aa[j*bs2+k1] = dk[k1];
495           }
496         }
497         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
498         if (i > aj[j]){
499           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
500           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
501           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
502           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
503             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
504           }
505         }
506       }
507     }
508     ierr = PetscFree(a2anew);CHKERRQ(ierr);
509   }
510 
511   /* for each row k */
512   for (k = 0; k<mbs; k++){
513 
514     /*initialize k-th row with elements nonzero in row perm(k) of A */
515     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
516 
517     ap = aa + jmin*bs2;
518     for (j = jmin; j < jmax; j++){
519       vj = perm_ptr[aj[j]];         /* block col. index */
520       rtmp_ptr = rtmp + vj*bs2;
521       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
522     }
523 
524     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
525     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
526     i = jl[k]; /* first row to be added to k_th row  */
527 
528     while (i < k){
529       nexti = jl[i]; /* next row to be added to k_th row */
530 
531       /* compute multiplier */
532       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
533 
534       /* uik = -inv(Di)*U_bar(i,k) */
535       diag = ba + i*bs2;
536       u    = ba + ili*bs2;
537       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
538       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
539 
540       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
541       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
542 
543       /* update -U(i,k) */
544       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
545 
546       /* add multiple of row i to k-th row ... */
547       jmin = ili + 1; jmax = bi[i+1];
548       if (jmin < jmax){
549         for (j=jmin; j<jmax; j++) {
550           /* rtmp += -U(i,k)^T * U_bar(i,j) */
551           rtmp_ptr = rtmp + bj[j]*bs2;
552           u = ba + j*bs2;
553           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
554         }
555 
556         /* ... add i to row list for next nonzero entry */
557         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
558         j     = bj[jmin];
559         jl[i] = jl[j]; jl[j] = i; /* update jl */
560       }
561       i = nexti;
562     }
563 
564     /* save nonzero entries in k-th row of U ... */
565 
566     /* invert diagonal block */
567     diag = ba+k*bs2;
568     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
569     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
570 
571     jmin = bi[k]; jmax = bi[k+1];
572     if (jmin < jmax) {
573       for (j=jmin; j<jmax; j++){
574          vj = bj[j];           /* block col. index of U */
575          u   = ba + j*bs2;
576          rtmp_ptr = rtmp + vj*bs2;
577          for (k1=0; k1<bs2; k1++){
578            *u++        = *rtmp_ptr;
579            *rtmp_ptr++ = 0.0;
580          }
581       }
582 
583       /* ... add k to row list for first nonzero entry in k-th row */
584       il[k] = jmin;
585       i     = bj[jmin];
586       jl[k] = jl[i]; jl[i] = k;
587     }
588   }
589 
590   ierr = PetscFree(rtmp);CHKERRQ(ierr);
591   ierr = PetscFree(il);CHKERRQ(ierr);
592   ierr = PetscFree(dk);CHKERRQ(ierr);
593   ierr = PetscFree(pivots);CHKERRQ(ierr);
594   if (a->permute){
595     ierr = PetscFree(aa);CHKERRQ(ierr);
596   }
597 
598   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
599   C->factor       = FACTOR_CHOLESKY;
600   C->assembled    = PETSC_TRUE;
601   C->preallocated = PETSC_TRUE;
602   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
603   PetscFunctionReturn(0);
604 }
605 
606 #undef __FUNCT__
607 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
608 int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
609 {
610   Mat                C = *B;
611   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
612   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
613   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
614   int                bs=a->bs,bs2 = a->bs2;
615   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
616   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
617   MatScalar          *work;
618   int                *pivots;
619 
620   PetscFunctionBegin;
621 
622   /* initialization */
623 
624   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
625   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
626   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
627   jl   = il + mbs;
628   for (i=0; i<mbs; i++) {
629     jl[i] = mbs; il[0] = 0;
630   }
631   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
632   uik  = dk + bs2;
633   work = uik + bs2;
634   ierr = PetscMalloc(bs*sizeof(int),&pivots);CHKERRQ(ierr);
635 
636   ai = a->i; aj = a->j; aa = a->a;
637 
638   /* for each row k */
639   for (k = 0; k<mbs; k++){
640 
641     /*initialize k-th row with elements nonzero in row k of A */
642     jmin = ai[k]; jmax = ai[k+1];
643     ap = aa + jmin*bs2;
644     for (j = jmin; j < jmax; j++){
645       vj = aj[j];         /* block col. index */
646       rtmp_ptr = rtmp + vj*bs2;
647       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
648     }
649 
650     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
651     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
652     i = jl[k]; /* first row to be added to k_th row  */
653 
654     while (i < k){
655       nexti = jl[i]; /* next row to be added to k_th row */
656 
657       /* compute multiplier */
658       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
659 
660       /* uik = -inv(Di)*U_bar(i,k) */
661       diag = ba + i*bs2;
662       u    = ba + ili*bs2;
663       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
664       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
665 
666       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
667       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
668 
669       /* update -U(i,k) */
670       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
671 
672       /* add multiple of row i to k-th row ... */
673       jmin = ili + 1; jmax = bi[i+1];
674       if (jmin < jmax){
675         for (j=jmin; j<jmax; j++) {
676           /* rtmp += -U(i,k)^T * U_bar(i,j) */
677           rtmp_ptr = rtmp + bj[j]*bs2;
678           u = ba + j*bs2;
679           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
680         }
681 
682         /* ... add i to row list for next nonzero entry */
683         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
684         j     = bj[jmin];
685         jl[i] = jl[j]; jl[j] = i; /* update jl */
686       }
687       i = nexti;
688     }
689 
690     /* save nonzero entries in k-th row of U ... */
691 
692     /* invert diagonal block */
693     diag = ba+k*bs2;
694     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
695     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
696 
697     jmin = bi[k]; jmax = bi[k+1];
698     if (jmin < jmax) {
699       for (j=jmin; j<jmax; j++){
700          vj = bj[j];           /* block col. index of U */
701          u   = ba + j*bs2;
702          rtmp_ptr = rtmp + vj*bs2;
703          for (k1=0; k1<bs2; k1++){
704            *u++        = *rtmp_ptr;
705            *rtmp_ptr++ = 0.0;
706          }
707       }
708 
709       /* ... add k to row list for first nonzero entry in k-th row */
710       il[k] = jmin;
711       i     = bj[jmin];
712       jl[k] = jl[i]; jl[i] = k;
713     }
714   }
715 
716   ierr = PetscFree(rtmp);CHKERRQ(ierr);
717   ierr = PetscFree(il);CHKERRQ(ierr);
718   ierr = PetscFree(dk);CHKERRQ(ierr);
719   ierr = PetscFree(pivots);CHKERRQ(ierr);
720 
721   C->factor    = FACTOR_CHOLESKY;
722   C->assembled = PETSC_TRUE;
723   C->preallocated = PETSC_TRUE;
724   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
725   PetscFunctionReturn(0);
726 }
727 
728 /*
729     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
730     Version for blocks 2 by 2.
731 */
732 #undef __FUNCT__
733 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
734 int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
735 {
736   Mat                C = *B;
737   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
738   IS                 perm = b->row;
739   int                *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
740   int                *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
741   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
742   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
743 
744   PetscFunctionBegin;
745 
746   /* initialization */
747   /* il and jl record the first nonzero element in each row of the accessing
748      window U(0:k, k:mbs-1).
749      jl:    list of rows to be added to uneliminated rows
750             i>= k: jl(i) is the first row to be added to row i
751             i<  k: jl(i) is the row following row i in some list of rows
752             jl(i) = mbs indicates the end of a list
753      il(i): points to the first nonzero element in columns k,...,mbs-1 of
754             row i of U */
755   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
756   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
757   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
758   jl   = il + mbs;
759   for (i=0; i<mbs; i++) {
760     jl[i] = mbs; il[0] = 0;
761   }
762   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
763   uik  = dk + 4;
764   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
765 
766   /* check permutation */
767   if (!a->permute){
768     ai = a->i; aj = a->j; aa = a->a;
769   } else {
770     ai   = a->inew; aj = a->jnew;
771     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
772     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
773     ierr = PetscMalloc(ai[mbs]*sizeof(int),&a2anew);CHKERRQ(ierr);
774     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
775 
776     for (i=0; i<mbs; i++){
777       jmin = ai[i]; jmax = ai[i+1];
778       for (j=jmin; j<jmax; j++){
779         while (a2anew[j] != j){
780           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
781           for (k1=0; k1<4; k1++){
782             dk[k1]       = aa[k*4+k1];
783             aa[k*4+k1] = aa[j*4+k1];
784             aa[j*4+k1] = dk[k1];
785           }
786         }
787         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
788         if (i > aj[j]){
789           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
790           ap = aa + j*4;     /* ptr to the beginning of the block */
791           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
792           ap[1] = ap[2];
793           ap[2] = dk[1];
794         }
795       }
796     }
797     ierr = PetscFree(a2anew);CHKERRQ(ierr);
798   }
799 
800   /* for each row k */
801   for (k = 0; k<mbs; k++){
802 
803     /*initialize k-th row with elements nonzero in row perm(k) of A */
804     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
805     ap = aa + jmin*4;
806     for (j = jmin; j < jmax; j++){
807       vj = perm_ptr[aj[j]];         /* block col. index */
808       rtmp_ptr = rtmp + vj*4;
809       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
810     }
811 
812     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
813     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
814     i = jl[k]; /* first row to be added to k_th row  */
815 
816     while (i < k){
817       nexti = jl[i]; /* next row to be added to k_th row */
818 
819       /* compute multiplier */
820       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
821 
822       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
823       diag = ba + i*4;
824       u    = ba + ili*4;
825       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
826       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
827       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
828       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
829 
830       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
831       dk[0] += uik[0]*u[0] + uik[1]*u[1];
832       dk[1] += uik[2]*u[0] + uik[3]*u[1];
833       dk[2] += uik[0]*u[2] + uik[1]*u[3];
834       dk[3] += uik[2]*u[2] + uik[3]*u[3];
835 
836       /* update -U(i,k): ba[ili] = uik */
837       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
838 
839       /* add multiple of row i to k-th row ... */
840       jmin = ili + 1; jmax = bi[i+1];
841       if (jmin < jmax){
842         for (j=jmin; j<jmax; j++) {
843           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
844           rtmp_ptr = rtmp + bj[j]*4;
845           u = ba + j*4;
846           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
847           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
848           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
849           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
850         }
851 
852         /* ... add i to row list for next nonzero entry */
853         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
854         j     = bj[jmin];
855         jl[i] = jl[j]; jl[j] = i; /* update jl */
856       }
857       i = nexti;
858     }
859 
860     /* save nonzero entries in k-th row of U ... */
861 
862     /* invert diagonal block */
863     diag = ba+k*4;
864     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
865     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
866 
867     jmin = bi[k]; jmax = bi[k+1];
868     if (jmin < jmax) {
869       for (j=jmin; j<jmax; j++){
870          vj = bj[j];           /* block col. index of U */
871          u   = ba + j*4;
872          rtmp_ptr = rtmp + vj*4;
873          for (k1=0; k1<4; k1++){
874            *u++        = *rtmp_ptr;
875            *rtmp_ptr++ = 0.0;
876          }
877       }
878 
879       /* ... add k to row list for first nonzero entry in k-th row */
880       il[k] = jmin;
881       i     = bj[jmin];
882       jl[k] = jl[i]; jl[i] = k;
883     }
884   }
885 
886   ierr = PetscFree(rtmp);CHKERRQ(ierr);
887   ierr = PetscFree(il);CHKERRQ(ierr);
888   ierr = PetscFree(dk);CHKERRQ(ierr);
889   if (a->permute) {
890     ierr = PetscFree(aa);CHKERRQ(ierr);
891   }
892   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
893   C->factor    = FACTOR_CHOLESKY;
894   C->assembled = PETSC_TRUE;
895   C->preallocated = PETSC_TRUE;
896   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
897   PetscFunctionReturn(0);
898 }
899 
900 /*
901       Version for when blocks are 2 by 2 Using natural ordering
902 */
903 #undef __FUNCT__
904 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
905 int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
906 {
907   Mat                C = *B;
908   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
909   int                ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
910   int                *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
911   MatScalar          *ba = b->a,*aa,*ap,*dk,*uik;
912   MatScalar          *u,*diag,*rtmp,*rtmp_ptr;
913 
914   PetscFunctionBegin;
915 
916   /* initialization */
917   /* il and jl record the first nonzero element in each row of the accessing
918      window U(0:k, k:mbs-1).
919      jl:    list of rows to be added to uneliminated rows
920             i>= k: jl(i) is the first row to be added to row i
921             i<  k: jl(i) is the row following row i in some list of rows
922             jl(i) = mbs indicates the end of a list
923      il(i): points to the first nonzero element in columns k,...,mbs-1 of
924             row i of U */
925   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
926   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
927   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
928   jl   = il + mbs;
929   for (i=0; i<mbs; i++) {
930     jl[i] = mbs; il[0] = 0;
931   }
932   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
933   uik  = dk + 4;
934 
935   ai = a->i; aj = a->j; aa = a->a;
936 
937   /* for each row k */
938   for (k = 0; k<mbs; k++){
939 
940     /*initialize k-th row with elements nonzero in row k of A */
941     jmin = ai[k]; jmax = ai[k+1];
942     ap = aa + jmin*4;
943     for (j = jmin; j < jmax; j++){
944       vj = aj[j];         /* block col. index */
945       rtmp_ptr = rtmp + vj*4;
946       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
947     }
948 
949     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
950     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
951     i = jl[k]; /* first row to be added to k_th row  */
952 
953     while (i < k){
954       nexti = jl[i]; /* next row to be added to k_th row */
955 
956       /* compute multiplier */
957       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
958 
959       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
960       diag = ba + i*4;
961       u    = ba + ili*4;
962       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
963       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
964       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
965       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
966 
967       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
968       dk[0] += uik[0]*u[0] + uik[1]*u[1];
969       dk[1] += uik[2]*u[0] + uik[3]*u[1];
970       dk[2] += uik[0]*u[2] + uik[1]*u[3];
971       dk[3] += uik[2]*u[2] + uik[3]*u[3];
972 
973       /* update -U(i,k): ba[ili] = uik */
974       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
975 
976       /* add multiple of row i to k-th row ... */
977       jmin = ili + 1; jmax = bi[i+1];
978       if (jmin < jmax){
979         for (j=jmin; j<jmax; j++) {
980           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
981           rtmp_ptr = rtmp + bj[j]*4;
982           u = ba + j*4;
983           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
984           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
985           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
986           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
987         }
988 
989         /* ... add i to row list for next nonzero entry */
990         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
991         j     = bj[jmin];
992         jl[i] = jl[j]; jl[j] = i; /* update jl */
993       }
994       i = nexti;
995     }
996 
997     /* save nonzero entries in k-th row of U ... */
998 
999     /* invert diagonal block */
1000     diag = ba+k*4;
1001     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1002     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1003 
1004     jmin = bi[k]; jmax = bi[k+1];
1005     if (jmin < jmax) {
1006       for (j=jmin; j<jmax; j++){
1007          vj = bj[j];           /* block col. index of U */
1008          u   = ba + j*4;
1009          rtmp_ptr = rtmp + vj*4;
1010          for (k1=0; k1<4; k1++){
1011            *u++        = *rtmp_ptr;
1012            *rtmp_ptr++ = 0.0;
1013          }
1014       }
1015 
1016       /* ... add k to row list for first nonzero entry in k-th row */
1017       il[k] = jmin;
1018       i     = bj[jmin];
1019       jl[k] = jl[i]; jl[i] = k;
1020     }
1021   }
1022 
1023   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1024   ierr = PetscFree(il);CHKERRQ(ierr);
1025   ierr = PetscFree(dk);CHKERRQ(ierr);
1026 
1027   C->factor    = FACTOR_CHOLESKY;
1028   C->assembled = PETSC_TRUE;
1029   C->preallocated = PETSC_TRUE;
1030   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 /*
1035     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
1036     Version for blocks are 1 by 1.
1037 */
1038 #undef __FUNCT__
1039 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1040 int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
1041 {
1042   Mat                C = *B;
1043   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1044   IS                 ip = b->row;
1045   int                *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
1046   int                *ai,*aj,*r;
1047   int                k,jmin,jmax,*jl,*il,vj,nexti,ili;
1048   MatScalar          *rtmp;
1049   MatScalar          *ba = b->a,*aa,ak;
1050   MatScalar          dk,uikdi;
1051 
1052   PetscFunctionBegin;
1053   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1054   if (!a->permute){
1055     ai = a->i; aj = a->j; aa = a->a;
1056   } else {
1057     ai = a->inew; aj = a->jnew;
1058     ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1059     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1060     ierr = PetscMalloc(ai[mbs]*sizeof(int),&r);CHKERRQ(ierr);
1061     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));CHKERRQ(ierr);
1062 
1063     jmin = ai[0]; jmax = ai[mbs];
1064     for (j=jmin; j<jmax; j++){
1065       while (r[j] != j){
1066         k = r[j]; r[j] = r[k]; r[k] = k;
1067         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
1068       }
1069     }
1070     ierr = PetscFree(r);CHKERRQ(ierr);
1071   }
1072 
1073   /* initialization */
1074   /* il and jl record the first nonzero element in each row of the accessing
1075      window U(0:k, k:mbs-1).
1076      jl:    list of rows to be added to uneliminated rows
1077             i>= k: jl(i) is the first row to be added to row i
1078             i<  k: jl(i) is the row following row i in some list of rows
1079             jl(i) = mbs indicates the end of a list
1080      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1081             row i of U */
1082   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1083   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
1084   jl   = il + mbs;
1085   for (i=0; i<mbs; i++) {
1086     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1087   }
1088 
1089   /* for each row k */
1090   for (k = 0; k<mbs; k++){
1091 
1092     /*initialize k-th row with elements nonzero in row perm(k) of A */
1093     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1094 
1095     for (j = jmin; j < jmax; j++){
1096       vj = rip[aj[j]];
1097       rtmp[vj] = aa[j];
1098     }
1099 
1100     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1101     dk = rtmp[k];
1102     i = jl[k]; /* first row to be added to k_th row  */
1103 
1104     while (i < k){
1105       nexti = jl[i]; /* next row to be added to k_th row */
1106 
1107       /* compute multiplier, update D(k) and U(i,k) */
1108       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1109       uikdi = - ba[ili]*ba[i];
1110       dk += uikdi*ba[ili];
1111       ba[ili] = uikdi; /* -U(i,k) */
1112 
1113       /* add multiple of row i to k-th row ... */
1114       jmin = ili + 1; jmax = bi[i+1];
1115       if (jmin < jmax){
1116         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1117         /* ... add i to row list for next nonzero entry */
1118         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1119         j     = bj[jmin];
1120         jl[i] = jl[j]; jl[j] = i; /* update jl */
1121       }
1122       i = nexti;
1123     }
1124 
1125     /* check for zero pivot and save diagoanl element */
1126     if (dk == 0.0){
1127       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1128       /*
1129     } else if (PetscRealPart(dk) < 0.0){
1130       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk);
1131       */
1132     }
1133 
1134     /* save nonzero entries in k-th row of U ... */
1135     ba[k] = 1.0/dk;
1136     jmin = bi[k]; jmax = bi[k+1];
1137     if (jmin < jmax) {
1138       for (j=jmin; j<jmax; j++){
1139          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1140       }
1141       /* ... add k to row list for first nonzero entry in k-th row */
1142       il[k] = jmin;
1143       i     = bj[jmin];
1144       jl[k] = jl[i]; jl[i] = k;
1145     }
1146   }
1147 
1148   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1149   ierr = PetscFree(il);CHKERRQ(ierr);
1150   if (a->permute){
1151     ierr = PetscFree(aa);CHKERRQ(ierr);
1152   }
1153 
1154   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1155   C->factor    = FACTOR_CHOLESKY;
1156   C->assembled = PETSC_TRUE;
1157   C->preallocated = PETSC_TRUE;
1158   PetscLogFlops(b->mbs);
1159   PetscFunctionReturn(0);
1160 }
1161 
1162 /*
1163   Version for when blocks are 1 by 1 Using natural ordering
1164 */
1165 #undef __FUNCT__
1166 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1167 int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1168 {
1169   Mat                C = *B;
1170   Mat_SeqSBAIJ       *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1171   int                ierr,i,j,mbs = a->mbs;
1172   int                *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1173   int                k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1174   MatScalar          *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1175   PetscReal          damping=b->factor_damping, zeropivot=b->factor_zeropivot;
1176   PetscTruth         damp;
1177 
1178   PetscFunctionBegin;
1179   /* initialization */
1180   /* il and jl record the first nonzero element in each row of the accessing
1181      window U(0:k, k:mbs-1).
1182      jl:    list of rows to be added to uneliminated rows
1183             i>= k: jl(i) is the first row to be added to row i
1184             i<  k: jl(i) is the row following row i in some list of rows
1185             jl(i) = mbs indicates the end of a list
1186      il(i): points to the first nonzero element in U(i,k:mbs-1)
1187   */
1188   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1189   ierr = PetscMalloc(2*mbs*sizeof(int),&il);CHKERRQ(ierr);
1190   jl   = il + mbs;
1191 
1192   do {
1193     damp = PETSC_FALSE;
1194     for (i=0; i<mbs; i++) {
1195       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1196     }
1197 
1198     for (k = 0; k<mbs; k++){ /* row k */
1199     /*initialize k-th row with elements nonzero in row perm(k) of A */
1200       nz   = ai[k+1] - ai[k];
1201       acol = aj + ai[k];
1202       aval = aa + ai[k];
1203       bval = ba + bi[k];
1204       while (nz -- ){
1205         rtmp[*acol++] = *aval++;
1206         *bval++       = 0.0; /* for in-place factorization */
1207       }
1208       /* damp the diagonal of the matrix */
1209       if (ndamp) rtmp[k] += damping;
1210 
1211       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1212       dk = rtmp[k];
1213       i  = jl[k]; /* first row to be added to k_th row  */
1214 
1215       while (i < k){
1216         nexti = jl[i]; /* next row to be added to k_th row */
1217 
1218         /* compute multiplier, update D(k) and U(i,k) */
1219         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1220         uikdi = - ba[ili]*ba[bi[i]];
1221         dk   += uikdi*ba[ili];
1222         ba[ili] = uikdi; /* -U(i,k) */
1223 
1224         /* add multiple of row i to k-th row ... */
1225         jmin = ili + 1;
1226         nz   = bi[i+1] - jmin;
1227         if (nz > 0){
1228           bcol = bj + jmin;
1229           bval = ba + jmin;
1230           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1231           /* ... add i to row list for next nonzero entry */
1232           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1233           j     = bj[jmin];
1234           jl[i] = jl[j]; jl[j] = i; /* update jl */
1235         }
1236         i = nexti;
1237       }
1238 
1239       /* check for zero pivot and save diagoanl element */
1240       if (dk < zeropivot){
1241         if (damping > 0.0) {
1242           if (ndamp) damping *= 2.0;
1243           damp = PETSC_TRUE;
1244           ndamp++;
1245           break;
1246         } else if (PetscAbsScalar(dk) < zeropivot){
1247           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",k,dk,zeropivot);
1248         } else {
1249           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %d of Cholesky factorization\n",1./PetscRealPart(dk),k);
1250         }
1251       }
1252 
1253       /* save nonzero entries in k-th row of U ... */
1254       ba[bi[k]] = 1.0/dk;
1255       jmin      = bi[k]+1;
1256       nz        = bi[k+1] - jmin;
1257       if (nz){
1258         bcol = bj + jmin;
1259         bval = ba + jmin;
1260         while (nz--){
1261           *bval++       = rtmp[*bcol];
1262           rtmp[*bcol++] = 0.0;
1263         }
1264         /* ... add k to row list for first nonzero entry in k-th row */
1265         il[k] = jmin;
1266         i     = bj[jmin];
1267         jl[k] = jl[i]; jl[i] = k;
1268       }
1269     } /* end of for (k = 0; k<mbs; k++) */
1270   } while (damp);
1271   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1272   ierr = PetscFree(il);CHKERRQ(ierr);
1273 
1274   C->factor       = FACTOR_CHOLESKY;
1275   C->assembled    = PETSC_TRUE;
1276   C->preallocated = PETSC_TRUE;
1277   PetscLogFlops(b->mbs);
1278   if (ndamp) {
1279     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %d damping value %g\n",ndamp,damping);
1280   }
1281   PetscFunctionReturn(0);
1282 }
1283 
1284 #undef __FUNCT__
1285 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1286 int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1287 {
1288   int ierr;
1289   Mat C;
1290 
1291   PetscFunctionBegin;
1292   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1293   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
1294   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 
1299