xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 7a48dd6f2aa3b8ed7e706edfa21a0730bd5df65d)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/mat/impls/sbaij/seq/sbaij.h"
4 #include "src/inline/ilu.h"
5 #include "include/petscis.h"
6 
7 #if !defined(PETSC_USE_COMPLEX)
8 /*
9   input:
10    F -- numeric factor
11   output:
12    nneg, nzero, npos: matrix inertia
13 */
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
17 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
18 {
19   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
20   PetscScalar  *dd = fact_ptr->a;
21   PetscInt     mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
22 
23   PetscFunctionBegin;
24   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
25   nneig_tmp = 0; npos_tmp = 0;
26   for (i=0; i<mbs; i++){
27     if (PetscRealPart(dd[*fi]) > 0.0){
28       npos_tmp++;
29     } else if (PetscRealPart(dd[*fi]) < 0.0){
30       nneig_tmp++;
31     }
32     fi++;
33   }
34   if (nneig) *nneig = nneig_tmp;
35   if (npos)  *npos  = npos_tmp;
36   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
37 
38   PetscFunctionReturn(0);
39 }
40 #endif /* !defined(PETSC_USE_COMPLEX) */
41 
42 /*
43   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
44   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
45 */
46 #undef __FUNCT__
47 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
48 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
49 {
50   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
51   PetscErrorCode ierr;
52   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
53   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
54   PetscInt       m,reallocs = 0,prow;
55   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
56   PetscReal      f = info->fill;
57   PetscTruth     perm_identity;
58 
59   PetscFunctionBegin;
60   /* check whether perm is the identity mapping */
61   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
62   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
63 
64   if (perm_identity){ /* without permutation */
65     a->permute = PETSC_FALSE;
66     ai = a->i; aj = a->j;
67   } else {            /* non-trivial permutation */
68     a->permute = PETSC_TRUE;
69     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
70     ai = a->inew; aj = a->jnew;
71   }
72 
73   /* initialization */
74   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
75   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
76   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
77   iu[0] = mbs+1;
78   juidx = mbs + 1; /* index for ju */
79   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
80   q     = jl + mbs;   /* linked list for col index */
81   for (i=0; i<mbs; i++){
82     jl[i] = mbs;
83     q[i] = 0;
84   }
85 
86   /* for each row k */
87   for (k=0; k<mbs; k++){
88     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
89     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
90     q[k] = mbs;
91     /* initialize nonzero structure of k-th row to row rip[k] of A */
92     jmin = ai[rip[k]] +1; /* exclude diag[k] */
93     jmax = ai[rip[k]+1];
94     for (j=jmin; j<jmax; j++){
95       vj = rip[aj[j]]; /* col. value */
96       if(vj > k){
97         qm = k;
98         do {
99           m  = qm; qm = q[m];
100         } while(qm < vj);
101         if (qm == vj) {
102           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
103         }
104         nzk++;
105         q[m]  = vj;
106         q[vj] = qm;
107       } /* if(vj > k) */
108     } /* for (j=jmin; j<jmax; j++) */
109 
110     /* modify nonzero structure of k-th row by computing fill-in
111        for each row i to be merged in */
112     prow = k;
113     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
114 
115     while (prow < k){
116       /* merge row prow into k-th row */
117       jmin = iu[prow] + 1; jmax = iu[prow+1];
118       qm = k;
119       for (j=jmin; j<jmax; j++){
120         vj = ju[j];
121         do {
122           m = qm; qm = q[m];
123         } while (qm < vj);
124         if (qm != vj){
125          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
126         }
127       }
128       prow = jl[prow]; /* next pivot row */
129     }
130 
131     /* add k to row list for first nonzero element in k-th row */
132     if (nzk > 0){
133       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134       jl[k] = jl[i]; jl[i] = k;
135     }
136     iu[k+1] = iu[k] + nzk;
137 
138     /* allocate more space to ju if needed */
139     if (iu[k+1] > umax) {
140       /* estimate how much additional space we will need */
141       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
142       /* just double the memory each time */
143       maxadd = umax;
144       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
145       umax += maxadd;
146 
147       /* allocate a longer ju */
148       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
149       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
150       ierr = PetscFree(ju);CHKERRQ(ierr);
151       ju   = jutmp;
152       reallocs++; /* count how many times we realloc */
153     }
154 
155     /* save nonzero structure of k-th row in ju */
156     i=k;
157     while (nzk --) {
158       i           = q[i];
159       ju[juidx++] = i;
160     }
161   }
162 
163   if (ai[mbs] != 0) {
164     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
165     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
166     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
167     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
168     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
169   } else {
170      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
171   }
172 
173   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
174   ierr = PetscFree(jl);CHKERRQ(ierr);
175 
176   /* put together the new matrix */
177   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
178   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
179   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
180 
181   /* PetscLogObjectParent(*B,iperm); */
182   b = (Mat_SeqSBAIJ*)(*B)->data;
183   ierr = PetscFree(b->imax);CHKERRQ(ierr);
184   b->singlemalloc = PETSC_FALSE;
185   /* the next line frees the default space generated by the Create() */
186   ierr = PetscFree(b->a);CHKERRQ(ierr);
187   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
188   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
189   b->j    = ju;
190   b->i    = iu;
191   b->diag = 0;
192   b->ilen = 0;
193   b->imax = 0;
194   b->row  = perm;
195   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
197   b->icol = perm;
198   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
199   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
200   /* In b structure:  Free imax, ilen, old a, old j.
201      Allocate idnew, solve_work, new a, new j */
202   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
203   b->maxnz = b->nz = iu[mbs];
204 
205   (*B)->factor                 = FACTOR_CHOLESKY;
206   (*B)->info.factor_mallocs    = reallocs;
207   (*B)->info.fill_ratio_given  = f;
208   if (ai[mbs] != 0) {
209     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
210   } else {
211     (*B)->info.fill_ratio_needed = 0.0;
212   }
213 
214   if (perm_identity){
215     switch (bs) {
216       case 1:
217         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
218         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
219         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
220         break;
221       case 2:
222         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
223         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
224         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
225         break;
226       case 3:
227         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
228         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
229         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
230         break;
231       case 4:
232         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
233         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
234         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
235         break;
236       case 5:
237         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
238         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
239         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
240         break;
241       case 6:
242         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
243         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
244         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
245         break;
246       case 7:
247         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
248         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
249         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
250       break;
251       default:
252         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
253         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
254         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
255       break;
256     }
257   }
258   PetscFunctionReturn(0);
259 }
260 /*
261     Symbolic U^T*D*U factorization for SBAIJ format.
262 */
263 #undef __FUNCT__
264 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
265 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
266 {
267   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
268   PetscErrorCode ierr;
269   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
270   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
271   PetscInt       m,reallocs = 0,prow;
272   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
273   PetscInt       *il,ili,nextprow;
274   PetscReal      f = info->fill;
275   PetscTruth     perm_identity;
276 
277   PetscFunctionBegin;
278   /*
279    This code originally uses Modified Sparse Row (MSR) storage
280    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
281    Then it is rewritten so the factor B takes seqsbaij format. However the associated
282    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
283    thus the original code in MSR format is still used for these cases.
284    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
285    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
286   */
287 
288   /* check whether perm is the identity mapping */
289   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
290   if (bs>1 || !perm_identity){
291     a->permute = PETSC_TRUE;
292     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,B);CHKERRQ(ierr);
293     PetscFunctionReturn(0);
294   }
295 
296   /* At present, the code below works only for perm_identity=PETSC_TRUE */
297   a->permute = PETSC_FALSE;
298 
299   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
300   ai = a->i; aj = a->j;
301 
302   /* initialization */
303   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
304   umax  = (PetscInt)(f*ai[mbs] + 1);
305   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
306   iu[0] = 0;
307   juidx = 0; /* index for ju */
308   ierr  = PetscMalloc((3*mbs+1)*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */
309   q     = jl + mbs;   /* linked list for col index of active row */
310   il    = q  + mbs;
311   for (i=0; i<mbs; i++){
312     jl[i] = mbs;
313     q[i]  = 0;
314     il[i] = 0;
315   }
316 
317   /* for each row k */
318   for (k=0; k<mbs; k++){
319     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
320     q[k] = mbs;
321     /* initialize nonzero structure of k-th row to row rip[k] of A */
322     jmin = ai[rip[k]] +1; /* exclude diag[k] */
323     jmax = ai[rip[k]+1];
324     for (j=jmin; j<jmax; j++){
325       vj = rip[aj[j]]; /* col. value */
326       if(vj > k){
327         qm = k;
328         do {
329           m  = qm; qm = q[m];
330         } while(qm < vj);
331         if (qm == vj) {
332           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
333         }
334         nzk++;
335         q[m]  = vj;
336         q[vj] = qm;
337       } /* if(vj > k) */
338     } /* for (j=jmin; j<jmax; j++) */
339 
340     /* modify nonzero structure of k-th row by computing fill-in
341        for each row i to be merged in */
342     prow = k;
343     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
344 
345     while (prow < k){
346       nextprow = jl[prow];
347 
348       /* merge row prow into k-th row */
349       ili = il[prow];
350       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
351       jmax = iu[prow+1];
352       qm = k;
353       for (j=jmin; j<jmax; j++){
354         vj = ju[j];
355         do {
356           m = qm; qm = q[m];
357         } while (qm < vj);
358         if (qm != vj){  /* a fill */
359           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
360         }
361       } /* end of for (j=jmin; j<jmax; j++) */
362       if (jmin < jmax){
363         il[prow] = jmin;
364         j = ju[jmin];
365         jl[prow] = jl[j]; jl[j] = prow;  /* update jl */
366       }
367       prow = nextprow;
368     }
369 
370     /* update il and jl */
371     if (nzk > 0){
372       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
373       jl[k] = jl[i]; jl[i] = k;
374       il[k] = iu[k] + 1;
375     }
376     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */
377 
378     /* allocate more space to ju if needed */
379     if (iu[k+1] > umax) {
380       /* estimate how much additional space we will need */
381       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
382       /* just double the memory each time */
383       maxadd = umax;
384       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
385       umax += maxadd;
386 
387       /* allocate a longer ju */
388       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
389       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
390       ierr = PetscFree(ju);CHKERRQ(ierr);
391       ju   = jutmp;
392       reallocs++; /* count how many times we realloc */
393     }
394 
395     /* save nonzero structure of k-th row in ju */
396     ju[juidx++] = k; /* diag[k] */
397     i = k;
398     while (nzk --) {
399       i           = q[i];
400       ju[juidx++] = i;
401     }
402   }
403 
404   if (ai[mbs] != 0) {
405     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
406     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
407     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
408     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
409     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
410   } else {
411      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
412   }
413 
414   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
415   /* ierr = PetscFree(q);CHKERRQ(ierr); */
416   ierr = PetscFree(jl);CHKERRQ(ierr);
417 
418   /* put together the new matrix */
419   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
420   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
421   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
422 
423   /* PetscLogObjectParent(*B,iperm); */
424   b = (Mat_SeqSBAIJ*)(*B)->data;
425   ierr = PetscFree(b->imax);CHKERRQ(ierr);
426   b->singlemalloc = PETSC_FALSE;
427   /* the next line frees the default space generated by the Create() */
428   ierr = PetscFree(b->a);CHKERRQ(ierr);
429   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
430   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
431   b->j    = ju;
432   b->i    = iu;
433   b->diag = 0;
434   b->ilen = 0;
435   b->imax = 0;
436   b->row  = perm;
437   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
438   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
439   b->icol = perm;
440   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
441   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
442   /* In b structure:  Free imax, ilen, old a, old j.
443      Allocate idnew, solve_work, new a, new j */
444   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
445   b->maxnz = b->nz = iu[mbs];
446 
447   (*B)->factor                 = FACTOR_CHOLESKY;
448   (*B)->info.factor_mallocs    = reallocs;
449   (*B)->info.fill_ratio_given  = f;
450   if (ai[mbs] != 0) {
451     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
452   } else {
453     (*B)->info.fill_ratio_needed = 0.0;
454   }
455 
456   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
457   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
458   (*B)->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
459   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
460   PetscFunctionReturn(0);
461 }
462 #undef __FUNCT__
463 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
464 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
465 {
466   Mat            C = *B;
467   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
468   IS             perm = b->row;
469   PetscErrorCode ierr;
470   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
471   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
472   PetscInt       bs=A->bs,bs2 = a->bs2;
473   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
474   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
475   MatScalar      *work;
476   PetscInt       *pivots;
477 
478   PetscFunctionBegin;
479   /* initialization */
480   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
481   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
482   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
483   jl   = il + mbs;
484   for (i=0; i<mbs; i++) {
485     jl[i] = mbs; il[0] = 0;
486   }
487   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
488   uik  = dk + bs2;
489   work = uik + bs2;
490   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
491 
492   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
493 
494   /* check permutation */
495   if (!a->permute){
496     ai = a->i; aj = a->j; aa = a->a;
497   } else {
498     ai   = a->inew; aj = a->jnew;
499     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
500     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
501     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
502     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
503 
504     for (i=0; i<mbs; i++){
505       jmin = ai[i]; jmax = ai[i+1];
506       for (j=jmin; j<jmax; j++){
507         while (a2anew[j] != j){
508           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
509           for (k1=0; k1<bs2; k1++){
510             dk[k1]       = aa[k*bs2+k1];
511             aa[k*bs2+k1] = aa[j*bs2+k1];
512             aa[j*bs2+k1] = dk[k1];
513           }
514         }
515         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
516         if (i > aj[j]){
517           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
518           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
519           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
520           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
521             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
522           }
523         }
524       }
525     }
526     ierr = PetscFree(a2anew);CHKERRQ(ierr);
527   }
528 
529   /* for each row k */
530   for (k = 0; k<mbs; k++){
531 
532     /*initialize k-th row with elements nonzero in row perm(k) of A */
533     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
534 
535     ap = aa + jmin*bs2;
536     for (j = jmin; j < jmax; j++){
537       vj = perm_ptr[aj[j]];         /* block col. index */
538       rtmp_ptr = rtmp + vj*bs2;
539       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
540     }
541 
542     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
543     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
544     i = jl[k]; /* first row to be added to k_th row  */
545 
546     while (i < k){
547       nexti = jl[i]; /* next row to be added to k_th row */
548 
549       /* compute multiplier */
550       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
551 
552       /* uik = -inv(Di)*U_bar(i,k) */
553       diag = ba + i*bs2;
554       u    = ba + ili*bs2;
555       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
556       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
557 
558       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
559       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
560 
561       /* update -U(i,k) */
562       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
563 
564       /* add multiple of row i to k-th row ... */
565       jmin = ili + 1; jmax = bi[i+1];
566       if (jmin < jmax){
567         for (j=jmin; j<jmax; j++) {
568           /* rtmp += -U(i,k)^T * U_bar(i,j) */
569           rtmp_ptr = rtmp + bj[j]*bs2;
570           u = ba + j*bs2;
571           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
572         }
573 
574         /* ... add i to row list for next nonzero entry */
575         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
576         j     = bj[jmin];
577         jl[i] = jl[j]; jl[j] = i; /* update jl */
578       }
579       i = nexti;
580     }
581 
582     /* save nonzero entries in k-th row of U ... */
583 
584     /* invert diagonal block */
585     diag = ba+k*bs2;
586     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
587     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
588 
589     jmin = bi[k]; jmax = bi[k+1];
590     if (jmin < jmax) {
591       for (j=jmin; j<jmax; j++){
592          vj = bj[j];           /* block col. index of U */
593          u   = ba + j*bs2;
594          rtmp_ptr = rtmp + vj*bs2;
595          for (k1=0; k1<bs2; k1++){
596            *u++        = *rtmp_ptr;
597            *rtmp_ptr++ = 0.0;
598          }
599       }
600 
601       /* ... add k to row list for first nonzero entry in k-th row */
602       il[k] = jmin;
603       i     = bj[jmin];
604       jl[k] = jl[i]; jl[i] = k;
605     }
606   }
607 
608   ierr = PetscFree(rtmp);CHKERRQ(ierr);
609   ierr = PetscFree(il);CHKERRQ(ierr);
610   ierr = PetscFree(dk);CHKERRQ(ierr);
611   ierr = PetscFree(pivots);CHKERRQ(ierr);
612   if (a->permute){
613     ierr = PetscFree(aa);CHKERRQ(ierr);
614   }
615 
616   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
617   C->factor       = FACTOR_CHOLESKY;
618   C->assembled    = PETSC_TRUE;
619   C->preallocated = PETSC_TRUE;
620   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
621   PetscFunctionReturn(0);
622 }
623 
624 #undef __FUNCT__
625 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
626 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
627 {
628   Mat            C = *B;
629   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
630   PetscErrorCode ierr;
631   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
632   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
633   PetscInt       bs=A->bs,bs2 = a->bs2;
634   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
635   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
636   MatScalar      *work;
637   PetscInt       *pivots;
638 
639   PetscFunctionBegin;
640   /* initialization */
641 
642   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
643   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
644   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
645   jl   = il + mbs;
646   for (i=0; i<mbs; i++) {
647     jl[i] = mbs; il[0] = 0;
648   }
649   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
650   uik  = dk + bs2;
651   work = uik + bs2;
652   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
653 
654   ai = a->i; aj = a->j; aa = a->a;
655 
656   /* for each row k */
657   for (k = 0; k<mbs; k++){
658 
659     /*initialize k-th row with elements nonzero in row k of A */
660     jmin = ai[k]; jmax = ai[k+1];
661     ap = aa + jmin*bs2;
662     for (j = jmin; j < jmax; j++){
663       vj = aj[j];         /* block col. index */
664       rtmp_ptr = rtmp + vj*bs2;
665       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
666     }
667 
668     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
669     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
670     i = jl[k]; /* first row to be added to k_th row  */
671 
672     while (i < k){
673       nexti = jl[i]; /* next row to be added to k_th row */
674 
675       /* compute multiplier */
676       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
677 
678       /* uik = -inv(Di)*U_bar(i,k) */
679       diag = ba + i*bs2;
680       u    = ba + ili*bs2;
681       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
682       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
683 
684       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
685       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
686 
687       /* update -U(i,k) */
688       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
689 
690       /* add multiple of row i to k-th row ... */
691       jmin = ili + 1; jmax = bi[i+1];
692       if (jmin < jmax){
693         for (j=jmin; j<jmax; j++) {
694           /* rtmp += -U(i,k)^T * U_bar(i,j) */
695           rtmp_ptr = rtmp + bj[j]*bs2;
696           u = ba + j*bs2;
697           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
698         }
699 
700         /* ... add i to row list for next nonzero entry */
701         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
702         j     = bj[jmin];
703         jl[i] = jl[j]; jl[j] = i; /* update jl */
704       }
705       i = nexti;
706     }
707 
708     /* save nonzero entries in k-th row of U ... */
709 
710     /* invert diagonal block */
711     diag = ba+k*bs2;
712     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
713     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
714 
715     jmin = bi[k]; jmax = bi[k+1];
716     if (jmin < jmax) {
717       for (j=jmin; j<jmax; j++){
718          vj = bj[j];           /* block col. index of U */
719          u   = ba + j*bs2;
720          rtmp_ptr = rtmp + vj*bs2;
721          for (k1=0; k1<bs2; k1++){
722            *u++        = *rtmp_ptr;
723            *rtmp_ptr++ = 0.0;
724          }
725       }
726 
727       /* ... add k to row list for first nonzero entry in k-th row */
728       il[k] = jmin;
729       i     = bj[jmin];
730       jl[k] = jl[i]; jl[i] = k;
731     }
732   }
733 
734   ierr = PetscFree(rtmp);CHKERRQ(ierr);
735   ierr = PetscFree(il);CHKERRQ(ierr);
736   ierr = PetscFree(dk);CHKERRQ(ierr);
737   ierr = PetscFree(pivots);CHKERRQ(ierr);
738 
739   C->factor    = FACTOR_CHOLESKY;
740   C->assembled = PETSC_TRUE;
741   C->preallocated = PETSC_TRUE;
742   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
743   PetscFunctionReturn(0);
744 }
745 
746 /*
747     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
748     Version for blocks 2 by 2.
749 */
750 #undef __FUNCT__
751 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
752 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
753 {
754   Mat            C = *B;
755   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
756   IS             perm = b->row;
757   PetscErrorCode ierr;
758   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
759   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
760   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
761   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
762 
763   PetscFunctionBegin;
764   /* initialization */
765   /* il and jl record the first nonzero element in each row of the accessing
766      window U(0:k, k:mbs-1).
767      jl:    list of rows to be added to uneliminated rows
768             i>= k: jl(i) is the first row to be added to row i
769             i<  k: jl(i) is the row following row i in some list of rows
770             jl(i) = mbs indicates the end of a list
771      il(i): points to the first nonzero element in columns k,...,mbs-1 of
772             row i of U */
773   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
774   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
775   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
776   jl   = il + mbs;
777   for (i=0; i<mbs; i++) {
778     jl[i] = mbs; il[0] = 0;
779   }
780   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
781   uik  = dk + 4;
782   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
783 
784   /* check permutation */
785   if (!a->permute){
786     ai = a->i; aj = a->j; aa = a->a;
787   } else {
788     ai   = a->inew; aj = a->jnew;
789     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
790     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
791     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
792     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
793 
794     for (i=0; i<mbs; i++){
795       jmin = ai[i]; jmax = ai[i+1];
796       for (j=jmin; j<jmax; j++){
797         while (a2anew[j] != j){
798           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
799           for (k1=0; k1<4; k1++){
800             dk[k1]       = aa[k*4+k1];
801             aa[k*4+k1] = aa[j*4+k1];
802             aa[j*4+k1] = dk[k1];
803           }
804         }
805         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
806         if (i > aj[j]){
807           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
808           ap = aa + j*4;     /* ptr to the beginning of the block */
809           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
810           ap[1] = ap[2];
811           ap[2] = dk[1];
812         }
813       }
814     }
815     ierr = PetscFree(a2anew);CHKERRQ(ierr);
816   }
817 
818   /* for each row k */
819   for (k = 0; k<mbs; k++){
820 
821     /*initialize k-th row with elements nonzero in row perm(k) of A */
822     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
823     ap = aa + jmin*4;
824     for (j = jmin; j < jmax; j++){
825       vj = perm_ptr[aj[j]];         /* block col. index */
826       rtmp_ptr = rtmp + vj*4;
827       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
828     }
829 
830     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
831     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
832     i = jl[k]; /* first row to be added to k_th row  */
833 
834     while (i < k){
835       nexti = jl[i]; /* next row to be added to k_th row */
836 
837       /* compute multiplier */
838       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
839 
840       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
841       diag = ba + i*4;
842       u    = ba + ili*4;
843       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
844       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
845       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
846       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
847 
848       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
849       dk[0] += uik[0]*u[0] + uik[1]*u[1];
850       dk[1] += uik[2]*u[0] + uik[3]*u[1];
851       dk[2] += uik[0]*u[2] + uik[1]*u[3];
852       dk[3] += uik[2]*u[2] + uik[3]*u[3];
853 
854       /* update -U(i,k): ba[ili] = uik */
855       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
856 
857       /* add multiple of row i to k-th row ... */
858       jmin = ili + 1; jmax = bi[i+1];
859       if (jmin < jmax){
860         for (j=jmin; j<jmax; j++) {
861           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
862           rtmp_ptr = rtmp + bj[j]*4;
863           u = ba + j*4;
864           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
865           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
866           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
867           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
868         }
869 
870         /* ... add i to row list for next nonzero entry */
871         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
872         j     = bj[jmin];
873         jl[i] = jl[j]; jl[j] = i; /* update jl */
874       }
875       i = nexti;
876     }
877 
878     /* save nonzero entries in k-th row of U ... */
879 
880     /* invert diagonal block */
881     diag = ba+k*4;
882     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
883     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
884 
885     jmin = bi[k]; jmax = bi[k+1];
886     if (jmin < jmax) {
887       for (j=jmin; j<jmax; j++){
888          vj = bj[j];           /* block col. index of U */
889          u   = ba + j*4;
890          rtmp_ptr = rtmp + vj*4;
891          for (k1=0; k1<4; k1++){
892            *u++        = *rtmp_ptr;
893            *rtmp_ptr++ = 0.0;
894          }
895       }
896 
897       /* ... add k to row list for first nonzero entry in k-th row */
898       il[k] = jmin;
899       i     = bj[jmin];
900       jl[k] = jl[i]; jl[i] = k;
901     }
902   }
903 
904   ierr = PetscFree(rtmp);CHKERRQ(ierr);
905   ierr = PetscFree(il);CHKERRQ(ierr);
906   ierr = PetscFree(dk);CHKERRQ(ierr);
907   if (a->permute) {
908     ierr = PetscFree(aa);CHKERRQ(ierr);
909   }
910   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
911   C->factor    = FACTOR_CHOLESKY;
912   C->assembled = PETSC_TRUE;
913   C->preallocated = PETSC_TRUE;
914   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
915   PetscFunctionReturn(0);
916 }
917 
918 /*
919       Version for when blocks are 2 by 2 Using natural ordering
920 */
921 #undef __FUNCT__
922 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
923 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
924 {
925   Mat            C = *B;
926   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
927   PetscErrorCode ierr;
928   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
929   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
930   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
931   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
932 
933   PetscFunctionBegin;
934   /* initialization */
935   /* il and jl record the first nonzero element in each row of the accessing
936      window U(0:k, k:mbs-1).
937      jl:    list of rows to be added to uneliminated rows
938             i>= k: jl(i) is the first row to be added to row i
939             i<  k: jl(i) is the row following row i in some list of rows
940             jl(i) = mbs indicates the end of a list
941      il(i): points to the first nonzero element in columns k,...,mbs-1 of
942             row i of U */
943   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
944   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
945   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
946   jl   = il + mbs;
947   for (i=0; i<mbs; i++) {
948     jl[i] = mbs; il[0] = 0;
949   }
950   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
951   uik  = dk + 4;
952 
953   ai = a->i; aj = a->j; aa = a->a;
954 
955   /* for each row k */
956   for (k = 0; k<mbs; k++){
957 
958     /*initialize k-th row with elements nonzero in row k of A */
959     jmin = ai[k]; jmax = ai[k+1];
960     ap = aa + jmin*4;
961     for (j = jmin; j < jmax; j++){
962       vj = aj[j];         /* block col. index */
963       rtmp_ptr = rtmp + vj*4;
964       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
965     }
966 
967     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
968     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
969     i = jl[k]; /* first row to be added to k_th row  */
970 
971     while (i < k){
972       nexti = jl[i]; /* next row to be added to k_th row */
973 
974       /* compute multiplier */
975       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
976 
977       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
978       diag = ba + i*4;
979       u    = ba + ili*4;
980       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
981       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
982       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
983       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
984 
985       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
986       dk[0] += uik[0]*u[0] + uik[1]*u[1];
987       dk[1] += uik[2]*u[0] + uik[3]*u[1];
988       dk[2] += uik[0]*u[2] + uik[1]*u[3];
989       dk[3] += uik[2]*u[2] + uik[3]*u[3];
990 
991       /* update -U(i,k): ba[ili] = uik */
992       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
993 
994       /* add multiple of row i to k-th row ... */
995       jmin = ili + 1; jmax = bi[i+1];
996       if (jmin < jmax){
997         for (j=jmin; j<jmax; j++) {
998           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
999           rtmp_ptr = rtmp + bj[j]*4;
1000           u = ba + j*4;
1001           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1002           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1003           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1004           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1005         }
1006 
1007         /* ... add i to row list for next nonzero entry */
1008         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1009         j     = bj[jmin];
1010         jl[i] = jl[j]; jl[j] = i; /* update jl */
1011       }
1012       i = nexti;
1013     }
1014 
1015     /* save nonzero entries in k-th row of U ... */
1016 
1017     /* invert diagonal block */
1018     diag = ba+k*4;
1019     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1020     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1021 
1022     jmin = bi[k]; jmax = bi[k+1];
1023     if (jmin < jmax) {
1024       for (j=jmin; j<jmax; j++){
1025          vj = bj[j];           /* block col. index of U */
1026          u   = ba + j*4;
1027          rtmp_ptr = rtmp + vj*4;
1028          for (k1=0; k1<4; k1++){
1029            *u++        = *rtmp_ptr;
1030            *rtmp_ptr++ = 0.0;
1031          }
1032       }
1033 
1034       /* ... add k to row list for first nonzero entry in k-th row */
1035       il[k] = jmin;
1036       i     = bj[jmin];
1037       jl[k] = jl[i]; jl[i] = k;
1038     }
1039   }
1040 
1041   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1042   ierr = PetscFree(il);CHKERRQ(ierr);
1043   ierr = PetscFree(dk);CHKERRQ(ierr);
1044 
1045   C->factor    = FACTOR_CHOLESKY;
1046   C->assembled = PETSC_TRUE;
1047   C->preallocated = PETSC_TRUE;
1048   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 /*
1053     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
1054     Version for blocks are 1 by 1.
1055 */
1056 #undef __FUNCT__
1057 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1058 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
1059 {
1060   Mat            C = *B;
1061   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1062   IS             ip = b->row;
1063   PetscErrorCode ierr;
1064   PetscInt       *rip,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
1065   PetscInt       *ai,*aj,*r;
1066   PetscInt       k,jmin,jmax,*jl,*il,vj,nexti,ili;
1067   MatScalar      *rtmp;
1068   MatScalar      *ba = b->a,*aa,ak;
1069   MatScalar      dk,uikdi;
1070 
1071   PetscFunctionBegin;
1072   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1073   if (!a->permute){
1074     ai = a->i; aj = a->j; aa = a->a;
1075   } else {
1076     ai = a->inew; aj = a->jnew;
1077     ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1078     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1079     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&r);CHKERRQ(ierr);
1080     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
1081 
1082     jmin = ai[0]; jmax = ai[mbs];
1083     for (j=jmin; j<jmax; j++){
1084       while (r[j] != j){
1085         k = r[j]; r[j] = r[k]; r[k] = k;
1086         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
1087       }
1088     }
1089     ierr = PetscFree(r);CHKERRQ(ierr);
1090   }
1091 
1092   /* initialization */
1093   /* il and jl record the first nonzero element in each row of the accessing
1094      window U(0:k, k:mbs-1).
1095      jl:    list of rows to be added to uneliminated rows
1096             i>= k: jl(i) is the first row to be added to row i
1097             i<  k: jl(i) is the row following row i in some list of rows
1098             jl(i) = mbs indicates the end of a list
1099      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1100             row i of U */
1101   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1102   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1103   jl   = il + mbs;
1104   for (i=0; i<mbs; i++) {
1105     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1106   }
1107 
1108   /* for each row k */
1109   for (k = 0; k<mbs; k++){
1110 
1111     /*initialize k-th row with elements nonzero in row perm(k) of A */
1112     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1113 
1114     for (j = jmin; j < jmax; j++){
1115       vj = rip[aj[j]];
1116       rtmp[vj] = aa[j];
1117     }
1118 
1119     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1120     dk = rtmp[k];
1121     i = jl[k]; /* first row to be added to k_th row  */
1122 
1123     while (i < k){
1124       nexti = jl[i]; /* next row to be added to k_th row */
1125 
1126       /* compute multiplier, update D(k) and U(i,k) */
1127       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1128       uikdi = - ba[ili]*ba[i];
1129       dk += uikdi*ba[ili];
1130       ba[ili] = uikdi; /* -U(i,k) */
1131 
1132       /* add multiple of row i to k-th row ... */
1133       jmin = ili + 1; jmax = bi[i+1];
1134       if (jmin < jmax){
1135         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1136         /* ... add i to row list for next nonzero entry */
1137         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1138         j     = bj[jmin];
1139         jl[i] = jl[j]; jl[j] = i; /* update jl */
1140       }
1141       i = nexti;
1142     }
1143 
1144     /* check for zero pivot and save diagoanl element */
1145     if (dk == 0.0){
1146       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1147       /*
1148     } else if (PetscRealPart(dk) < 0.0){
1149       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk);
1150       */
1151     }
1152 
1153     /* save nonzero entries in k-th row of U ... */
1154     ba[k] = 1.0/dk;
1155     jmin = bi[k]; jmax = bi[k+1];
1156     if (jmin < jmax) {
1157       for (j=jmin; j<jmax; j++){
1158          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1159       }
1160       /* ... add k to row list for first nonzero entry in k-th row */
1161       il[k] = jmin;
1162       i     = bj[jmin];
1163       jl[k] = jl[i]; jl[i] = k;
1164     }
1165   }
1166 
1167   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1168   ierr = PetscFree(il);CHKERRQ(ierr);
1169   if (a->permute){
1170     ierr = PetscFree(aa);CHKERRQ(ierr);
1171   }
1172 
1173   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1174   C->factor    = FACTOR_CHOLESKY;
1175   C->assembled = PETSC_TRUE;
1176   C->preallocated = PETSC_TRUE;
1177   PetscLogFlops(b->mbs);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 /*
1182   Version for when blocks are 1 by 1 Using natural ordering
1183 */
1184 #undef __FUNCT__
1185 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1186 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1187 {
1188   Mat            C = *B;
1189   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1190   PetscErrorCode ierr;
1191   PetscInt       i,j,mbs = a->mbs;
1192   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1193   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1194   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1195   PetscReal      damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
1196   PetscTruth     damp,chshift;
1197   PetscInt       nshift=0;
1198 
1199   PetscFunctionBegin;
1200   /* initialization */
1201   /* il and jl record the first nonzero element in each row of the accessing
1202      window U(0:k, k:mbs-1).
1203      jl:    list of rows to be added to uneliminated rows
1204             i>= k: jl(i) is the first row to be added to row i
1205             i<  k: jl(i) is the row following row i in some list of rows
1206             jl(i) = mbs indicates the end of a list
1207      il(i): points to the first nonzero element in U(i,k:mbs-1)
1208   */
1209   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1210   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1211   jl   = il + mbs;
1212 
1213   shift_amount = 0;
1214   do {
1215     damp = PETSC_FALSE;
1216     chshift = PETSC_FALSE;
1217     for (i=0; i<mbs; i++) {
1218       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1219     }
1220 
1221     for (k = 0; k<mbs; k++){ /* row k */
1222     /*initialize k-th row with elements nonzero in row perm(k) of A */
1223       nz   = ai[k+1] - ai[k];
1224       acol = aj + ai[k];
1225       aval = aa + ai[k];
1226       bval = ba + bi[k];
1227       while (nz -- ){
1228         rtmp[*acol++] = *aval++;
1229         *bval++       = 0.0; /* for in-place factorization */
1230       }
1231       /* damp the diagonal of the matrix */
1232       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1233 
1234       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1235       dk = rtmp[k];
1236       i  = jl[k]; /* first row to be added to k_th row  */
1237 
1238       while (i < k){
1239         nexti = jl[i]; /* next row to be added to k_th row */
1240 
1241         /* compute multiplier, update D(k) and U(i,k) */
1242         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1243         uikdi = - ba[ili]*ba[bi[i]];
1244         dk   += uikdi*ba[ili];
1245         ba[ili] = uikdi; /* -U(i,k) */
1246 
1247         /* add multiple of row i to k-th row ... */
1248         jmin = ili + 1;
1249         nz   = bi[i+1] - jmin;
1250         if (nz > 0){
1251           bcol = bj + jmin;
1252           bval = ba + jmin;
1253           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1254           /* ... add i to row list for next nonzero entry */
1255           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1256           j     = bj[jmin];
1257           jl[i] = jl[j]; jl[j] = i; /* update jl */
1258         }
1259         i = nexti;
1260       }
1261 
1262       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1263 	/* calculate a shift that would make this row diagonally dominant */
1264 	PetscReal rs = PetscAbs(PetscRealPart(dk));
1265 	jmin      = bi[k]+1;
1266 	nz        = bi[k+1] - jmin;
1267 	if (nz){
1268 	  bcol = bj + jmin;
1269 	  bval = ba + jmin;
1270 	  while (nz--){
1271 	    rs += PetscAbsScalar(rtmp[*bcol++]);
1272 	  }
1273 	}
1274 	/* if this shift is less than the previous, just up the previous
1275 	   one by a bit */
1276 	shift_amount = PetscMax(rs,1.1*shift_amount);
1277 	chshift  = PETSC_TRUE;
1278 	/* Unlike in the ILU case there is no exit condition on nshift:
1279 	   we increase the shift until it converges. There is no guarantee that
1280 	   this algorithm converges faster or slower, or is better or worse
1281 	   than the ILU algorithm. */
1282 	nshift++;
1283 	break;
1284       }
1285       if (PetscRealPart(dk) < zeropivot){
1286         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1287         if (damping > 0.0) {
1288           if (ndamp) damping *= 2.0;
1289           damp = PETSC_TRUE;
1290           ndamp++;
1291           break;
1292         } else if (PetscAbsScalar(dk) < zeropivot){
1293           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1294         } else {
1295           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %D of Cholesky factorization\n",PetscRealPart(dk),k);
1296         }
1297       }
1298 
1299       /* save nonzero entries in k-th row of U ... */
1300       /* printf("%D, dk: %g, 1/dk: %g\n",k,dk,1/dk); */
1301       ba[bi[k]] = 1.0/dk;
1302       jmin      = bi[k]+1;
1303       nz        = bi[k+1] - jmin;
1304       if (nz){
1305         bcol = bj + jmin;
1306         bval = ba + jmin;
1307         while (nz--){
1308           *bval++       = rtmp[*bcol];
1309           rtmp[*bcol++] = 0.0;
1310         }
1311         /* ... add k to row list for first nonzero entry in k-th row */
1312         il[k] = jmin;
1313         i     = bj[jmin];
1314         jl[k] = jl[i]; jl[i] = k;
1315       }
1316     } /* end of for (k = 0; k<mbs; k++) */
1317   } while (damp||chshift);
1318   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1319   ierr = PetscFree(il);CHKERRQ(ierr);
1320 
1321   C->factor       = FACTOR_CHOLESKY;
1322   C->assembled    = PETSC_TRUE;
1323   C->preallocated = PETSC_TRUE;
1324   PetscLogFlops(b->mbs);
1325   if (ndamp) {
1326     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %D damping value %g\n",ndamp,damping);
1327   }
1328  if (nshift) {
1329     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %D shifts\n",nshift);
1330   }
1331 
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 #undef __FUNCT__
1336 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1337 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1338 {
1339   PetscErrorCode ierr;
1340   Mat            C;
1341 
1342   PetscFunctionBegin;
1343   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1344   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
1345   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 
1350