xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 2b8d69ca7ea5fe9190df62c1dce3bbd66fce84dd)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscis.h>
6 
7 /*
8   input:
9    F -- numeric factor
10   output:
11    nneg, nzero, npos: matrix inertia
12 */
13 
14 #undef __FUNCT__
15 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
16 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
17 {
18   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
19   MatScalar    *dd       = fact_ptr->a;
20   PetscInt     mbs       =fact_ptr->mbs,bs=F->rmap->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->diag;
21 
22   PetscFunctionBegin;
23   if (bs != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
24   nneig_tmp = 0; npos_tmp = 0;
25   for (i=0; i<mbs; i++) {
26     if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
27     else if (PetscRealPart(dd[*fi]) < 0.0) nneig_tmp++;
28     fi++;
29   }
30   if (nneig) *nneig = nneig_tmp;
31   if (npos)  *npos  = npos_tmp;
32   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
33   PetscFunctionReturn(0);
34 }
35 
36 /*
37   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
38   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
39 */
40 #undef __FUNCT__
41 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
42 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info)
43 {
44   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
45   PetscErrorCode ierr;
46   const PetscInt *rip,*ai,*aj;
47   PetscInt       i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2;
48   PetscInt       m,reallocs = 0,prow;
49   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
50   PetscReal      f = info->fill;
51   PetscBool      perm_identity;
52 
53   PetscFunctionBegin;
54   /* check whether perm is the identity mapping */
55   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
56   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
57 
58   if (perm_identity) { /* without permutation */
59     a->permute = PETSC_FALSE;
60 
61     ai = a->i; aj = a->j;
62   } else {            /* non-trivial permutation */
63     a->permute = PETSC_TRUE;
64 
65     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
66 
67     ai = a->inew; aj = a->jnew;
68   }
69 
70   /* initialization */
71   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
72   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
73   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
74   iu[0] = mbs+1;
75   juidx = mbs + 1; /* index for ju */
76   /* jl linked list for pivot row -- linked list for col index */
77   ierr = PetscMalloc2(mbs,&jl,mbs,&q);CHKERRQ(ierr);
78   for (i=0; i<mbs; i++) {
79     jl[i] = mbs;
80     q[i]  = 0;
81   }
82 
83   /* for each row k */
84   for (k=0; k<mbs; k++) {
85     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
86     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
87     q[k] = mbs;
88     /* initialize nonzero structure of k-th row to row rip[k] of A */
89     jmin = ai[rip[k]] +1; /* exclude diag[k] */
90     jmax = ai[rip[k]+1];
91     for (j=jmin; j<jmax; j++) {
92       vj = rip[aj[j]]; /* col. value */
93       if (vj > k) {
94         qm = k;
95         do {
96           m = qm; qm = q[m];
97         } while (qm < vj);
98         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
99         nzk++;
100         q[m]  = vj;
101         q[vj] = qm;
102       } /* if (vj > k) */
103     } /* for (j=jmin; j<jmax; j++) */
104 
105     /* modify nonzero structure of k-th row by computing fill-in
106        for each row i to be merged in */
107     prow = k;
108     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
109 
110     while (prow < k) {
111       /* merge row prow into k-th row */
112       jmin = iu[prow] + 1; jmax = iu[prow+1];
113       qm   = k;
114       for (j=jmin; j<jmax; j++) {
115         vj = ju[j];
116         do {
117           m = qm; qm = q[m];
118         } while (qm < vj);
119         if (qm != vj) {
120           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
121         }
122       }
123       prow = jl[prow]; /* next pivot row */
124     }
125 
126     /* add k to row list for first nonzero element in k-th row */
127     if (nzk > 0) {
128       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
129       jl[k] = jl[i]; jl[i] = k;
130     }
131     iu[k+1] = iu[k] + nzk;
132 
133     /* allocate more space to ju if needed */
134     if (iu[k+1] > umax) {
135       /* estimate how much additional space we will need */
136       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
137       /* just double the memory each time */
138       maxadd = umax;
139       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
140       umax += maxadd;
141 
142       /* allocate a longer ju */
143       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
144       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
145       ierr = PetscFree(ju);CHKERRQ(ierr);
146       ju   = jutmp;
147       reallocs++; /* count how many times we realloc */
148     }
149 
150     /* save nonzero structure of k-th row in ju */
151     i=k;
152     while (nzk--) {
153       i           = q[i];
154       ju[juidx++] = i;
155     }
156   }
157 
158 #if defined(PETSC_USE_INFO)
159   if (ai[mbs] != 0) {
160     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
161     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
162     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
163     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
164     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
165   } else {
166     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
167   }
168 #endif
169 
170   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
171   ierr = PetscFree2(jl,q);CHKERRQ(ierr);
172 
173   /* put together the new matrix */
174   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(F,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
175 
176   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
177   b                = (Mat_SeqSBAIJ*)(F)->data;
178   b->singlemalloc  = PETSC_FALSE;
179   b->free_a        = PETSC_TRUE;
180   b->free_ij       = PETSC_TRUE;
181 
182   ierr    = PetscMalloc1((iu[mbs]+1)*bs2,&b->a);CHKERRQ(ierr);
183   b->j    = ju;
184   b->i    = iu;
185   b->diag = 0;
186   b->ilen = 0;
187   b->imax = 0;
188   b->row  = perm;
189 
190   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
191 
192   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
193 
194   b->icol = perm;
195   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
196   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
197   /* In b structure:  Free imax, ilen, old a, old j.
198      Allocate idnew, solve_work, new a, new j */
199   ierr     = PetscLogObjectMemory((PetscObject)F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
200   b->maxnz = b->nz = iu[mbs];
201 
202   (F)->info.factor_mallocs   = reallocs;
203   (F)->info.fill_ratio_given = f;
204   if (ai[mbs] != 0) {
205     (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
206   } else {
207     (F)->info.fill_ratio_needed = 0.0;
208   }
209   ierr = MatSeqSBAIJSetNumericFactorization_inplace(F,perm_identity);CHKERRQ(ierr);
210   PetscFunctionReturn(0);
211 }
212 /*
213     Symbolic U^T*D*U factorization for SBAIJ format.
214     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
215 */
216 #include <petscbt.h>
217 #include <../src/mat/utils/freespace.h>
218 #undef __FUNCT__
219 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
220 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
221 {
222   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
223   Mat_SeqSBAIJ       *b;
224   PetscErrorCode     ierr;
225   PetscBool          perm_identity,missing;
226   PetscReal          fill = info->fill;
227   const PetscInt     *rip,*ai=a->i,*aj=a->j;
228   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow;
229   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
230   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr,*udiag;
231   PetscFreeSpaceList free_space=NULL,current_space=NULL;
232   PetscBT            lnkbt;
233 
234   PetscFunctionBegin;
235   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
236   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
237   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
238   if (bs > 1) {
239     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
240     PetscFunctionReturn(0);
241   }
242 
243   /* check whether perm is the identity mapping */
244   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
245   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
246   a->permute = PETSC_FALSE;
247   ierr       = ISGetIndices(perm,&rip);CHKERRQ(ierr);
248 
249   /* initialization */
250   ierr  = PetscMalloc1(mbs+1,&ui);CHKERRQ(ierr);
251   ierr  = PetscMalloc1(mbs+1,&udiag);CHKERRQ(ierr);
252   ui[0] = 0;
253 
254   /* jl: linked list for storing indices of the pivot rows
255      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
256   ierr = PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);CHKERRQ(ierr);
257   for (i=0; i<mbs; i++) {
258     jl[i] = mbs; il[i] = 0;
259   }
260 
261   /* create and initialize a linked list for storing column indices of the active row k */
262   nlnk = mbs + 1;
263   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
264 
265   /* initial FreeSpace size is fill*(ai[mbs]+1) */
266   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);CHKERRQ(ierr);
267   current_space = free_space;
268 
269   for (k=0; k<mbs; k++) {  /* for each active row k */
270     /* initialize lnk by the column indices of row rip[k] of A */
271     nzk   = 0;
272     ncols = ai[k+1] - ai[k];
273     if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
274     for (j=0; j<ncols; j++) {
275       i       = *(aj + ai[k] + j);
276       cols[j] = i;
277     }
278     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
279     nzk += nlnk;
280 
281     /* update lnk by computing fill-in for each pivot row to be merged in */
282     prow = jl[k]; /* 1st pivot row */
283 
284     while (prow < k) {
285       nextprow = jl[prow];
286       /* merge prow into k-th row */
287       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
288       jmax   = ui[prow+1];
289       ncols  = jmax-jmin;
290       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
291       ierr   = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
292       nzk   += nlnk;
293 
294       /* update il and jl for prow */
295       if (jmin < jmax) {
296         il[prow] = jmin;
297         j        = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
298       }
299       prow = nextprow;
300     }
301 
302     /* if free space is not available, make more free space */
303     if (current_space->local_remaining<nzk) {
304       i    = mbs - k + 1; /* num of unfactored rows */
305       i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
306       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
307       reallocs++;
308     }
309 
310     /* copy data into free space, then initialize lnk */
311     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
312 
313     /* add the k-th row into il and jl */
314     if (nzk > 1) {
315       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
316       jl[k] = jl[i]; jl[i] = k;
317       il[k] = ui[k] + 1;
318     }
319     ui_ptr[k] = current_space->array;
320 
321     current_space->array           += nzk;
322     current_space->local_used      += nzk;
323     current_space->local_remaining -= nzk;
324 
325     ui[k+1] = ui[k] + nzk;
326   }
327 
328   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
329   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
330 
331   /* destroy list of free space and other temporary array(s) */
332   ierr = PetscMalloc1(ui[mbs]+1,&uj);CHKERRQ(ierr);
333   ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,mbs,ui,udiag);CHKERRQ(ierr); /* store matrix factor */
334   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
335 
336   /* put together the new matrix in MATSEQSBAIJ format */
337   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
338 
339   b               = (Mat_SeqSBAIJ*)fact->data;
340   b->singlemalloc = PETSC_FALSE;
341   b->free_a       = PETSC_TRUE;
342   b->free_ij      = PETSC_TRUE;
343 
344   ierr = PetscMalloc1(ui[mbs]+1,&b->a);CHKERRQ(ierr);
345 
346   b->j         = uj;
347   b->i         = ui;
348   b->diag      = udiag;
349   b->free_diag = PETSC_TRUE;
350   b->ilen      = 0;
351   b->imax      = 0;
352   b->row       = perm;
353   b->icol      = perm;
354 
355   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
356   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
357 
358   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
359 
360   ierr = PetscMalloc1(mbs+1,&b->solve_work);CHKERRQ(ierr);
361   ierr = PetscLogObjectMemory((PetscObject)fact,ui[mbs]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
362 
363   b->maxnz = b->nz = ui[mbs];
364 
365   fact->info.factor_mallocs   = reallocs;
366   fact->info.fill_ratio_given = fill;
367   if (ai[mbs] != 0) {
368     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
369   } else {
370     fact->info.fill_ratio_needed = 0.0;
371   }
372 #if defined(PETSC_USE_INFO)
373   if (ai[mbs] != 0) {
374     PetscReal af = fact->info.fill_ratio_needed;
375     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
376     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
377     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
378   } else {
379     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
380   }
381 #endif
382   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_inplace"
388 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
389 {
390   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
391   Mat_SeqSBAIJ       *b;
392   PetscErrorCode     ierr;
393   PetscBool          perm_identity,missing;
394   PetscReal          fill = info->fill;
395   const PetscInt     *rip,*ai,*aj;
396   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
397   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
398   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
399   PetscFreeSpaceList free_space=NULL,current_space=NULL;
400   PetscBT            lnkbt;
401 
402   PetscFunctionBegin;
403   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
404   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
405 
406   /*
407    This code originally uses Modified Sparse Row (MSR) storage
408    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
409    Then it is rewritten so the factor B takes seqsbaij format. However the associated
410    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
411    thus the original code in MSR format is still used for these cases.
412    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
413    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
414   */
415   if (bs > 1) {
416     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
417     PetscFunctionReturn(0);
418   }
419 
420   /* check whether perm is the identity mapping */
421   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
422 
423   if (perm_identity) {
424     a->permute = PETSC_FALSE;
425 
426     ai = a->i; aj = a->j;
427   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
428   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
429 
430   /* initialization */
431   ierr  = PetscMalloc1(mbs+1,&ui);CHKERRQ(ierr);
432   ui[0] = 0;
433 
434   /* jl: linked list for storing indices of the pivot rows
435      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
436   ierr = PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);CHKERRQ(ierr);
437   for (i=0; i<mbs; i++) {
438     jl[i] = mbs; il[i] = 0;
439   }
440 
441   /* create and initialize a linked list for storing column indices of the active row k */
442   nlnk = mbs + 1;
443   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
444 
445   /* initial FreeSpace size is fill*(ai[mbs]+1) */
446   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);CHKERRQ(ierr);
447   current_space = free_space;
448 
449   for (k=0; k<mbs; k++) {  /* for each active row k */
450     /* initialize lnk by the column indices of row rip[k] of A */
451     nzk   = 0;
452     ncols = ai[rip[k]+1] - ai[rip[k]];
453     for (j=0; j<ncols; j++) {
454       i       = *(aj + ai[rip[k]] + j);
455       cols[j] = rip[i];
456     }
457     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
458     nzk += nlnk;
459 
460     /* update lnk by computing fill-in for each pivot row to be merged in */
461     prow = jl[k]; /* 1st pivot row */
462 
463     while (prow < k) {
464       nextprow = jl[prow];
465       /* merge prow into k-th row */
466       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
467       jmax   = ui[prow+1];
468       ncols  = jmax-jmin;
469       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
470       ierr   = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
471       nzk   += nlnk;
472 
473       /* update il and jl for prow */
474       if (jmin < jmax) {
475         il[prow] = jmin;
476 
477         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
478       }
479       prow = nextprow;
480     }
481 
482     /* if free space is not available, make more free space */
483     if (current_space->local_remaining<nzk) {
484       i    = mbs - k + 1; /* num of unfactored rows */
485       i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
486       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
487       reallocs++;
488     }
489 
490     /* copy data into free space, then initialize lnk */
491     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
492 
493     /* add the k-th row into il and jl */
494     if (nzk-1 > 0) {
495       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
496       jl[k] = jl[i]; jl[i] = k;
497       il[k] = ui[k] + 1;
498     }
499     ui_ptr[k] = current_space->array;
500 
501     current_space->array           += nzk;
502     current_space->local_used      += nzk;
503     current_space->local_remaining -= nzk;
504 
505     ui[k+1] = ui[k] + nzk;
506   }
507 
508   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
509   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
510 
511   /* destroy list of free space and other temporary array(s) */
512   ierr = PetscMalloc1(ui[mbs]+1,&uj);CHKERRQ(ierr);
513   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
514   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
515 
516   /* put together the new matrix in MATSEQSBAIJ format */
517   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
518 
519   b               = (Mat_SeqSBAIJ*)fact->data;
520   b->singlemalloc = PETSC_FALSE;
521   b->free_a       = PETSC_TRUE;
522   b->free_ij      = PETSC_TRUE;
523 
524   ierr = PetscMalloc1(ui[mbs]+1,&b->a);CHKERRQ(ierr);
525 
526   b->j    = uj;
527   b->i    = ui;
528   b->diag = 0;
529   b->ilen = 0;
530   b->imax = 0;
531   b->row  = perm;
532 
533   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
534 
535   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
536   b->icol  = perm;
537   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
538   ierr     = PetscMalloc1(mbs+1,&b->solve_work);CHKERRQ(ierr);
539   ierr     = PetscLogObjectMemory((PetscObject)fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
540   b->maxnz = b->nz = ui[mbs];
541 
542   fact->info.factor_mallocs   = reallocs;
543   fact->info.fill_ratio_given = fill;
544   if (ai[mbs] != 0) {
545     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
546   } else {
547     fact->info.fill_ratio_needed = 0.0;
548   }
549 #if defined(PETSC_USE_INFO)
550   if (ai[mbs] != 0) {
551     PetscReal af = fact->info.fill_ratio_needed;
552     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
553     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
554     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
555   } else {
556     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
557   }
558 #endif
559   ierr = MatSeqSBAIJSetNumericFactorization_inplace(fact,perm_identity);CHKERRQ(ierr);
560   PetscFunctionReturn(0);
561 }
562 
563 #undef __FUNCT__
564 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
565 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
566 {
567   Mat_SeqSBAIJ   *a   = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
568   IS             perm = b->row;
569   PetscErrorCode ierr;
570   const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j;
571   PetscInt       i,j;
572   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
573   PetscInt       bs  =A->rmap->bs,bs2 = a->bs2;
574   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
575   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
576   MatScalar      *work;
577   PetscInt       *pivots;
578   PetscBool      allowzeropivot,zeropivotdetected;
579 
580   PetscFunctionBegin;
581   /* initialization */
582   ierr = PetscCalloc1(bs2*mbs,&rtmp);CHKERRQ(ierr);
583   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
584   allowzeropivot = PetscNot(A->erroriffailure);
585 
586   for (i=0; i<mbs; i++) {
587     jl[i] = mbs; il[0] = 0;
588   }
589   ierr = PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);CHKERRQ(ierr);
590   ierr = PetscMalloc1(bs,&pivots);CHKERRQ(ierr);
591 
592   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
593 
594   /* check permutation */
595   if (!a->permute) {
596     ai = a->i; aj = a->j; aa = a->a;
597   } else {
598     ai   = a->inew; aj = a->jnew;
599     ierr = PetscMalloc1(bs2*ai[mbs],&aa);CHKERRQ(ierr);
600     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
601     ierr = PetscMalloc1(ai[mbs],&a2anew);CHKERRQ(ierr);
602     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
603 
604     for (i=0; i<mbs; i++) {
605       jmin = ai[i]; jmax = ai[i+1];
606       for (j=jmin; j<jmax; j++) {
607         while (a2anew[j] != j) {
608           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
609           for (k1=0; k1<bs2; k1++) {
610             dk[k1]       = aa[k*bs2+k1];
611             aa[k*bs2+k1] = aa[j*bs2+k1];
612             aa[j*bs2+k1] = dk[k1];
613           }
614         }
615         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
616         if (i > aj[j]) {
617           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
618           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
619           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
620           for (k=0; k<bs; k++) {               /* j-th block of aa <- dk^T */
621             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
622           }
623         }
624       }
625     }
626     ierr = PetscFree(a2anew);CHKERRQ(ierr);
627   }
628 
629   /* for each row k */
630   for (k = 0; k<mbs; k++) {
631 
632     /*initialize k-th row with elements nonzero in row perm(k) of A */
633     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
634 
635     ap = aa + jmin*bs2;
636     for (j = jmin; j < jmax; j++) {
637       vj       = perm_ptr[aj[j]];   /* block col. index */
638       rtmp_ptr = rtmp + vj*bs2;
639       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
640     }
641 
642     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
643     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
644     i    = jl[k]; /* first row to be added to k_th row  */
645 
646     while (i < k) {
647       nexti = jl[i]; /* next row to be added to k_th row */
648 
649       /* compute multiplier */
650       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
651 
652       /* uik = -inv(Di)*U_bar(i,k) */
653       diag = ba + i*bs2;
654       u    = ba + ili*bs2;
655       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
656       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
657 
658       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
659       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
660       ierr = PetscLogFlops(4.0*bs*bs2);CHKERRQ(ierr);
661 
662       /* update -U(i,k) */
663       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
664 
665       /* add multiple of row i to k-th row ... */
666       jmin = ili + 1; jmax = bi[i+1];
667       if (jmin < jmax) {
668         for (j=jmin; j<jmax; j++) {
669           /* rtmp += -U(i,k)^T * U_bar(i,j) */
670           rtmp_ptr = rtmp + bj[j]*bs2;
671           u        = ba + j*bs2;
672           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
673         }
674         ierr = PetscLogFlops(2.0*bs*bs2*(jmax-jmin));CHKERRQ(ierr);
675 
676         /* ... add i to row list for next nonzero entry */
677         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
678         j     = bj[jmin];
679         jl[i] = jl[j]; jl[j] = i; /* update jl */
680       }
681       i = nexti;
682     }
683 
684     /* save nonzero entries in k-th row of U ... */
685 
686     /* invert diagonal block */
687     diag = ba+k*bs2;
688     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
689 
690     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
691     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
692 
693     jmin = bi[k]; jmax = bi[k+1];
694     if (jmin < jmax) {
695       for (j=jmin; j<jmax; j++) {
696         vj       = bj[j];      /* block col. index of U */
697         u        = ba + j*bs2;
698         rtmp_ptr = rtmp + vj*bs2;
699         for (k1=0; k1<bs2; k1++) {
700           *u++        = *rtmp_ptr;
701           *rtmp_ptr++ = 0.0;
702         }
703       }
704 
705       /* ... add k to row list for first nonzero entry in k-th row */
706       il[k] = jmin;
707       i     = bj[jmin];
708       jl[k] = jl[i]; jl[i] = k;
709     }
710   }
711 
712   ierr = PetscFree(rtmp);CHKERRQ(ierr);
713   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
714   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
715   ierr = PetscFree(pivots);CHKERRQ(ierr);
716   if (a->permute) {
717     ierr = PetscFree(aa);CHKERRQ(ierr);
718   }
719 
720   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
721 
722   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
723   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
724   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
725   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
726 
727   C->assembled    = PETSC_TRUE;
728   C->preallocated = PETSC_TRUE;
729 
730   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
731   PetscFunctionReturn(0);
732 }
733 
734 #undef __FUNCT__
735 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
736 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
737 {
738   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
739   PetscErrorCode ierr;
740   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
741   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
742   PetscInt       bs  =A->rmap->bs,bs2 = a->bs2;
743   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
744   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
745   MatScalar      *work;
746   PetscInt       *pivots;
747   PetscBool      allowzeropivot,zeropivotdetected;
748 
749   PetscFunctionBegin;
750   ierr = PetscCalloc1(bs2*mbs,&rtmp);CHKERRQ(ierr);
751   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
752   for (i=0; i<mbs; i++) {
753     jl[i] = mbs; il[0] = 0;
754   }
755   ierr = PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);CHKERRQ(ierr);
756   ierr = PetscMalloc1(bs,&pivots);CHKERRQ(ierr);
757   allowzeropivot = PetscNot(A->erroriffailure);
758 
759   ai = a->i; aj = a->j; aa = a->a;
760 
761   /* for each row k */
762   for (k = 0; k<mbs; k++) {
763 
764     /*initialize k-th row with elements nonzero in row k of A */
765     jmin = ai[k]; jmax = ai[k+1];
766     ap   = aa + jmin*bs2;
767     for (j = jmin; j < jmax; j++) {
768       vj       = aj[j];   /* block col. index */
769       rtmp_ptr = rtmp + vj*bs2;
770       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
771     }
772 
773     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
774     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
775     i    = jl[k]; /* first row to be added to k_th row  */
776 
777     while (i < k) {
778       nexti = jl[i]; /* next row to be added to k_th row */
779 
780       /* compute multiplier */
781       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
782 
783       /* uik = -inv(Di)*U_bar(i,k) */
784       diag = ba + i*bs2;
785       u    = ba + ili*bs2;
786       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
787       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
788 
789       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
790       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
791       ierr = PetscLogFlops(2.0*bs*bs2);CHKERRQ(ierr);
792 
793       /* update -U(i,k) */
794       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
795 
796       /* add multiple of row i to k-th row ... */
797       jmin = ili + 1; jmax = bi[i+1];
798       if (jmin < jmax) {
799         for (j=jmin; j<jmax; j++) {
800           /* rtmp += -U(i,k)^T * U_bar(i,j) */
801           rtmp_ptr = rtmp + bj[j]*bs2;
802           u        = ba + j*bs2;
803           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
804         }
805         ierr = PetscLogFlops(2.0*bs*bs2*(jmax-jmin));CHKERRQ(ierr);
806 
807         /* ... add i to row list for next nonzero entry */
808         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
809         j     = bj[jmin];
810         jl[i] = jl[j]; jl[j] = i; /* update jl */
811       }
812       i = nexti;
813     }
814 
815     /* save nonzero entries in k-th row of U ... */
816 
817     /* invert diagonal block */
818     diag = ba+k*bs2;
819     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
820 
821     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
822     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
823 
824     jmin = bi[k]; jmax = bi[k+1];
825     if (jmin < jmax) {
826       for (j=jmin; j<jmax; j++) {
827         vj       = bj[j];      /* block col. index of U */
828         u        = ba + j*bs2;
829         rtmp_ptr = rtmp + vj*bs2;
830         for (k1=0; k1<bs2; k1++) {
831           *u++        = *rtmp_ptr;
832           *rtmp_ptr++ = 0.0;
833         }
834       }
835 
836       /* ... add k to row list for first nonzero entry in k-th row */
837       il[k] = jmin;
838       i     = bj[jmin];
839       jl[k] = jl[i]; jl[i] = k;
840     }
841   }
842 
843   ierr = PetscFree(rtmp);CHKERRQ(ierr);
844   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
845   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
846   ierr = PetscFree(pivots);CHKERRQ(ierr);
847 
848   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
849   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
850   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
851   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
852   C->assembled           = PETSC_TRUE;
853   C->preallocated        = PETSC_TRUE;
854 
855   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
856   PetscFunctionReturn(0);
857 }
858 
859 /*
860     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
861     Version for blocks 2 by 2.
862 */
863 #undef __FUNCT__
864 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
865 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info)
866 {
867   Mat_SeqSBAIJ   *a   = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
868   IS             perm = b->row;
869   PetscErrorCode ierr;
870   const PetscInt *ai,*aj,*perm_ptr;
871   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
872   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
873   MatScalar      *ba = b->a,*aa,*ap;
874   MatScalar      *u,*diag,*rtmp,*rtmp_ptr,dk[4],uik[4];
875   PetscReal      shift = info->shiftamount;
876   PetscBool      allowzeropivot,zeropivotdetected;
877 
878   PetscFunctionBegin;
879   allowzeropivot = PetscNot(A->erroriffailure);
880 
881   /* initialization */
882   /* il and jl record the first nonzero element in each row of the accessing
883      window U(0:k, k:mbs-1).
884      jl:    list of rows to be added to uneliminated rows
885             i>= k: jl(i) is the first row to be added to row i
886             i<  k: jl(i) is the row following row i in some list of rows
887             jl(i) = mbs indicates the end of a list
888      il(i): points to the first nonzero element in columns k,...,mbs-1 of
889             row i of U */
890   ierr = PetscCalloc1(4*mbs,&rtmp);CHKERRQ(ierr);
891   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
892   for (i=0; i<mbs; i++) {
893     jl[i] = mbs; il[0] = 0;
894   }
895   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
896 
897   /* check permutation */
898   if (!a->permute) {
899     ai = a->i; aj = a->j; aa = a->a;
900   } else {
901     ai   = a->inew; aj = a->jnew;
902     ierr = PetscMalloc1(4*ai[mbs],&aa);CHKERRQ(ierr);
903     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
904     ierr = PetscMalloc1(ai[mbs],&a2anew);CHKERRQ(ierr);
905     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
906 
907     for (i=0; i<mbs; i++) {
908       jmin = ai[i]; jmax = ai[i+1];
909       for (j=jmin; j<jmax; j++) {
910         while (a2anew[j] != j) {
911           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
912           for (k1=0; k1<4; k1++) {
913             dk[k1]     = aa[k*4+k1];
914             aa[k*4+k1] = aa[j*4+k1];
915             aa[j*4+k1] = dk[k1];
916           }
917         }
918         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
919         if (i > aj[j]) {
920           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
921           ap    = aa + j*4;  /* ptr to the beginning of the block */
922           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
923           ap[1] = ap[2];
924           ap[2] = dk[1];
925         }
926       }
927     }
928     ierr = PetscFree(a2anew);CHKERRQ(ierr);
929   }
930 
931   /* for each row k */
932   for (k = 0; k<mbs; k++) {
933 
934     /*initialize k-th row with elements nonzero in row perm(k) of A */
935     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
936     ap   = aa + jmin*4;
937     for (j = jmin; j < jmax; j++) {
938       vj       = perm_ptr[aj[j]];   /* block col. index */
939       rtmp_ptr = rtmp + vj*4;
940       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
941     }
942 
943     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
944     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
945     i    = jl[k]; /* first row to be added to k_th row  */
946 
947     while (i < k) {
948       nexti = jl[i]; /* next row to be added to k_th row */
949 
950       /* compute multiplier */
951       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
952 
953       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
954       diag   = ba + i*4;
955       u      = ba + ili*4;
956       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
957       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
958       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
959       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
960 
961       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
962       dk[0] += uik[0]*u[0] + uik[1]*u[1];
963       dk[1] += uik[2]*u[0] + uik[3]*u[1];
964       dk[2] += uik[0]*u[2] + uik[1]*u[3];
965       dk[3] += uik[2]*u[2] + uik[3]*u[3];
966 
967       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
968 
969       /* update -U(i,k): ba[ili] = uik */
970       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
971 
972       /* add multiple of row i to k-th row ... */
973       jmin = ili + 1; jmax = bi[i+1];
974       if (jmin < jmax) {
975         for (j=jmin; j<jmax; j++) {
976           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
977           rtmp_ptr     = rtmp + bj[j]*4;
978           u            = ba + j*4;
979           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
980           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
981           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
982           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
983         }
984         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
985 
986         /* ... add i to row list for next nonzero entry */
987         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
988         j     = bj[jmin];
989         jl[i] = jl[j]; jl[j] = i; /* update jl */
990       }
991       i = nexti;
992     }
993 
994     /* save nonzero entries in k-th row of U ... */
995 
996     /* invert diagonal block */
997     diag = ba+k*4;
998     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
999     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1000     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1001 
1002     jmin = bi[k]; jmax = bi[k+1];
1003     if (jmin < jmax) {
1004       for (j=jmin; j<jmax; j++) {
1005         vj       = bj[j];      /* block col. index of U */
1006         u        = ba + j*4;
1007         rtmp_ptr = rtmp + vj*4;
1008         for (k1=0; k1<4; k1++) {
1009           *u++        = *rtmp_ptr;
1010           *rtmp_ptr++ = 0.0;
1011         }
1012       }
1013 
1014       /* ... add k to row list for first nonzero entry in k-th row */
1015       il[k] = jmin;
1016       i     = bj[jmin];
1017       jl[k] = jl[i]; jl[i] = k;
1018     }
1019   }
1020 
1021   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1022   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1023   if (a->permute) {
1024     ierr = PetscFree(aa);CHKERRQ(ierr);
1025   }
1026   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1027 
1028   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1029   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1030   C->assembled           = PETSC_TRUE;
1031   C->preallocated        = PETSC_TRUE;
1032 
1033   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1034   PetscFunctionReturn(0);
1035 }
1036 
1037 /*
1038       Version for when blocks are 2 by 2 Using natural ordering
1039 */
1040 #undef __FUNCT__
1041 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
1042 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
1043 {
1044   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
1045   PetscErrorCode ierr;
1046   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1047   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1048   MatScalar      *ba = b->a,*aa,*ap,dk[8],uik[8];
1049   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
1050   PetscReal      shift = info->shiftamount;
1051   PetscBool      allowzeropivot,zeropivotdetected;
1052 
1053   PetscFunctionBegin;
1054   allowzeropivot = PetscNot(A->erroriffailure);
1055 
1056   /* initialization */
1057   /* il and jl record the first nonzero element in each row of the accessing
1058      window U(0:k, k:mbs-1).
1059      jl:    list of rows to be added to uneliminated rows
1060             i>= k: jl(i) is the first row to be added to row i
1061             i<  k: jl(i) is the row following row i in some list of rows
1062             jl(i) = mbs indicates the end of a list
1063      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1064             row i of U */
1065   ierr = PetscCalloc1(4*mbs,&rtmp);CHKERRQ(ierr);
1066   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
1067   for (i=0; i<mbs; i++) {
1068     jl[i] = mbs; il[0] = 0;
1069   }
1070   ai = a->i; aj = a->j; aa = a->a;
1071 
1072   /* for each row k */
1073   for (k = 0; k<mbs; k++) {
1074 
1075     /*initialize k-th row with elements nonzero in row k of A */
1076     jmin = ai[k]; jmax = ai[k+1];
1077     ap   = aa + jmin*4;
1078     for (j = jmin; j < jmax; j++) {
1079       vj       = aj[j];   /* block col. index */
1080       rtmp_ptr = rtmp + vj*4;
1081       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1082     }
1083 
1084     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1085     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
1086     i    = jl[k]; /* first row to be added to k_th row  */
1087 
1088     while (i < k) {
1089       nexti = jl[i]; /* next row to be added to k_th row */
1090 
1091       /* compute multiplier */
1092       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1093 
1094       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1095       diag   = ba + i*4;
1096       u      = ba + ili*4;
1097       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1098       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1099       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1100       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1101 
1102       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1103       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1104       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1105       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1106       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1107 
1108       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
1109 
1110       /* update -U(i,k): ba[ili] = uik */
1111       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1112 
1113       /* add multiple of row i to k-th row ... */
1114       jmin = ili + 1; jmax = bi[i+1];
1115       if (jmin < jmax) {
1116         for (j=jmin; j<jmax; j++) {
1117           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1118           rtmp_ptr     = rtmp + bj[j]*4;
1119           u            = ba + j*4;
1120           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1121           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1122           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1123           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1124         }
1125         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
1126 
1127         /* ... add i to row list for next nonzero entry */
1128         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1129         j     = bj[jmin];
1130         jl[i] = jl[j]; jl[j] = i; /* update jl */
1131       }
1132       i = nexti;
1133     }
1134 
1135     /* save nonzero entries in k-th row of U ... */
1136 
1137     /* invert diagonal block */
1138     diag = ba+k*4;
1139     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1140     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1141     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1142 
1143     jmin = bi[k]; jmax = bi[k+1];
1144     if (jmin < jmax) {
1145       for (j=jmin; j<jmax; j++) {
1146         vj       = bj[j];      /* block col. index of U */
1147         u        = ba + j*4;
1148         rtmp_ptr = rtmp + vj*4;
1149         for (k1=0; k1<4; k1++) {
1150           *u++        = *rtmp_ptr;
1151           *rtmp_ptr++ = 0.0;
1152         }
1153       }
1154 
1155       /* ... add k to row list for first nonzero entry in k-th row */
1156       il[k] = jmin;
1157       i     = bj[jmin];
1158       jl[k] = jl[i]; jl[i] = k;
1159     }
1160   }
1161 
1162   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1163   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1164 
1165   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1166   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1167   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1168   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1169   C->assembled           = PETSC_TRUE;
1170   C->preallocated        = PETSC_TRUE;
1171 
1172   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 /*
1177     Numeric U^T*D*U factorization for SBAIJ format.
1178     Version for blocks are 1 by 1.
1179 */
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace"
1182 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1185   IS             ip=b->row;
1186   PetscErrorCode ierr;
1187   const PetscInt *ai,*aj,*rip;
1188   PetscInt       *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1189   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1190   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1191   PetscReal      rs;
1192   FactorShiftCtx sctx;
1193 
1194   PetscFunctionBegin;
1195   /* MatPivotSetUp(): initialize shift context sctx */
1196   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1197 
1198   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1199   if (!a->permute) {
1200     ai = a->i; aj = a->j; aa = a->a;
1201   } else {
1202     ai     = a->inew; aj = a->jnew;
1203     nz     = ai[mbs];
1204     ierr   = PetscMalloc1(nz,&aa);CHKERRQ(ierr);
1205     a2anew = a->a2anew;
1206     bval   = a->a;
1207     for (j=0; j<nz; j++) {
1208       aa[a2anew[j]] = *(bval++);
1209     }
1210   }
1211 
1212   /* initialization */
1213   /* il and jl record the first nonzero element in each row of the accessing
1214      window U(0:k, k:mbs-1).
1215      jl:    list of rows to be added to uneliminated rows
1216             i>= k: jl(i) is the first row to be added to row i
1217             i<  k: jl(i) is the row following row i in some list of rows
1218             jl(i) = mbs indicates the end of a list
1219      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1220             row i of U */
1221   ierr = PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1222 
1223   do {
1224     sctx.newshift = PETSC_FALSE;
1225     for (i=0; i<mbs; i++) {
1226       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1227     }
1228 
1229     for (k = 0; k<mbs; k++) {
1230       /*initialize k-th row by the perm[k]-th row of A */
1231       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1232       bval = ba + bi[k];
1233       for (j = jmin; j < jmax; j++) {
1234         col       = rip[aj[j]];
1235         rtmp[col] = aa[j];
1236         *bval++   = 0.0; /* for in-place factorization */
1237       }
1238 
1239       /* shift the diagonal of the matrix */
1240       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1241 
1242       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1243       dk = rtmp[k];
1244       i  = jl[k]; /* first row to be added to k_th row  */
1245 
1246       while (i < k) {
1247         nexti = jl[i]; /* next row to be added to k_th row */
1248 
1249         /* compute multiplier, update diag(k) and U(i,k) */
1250         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1251         uikdi   = -ba[ili]*ba[bi[i]]; /* diagonal(k) */
1252         dk     += uikdi*ba[ili];
1253         ba[ili] = uikdi; /* -U(i,k) */
1254 
1255         /* add multiple of row i to k-th row */
1256         jmin = ili + 1; jmax = bi[i+1];
1257         if (jmin < jmax) {
1258           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1259           ierr = PetscLogFlops(2.0*(jmax-jmin));CHKERRQ(ierr);
1260 
1261           /* update il and jl for row i */
1262           il[i] = jmin;
1263           j     = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1264         }
1265         i = nexti;
1266       }
1267 
1268       /* shift the diagonals when zero pivot is detected */
1269       /* compute rs=sum of abs(off-diagonal) */
1270       rs   = 0.0;
1271       jmin = bi[k]+1;
1272       nz   = bi[k+1] - jmin;
1273       if (nz) {
1274         bcol = bj + jmin;
1275         while (nz--) {
1276           rs += PetscAbsScalar(rtmp[*bcol]);
1277           bcol++;
1278         }
1279       }
1280 
1281       sctx.rs = rs;
1282       sctx.pv = dk;
1283       ierr    = MatPivotCheck(C,A,info,&sctx,k);CHKERRQ(ierr);
1284       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1285       dk = sctx.pv;
1286 
1287       /* copy data into U(k,:) */
1288       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1289       jmin      = bi[k]+1; jmax = bi[k+1];
1290       if (jmin < jmax) {
1291         for (j=jmin; j<jmax; j++) {
1292           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1293         }
1294         /* add the k-th row into il and jl */
1295         il[k] = jmin;
1296         i     = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1297       }
1298     }
1299   } while (sctx.newshift);
1300   ierr = PetscFree3(rtmp,il,jl);CHKERRQ(ierr);
1301   if (a->permute) {ierr = PetscFree(aa);CHKERRQ(ierr);}
1302 
1303   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1304 
1305   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1306   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1307   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1308   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1309   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1310   C->assembled           = PETSC_TRUE;
1311   C->preallocated        = PETSC_TRUE;
1312 
1313   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1314   if (sctx.nshift) {
1315     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1316       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1317     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1318       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1319     }
1320   }
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 /*
1325   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1326   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1327 */
1328 #undef __FUNCT__
1329 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1330 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
1331 {
1332   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1333   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)B->data;
1334   PetscErrorCode ierr;
1335   PetscInt       i,j,mbs=A->rmap->n,*bi=b->i,*bj=b->j,*bdiag=b->diag,*bjtmp;
1336   PetscInt       *ai=a->i,*aj=a->j,*ajtmp;
1337   PetscInt       k,jmin,jmax,*c2r,*il,col,nexti,ili,nz;
1338   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
1339   FactorShiftCtx sctx;
1340   PetscReal      rs;
1341   MatScalar      d,*v;
1342 
1343   PetscFunctionBegin;
1344   ierr = PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&c2r);CHKERRQ(ierr);
1345 
1346   /* MatPivotSetUp(): initialize shift context sctx */
1347   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1348 
1349   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1350     sctx.shift_top = info->zeropivot;
1351 
1352     ierr = PetscMemzero(rtmp,mbs*sizeof(MatScalar));CHKERRQ(ierr);
1353 
1354     for (i=0; i<mbs; i++) {
1355       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1356       d        = (aa)[a->diag[i]];
1357       rtmp[i] += -PetscRealPart(d);  /* diagonal entry */
1358       ajtmp    = aj + ai[i] + 1;     /* exclude diagonal */
1359       v        = aa + ai[i] + 1;
1360       nz       = ai[i+1] - ai[i] - 1;
1361       for (j=0; j<nz; j++) {
1362         rtmp[i]        += PetscAbsScalar(v[j]);
1363         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1364       }
1365       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1366     }
1367     sctx.shift_top *= 1.1;
1368     sctx.nshift_max = 5;
1369     sctx.shift_lo   = 0.;
1370     sctx.shift_hi   = 1.;
1371   }
1372 
1373   /* allocate working arrays
1374      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1375      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1376   */
1377   do {
1378     sctx.newshift = PETSC_FALSE;
1379 
1380     for (i=0; i<mbs; i++) c2r[i] = mbs;
1381     if (mbs) il[0] = 0;
1382 
1383     for (k = 0; k<mbs; k++) {
1384       /* zero rtmp */
1385       nz    = bi[k+1] - bi[k];
1386       bjtmp = bj + bi[k];
1387       for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1388 
1389       /* load in initial unfactored row */
1390       bval = ba + bi[k];
1391       jmin = ai[k]; jmax = ai[k+1];
1392       for (j = jmin; j < jmax; j++) {
1393         col       = aj[j];
1394         rtmp[col] = aa[j];
1395         *bval++   = 0.0; /* for in-place factorization */
1396       }
1397       /* shift the diagonal of the matrix: ZeropivotApply() */
1398       rtmp[k] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1399 
1400       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1401       dk = rtmp[k];
1402       i  = c2r[k]; /* first row to be added to k_th row  */
1403 
1404       while (i < k) {
1405         nexti = c2r[i]; /* next row to be added to k_th row */
1406 
1407         /* compute multiplier, update diag(k) and U(i,k) */
1408         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1409         uikdi   = -ba[ili]*ba[bdiag[i]]; /* diagonal(k) */
1410         dk     += uikdi*ba[ili]; /* update diag[k] */
1411         ba[ili] = uikdi; /* -U(i,k) */
1412 
1413         /* add multiple of row i to k-th row */
1414         jmin = ili + 1; jmax = bi[i+1];
1415         if (jmin < jmax) {
1416           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1417           /* update il and c2r for row i */
1418           il[i] = jmin;
1419           j     = bj[jmin]; c2r[i] = c2r[j]; c2r[j] = i;
1420         }
1421         i = nexti;
1422       }
1423 
1424       /* copy data into U(k,:) */
1425       rs   = 0.0;
1426       jmin = bi[k]; jmax = bi[k+1]-1;
1427       if (jmin < jmax) {
1428         for (j=jmin; j<jmax; j++) {
1429           col = bj[j]; ba[j] = rtmp[col]; rs += PetscAbsScalar(ba[j]);
1430         }
1431         /* add the k-th row into il and c2r */
1432         il[k] = jmin;
1433         i     = bj[jmin]; c2r[k] = c2r[i]; c2r[i] = k;
1434       }
1435 
1436       sctx.rs = rs;
1437       sctx.pv = dk;
1438       ierr    = MatPivotCheck(B,A,info,&sctx,k);CHKERRQ(ierr);
1439       if (sctx.newshift) break;
1440       dk = sctx.pv;
1441 
1442       ba[bdiag[k]] = 1.0/dk; /* U(k,k) */
1443     }
1444   } while (sctx.newshift);
1445 
1446   ierr = PetscFree3(rtmp,il,c2r);CHKERRQ(ierr);
1447 
1448   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1449   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1450   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1451   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1452   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1453 
1454   B->assembled    = PETSC_TRUE;
1455   B->preallocated = PETSC_TRUE;
1456 
1457   ierr = PetscLogFlops(B->rmap->n);CHKERRQ(ierr);
1458 
1459   /* MatPivotView() */
1460   if (sctx.nshift) {
1461     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1462       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1463     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1464       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1465     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1466       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1467     }
1468   }
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 #undef __FUNCT__
1473 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace"
1474 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
1475 {
1476   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1477   PetscErrorCode ierr;
1478   PetscInt       i,j,mbs = a->mbs;
1479   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1480   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1481   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1482   PetscReal      rs;
1483   FactorShiftCtx sctx;
1484 
1485   PetscFunctionBegin;
1486   /* MatPivotSetUp(): initialize shift context sctx */
1487   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1488 
1489   /* initialization */
1490   /* il and jl record the first nonzero element in each row of the accessing
1491      window U(0:k, k:mbs-1).
1492      jl:    list of rows to be added to uneliminated rows
1493             i>= k: jl(i) is the first row to be added to row i
1494             i<  k: jl(i) is the row following row i in some list of rows
1495             jl(i) = mbs indicates the end of a list
1496      il(i): points to the first nonzero element in U(i,k:mbs-1)
1497   */
1498   ierr = PetscMalloc1(mbs,&rtmp);CHKERRQ(ierr);
1499   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
1500 
1501   do {
1502     sctx.newshift = PETSC_FALSE;
1503     for (i=0; i<mbs; i++) {
1504       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1505     }
1506 
1507     for (k = 0; k<mbs; k++) {
1508       /*initialize k-th row with elements nonzero in row perm(k) of A */
1509       nz   = ai[k+1] - ai[k];
1510       acol = aj + ai[k];
1511       aval = aa + ai[k];
1512       bval = ba + bi[k];
1513       while (nz--) {
1514         rtmp[*acol++] = *aval++;
1515         *bval++       = 0.0; /* for in-place factorization */
1516       }
1517 
1518       /* shift the diagonal of the matrix */
1519       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1520 
1521       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1522       dk = rtmp[k];
1523       i  = jl[k]; /* first row to be added to k_th row  */
1524 
1525       while (i < k) {
1526         nexti = jl[i]; /* next row to be added to k_th row */
1527         /* compute multiplier, update D(k) and U(i,k) */
1528         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1529         uikdi   = -ba[ili]*ba[bi[i]];
1530         dk     += uikdi*ba[ili];
1531         ba[ili] = uikdi; /* -U(i,k) */
1532 
1533         /* add multiple of row i to k-th row ... */
1534         jmin = ili + 1;
1535         nz   = bi[i+1] - jmin;
1536         if (nz > 0) {
1537           bcol = bj + jmin;
1538           bval = ba + jmin;
1539           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1540           while (nz--) rtmp[*bcol++] += uikdi*(*bval++);
1541 
1542           /* update il and jl for i-th row */
1543           il[i] = jmin;
1544           j     = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1545         }
1546         i = nexti;
1547       }
1548 
1549       /* shift the diagonals when zero pivot is detected */
1550       /* compute rs=sum of abs(off-diagonal) */
1551       rs   = 0.0;
1552       jmin = bi[k]+1;
1553       nz   = bi[k+1] - jmin;
1554       if (nz) {
1555         bcol = bj + jmin;
1556         while (nz--) {
1557           rs += PetscAbsScalar(rtmp[*bcol]);
1558           bcol++;
1559         }
1560       }
1561 
1562       sctx.rs = rs;
1563       sctx.pv = dk;
1564       ierr    = MatPivotCheck(C,A,info,&sctx,k);CHKERRQ(ierr);
1565       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1566       dk = sctx.pv;
1567 
1568       /* copy data into U(k,:) */
1569       ba[bi[k]] = 1.0/dk;
1570       jmin      = bi[k]+1;
1571       nz        = bi[k+1] - jmin;
1572       if (nz) {
1573         bcol = bj + jmin;
1574         bval = ba + jmin;
1575         while (nz--) {
1576           *bval++       = rtmp[*bcol];
1577           rtmp[*bcol++] = 0.0;
1578         }
1579         /* add k-th row into il and jl */
1580         il[k] = jmin;
1581         i     = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1582       }
1583     } /* end of for (k = 0; k<mbs; k++) */
1584   } while (sctx.newshift);
1585   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1586   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1587 
1588   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1589   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1590   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1591   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1592   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1593 
1594   C->assembled    = PETSC_TRUE;
1595   C->preallocated = PETSC_TRUE;
1596 
1597   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1598   if (sctx.nshift) {
1599     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1600       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1601     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1602       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1603     }
1604   }
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 #undef __FUNCT__
1609 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1610 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info)
1611 {
1612   PetscErrorCode ierr;
1613   Mat            C;
1614 
1615   PetscFunctionBegin;
1616   ierr = MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);CHKERRQ(ierr);
1617   ierr = MatCholeskyFactorSymbolic(C,A,perm,info);CHKERRQ(ierr);
1618   ierr = MatCholeskyFactorNumeric(C,A,info);CHKERRQ(ierr);
1619 
1620   A->ops->solve          = C->ops->solve;
1621   A->ops->solvetranspose = C->ops->solvetranspose;
1622 
1623   ierr = MatHeaderMerge(A,&C);CHKERRQ(ierr);
1624   PetscFunctionReturn(0);
1625 }
1626 
1627 
1628