xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision fdb35ca6b8bfd78facfcbaad2994e8eeeb8f8a19)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <../src/mat/blockinvert.h>
5 #include <petscis.h>
6 
7 /*
8   input:
9    F -- numeric factor
10   output:
11    nneg, nzero, npos: matrix inertia
12 */
13 
14 #undef __FUNCT__
15 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
16 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
17 {
18   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
19   MatScalar    *dd       = fact_ptr->a;
20   PetscInt     mbs       =fact_ptr->mbs,bs=F->rmap->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->diag;
21 
22   PetscFunctionBegin;
23   if (bs != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
24   nneig_tmp = 0; npos_tmp = 0;
25   for (i=0; i<mbs; i++) {
26     if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
27     else if (PetscRealPart(dd[*fi]) < 0.0) nneig_tmp++;
28     fi++;
29   }
30   if (nneig) *nneig = nneig_tmp;
31   if (npos)  *npos  = npos_tmp;
32   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
33   PetscFunctionReturn(0);
34 }
35 
36 /*
37   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
38   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
39 */
40 #undef __FUNCT__
41 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
42 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info)
43 {
44   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
45   PetscErrorCode ierr;
46   const PetscInt *rip,*ai,*aj;
47   PetscInt       i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2;
48   PetscInt       m,reallocs = 0,prow;
49   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
50   PetscReal      f = info->fill;
51   PetscBool      perm_identity;
52 
53   PetscFunctionBegin;
54   /* check whether perm is the identity mapping */
55   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
56   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
57 
58   if (perm_identity) { /* without permutation */
59     a->permute = PETSC_FALSE;
60 
61     ai = a->i; aj = a->j;
62   } else {            /* non-trivial permutation */
63     a->permute = PETSC_TRUE;
64 
65     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
66 
67     ai = a->inew; aj = a->jnew;
68   }
69 
70   /* initialization */
71   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
72   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
73   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
74   iu[0] = mbs+1;
75   juidx = mbs + 1; /* index for ju */
76   /* jl linked list for pivot row -- linked list for col index */
77   ierr = PetscMalloc2(mbs,PetscInt,&jl,mbs,PetscInt,&q);CHKERRQ(ierr);
78   for (i=0; i<mbs; i++) {
79     jl[i] = mbs;
80     q[i]  = 0;
81   }
82 
83   /* for each row k */
84   for (k=0; k<mbs; k++) {
85     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
86     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
87     q[k] = mbs;
88     /* initialize nonzero structure of k-th row to row rip[k] of A */
89     jmin = ai[rip[k]] +1; /* exclude diag[k] */
90     jmax = ai[rip[k]+1];
91     for (j=jmin; j<jmax; j++) {
92       vj = rip[aj[j]]; /* col. value */
93       if (vj > k) {
94         qm = k;
95         do {
96           m = qm; qm = q[m];
97         } while (qm < vj);
98         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
99         nzk++;
100         q[m]  = vj;
101         q[vj] = qm;
102       } /* if (vj > k) */
103     } /* for (j=jmin; j<jmax; j++) */
104 
105     /* modify nonzero structure of k-th row by computing fill-in
106        for each row i to be merged in */
107     prow = k;
108     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
109 
110     while (prow < k) {
111       /* merge row prow into k-th row */
112       jmin = iu[prow] + 1; jmax = iu[prow+1];
113       qm   = k;
114       for (j=jmin; j<jmax; j++) {
115         vj = ju[j];
116         do {
117           m = qm; qm = q[m];
118         } while (qm < vj);
119         if (qm != vj) {
120           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
121         }
122       }
123       prow = jl[prow]; /* next pivot row */
124     }
125 
126     /* add k to row list for first nonzero element in k-th row */
127     if (nzk > 0) {
128       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
129       jl[k] = jl[i]; jl[i] = k;
130     }
131     iu[k+1] = iu[k] + nzk;
132 
133     /* allocate more space to ju if needed */
134     if (iu[k+1] > umax) {
135       /* estimate how much additional space we will need */
136       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
137       /* just double the memory each time */
138       maxadd = umax;
139       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
140       umax += maxadd;
141 
142       /* allocate a longer ju */
143       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
144       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
145       ierr = PetscFree(ju);CHKERRQ(ierr);
146       ju   = jutmp;
147       reallocs++; /* count how many times we realloc */
148     }
149 
150     /* save nonzero structure of k-th row in ju */
151     i=k;
152     while (nzk--) {
153       i           = q[i];
154       ju[juidx++] = i;
155     }
156   }
157 
158 #if defined(PETSC_USE_INFO)
159   if (ai[mbs] != 0) {
160     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
161     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
162     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
163     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
164     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
165   } else {
166     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
167   }
168 #endif
169 
170   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
171   ierr = PetscFree2(jl,q);CHKERRQ(ierr);
172 
173   /* put together the new matrix */
174   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(F,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
175 
176   /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */
177   b                = (Mat_SeqSBAIJ*)(F)->data;
178   b->singlemalloc  = PETSC_FALSE;
179   b->free_a        = PETSC_TRUE;
180   b->free_ij       = PETSC_TRUE;
181 
182   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
183   b->j    = ju;
184   b->i    = iu;
185   b->diag = 0;
186   b->ilen = 0;
187   b->imax = 0;
188   b->row  = perm;
189 
190   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
191 
192   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
193 
194   b->icol = perm;
195   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
196   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
197   /* In b structure:  Free imax, ilen, old a, old j.
198      Allocate idnew, solve_work, new a, new j */
199   ierr     = PetscLogObjectMemory(F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
200   b->maxnz = b->nz = iu[mbs];
201 
202   (F)->info.factor_mallocs   = reallocs;
203   (F)->info.fill_ratio_given = f;
204   if (ai[mbs] != 0) {
205     (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
206   } else {
207     (F)->info.fill_ratio_needed = 0.0;
208   }
209   ierr = MatSeqSBAIJSetNumericFactorization_inplace(F,perm_identity);CHKERRQ(ierr);
210   PetscFunctionReturn(0);
211 }
212 /*
213     Symbolic U^T*D*U factorization for SBAIJ format.
214     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
215 */
216 #include <petscbt.h>
217 #include <../src/mat/utils/freespace.h>
218 #undef __FUNCT__
219 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
220 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
221 {
222   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
223   Mat_SeqSBAIJ       *b;
224   PetscErrorCode     ierr;
225   PetscBool          perm_identity,missing;
226   PetscReal          fill = info->fill;
227   const PetscInt     *rip,*ai=a->i,*aj=a->j;
228   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
229   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
230   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr,*udiag;
231   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
232   PetscBT            lnkbt;
233 
234   PetscFunctionBegin;
235   if (bs > 1) {
236     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
237     PetscFunctionReturn(0);
238   }
239   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
240   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
241   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
242 
243   /* check whether perm is the identity mapping */
244   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
245   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
246   a->permute = PETSC_FALSE;
247   ierr       = ISGetIndices(perm,&rip);CHKERRQ(ierr);
248 
249   /* initialization */
250   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
251   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr);
252   ui[0] = 0;
253 
254   /* jl: linked list for storing indices of the pivot rows
255      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
256   ierr = PetscMalloc4(mbs,PetscInt*,&ui_ptr,mbs,PetscInt,&il,mbs,PetscInt,&jl,mbs,PetscInt,&cols);CHKERRQ(ierr);
257   for (i=0; i<mbs; i++) {
258     jl[i] = mbs; il[i] = 0;
259   }
260 
261   /* create and initialize a linked list for storing column indices of the active row k */
262   nlnk = mbs + 1;
263   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
264 
265   /* initial FreeSpace size is fill*(ai[mbs]+1) */
266   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
267   current_space = free_space;
268 
269   for (k=0; k<mbs; k++) {  /* for each active row k */
270     /* initialize lnk by the column indices of row rip[k] of A */
271     nzk   = 0;
272     ncols = ai[k+1] - ai[k];
273     if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
274     for (j=0; j<ncols; j++) {
275       i       = *(aj + ai[k] + j);
276       cols[j] = i;
277     }
278     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
279     nzk += nlnk;
280 
281     /* update lnk by computing fill-in for each pivot row to be merged in */
282     prow = jl[k]; /* 1st pivot row */
283 
284     while (prow < k) {
285       nextprow = jl[prow];
286       /* merge prow into k-th row */
287       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
288       jmax   = ui[prow+1];
289       ncols  = jmax-jmin;
290       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
291       ierr   = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
292       nzk   += nlnk;
293 
294       /* update il and jl for prow */
295       if (jmin < jmax) {
296         il[prow] = jmin;
297         j        = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
298       }
299       prow = nextprow;
300     }
301 
302     /* if free space is not available, make more free space */
303     if (current_space->local_remaining<nzk) {
304       i    = mbs - k + 1; /* num of unfactored rows */
305       i   *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
306       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
307       reallocs++;
308     }
309 
310     /* copy data into free space, then initialize lnk */
311     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
312 
313     /* add the k-th row into il and jl */
314     if (nzk > 1) {
315       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
316       jl[k] = jl[i]; jl[i] = k;
317       il[k] = ui[k] + 1;
318     }
319     ui_ptr[k] = current_space->array;
320 
321     current_space->array           += nzk;
322     current_space->local_used      += nzk;
323     current_space->local_remaining -= nzk;
324 
325     ui[k+1] = ui[k] + nzk;
326   }
327 
328   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
329   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
330 
331   /* destroy list of free space and other temporary array(s) */
332   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
333   ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,mbs,ui,udiag);CHKERRQ(ierr); /* store matrix factor */
334   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
335 
336   /* put together the new matrix in MATSEQSBAIJ format */
337   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
338 
339   b               = (Mat_SeqSBAIJ*)fact->data;
340   b->singlemalloc = PETSC_FALSE;
341   b->free_a       = PETSC_TRUE;
342   b->free_ij      = PETSC_TRUE;
343 
344   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
345 
346   b->j         = uj;
347   b->i         = ui;
348   b->diag      = udiag;
349   b->free_diag = PETSC_TRUE;
350   b->ilen      = 0;
351   b->imax      = 0;
352   b->row       = perm;
353   b->icol      = perm;
354 
355   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
356   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
357 
358   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
359 
360   ierr = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
361   ierr = PetscLogObjectMemory(fact,ui[mbs]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
362 
363   b->maxnz = b->nz = ui[mbs];
364 
365   fact->info.factor_mallocs   = reallocs;
366   fact->info.fill_ratio_given = fill;
367   if (ai[mbs] != 0) {
368     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
369   } else {
370     fact->info.fill_ratio_needed = 0.0;
371   }
372 #if defined(PETSC_USE_INFO)
373   if (ai[mbs] != 0) {
374     PetscReal af = fact->info.fill_ratio_needed;
375     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
376     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
377     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
378   } else {
379     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
380   }
381 #endif
382   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_inplace"
388 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
389 {
390   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
391   Mat_SeqSBAIJ       *b;
392   PetscErrorCode     ierr;
393   PetscBool          perm_identity,missing;
394   PetscReal          fill = info->fill;
395   const PetscInt     *rip,*ai,*aj;
396   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
397   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
398   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
399   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
400   PetscBT            lnkbt;
401 
402   PetscFunctionBegin;
403   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
404   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
405 
406   /*
407    This code originally uses Modified Sparse Row (MSR) storage
408    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
409    Then it is rewritten so the factor B takes seqsbaij format. However the associated
410    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
411    thus the original code in MSR format is still used for these cases.
412    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
413    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
414   */
415   if (bs > 1) {
416     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
417     PetscFunctionReturn(0);
418   }
419 
420   /* check whether perm is the identity mapping */
421   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
422 
423   if (perm_identity) {
424     a->permute = PETSC_FALSE;
425 
426     ai = a->i; aj = a->j;
427   } else {
428     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
429     /* There are bugs for reordeing. Needs further work.
430        MatReordering for sbaij cannot be efficient. User should use aij formt! */
431     a->permute = PETSC_TRUE;
432 
433     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
434     ai   = a->inew; aj = a->jnew;
435   }
436   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
437 
438   /* initialization */
439   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
440   ui[0] = 0;
441 
442   /* jl: linked list for storing indices of the pivot rows
443      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
444   ierr = PetscMalloc4(mbs,PetscInt*,&ui_ptr,mbs,PetscInt,&il,mbs,PetscInt,&jl,mbs,PetscInt,&cols);CHKERRQ(ierr);
445   for (i=0; i<mbs; i++) {
446     jl[i] = mbs; il[i] = 0;
447   }
448 
449   /* create and initialize a linked list for storing column indices of the active row k */
450   nlnk = mbs + 1;
451   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
452 
453   /* initial FreeSpace size is fill*(ai[mbs]+1) */
454   ierr          = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
455   current_space = free_space;
456 
457   for (k=0; k<mbs; k++) {  /* for each active row k */
458     /* initialize lnk by the column indices of row rip[k] of A */
459     nzk   = 0;
460     ncols = ai[rip[k]+1] - ai[rip[k]];
461     for (j=0; j<ncols; j++) {
462       i       = *(aj + ai[rip[k]] + j);
463       cols[j] = rip[i];
464     }
465     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
466     nzk += nlnk;
467 
468     /* update lnk by computing fill-in for each pivot row to be merged in */
469     prow = jl[k]; /* 1st pivot row */
470 
471     while (prow < k) {
472       nextprow = jl[prow];
473       /* merge prow into k-th row */
474       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
475       jmax   = ui[prow+1];
476       ncols  = jmax-jmin;
477       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
478       ierr   = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
479       nzk   += nlnk;
480 
481       /* update il and jl for prow */
482       if (jmin < jmax) {
483         il[prow] = jmin;
484 
485         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
486       }
487       prow = nextprow;
488     }
489 
490     /* if free space is not available, make more free space */
491     if (current_space->local_remaining<nzk) {
492       i    = mbs - k + 1; /* num of unfactored rows */
493       i    = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
494       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
495       reallocs++;
496     }
497 
498     /* copy data into free space, then initialize lnk */
499     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
500 
501     /* add the k-th row into il and jl */
502     if (nzk-1 > 0) {
503       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
504       jl[k] = jl[i]; jl[i] = k;
505       il[k] = ui[k] + 1;
506     }
507     ui_ptr[k] = current_space->array;
508 
509     current_space->array           += nzk;
510     current_space->local_used      += nzk;
511     current_space->local_remaining -= nzk;
512 
513     ui[k+1] = ui[k] + nzk;
514   }
515 
516   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
517   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
518 
519   /* destroy list of free space and other temporary array(s) */
520   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
521   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
522   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
523 
524   /* put together the new matrix in MATSEQSBAIJ format */
525   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
526 
527   b               = (Mat_SeqSBAIJ*)fact->data;
528   b->singlemalloc = PETSC_FALSE;
529   b->free_a       = PETSC_TRUE;
530   b->free_ij      = PETSC_TRUE;
531 
532   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
533 
534   b->j    = uj;
535   b->i    = ui;
536   b->diag = 0;
537   b->ilen = 0;
538   b->imax = 0;
539   b->row  = perm;
540 
541   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
542 
543   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
544   b->icol  = perm;
545   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
546   ierr     = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
547   ierr     = PetscLogObjectMemory(fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
548   b->maxnz = b->nz = ui[mbs];
549 
550   fact->info.factor_mallocs   = reallocs;
551   fact->info.fill_ratio_given = fill;
552   if (ai[mbs] != 0) {
553     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
554   } else {
555     fact->info.fill_ratio_needed = 0.0;
556   }
557 #if defined(PETSC_USE_INFO)
558   if (ai[mbs] != 0) {
559     PetscReal af = fact->info.fill_ratio_needed;
560     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
561     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
562     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
563   } else {
564     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
565   }
566 #endif
567   ierr = MatSeqSBAIJSetNumericFactorization_inplace(fact,perm_identity);CHKERRQ(ierr);
568   PetscFunctionReturn(0);
569 }
570 
571 #undef __FUNCT__
572 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
573 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
574 {
575   Mat_SeqSBAIJ   *a   = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
576   IS             perm = b->row;
577   PetscErrorCode ierr;
578   const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j;
579   PetscInt       i,j;
580   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
581   PetscInt       bs  =A->rmap->bs,bs2 = a->bs2,bslog = 0;
582   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
583   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
584   MatScalar      *work;
585   PetscInt       *pivots;
586 
587   PetscFunctionBegin;
588   /* initialization */
589   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
590   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
591   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
592   for (i=0; i<mbs; i++) {
593     jl[i] = mbs; il[0] = 0;
594   }
595   ierr = PetscMalloc3(bs2,MatScalar,&dk,bs2,MatScalar,&uik,bs,MatScalar,&work);CHKERRQ(ierr);
596   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
597 
598   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
599 
600   /* check permutation */
601   if (!a->permute) {
602     ai = a->i; aj = a->j; aa = a->a;
603   } else {
604     ai   = a->inew; aj = a->jnew;
605     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
606     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
607     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
608     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
609 
610     /* flops in while loop */
611     bslog = 2*bs*bs2;
612 
613     for (i=0; i<mbs; i++) {
614       jmin = ai[i]; jmax = ai[i+1];
615       for (j=jmin; j<jmax; j++) {
616         while (a2anew[j] != j) {
617           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
618           for (k1=0; k1<bs2; k1++) {
619             dk[k1]       = aa[k*bs2+k1];
620             aa[k*bs2+k1] = aa[j*bs2+k1];
621             aa[j*bs2+k1] = dk[k1];
622           }
623         }
624         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
625         if (i > aj[j]) {
626           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
627           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
628           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
629           for (k=0; k<bs; k++) {               /* j-th block of aa <- dk^T */
630             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
631           }
632         }
633       }
634     }
635     ierr = PetscFree(a2anew);CHKERRQ(ierr);
636   }
637 
638   /* for each row k */
639   for (k = 0; k<mbs; k++) {
640 
641     /*initialize k-th row with elements nonzero in row perm(k) of A */
642     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
643 
644     ap = aa + jmin*bs2;
645     for (j = jmin; j < jmax; j++) {
646       vj       = perm_ptr[aj[j]];   /* block col. index */
647       rtmp_ptr = rtmp + vj*bs2;
648       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
649     }
650 
651     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
652     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
653     i    = jl[k]; /* first row to be added to k_th row  */
654 
655     while (i < k) {
656       nexti = jl[i]; /* next row to be added to k_th row */
657 
658       /* compute multiplier */
659       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
660 
661       /* uik = -inv(Di)*U_bar(i,k) */
662       diag = ba + i*bs2;
663       u    = ba + ili*bs2;
664       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
665       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
666 
667       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
668       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
669       ierr = PetscLogFlops(bslog*2.0);CHKERRQ(ierr);
670 
671       /* update -U(i,k) */
672       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
673 
674       /* add multiple of row i to k-th row ... */
675       jmin = ili + 1; jmax = bi[i+1];
676       if (jmin < jmax) {
677         for (j=jmin; j<jmax; j++) {
678           /* rtmp += -U(i,k)^T * U_bar(i,j) */
679           rtmp_ptr = rtmp + bj[j]*bs2;
680           u        = ba + j*bs2;
681           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
682         }
683         ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr);
684 
685         /* ... add i to row list for next nonzero entry */
686         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
687         j     = bj[jmin];
688         jl[i] = jl[j]; jl[j] = i; /* update jl */
689       }
690       i = nexti;
691     }
692 
693     /* save nonzero entries in k-th row of U ... */
694 
695     /* invert diagonal block */
696     diag = ba+k*bs2;
697     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
698     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
699 
700     jmin = bi[k]; jmax = bi[k+1];
701     if (jmin < jmax) {
702       for (j=jmin; j<jmax; j++) {
703         vj       = bj[j];      /* block col. index of U */
704         u        = ba + j*bs2;
705         rtmp_ptr = rtmp + vj*bs2;
706         for (k1=0; k1<bs2; k1++) {
707           *u++        = *rtmp_ptr;
708           *rtmp_ptr++ = 0.0;
709         }
710       }
711 
712       /* ... add k to row list for first nonzero entry in k-th row */
713       il[k] = jmin;
714       i     = bj[jmin];
715       jl[k] = jl[i]; jl[i] = k;
716     }
717   }
718 
719   ierr = PetscFree(rtmp);CHKERRQ(ierr);
720   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
721   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
722   ierr = PetscFree(pivots);CHKERRQ(ierr);
723   if (a->permute) {
724     ierr = PetscFree(aa);CHKERRQ(ierr);
725   }
726 
727   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
728 
729   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
730   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
731   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
732   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
733 
734   C->assembled    = PETSC_TRUE;
735   C->preallocated = PETSC_TRUE;
736 
737   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
738   PetscFunctionReturn(0);
739 }
740 
741 #undef __FUNCT__
742 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
743 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
744 {
745   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
746   PetscErrorCode ierr;
747   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
748   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
749   PetscInt       bs  =A->rmap->bs,bs2 = a->bs2,bslog;
750   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
751   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
752   MatScalar      *work;
753   PetscInt       *pivots;
754 
755   PetscFunctionBegin;
756   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
757   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
758   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
759   for (i=0; i<mbs; i++) {
760     jl[i] = mbs; il[0] = 0;
761   }
762   ierr = PetscMalloc3(bs2,MatScalar,&dk,bs2,MatScalar,&uik,bs,MatScalar,&work);CHKERRQ(ierr);
763   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
764 
765   ai = a->i; aj = a->j; aa = a->a;
766 
767   /* flops in while loop */
768   bslog = 2*bs*bs2;
769 
770   /* for each row k */
771   for (k = 0; k<mbs; k++) {
772 
773     /*initialize k-th row with elements nonzero in row k of A */
774     jmin = ai[k]; jmax = ai[k+1];
775     ap   = aa + jmin*bs2;
776     for (j = jmin; j < jmax; j++) {
777       vj       = aj[j];   /* block col. index */
778       rtmp_ptr = rtmp + vj*bs2;
779       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
780     }
781 
782     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
783     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
784     i    = jl[k]; /* first row to be added to k_th row  */
785 
786     while (i < k) {
787       nexti = jl[i]; /* next row to be added to k_th row */
788 
789       /* compute multiplier */
790       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
791 
792       /* uik = -inv(Di)*U_bar(i,k) */
793       diag = ba + i*bs2;
794       u    = ba + ili*bs2;
795       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
796       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
797 
798       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
799       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
800       ierr = PetscLogFlops(bslog*2.0);CHKERRQ(ierr);
801 
802       /* update -U(i,k) */
803       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
804 
805       /* add multiple of row i to k-th row ... */
806       jmin = ili + 1; jmax = bi[i+1];
807       if (jmin < jmax) {
808         for (j=jmin; j<jmax; j++) {
809           /* rtmp += -U(i,k)^T * U_bar(i,j) */
810           rtmp_ptr = rtmp + bj[j]*bs2;
811           u        = ba + j*bs2;
812           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
813         }
814         ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr);
815 
816         /* ... add i to row list for next nonzero entry */
817         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
818         j     = bj[jmin];
819         jl[i] = jl[j]; jl[j] = i; /* update jl */
820       }
821       i = nexti;
822     }
823 
824     /* save nonzero entries in k-th row of U ... */
825 
826     /* invert diagonal block */
827     diag = ba+k*bs2;
828     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
829     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
830 
831     jmin = bi[k]; jmax = bi[k+1];
832     if (jmin < jmax) {
833       for (j=jmin; j<jmax; j++) {
834         vj       = bj[j];      /* block col. index of U */
835         u        = ba + j*bs2;
836         rtmp_ptr = rtmp + vj*bs2;
837         for (k1=0; k1<bs2; k1++) {
838           *u++        = *rtmp_ptr;
839           *rtmp_ptr++ = 0.0;
840         }
841       }
842 
843       /* ... add k to row list for first nonzero entry in k-th row */
844       il[k] = jmin;
845       i     = bj[jmin];
846       jl[k] = jl[i]; jl[i] = k;
847     }
848   }
849 
850   ierr = PetscFree(rtmp);CHKERRQ(ierr);
851   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
852   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
853   ierr = PetscFree(pivots);CHKERRQ(ierr);
854 
855   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
856   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
857   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
858   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
859   C->assembled           = PETSC_TRUE;
860   C->preallocated        = PETSC_TRUE;
861 
862   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
863   PetscFunctionReturn(0);
864 }
865 
866 /*
867     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
868     Version for blocks 2 by 2.
869 */
870 #undef __FUNCT__
871 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
872 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info)
873 {
874   Mat_SeqSBAIJ   *a   = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
875   IS             perm = b->row;
876   PetscErrorCode ierr;
877   const PetscInt *ai,*aj,*perm_ptr;
878   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
879   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
880   MatScalar      *ba = b->a,*aa,*ap;
881   MatScalar      *u,*diag,*rtmp,*rtmp_ptr,dk[4],uik[4];
882   PetscReal      shift = info->shiftamount;
883 
884   PetscFunctionBegin;
885   /* initialization */
886   /* il and jl record the first nonzero element in each row of the accessing
887      window U(0:k, k:mbs-1).
888      jl:    list of rows to be added to uneliminated rows
889             i>= k: jl(i) is the first row to be added to row i
890             i<  k: jl(i) is the row following row i in some list of rows
891             jl(i) = mbs indicates the end of a list
892      il(i): points to the first nonzero element in columns k,...,mbs-1 of
893             row i of U */
894   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
895   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
896   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
897   for (i=0; i<mbs; i++) {
898     jl[i] = mbs; il[0] = 0;
899   }
900   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
901 
902   /* check permutation */
903   if (!a->permute) {
904     ai = a->i; aj = a->j; aa = a->a;
905   } else {
906     ai   = a->inew; aj = a->jnew;
907     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
908     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
909     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
910     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
911 
912     for (i=0; i<mbs; i++) {
913       jmin = ai[i]; jmax = ai[i+1];
914       for (j=jmin; j<jmax; j++) {
915         while (a2anew[j] != j) {
916           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
917           for (k1=0; k1<4; k1++) {
918             dk[k1]     = aa[k*4+k1];
919             aa[k*4+k1] = aa[j*4+k1];
920             aa[j*4+k1] = dk[k1];
921           }
922         }
923         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
924         if (i > aj[j]) {
925           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
926           ap    = aa + j*4;  /* ptr to the beginning of the block */
927           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
928           ap[1] = ap[2];
929           ap[2] = dk[1];
930         }
931       }
932     }
933     ierr = PetscFree(a2anew);CHKERRQ(ierr);
934   }
935 
936   /* for each row k */
937   for (k = 0; k<mbs; k++) {
938 
939     /*initialize k-th row with elements nonzero in row perm(k) of A */
940     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
941     ap   = aa + jmin*4;
942     for (j = jmin; j < jmax; j++) {
943       vj       = perm_ptr[aj[j]];   /* block col. index */
944       rtmp_ptr = rtmp + vj*4;
945       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
946     }
947 
948     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
949     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
950     i    = jl[k]; /* first row to be added to k_th row  */
951 
952     while (i < k) {
953       nexti = jl[i]; /* next row to be added to k_th row */
954 
955       /* compute multiplier */
956       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
957 
958       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
959       diag   = ba + i*4;
960       u      = ba + ili*4;
961       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
962       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
963       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
964       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
965 
966       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
967       dk[0] += uik[0]*u[0] + uik[1]*u[1];
968       dk[1] += uik[2]*u[0] + uik[3]*u[1];
969       dk[2] += uik[0]*u[2] + uik[1]*u[3];
970       dk[3] += uik[2]*u[2] + uik[3]*u[3];
971 
972       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
973 
974       /* update -U(i,k): ba[ili] = uik */
975       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
976 
977       /* add multiple of row i to k-th row ... */
978       jmin = ili + 1; jmax = bi[i+1];
979       if (jmin < jmax) {
980         for (j=jmin; j<jmax; j++) {
981           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
982           rtmp_ptr     = rtmp + bj[j]*4;
983           u            = ba + j*4;
984           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
985           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
986           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
987           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
988         }
989         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
990 
991         /* ... add i to row list for next nonzero entry */
992         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
993         j     = bj[jmin];
994         jl[i] = jl[j]; jl[j] = i; /* update jl */
995       }
996       i = nexti;
997     }
998 
999     /* save nonzero entries in k-th row of U ... */
1000 
1001     /* invert diagonal block */
1002     diag = ba+k*4;
1003     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1004     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift);CHKERRQ(ierr);
1005 
1006     jmin = bi[k]; jmax = bi[k+1];
1007     if (jmin < jmax) {
1008       for (j=jmin; j<jmax; j++) {
1009         vj       = bj[j];      /* block col. index of U */
1010         u        = ba + j*4;
1011         rtmp_ptr = rtmp + vj*4;
1012         for (k1=0; k1<4; k1++) {
1013           *u++        = *rtmp_ptr;
1014           *rtmp_ptr++ = 0.0;
1015         }
1016       }
1017 
1018       /* ... add k to row list for first nonzero entry in k-th row */
1019       il[k] = jmin;
1020       i     = bj[jmin];
1021       jl[k] = jl[i]; jl[i] = k;
1022     }
1023   }
1024 
1025   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1026   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1027   if (a->permute) {
1028     ierr = PetscFree(aa);CHKERRQ(ierr);
1029   }
1030   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1031 
1032   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1033   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1034   C->assembled           = PETSC_TRUE;
1035   C->preallocated        = PETSC_TRUE;
1036 
1037   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 /*
1042       Version for when blocks are 2 by 2 Using natural ordering
1043 */
1044 #undef __FUNCT__
1045 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
1046 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
1047 {
1048   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
1049   PetscErrorCode ierr;
1050   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1051   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1052   MatScalar      *ba = b->a,*aa,*ap,dk[8],uik[8];
1053   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
1054   PetscReal      shift = info->shiftamount;
1055 
1056   PetscFunctionBegin;
1057   /* initialization */
1058   /* il and jl record the first nonzero element in each row of the accessing
1059      window U(0:k, k:mbs-1).
1060      jl:    list of rows to be added to uneliminated rows
1061             i>= k: jl(i) is the first row to be added to row i
1062             i<  k: jl(i) is the row following row i in some list of rows
1063             jl(i) = mbs indicates the end of a list
1064      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1065             row i of U */
1066   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1067   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
1068   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1069   for (i=0; i<mbs; i++) {
1070     jl[i] = mbs; il[0] = 0;
1071   }
1072   ai = a->i; aj = a->j; aa = a->a;
1073 
1074   /* for each row k */
1075   for (k = 0; k<mbs; k++) {
1076 
1077     /*initialize k-th row with elements nonzero in row k of A */
1078     jmin = ai[k]; jmax = ai[k+1];
1079     ap   = aa + jmin*4;
1080     for (j = jmin; j < jmax; j++) {
1081       vj       = aj[j];   /* block col. index */
1082       rtmp_ptr = rtmp + vj*4;
1083       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1084     }
1085 
1086     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1087     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
1088     i    = jl[k]; /* first row to be added to k_th row  */
1089 
1090     while (i < k) {
1091       nexti = jl[i]; /* next row to be added to k_th row */
1092 
1093       /* compute multiplier */
1094       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1095 
1096       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1097       diag   = ba + i*4;
1098       u      = ba + ili*4;
1099       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1100       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1101       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1102       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1103 
1104       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1105       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1106       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1107       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1108       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1109 
1110       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
1111 
1112       /* update -U(i,k): ba[ili] = uik */
1113       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1114 
1115       /* add multiple of row i to k-th row ... */
1116       jmin = ili + 1; jmax = bi[i+1];
1117       if (jmin < jmax) {
1118         for (j=jmin; j<jmax; j++) {
1119           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1120           rtmp_ptr     = rtmp + bj[j]*4;
1121           u            = ba + j*4;
1122           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1123           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1124           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1125           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1126         }
1127         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
1128 
1129         /* ... add i to row list for next nonzero entry */
1130         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1131         j     = bj[jmin];
1132         jl[i] = jl[j]; jl[j] = i; /* update jl */
1133       }
1134       i = nexti;
1135     }
1136 
1137     /* save nonzero entries in k-th row of U ... */
1138 
1139     /* invert diagonal block */
1140     diag = ba+k*4;
1141     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1142     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift);CHKERRQ(ierr);
1143 
1144     jmin = bi[k]; jmax = bi[k+1];
1145     if (jmin < jmax) {
1146       for (j=jmin; j<jmax; j++) {
1147         vj       = bj[j];      /* block col. index of U */
1148         u        = ba + j*4;
1149         rtmp_ptr = rtmp + vj*4;
1150         for (k1=0; k1<4; k1++) {
1151           *u++        = *rtmp_ptr;
1152           *rtmp_ptr++ = 0.0;
1153         }
1154       }
1155 
1156       /* ... add k to row list for first nonzero entry in k-th row */
1157       il[k] = jmin;
1158       i     = bj[jmin];
1159       jl[k] = jl[i]; jl[i] = k;
1160     }
1161   }
1162 
1163   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1164   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1165 
1166   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1167   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1168   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1169   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1170   C->assembled           = PETSC_TRUE;
1171   C->preallocated        = PETSC_TRUE;
1172 
1173   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 /*
1178     Numeric U^T*D*U factorization for SBAIJ format.
1179     Version for blocks are 1 by 1.
1180 */
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace"
1183 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1186   IS             ip=b->row;
1187   PetscErrorCode ierr;
1188   const PetscInt *ai,*aj,*rip;
1189   PetscInt       *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1190   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1191   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1192   PetscReal      rs;
1193   FactorShiftCtx sctx;
1194 
1195   PetscFunctionBegin;
1196   /* MatPivotSetUp(): initialize shift context sctx */
1197   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1198 
1199   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1200   if (!a->permute) {
1201     ai = a->i; aj = a->j; aa = a->a;
1202   } else {
1203     ai     = a->inew; aj = a->jnew;
1204     nz     = ai[mbs];
1205     ierr   = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1206     a2anew = a->a2anew;
1207     bval   = a->a;
1208     for (j=0; j<nz; j++) {
1209       aa[a2anew[j]] = *(bval++);
1210     }
1211   }
1212 
1213   /* initialization */
1214   /* il and jl record the first nonzero element in each row of the accessing
1215      window U(0:k, k:mbs-1).
1216      jl:    list of rows to be added to uneliminated rows
1217             i>= k: jl(i) is the first row to be added to row i
1218             i<  k: jl(i) is the row following row i in some list of rows
1219             jl(i) = mbs indicates the end of a list
1220      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1221             row i of U */
1222   ierr = PetscMalloc3(mbs,MatScalar,&rtmp,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1223 
1224   do {
1225     sctx.newshift = PETSC_FALSE;
1226     for (i=0; i<mbs; i++) {
1227       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1228     }
1229 
1230     for (k = 0; k<mbs; k++) {
1231       /*initialize k-th row by the perm[k]-th row of A */
1232       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1233       bval = ba + bi[k];
1234       for (j = jmin; j < jmax; j++) {
1235         col       = rip[aj[j]];
1236         rtmp[col] = aa[j];
1237         *bval++   = 0.0; /* for in-place factorization */
1238       }
1239 
1240       /* shift the diagonal of the matrix */
1241       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1242 
1243       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1244       dk = rtmp[k];
1245       i  = jl[k]; /* first row to be added to k_th row  */
1246 
1247       while (i < k) {
1248         nexti = jl[i]; /* next row to be added to k_th row */
1249 
1250         /* compute multiplier, update diag(k) and U(i,k) */
1251         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1252         uikdi   = -ba[ili]*ba[bi[i]]; /* diagonal(k) */
1253         dk     += uikdi*ba[ili];
1254         ba[ili] = uikdi; /* -U(i,k) */
1255 
1256         /* add multiple of row i to k-th row */
1257         jmin = ili + 1; jmax = bi[i+1];
1258         if (jmin < jmax) {
1259           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1260           ierr = PetscLogFlops(2.0*(jmax-jmin));CHKERRQ(ierr);
1261 
1262           /* update il and jl for row i */
1263           il[i] = jmin;
1264           j     = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1265         }
1266         i = nexti;
1267       }
1268 
1269       /* shift the diagonals when zero pivot is detected */
1270       /* compute rs=sum of abs(off-diagonal) */
1271       rs   = 0.0;
1272       jmin = bi[k]+1;
1273       nz   = bi[k+1] - jmin;
1274       if (nz) {
1275         bcol = bj + jmin;
1276         while (nz--) {
1277           rs += PetscAbsScalar(rtmp[*bcol]);
1278           bcol++;
1279         }
1280       }
1281 
1282       sctx.rs = rs;
1283       sctx.pv = dk;
1284       ierr    = MatPivotCheck(A,info,&sctx,k);CHKERRQ(ierr);
1285       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1286       dk = sctx.pv;
1287 
1288       /* copy data into U(k,:) */
1289       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1290       jmin      = bi[k]+1; jmax = bi[k+1];
1291       if (jmin < jmax) {
1292         for (j=jmin; j<jmax; j++) {
1293           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1294         }
1295         /* add the k-th row into il and jl */
1296         il[k] = jmin;
1297         i     = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1298       }
1299     }
1300   } while (sctx.newshift);
1301   ierr = PetscFree3(rtmp,il,jl);CHKERRQ(ierr);
1302   if (a->permute) {ierr = PetscFree(aa);CHKERRQ(ierr);}
1303 
1304   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1305 
1306   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1307   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1308   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1309   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1310   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1311   C->assembled           = PETSC_TRUE;
1312   C->preallocated        = PETSC_TRUE;
1313 
1314   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1315   if (sctx.nshift) {
1316     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1317       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1318     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1319       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1320     }
1321   }
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 /*
1326   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1327   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1328 */
1329 #undef __FUNCT__
1330 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1331 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
1332 {
1333   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1334   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)B->data;
1335   PetscErrorCode ierr;
1336   PetscInt       i,j,mbs=A->rmap->n,*bi=b->i,*bj=b->j,*bdiag=b->diag,*bjtmp;
1337   PetscInt       *ai=a->i,*aj=a->j,*ajtmp;
1338   PetscInt       k,jmin,jmax,*c2r,*il,col,nexti,ili,nz;
1339   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
1340   FactorShiftCtx sctx;
1341   PetscReal      rs;
1342   MatScalar      d,*v;
1343 
1344   PetscFunctionBegin;
1345   ierr = PetscMalloc3(mbs,MatScalar,&rtmp,mbs,PetscInt,&il,mbs,PetscInt,&c2r);CHKERRQ(ierr);
1346 
1347   /* MatPivotSetUp(): initialize shift context sctx */
1348   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1349 
1350   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1351     sctx.shift_top = info->zeropivot;
1352 
1353     ierr = PetscMemzero(rtmp,mbs*sizeof(MatScalar));CHKERRQ(ierr);
1354 
1355     for (i=0; i<mbs; i++) {
1356       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1357       d        = (aa)[a->diag[i]];
1358       rtmp[i] += -PetscRealPart(d);  /* diagonal entry */
1359       ajtmp    = aj + ai[i] + 1;     /* exclude diagonal */
1360       v        = aa + ai[i] + 1;
1361       nz       = ai[i+1] - ai[i] - 1;
1362       for (j=0; j<nz; j++) {
1363         rtmp[i]        += PetscAbsScalar(v[j]);
1364         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1365       }
1366       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1367     }
1368     sctx.shift_top *= 1.1;
1369     sctx.nshift_max = 5;
1370     sctx.shift_lo   = 0.;
1371     sctx.shift_hi   = 1.;
1372   }
1373 
1374   /* allocate working arrays
1375      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1376      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1377   */
1378   do {
1379     sctx.newshift = PETSC_FALSE;
1380 
1381     for (i=0; i<mbs; i++) c2r[i] = mbs;
1382     if (mbs) il[0] = 0;
1383 
1384     for (k = 0; k<mbs; k++) {
1385       /* zero rtmp */
1386       nz    = bi[k+1] - bi[k];
1387       bjtmp = bj + bi[k];
1388       for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1389 
1390       /* load in initial unfactored row */
1391       bval = ba + bi[k];
1392       jmin = ai[k]; jmax = ai[k+1];
1393       for (j = jmin; j < jmax; j++) {
1394         col       = aj[j];
1395         rtmp[col] = aa[j];
1396         *bval++   = 0.0; /* for in-place factorization */
1397       }
1398       /* shift the diagonal of the matrix: ZeropivotApply() */
1399       rtmp[k] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1400 
1401       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1402       dk = rtmp[k];
1403       i  = c2r[k]; /* first row to be added to k_th row  */
1404 
1405       while (i < k) {
1406         nexti = c2r[i]; /* next row to be added to k_th row */
1407 
1408         /* compute multiplier, update diag(k) and U(i,k) */
1409         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1410         uikdi   = -ba[ili]*ba[bdiag[i]]; /* diagonal(k) */
1411         dk     += uikdi*ba[ili]; /* update diag[k] */
1412         ba[ili] = uikdi; /* -U(i,k) */
1413 
1414         /* add multiple of row i to k-th row */
1415         jmin = ili + 1; jmax = bi[i+1];
1416         if (jmin < jmax) {
1417           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1418           /* update il and c2r for row i */
1419           il[i] = jmin;
1420           j     = bj[jmin]; c2r[i] = c2r[j]; c2r[j] = i;
1421         }
1422         i = nexti;
1423       }
1424 
1425       /* copy data into U(k,:) */
1426       rs   = 0.0;
1427       jmin = bi[k]; jmax = bi[k+1]-1;
1428       if (jmin < jmax) {
1429         for (j=jmin; j<jmax; j++) {
1430           col = bj[j]; ba[j] = rtmp[col]; rs += PetscAbsScalar(ba[j]);
1431         }
1432         /* add the k-th row into il and c2r */
1433         il[k] = jmin;
1434         i     = bj[jmin]; c2r[k] = c2r[i]; c2r[i] = k;
1435       }
1436 
1437       sctx.rs = rs;
1438       sctx.pv = dk;
1439       ierr    = MatPivotCheck(A,info,&sctx,k);CHKERRQ(ierr);
1440       if (sctx.newshift) break;
1441       dk = sctx.pv;
1442 
1443       ba[bdiag[k]] = 1.0/dk; /* U(k,k) */
1444     }
1445   } while (sctx.newshift);
1446 
1447   ierr = PetscFree3(rtmp,il,c2r);CHKERRQ(ierr);
1448 
1449   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1450   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1451   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1452   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1453   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1454 
1455   B->assembled    = PETSC_TRUE;
1456   B->preallocated = PETSC_TRUE;
1457 
1458   ierr = PetscLogFlops(B->rmap->n);CHKERRQ(ierr);
1459 
1460   /* MatPivotView() */
1461   if (sctx.nshift) {
1462     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1463       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1464     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1465       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1466     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1467       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1468     }
1469   }
1470   PetscFunctionReturn(0);
1471 }
1472 
1473 #undef __FUNCT__
1474 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace"
1475 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
1476 {
1477   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1478   PetscErrorCode ierr;
1479   PetscInt       i,j,mbs = a->mbs;
1480   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1481   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1482   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1483   PetscReal      rs;
1484   FactorShiftCtx sctx;
1485 
1486   PetscFunctionBegin;
1487   /* MatPivotSetUp(): initialize shift context sctx */
1488   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1489 
1490   /* initialization */
1491   /* il and jl record the first nonzero element in each row of the accessing
1492      window U(0:k, k:mbs-1).
1493      jl:    list of rows to be added to uneliminated rows
1494             i>= k: jl(i) is the first row to be added to row i
1495             i<  k: jl(i) is the row following row i in some list of rows
1496             jl(i) = mbs indicates the end of a list
1497      il(i): points to the first nonzero element in U(i,k:mbs-1)
1498   */
1499   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1500   ierr = PetscMalloc2(mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1501 
1502   do {
1503     sctx.newshift = PETSC_FALSE;
1504     for (i=0; i<mbs; i++) {
1505       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1506     }
1507 
1508     for (k = 0; k<mbs; k++) {
1509       /*initialize k-th row with elements nonzero in row perm(k) of A */
1510       nz   = ai[k+1] - ai[k];
1511       acol = aj + ai[k];
1512       aval = aa + ai[k];
1513       bval = ba + bi[k];
1514       while (nz--) {
1515         rtmp[*acol++] = *aval++;
1516         *bval++       = 0.0; /* for in-place factorization */
1517       }
1518 
1519       /* shift the diagonal of the matrix */
1520       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1521 
1522       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1523       dk = rtmp[k];
1524       i  = jl[k]; /* first row to be added to k_th row  */
1525 
1526       while (i < k) {
1527         nexti = jl[i]; /* next row to be added to k_th row */
1528         /* compute multiplier, update D(k) and U(i,k) */
1529         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1530         uikdi   = -ba[ili]*ba[bi[i]];
1531         dk     += uikdi*ba[ili];
1532         ba[ili] = uikdi; /* -U(i,k) */
1533 
1534         /* add multiple of row i to k-th row ... */
1535         jmin = ili + 1;
1536         nz   = bi[i+1] - jmin;
1537         if (nz > 0) {
1538           bcol = bj + jmin;
1539           bval = ba + jmin;
1540           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1541           while (nz--) rtmp[*bcol++] += uikdi*(*bval++);
1542 
1543           /* update il and jl for i-th row */
1544           il[i] = jmin;
1545           j     = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1546         }
1547         i = nexti;
1548       }
1549 
1550       /* shift the diagonals when zero pivot is detected */
1551       /* compute rs=sum of abs(off-diagonal) */
1552       rs   = 0.0;
1553       jmin = bi[k]+1;
1554       nz   = bi[k+1] - jmin;
1555       if (nz) {
1556         bcol = bj + jmin;
1557         while (nz--) {
1558           rs += PetscAbsScalar(rtmp[*bcol]);
1559           bcol++;
1560         }
1561       }
1562 
1563       sctx.rs = rs;
1564       sctx.pv = dk;
1565       ierr    = MatPivotCheck(A,info,&sctx,k);CHKERRQ(ierr);
1566       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1567       dk = sctx.pv;
1568 
1569       /* copy data into U(k,:) */
1570       ba[bi[k]] = 1.0/dk;
1571       jmin      = bi[k]+1;
1572       nz        = bi[k+1] - jmin;
1573       if (nz) {
1574         bcol = bj + jmin;
1575         bval = ba + jmin;
1576         while (nz--) {
1577           *bval++       = rtmp[*bcol];
1578           rtmp[*bcol++] = 0.0;
1579         }
1580         /* add k-th row into il and jl */
1581         il[k] = jmin;
1582         i     = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1583       }
1584     } /* end of for (k = 0; k<mbs; k++) */
1585   } while (sctx.newshift);
1586   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1587   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1588 
1589   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1590   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1591   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1592   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1593   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1594 
1595   C->assembled    = PETSC_TRUE;
1596   C->preallocated = PETSC_TRUE;
1597 
1598   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1599   if (sctx.nshift) {
1600     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1601       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1602     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1603       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1604     }
1605   }
1606   PetscFunctionReturn(0);
1607 }
1608 
1609 #undef __FUNCT__
1610 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1611 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info)
1612 {
1613   PetscErrorCode ierr;
1614   Mat            C;
1615 
1616   PetscFunctionBegin;
1617   ierr = MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);CHKERRQ(ierr);
1618   ierr = MatCholeskyFactorSymbolic(C,A,perm,info);CHKERRQ(ierr);
1619   ierr = MatCholeskyFactorNumeric(C,A,info);CHKERRQ(ierr);
1620 
1621   A->ops->solve          = C->ops->solve;
1622   A->ops->solvetranspose = C->ops->solvetranspose;
1623 
1624   ierr = MatHeaderMerge(A,C);CHKERRQ(ierr);
1625   PetscFunctionReturn(0);
1626 }
1627 
1628 
1629