xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision c688d0420b4e513ff34944d1e1ad7d4e50aafa8d)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/sbaij/seq/sbaij.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscis.h>
6 
7 #undef __FUNCT__
8 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
9 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneg,PetscInt *nzero,PetscInt *npos)
10 {
11   Mat_SeqSBAIJ *fact=(Mat_SeqSBAIJ*)F->data;
12   MatScalar    *dd=fact->a;
13   PetscInt     mbs=fact->mbs,bs=F->rmap->bs,i,nneg_tmp,npos_tmp,*fi=fact->diag;
14 
15   PetscFunctionBegin;
16   if (bs != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
17   if (F->errortype==MAT_FACTOR_NUMERIC_ZEROPIVOT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactor fails with numeric zeropivot");
18 
19   nneg_tmp = 0; npos_tmp = 0;
20   for (i=0; i<mbs; i++) {
21     if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
22     else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
23     fi++;
24   }
25   if (nneg)  *nneg  = nneg_tmp;
26   if (npos)  *npos  = npos_tmp;
27   if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
28   PetscFunctionReturn(0);
29 }
30 
31 /*
32   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
33   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
34 */
35 #undef __FUNCT__
36 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
37 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info)
38 {
39   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
40   PetscErrorCode ierr;
41   const PetscInt *rip,*ai,*aj;
42   PetscInt       i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2;
43   PetscInt       m,reallocs = 0,prow;
44   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
45   PetscReal      f = info->fill;
46   PetscBool      perm_identity;
47 
48   PetscFunctionBegin;
49   /* check whether perm is the identity mapping */
50   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
51   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
52 
53   if (perm_identity) { /* without permutation */
54     a->permute = PETSC_FALSE;
55 
56     ai = a->i; aj = a->j;
57   } else {            /* non-trivial permutation */
58     a->permute = PETSC_TRUE;
59 
60     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
61 
62     ai = a->inew; aj = a->jnew;
63   }
64 
65   /* initialization */
66   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
67   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
68   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
69   iu[0] = mbs+1;
70   juidx = mbs + 1; /* index for ju */
71   /* jl linked list for pivot row -- linked list for col index */
72   ierr = PetscMalloc2(mbs,&jl,mbs,&q);CHKERRQ(ierr);
73   for (i=0; i<mbs; i++) {
74     jl[i] = mbs;
75     q[i]  = 0;
76   }
77 
78   /* for each row k */
79   for (k=0; k<mbs; k++) {
80     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
81     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
82     q[k] = mbs;
83     /* initialize nonzero structure of k-th row to row rip[k] of A */
84     jmin = ai[rip[k]] +1; /* exclude diag[k] */
85     jmax = ai[rip[k]+1];
86     for (j=jmin; j<jmax; j++) {
87       vj = rip[aj[j]]; /* col. value */
88       if (vj > k) {
89         qm = k;
90         do {
91           m = qm; qm = q[m];
92         } while (qm < vj);
93         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
94         nzk++;
95         q[m]  = vj;
96         q[vj] = qm;
97       } /* if (vj > k) */
98     } /* for (j=jmin; j<jmax; j++) */
99 
100     /* modify nonzero structure of k-th row by computing fill-in
101        for each row i to be merged in */
102     prow = k;
103     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
104 
105     while (prow < k) {
106       /* merge row prow into k-th row */
107       jmin = iu[prow] + 1; jmax = iu[prow+1];
108       qm   = k;
109       for (j=jmin; j<jmax; j++) {
110         vj = ju[j];
111         do {
112           m = qm; qm = q[m];
113         } while (qm < vj);
114         if (qm != vj) {
115           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
116         }
117       }
118       prow = jl[prow]; /* next pivot row */
119     }
120 
121     /* add k to row list for first nonzero element in k-th row */
122     if (nzk > 0) {
123       i     = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
124       jl[k] = jl[i]; jl[i] = k;
125     }
126     iu[k+1] = iu[k] + nzk;
127 
128     /* allocate more space to ju if needed */
129     if (iu[k+1] > umax) {
130       /* estimate how much additional space we will need */
131       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
132       /* just double the memory each time */
133       maxadd = umax;
134       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
135       umax += maxadd;
136 
137       /* allocate a longer ju */
138       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
139       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
140       ierr = PetscFree(ju);CHKERRQ(ierr);
141       ju   = jutmp;
142       reallocs++; /* count how many times we realloc */
143     }
144 
145     /* save nonzero structure of k-th row in ju */
146     i=k;
147     while (nzk--) {
148       i           = q[i];
149       ju[juidx++] = i;
150     }
151   }
152 
153 #if defined(PETSC_USE_INFO)
154   if (ai[mbs] != 0) {
155     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
156     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
157     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
158     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
159     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
160   } else {
161     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
162   }
163 #endif
164 
165   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
166   ierr = PetscFree2(jl,q);CHKERRQ(ierr);
167 
168   /* put together the new matrix */
169   ierr = MatSeqSBAIJSetPreallocation(F,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
170 
171   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
172   b                = (Mat_SeqSBAIJ*)(F)->data;
173   b->singlemalloc  = PETSC_FALSE;
174   b->free_a        = PETSC_TRUE;
175   b->free_ij       = PETSC_TRUE;
176 
177   ierr    = PetscMalloc1((iu[mbs]+1)*bs2,&b->a);CHKERRQ(ierr);
178   b->j    = ju;
179   b->i    = iu;
180   b->diag = 0;
181   b->ilen = 0;
182   b->imax = 0;
183   b->row  = perm;
184 
185   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
186 
187   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
188 
189   b->icol = perm;
190   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
191   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
192   /* In b structure:  Free imax, ilen, old a, old j.
193      Allocate idnew, solve_work, new a, new j */
194   ierr     = PetscLogObjectMemory((PetscObject)F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
195   b->maxnz = b->nz = iu[mbs];
196 
197   (F)->info.factor_mallocs   = reallocs;
198   (F)->info.fill_ratio_given = f;
199   if (ai[mbs] != 0) {
200     (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
201   } else {
202     (F)->info.fill_ratio_needed = 0.0;
203   }
204   ierr = MatSeqSBAIJSetNumericFactorization_inplace(F,perm_identity);CHKERRQ(ierr);
205   PetscFunctionReturn(0);
206 }
207 /*
208     Symbolic U^T*D*U factorization for SBAIJ format.
209     See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
210 */
211 #include <petscbt.h>
212 #include <../src/mat/utils/freespace.h>
213 #undef __FUNCT__
214 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
215 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
218   Mat_SeqSBAIJ       *b;
219   PetscErrorCode     ierr;
220   PetscBool          perm_identity,missing;
221   PetscReal          fill = info->fill;
222   const PetscInt     *rip,*ai=a->i,*aj=a->j;
223   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow;
224   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
225   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr,*udiag;
226   PetscFreeSpaceList free_space=NULL,current_space=NULL;
227   PetscBT            lnkbt;
228 
229   PetscFunctionBegin;
230   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
231   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
232   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
233   if (bs > 1) {
234     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
235     PetscFunctionReturn(0);
236   }
237 
238   /* check whether perm is the identity mapping */
239   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
240   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
241   a->permute = PETSC_FALSE;
242   ierr       = ISGetIndices(perm,&rip);CHKERRQ(ierr);
243 
244   /* initialization */
245   ierr  = PetscMalloc1(mbs+1,&ui);CHKERRQ(ierr);
246   ierr  = PetscMalloc1(mbs+1,&udiag);CHKERRQ(ierr);
247   ui[0] = 0;
248 
249   /* jl: linked list for storing indices of the pivot rows
250      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
251   ierr = PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);CHKERRQ(ierr);
252   for (i=0; i<mbs; i++) {
253     jl[i] = mbs; il[i] = 0;
254   }
255 
256   /* create and initialize a linked list for storing column indices of the active row k */
257   nlnk = mbs + 1;
258   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
259 
260   /* initial FreeSpace size is fill*(ai[mbs]+1) */
261   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);CHKERRQ(ierr);
262   current_space = free_space;
263 
264   for (k=0; k<mbs; k++) {  /* for each active row k */
265     /* initialize lnk by the column indices of row rip[k] of A */
266     nzk   = 0;
267     ncols = ai[k+1] - ai[k];
268     if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
269     for (j=0; j<ncols; j++) {
270       i       = *(aj + ai[k] + j);
271       cols[j] = i;
272     }
273     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
274     nzk += nlnk;
275 
276     /* update lnk by computing fill-in for each pivot row to be merged in */
277     prow = jl[k]; /* 1st pivot row */
278 
279     while (prow < k) {
280       nextprow = jl[prow];
281       /* merge prow into k-th row */
282       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
283       jmax   = ui[prow+1];
284       ncols  = jmax-jmin;
285       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
286       ierr   = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
287       nzk   += nlnk;
288 
289       /* update il and jl for prow */
290       if (jmin < jmax) {
291         il[prow] = jmin;
292         j        = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
293       }
294       prow = nextprow;
295     }
296 
297     /* if free space is not available, make more free space */
298     if (current_space->local_remaining<nzk) {
299       i    = mbs - k + 1; /* num of unfactored rows */
300       i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
301       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
302       reallocs++;
303     }
304 
305     /* copy data into free space, then initialize lnk */
306     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
307 
308     /* add the k-th row into il and jl */
309     if (nzk > 1) {
310       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
311       jl[k] = jl[i]; jl[i] = k;
312       il[k] = ui[k] + 1;
313     }
314     ui_ptr[k] = current_space->array;
315 
316     current_space->array           += nzk;
317     current_space->local_used      += nzk;
318     current_space->local_remaining -= nzk;
319 
320     ui[k+1] = ui[k] + nzk;
321   }
322 
323   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
324   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
325 
326   /* destroy list of free space and other temporary array(s) */
327   ierr = PetscMalloc1(ui[mbs]+1,&uj);CHKERRQ(ierr);
328   ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,mbs,ui,udiag);CHKERRQ(ierr); /* store matrix factor */
329   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
330 
331   /* put together the new matrix in MATSEQSBAIJ format */
332   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
333 
334   b               = (Mat_SeqSBAIJ*)fact->data;
335   b->singlemalloc = PETSC_FALSE;
336   b->free_a       = PETSC_TRUE;
337   b->free_ij      = PETSC_TRUE;
338 
339   ierr = PetscMalloc1(ui[mbs]+1,&b->a);CHKERRQ(ierr);
340 
341   b->j         = uj;
342   b->i         = ui;
343   b->diag      = udiag;
344   b->free_diag = PETSC_TRUE;
345   b->ilen      = 0;
346   b->imax      = 0;
347   b->row       = perm;
348   b->icol      = perm;
349 
350   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
351   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
352 
353   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
354 
355   ierr = PetscMalloc1(mbs+1,&b->solve_work);CHKERRQ(ierr);
356   ierr = PetscLogObjectMemory((PetscObject)fact,ui[mbs]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
357 
358   b->maxnz = b->nz = ui[mbs];
359 
360   fact->info.factor_mallocs   = reallocs;
361   fact->info.fill_ratio_given = fill;
362   if (ai[mbs] != 0) {
363     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
364   } else {
365     fact->info.fill_ratio_needed = 0.0;
366   }
367 #if defined(PETSC_USE_INFO)
368   if (ai[mbs] != 0) {
369     PetscReal af = fact->info.fill_ratio_needed;
370     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
371     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
372     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
373   } else {
374     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
375   }
376 #endif
377   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
378   PetscFunctionReturn(0);
379 }
380 
381 #undef __FUNCT__
382 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_inplace"
383 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
384 {
385   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
386   Mat_SeqSBAIJ       *b;
387   PetscErrorCode     ierr;
388   PetscBool          perm_identity,missing;
389   PetscReal          fill = info->fill;
390   const PetscInt     *rip,*ai,*aj;
391   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
392   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
393   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
394   PetscFreeSpaceList free_space=NULL,current_space=NULL;
395   PetscBT            lnkbt;
396 
397   PetscFunctionBegin;
398   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
399   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
400 
401   /*
402    This code originally uses Modified Sparse Row (MSR) storage
403    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
404    Then it is rewritten so the factor B takes seqsbaij format. However the associated
405    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
406    thus the original code in MSR format is still used for these cases.
407    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
408    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
409   */
410   if (bs > 1) {
411     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
412     PetscFunctionReturn(0);
413   }
414 
415   /* check whether perm is the identity mapping */
416   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
417 
418   if (perm_identity) {
419     a->permute = PETSC_FALSE;
420 
421     ai = a->i; aj = a->j;
422   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
423   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
424 
425   /* initialization */
426   ierr  = PetscMalloc1(mbs+1,&ui);CHKERRQ(ierr);
427   ui[0] = 0;
428 
429   /* jl: linked list for storing indices of the pivot rows
430      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
431   ierr = PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);CHKERRQ(ierr);
432   for (i=0; i<mbs; i++) {
433     jl[i] = mbs; il[i] = 0;
434   }
435 
436   /* create and initialize a linked list for storing column indices of the active row k */
437   nlnk = mbs + 1;
438   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
439 
440   /* initial FreeSpace size is fill*(ai[mbs]+1) */
441   ierr          = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);CHKERRQ(ierr);
442   current_space = free_space;
443 
444   for (k=0; k<mbs; k++) {  /* for each active row k */
445     /* initialize lnk by the column indices of row rip[k] of A */
446     nzk   = 0;
447     ncols = ai[rip[k]+1] - ai[rip[k]];
448     for (j=0; j<ncols; j++) {
449       i       = *(aj + ai[rip[k]] + j);
450       cols[j] = rip[i];
451     }
452     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
453     nzk += nlnk;
454 
455     /* update lnk by computing fill-in for each pivot row to be merged in */
456     prow = jl[k]; /* 1st pivot row */
457 
458     while (prow < k) {
459       nextprow = jl[prow];
460       /* merge prow into k-th row */
461       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
462       jmax   = ui[prow+1];
463       ncols  = jmax-jmin;
464       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
465       ierr   = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
466       nzk   += nlnk;
467 
468       /* update il and jl for prow */
469       if (jmin < jmax) {
470         il[prow] = jmin;
471 
472         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
473       }
474       prow = nextprow;
475     }
476 
477     /* if free space is not available, make more free space */
478     if (current_space->local_remaining<nzk) {
479       i    = mbs - k + 1; /* num of unfactored rows */
480       i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
481       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
482       reallocs++;
483     }
484 
485     /* copy data into free space, then initialize lnk */
486     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
487 
488     /* add the k-th row into il and jl */
489     if (nzk-1 > 0) {
490       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
491       jl[k] = jl[i]; jl[i] = k;
492       il[k] = ui[k] + 1;
493     }
494     ui_ptr[k] = current_space->array;
495 
496     current_space->array           += nzk;
497     current_space->local_used      += nzk;
498     current_space->local_remaining -= nzk;
499 
500     ui[k+1] = ui[k] + nzk;
501   }
502 
503   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
504   ierr = PetscFree4(ui_ptr,il,jl,cols);CHKERRQ(ierr);
505 
506   /* destroy list of free space and other temporary array(s) */
507   ierr = PetscMalloc1(ui[mbs]+1,&uj);CHKERRQ(ierr);
508   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
509   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
510 
511   /* put together the new matrix in MATSEQSBAIJ format */
512   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
513 
514   b               = (Mat_SeqSBAIJ*)fact->data;
515   b->singlemalloc = PETSC_FALSE;
516   b->free_a       = PETSC_TRUE;
517   b->free_ij      = PETSC_TRUE;
518 
519   ierr = PetscMalloc1(ui[mbs]+1,&b->a);CHKERRQ(ierr);
520 
521   b->j    = uj;
522   b->i    = ui;
523   b->diag = 0;
524   b->ilen = 0;
525   b->imax = 0;
526   b->row  = perm;
527 
528   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
529 
530   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
531   b->icol  = perm;
532   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
533   ierr     = PetscMalloc1(mbs+1,&b->solve_work);CHKERRQ(ierr);
534   ierr     = PetscLogObjectMemory((PetscObject)fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
535   b->maxnz = b->nz = ui[mbs];
536 
537   fact->info.factor_mallocs   = reallocs;
538   fact->info.fill_ratio_given = fill;
539   if (ai[mbs] != 0) {
540     fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
541   } else {
542     fact->info.fill_ratio_needed = 0.0;
543   }
544 #if defined(PETSC_USE_INFO)
545   if (ai[mbs] != 0) {
546     PetscReal af = fact->info.fill_ratio_needed;
547     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
548     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
549     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
550   } else {
551     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
552   }
553 #endif
554   ierr = MatSeqSBAIJSetNumericFactorization_inplace(fact,perm_identity);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 
558 #undef __FUNCT__
559 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
560 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
561 {
562   Mat_SeqSBAIJ   *a   = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
563   IS             perm = b->row;
564   PetscErrorCode ierr;
565   const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j;
566   PetscInt       i,j;
567   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
568   PetscInt       bs  =A->rmap->bs,bs2 = a->bs2;
569   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
570   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
571   MatScalar      *work;
572   PetscInt       *pivots;
573   PetscBool      allowzeropivot,zeropivotdetected;
574 
575   PetscFunctionBegin;
576   /* initialization */
577   ierr = PetscCalloc1(bs2*mbs,&rtmp);CHKERRQ(ierr);
578   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
579   allowzeropivot = PetscNot(A->erroriffailure);
580 
581   il[0] = 0;
582   for (i=0; i<mbs; i++) jl[i] = mbs;
583 
584   ierr = PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);CHKERRQ(ierr);
585   ierr = PetscMalloc1(bs,&pivots);CHKERRQ(ierr);
586 
587   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
588 
589   /* check permutation */
590   if (!a->permute) {
591     ai = a->i; aj = a->j; aa = a->a;
592   } else {
593     ai   = a->inew; aj = a->jnew;
594     ierr = PetscMalloc1(bs2*ai[mbs],&aa);CHKERRQ(ierr);
595     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
596     ierr = PetscMalloc1(ai[mbs],&a2anew);CHKERRQ(ierr);
597     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
598 
599     for (i=0; i<mbs; i++) {
600       jmin = ai[i]; jmax = ai[i+1];
601       for (j=jmin; j<jmax; j++) {
602         while (a2anew[j] != j) {
603           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
604           for (k1=0; k1<bs2; k1++) {
605             dk[k1]       = aa[k*bs2+k1];
606             aa[k*bs2+k1] = aa[j*bs2+k1];
607             aa[j*bs2+k1] = dk[k1];
608           }
609         }
610         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
611         if (i > aj[j]) {
612           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
613           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
614           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
615           for (k=0; k<bs; k++) {               /* j-th block of aa <- dk^T */
616             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
617           }
618         }
619       }
620     }
621     ierr = PetscFree(a2anew);CHKERRQ(ierr);
622   }
623 
624   /* for each row k */
625   for (k = 0; k<mbs; k++) {
626 
627     /*initialize k-th row with elements nonzero in row perm(k) of A */
628     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
629 
630     ap = aa + jmin*bs2;
631     for (j = jmin; j < jmax; j++) {
632       vj       = perm_ptr[aj[j]];   /* block col. index */
633       rtmp_ptr = rtmp + vj*bs2;
634       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
635     }
636 
637     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
638     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
639     i    = jl[k]; /* first row to be added to k_th row  */
640 
641     while (i < k) {
642       nexti = jl[i]; /* next row to be added to k_th row */
643 
644       /* compute multiplier */
645       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
646 
647       /* uik = -inv(Di)*U_bar(i,k) */
648       diag = ba + i*bs2;
649       u    = ba + ili*bs2;
650       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
651       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
652 
653       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
654       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
655       ierr = PetscLogFlops(4.0*bs*bs2);CHKERRQ(ierr);
656 
657       /* update -U(i,k) */
658       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
659 
660       /* add multiple of row i to k-th row ... */
661       jmin = ili + 1; jmax = bi[i+1];
662       if (jmin < jmax) {
663         for (j=jmin; j<jmax; j++) {
664           /* rtmp += -U(i,k)^T * U_bar(i,j) */
665           rtmp_ptr = rtmp + bj[j]*bs2;
666           u        = ba + j*bs2;
667           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
668         }
669         ierr = PetscLogFlops(2.0*bs*bs2*(jmax-jmin));CHKERRQ(ierr);
670 
671         /* ... add i to row list for next nonzero entry */
672         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
673         j     = bj[jmin];
674         jl[i] = jl[j]; jl[j] = i; /* update jl */
675       }
676       i = nexti;
677     }
678 
679     /* save nonzero entries in k-th row of U ... */
680 
681     /* invert diagonal block */
682     diag = ba+k*bs2;
683     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
684 
685     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
686     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
687 
688     jmin = bi[k]; jmax = bi[k+1];
689     if (jmin < jmax) {
690       for (j=jmin; j<jmax; j++) {
691         vj       = bj[j];      /* block col. index of U */
692         u        = ba + j*bs2;
693         rtmp_ptr = rtmp + vj*bs2;
694         for (k1=0; k1<bs2; k1++) {
695           *u++        = *rtmp_ptr;
696           *rtmp_ptr++ = 0.0;
697         }
698       }
699 
700       /* ... add k to row list for first nonzero entry in k-th row */
701       il[k] = jmin;
702       i     = bj[jmin];
703       jl[k] = jl[i]; jl[i] = k;
704     }
705   }
706 
707   ierr = PetscFree(rtmp);CHKERRQ(ierr);
708   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
709   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
710   ierr = PetscFree(pivots);CHKERRQ(ierr);
711   if (a->permute) {
712     ierr = PetscFree(aa);CHKERRQ(ierr);
713   }
714 
715   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
716 
717   C->ops->solve          = MatSolve_SeqSBAIJ_N_inplace;
718   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
719   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_inplace;
720   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_inplace;
721 
722   C->assembled    = PETSC_TRUE;
723   C->preallocated = PETSC_TRUE;
724 
725   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
726   PetscFunctionReturn(0);
727 }
728 
729 #undef __FUNCT__
730 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
731 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
732 {
733   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
734   PetscErrorCode ierr;
735   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
736   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
737   PetscInt       bs  =A->rmap->bs,bs2 = a->bs2;
738   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
739   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
740   MatScalar      *work;
741   PetscInt       *pivots;
742   PetscBool      allowzeropivot,zeropivotdetected;
743 
744   PetscFunctionBegin;
745   ierr = PetscCalloc1(bs2*mbs,&rtmp);CHKERRQ(ierr);
746   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
747   il[0] = 0;
748   for (i=0; i<mbs; i++) jl[i] = mbs;
749 
750   ierr = PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);CHKERRQ(ierr);
751   ierr = PetscMalloc1(bs,&pivots);CHKERRQ(ierr);
752   allowzeropivot = PetscNot(A->erroriffailure);
753 
754   ai = a->i; aj = a->j; aa = a->a;
755 
756   /* for each row k */
757   for (k = 0; k<mbs; k++) {
758 
759     /*initialize k-th row with elements nonzero in row k of A */
760     jmin = ai[k]; jmax = ai[k+1];
761     ap   = aa + jmin*bs2;
762     for (j = jmin; j < jmax; j++) {
763       vj       = aj[j];   /* block col. index */
764       rtmp_ptr = rtmp + vj*bs2;
765       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
766     }
767 
768     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
769     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
770     i    = jl[k]; /* first row to be added to k_th row  */
771 
772     while (i < k) {
773       nexti = jl[i]; /* next row to be added to k_th row */
774 
775       /* compute multiplier */
776       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
777 
778       /* uik = -inv(Di)*U_bar(i,k) */
779       diag = ba + i*bs2;
780       u    = ba + ili*bs2;
781       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
782       PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
783 
784       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
785       PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
786       ierr = PetscLogFlops(2.0*bs*bs2);CHKERRQ(ierr);
787 
788       /* update -U(i,k) */
789       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
790 
791       /* add multiple of row i to k-th row ... */
792       jmin = ili + 1; jmax = bi[i+1];
793       if (jmin < jmax) {
794         for (j=jmin; j<jmax; j++) {
795           /* rtmp += -U(i,k)^T * U_bar(i,j) */
796           rtmp_ptr = rtmp + bj[j]*bs2;
797           u        = ba + j*bs2;
798           PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
799         }
800         ierr = PetscLogFlops(2.0*bs*bs2*(jmax-jmin));CHKERRQ(ierr);
801 
802         /* ... add i to row list for next nonzero entry */
803         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
804         j     = bj[jmin];
805         jl[i] = jl[j]; jl[j] = i; /* update jl */
806       }
807       i = nexti;
808     }
809 
810     /* save nonzero entries in k-th row of U ... */
811 
812     /* invert diagonal block */
813     diag = ba+k*bs2;
814     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
815 
816     ierr = PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
817     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
818 
819     jmin = bi[k]; jmax = bi[k+1];
820     if (jmin < jmax) {
821       for (j=jmin; j<jmax; j++) {
822         vj       = bj[j];      /* block col. index of U */
823         u        = ba + j*bs2;
824         rtmp_ptr = rtmp + vj*bs2;
825         for (k1=0; k1<bs2; k1++) {
826           *u++        = *rtmp_ptr;
827           *rtmp_ptr++ = 0.0;
828         }
829       }
830 
831       /* ... add k to row list for first nonzero entry in k-th row */
832       il[k] = jmin;
833       i     = bj[jmin];
834       jl[k] = jl[i]; jl[i] = k;
835     }
836   }
837 
838   ierr = PetscFree(rtmp);CHKERRQ(ierr);
839   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
840   ierr = PetscFree3(dk,uik,work);CHKERRQ(ierr);
841   ierr = PetscFree(pivots);CHKERRQ(ierr);
842 
843   C->ops->solve          = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
844   C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
845   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
846   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
847   C->assembled           = PETSC_TRUE;
848   C->preallocated        = PETSC_TRUE;
849 
850   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
851   PetscFunctionReturn(0);
852 }
853 
854 /*
855     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
856     Version for blocks 2 by 2.
857 */
858 #undef __FUNCT__
859 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
860 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info)
861 {
862   Mat_SeqSBAIJ   *a   = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
863   IS             perm = b->row;
864   PetscErrorCode ierr;
865   const PetscInt *ai,*aj,*perm_ptr;
866   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
867   PetscInt       *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
868   MatScalar      *ba = b->a,*aa,*ap;
869   MatScalar      *u,*diag,*rtmp,*rtmp_ptr,dk[4],uik[4];
870   PetscReal      shift = info->shiftamount;
871   PetscBool      allowzeropivot,zeropivotdetected;
872 
873   PetscFunctionBegin;
874   allowzeropivot = PetscNot(A->erroriffailure);
875 
876   /* initialization */
877   /* il and jl record the first nonzero element in each row of the accessing
878      window U(0:k, k:mbs-1).
879      jl:    list of rows to be added to uneliminated rows
880             i>= k: jl(i) is the first row to be added to row i
881             i<  k: jl(i) is the row following row i in some list of rows
882             jl(i) = mbs indicates the end of a list
883      il(i): points to the first nonzero element in columns k,...,mbs-1 of
884             row i of U */
885   ierr = PetscCalloc1(4*mbs,&rtmp);CHKERRQ(ierr);
886   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
887   il[0] = 0;
888   for (i=0; i<mbs; i++) jl[i] = mbs;
889 
890   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
891 
892   /* check permutation */
893   if (!a->permute) {
894     ai = a->i; aj = a->j; aa = a->a;
895   } else {
896     ai   = a->inew; aj = a->jnew;
897     ierr = PetscMalloc1(4*ai[mbs],&aa);CHKERRQ(ierr);
898     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
899     ierr = PetscMalloc1(ai[mbs],&a2anew);CHKERRQ(ierr);
900     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
901 
902     for (i=0; i<mbs; i++) {
903       jmin = ai[i]; jmax = ai[i+1];
904       for (j=jmin; j<jmax; j++) {
905         while (a2anew[j] != j) {
906           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
907           for (k1=0; k1<4; k1++) {
908             dk[k1]     = aa[k*4+k1];
909             aa[k*4+k1] = aa[j*4+k1];
910             aa[j*4+k1] = dk[k1];
911           }
912         }
913         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
914         if (i > aj[j]) {
915           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
916           ap    = aa + j*4;  /* ptr to the beginning of the block */
917           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
918           ap[1] = ap[2];
919           ap[2] = dk[1];
920         }
921       }
922     }
923     ierr = PetscFree(a2anew);CHKERRQ(ierr);
924   }
925 
926   /* for each row k */
927   for (k = 0; k<mbs; k++) {
928 
929     /*initialize k-th row with elements nonzero in row perm(k) of A */
930     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
931     ap   = aa + jmin*4;
932     for (j = jmin; j < jmax; j++) {
933       vj       = perm_ptr[aj[j]];   /* block col. index */
934       rtmp_ptr = rtmp + vj*4;
935       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
936     }
937 
938     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
939     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
940     i    = jl[k]; /* first row to be added to k_th row  */
941 
942     while (i < k) {
943       nexti = jl[i]; /* next row to be added to k_th row */
944 
945       /* compute multiplier */
946       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
947 
948       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
949       diag   = ba + i*4;
950       u      = ba + ili*4;
951       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
952       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
953       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
954       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
955 
956       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
957       dk[0] += uik[0]*u[0] + uik[1]*u[1];
958       dk[1] += uik[2]*u[0] + uik[3]*u[1];
959       dk[2] += uik[0]*u[2] + uik[1]*u[3];
960       dk[3] += uik[2]*u[2] + uik[3]*u[3];
961 
962       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
963 
964       /* update -U(i,k): ba[ili] = uik */
965       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
966 
967       /* add multiple of row i to k-th row ... */
968       jmin = ili + 1; jmax = bi[i+1];
969       if (jmin < jmax) {
970         for (j=jmin; j<jmax; j++) {
971           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
972           rtmp_ptr     = rtmp + bj[j]*4;
973           u            = ba + j*4;
974           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
975           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
976           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
977           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
978         }
979         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
980 
981         /* ... add i to row list for next nonzero entry */
982         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
983         j     = bj[jmin];
984         jl[i] = jl[j]; jl[j] = i; /* update jl */
985       }
986       i = nexti;
987     }
988 
989     /* save nonzero entries in k-th row of U ... */
990 
991     /* invert diagonal block */
992     diag = ba+k*4;
993     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
994     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
995     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
996 
997     jmin = bi[k]; jmax = bi[k+1];
998     if (jmin < jmax) {
999       for (j=jmin; j<jmax; j++) {
1000         vj       = bj[j];      /* block col. index of U */
1001         u        = ba + j*4;
1002         rtmp_ptr = rtmp + vj*4;
1003         for (k1=0; k1<4; k1++) {
1004           *u++        = *rtmp_ptr;
1005           *rtmp_ptr++ = 0.0;
1006         }
1007       }
1008 
1009       /* ... add k to row list for first nonzero entry in k-th row */
1010       il[k] = jmin;
1011       i     = bj[jmin];
1012       jl[k] = jl[i]; jl[i] = k;
1013     }
1014   }
1015 
1016   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1017   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1018   if (a->permute) {
1019     ierr = PetscFree(aa);CHKERRQ(ierr);
1020   }
1021   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
1022 
1023   C->ops->solve          = MatSolve_SeqSBAIJ_2_inplace;
1024   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
1025   C->assembled           = PETSC_TRUE;
1026   C->preallocated        = PETSC_TRUE;
1027 
1028   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1029   PetscFunctionReturn(0);
1030 }
1031 
1032 /*
1033       Version for when blocks are 2 by 2 Using natural ordering
1034 */
1035 #undef __FUNCT__
1036 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
1037 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
1038 {
1039   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
1040   PetscErrorCode ierr;
1041   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1042   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1043   MatScalar      *ba = b->a,*aa,*ap,dk[8],uik[8];
1044   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
1045   PetscReal      shift = info->shiftamount;
1046   PetscBool      allowzeropivot,zeropivotdetected;
1047 
1048   PetscFunctionBegin;
1049   allowzeropivot = PetscNot(A->erroriffailure);
1050 
1051   /* initialization */
1052   /* il and jl record the first nonzero element in each row of the accessing
1053      window U(0:k, k:mbs-1).
1054      jl:    list of rows to be added to uneliminated rows
1055             i>= k: jl(i) is the first row to be added to row i
1056             i<  k: jl(i) is the row following row i in some list of rows
1057             jl(i) = mbs indicates the end of a list
1058      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1059             row i of U */
1060   ierr = PetscCalloc1(4*mbs,&rtmp);CHKERRQ(ierr);
1061   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
1062   il[0] = 0;
1063   for (i=0; i<mbs; i++) jl[i] = mbs;
1064 
1065   ai = a->i; aj = a->j; aa = a->a;
1066 
1067   /* for each row k */
1068   for (k = 0; k<mbs; k++) {
1069 
1070     /*initialize k-th row with elements nonzero in row k of A */
1071     jmin = ai[k]; jmax = ai[k+1];
1072     ap   = aa + jmin*4;
1073     for (j = jmin; j < jmax; j++) {
1074       vj       = aj[j];   /* block col. index */
1075       rtmp_ptr = rtmp + vj*4;
1076       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1077     }
1078 
1079     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1080     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
1081     i    = jl[k]; /* first row to be added to k_th row  */
1082 
1083     while (i < k) {
1084       nexti = jl[i]; /* next row to be added to k_th row */
1085 
1086       /* compute multiplier */
1087       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1088 
1089       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1090       diag   = ba + i*4;
1091       u      = ba + ili*4;
1092       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1093       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1094       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1095       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1096 
1097       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1098       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1099       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1100       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1101       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1102 
1103       ierr = PetscLogFlops(16.0*2.0);CHKERRQ(ierr);
1104 
1105       /* update -U(i,k): ba[ili] = uik */
1106       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1107 
1108       /* add multiple of row i to k-th row ... */
1109       jmin = ili + 1; jmax = bi[i+1];
1110       if (jmin < jmax) {
1111         for (j=jmin; j<jmax; j++) {
1112           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1113           rtmp_ptr     = rtmp + bj[j]*4;
1114           u            = ba + j*4;
1115           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1116           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1117           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1118           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1119         }
1120         ierr = PetscLogFlops(16.0*(jmax-jmin));CHKERRQ(ierr);
1121 
1122         /* ... add i to row list for next nonzero entry */
1123         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1124         j     = bj[jmin];
1125         jl[i] = jl[j]; jl[j] = i; /* update jl */
1126       }
1127       i = nexti;
1128     }
1129 
1130     /* save nonzero entries in k-th row of U ... */
1131 
1132     /* invert diagonal block */
1133     diag = ba+k*4;
1134     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1135     ierr = PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1136     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1137 
1138     jmin = bi[k]; jmax = bi[k+1];
1139     if (jmin < jmax) {
1140       for (j=jmin; j<jmax; j++) {
1141         vj       = bj[j];      /* block col. index of U */
1142         u        = ba + j*4;
1143         rtmp_ptr = rtmp + vj*4;
1144         for (k1=0; k1<4; k1++) {
1145           *u++        = *rtmp_ptr;
1146           *rtmp_ptr++ = 0.0;
1147         }
1148       }
1149 
1150       /* ... add k to row list for first nonzero entry in k-th row */
1151       il[k] = jmin;
1152       i     = bj[jmin];
1153       jl[k] = jl[i]; jl[i] = k;
1154     }
1155   }
1156 
1157   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1158   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1159 
1160   C->ops->solve          = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1161   C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1162   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1163   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1164   C->assembled           = PETSC_TRUE;
1165   C->preallocated        = PETSC_TRUE;
1166 
1167   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 /*
1172     Numeric U^T*D*U factorization for SBAIJ format.
1173     Version for blocks are 1 by 1.
1174 */
1175 #undef __FUNCT__
1176 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace"
1177 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
1178 {
1179   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1180   IS             ip=b->row;
1181   PetscErrorCode ierr;
1182   const PetscInt *ai,*aj,*rip;
1183   PetscInt       *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1184   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1185   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1186   PetscReal      rs;
1187   FactorShiftCtx sctx;
1188 
1189   PetscFunctionBegin;
1190   /* MatPivotSetUp(): initialize shift context sctx */
1191   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1192 
1193   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1194   if (!a->permute) {
1195     ai = a->i; aj = a->j; aa = a->a;
1196   } else {
1197     ai     = a->inew; aj = a->jnew;
1198     nz     = ai[mbs];
1199     ierr   = PetscMalloc1(nz,&aa);CHKERRQ(ierr);
1200     a2anew = a->a2anew;
1201     bval   = a->a;
1202     for (j=0; j<nz; j++) {
1203       aa[a2anew[j]] = *(bval++);
1204     }
1205   }
1206 
1207   /* initialization */
1208   /* il and jl record the first nonzero element in each row of the accessing
1209      window U(0:k, k:mbs-1).
1210      jl:    list of rows to be added to uneliminated rows
1211             i>= k: jl(i) is the first row to be added to row i
1212             i<  k: jl(i) is the row following row i in some list of rows
1213             jl(i) = mbs indicates the end of a list
1214      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1215             row i of U */
1216   ierr = PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1217 
1218   do {
1219     sctx.newshift = PETSC_FALSE;
1220     il[0] = 0;
1221     for (i=0; i<mbs; i++) {
1222       rtmp[i] = 0.0; jl[i] = mbs;
1223     }
1224 
1225     for (k = 0; k<mbs; k++) {
1226       /*initialize k-th row by the perm[k]-th row of A */
1227       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1228       bval = ba + bi[k];
1229       for (j = jmin; j < jmax; j++) {
1230         col       = rip[aj[j]];
1231         rtmp[col] = aa[j];
1232         *bval++   = 0.0; /* for in-place factorization */
1233       }
1234 
1235       /* shift the diagonal of the matrix */
1236       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1237 
1238       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1239       dk = rtmp[k];
1240       i  = jl[k]; /* first row to be added to k_th row  */
1241 
1242       while (i < k) {
1243         nexti = jl[i]; /* next row to be added to k_th row */
1244 
1245         /* compute multiplier, update diag(k) and U(i,k) */
1246         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1247         uikdi   = -ba[ili]*ba[bi[i]]; /* diagonal(k) */
1248         dk     += uikdi*ba[ili];
1249         ba[ili] = uikdi; /* -U(i,k) */
1250 
1251         /* add multiple of row i to k-th row */
1252         jmin = ili + 1; jmax = bi[i+1];
1253         if (jmin < jmax) {
1254           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1255           ierr = PetscLogFlops(2.0*(jmax-jmin));CHKERRQ(ierr);
1256 
1257           /* update il and jl for row i */
1258           il[i] = jmin;
1259           j     = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1260         }
1261         i = nexti;
1262       }
1263 
1264       /* shift the diagonals when zero pivot is detected */
1265       /* compute rs=sum of abs(off-diagonal) */
1266       rs   = 0.0;
1267       jmin = bi[k]+1;
1268       nz   = bi[k+1] - jmin;
1269       if (nz) {
1270         bcol = bj + jmin;
1271         while (nz--) {
1272           rs += PetscAbsScalar(rtmp[*bcol]);
1273           bcol++;
1274         }
1275       }
1276 
1277       sctx.rs = rs;
1278       sctx.pv = dk;
1279       ierr    = MatPivotCheck(C,A,info,&sctx,k);CHKERRQ(ierr);
1280       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1281       dk = sctx.pv;
1282 
1283       /* copy data into U(k,:) */
1284       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1285       jmin      = bi[k]+1; jmax = bi[k+1];
1286       if (jmin < jmax) {
1287         for (j=jmin; j<jmax; j++) {
1288           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1289         }
1290         /* add the k-th row into il and jl */
1291         il[k] = jmin;
1292         i     = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1293       }
1294     }
1295   } while (sctx.newshift);
1296   ierr = PetscFree3(rtmp,il,jl);CHKERRQ(ierr);
1297   if (a->permute) {ierr = PetscFree(aa);CHKERRQ(ierr);}
1298 
1299   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1300 
1301   C->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
1302   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1303   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1304   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
1305   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
1306   C->assembled           = PETSC_TRUE;
1307   C->preallocated        = PETSC_TRUE;
1308 
1309   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1310   if (sctx.nshift) {
1311     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1312       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1313     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1314       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1315     }
1316   }
1317   PetscFunctionReturn(0);
1318 }
1319 
1320 /*
1321   Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1322   Modified from MatCholeskyFactorNumeric_SeqAIJ()
1323 */
1324 #undef __FUNCT__
1325 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1326 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
1327 {
1328   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1329   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)B->data;
1330   PetscErrorCode ierr;
1331   PetscInt       i,j,mbs=A->rmap->n,*bi=b->i,*bj=b->j,*bdiag=b->diag,*bjtmp;
1332   PetscInt       *ai=a->i,*aj=a->j,*ajtmp;
1333   PetscInt       k,jmin,jmax,*c2r,*il,col,nexti,ili,nz;
1334   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
1335   FactorShiftCtx sctx;
1336   PetscReal      rs;
1337   MatScalar      d,*v;
1338 
1339   PetscFunctionBegin;
1340   ierr = PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&c2r);CHKERRQ(ierr);
1341 
1342   /* MatPivotSetUp(): initialize shift context sctx */
1343   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1344 
1345   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1346     sctx.shift_top = info->zeropivot;
1347 
1348     ierr = PetscMemzero(rtmp,mbs*sizeof(MatScalar));CHKERRQ(ierr);
1349 
1350     for (i=0; i<mbs; i++) {
1351       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1352       d        = (aa)[a->diag[i]];
1353       rtmp[i] += -PetscRealPart(d);  /* diagonal entry */
1354       ajtmp    = aj + ai[i] + 1;     /* exclude diagonal */
1355       v        = aa + ai[i] + 1;
1356       nz       = ai[i+1] - ai[i] - 1;
1357       for (j=0; j<nz; j++) {
1358         rtmp[i]        += PetscAbsScalar(v[j]);
1359         rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1360       }
1361       if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1362     }
1363     sctx.shift_top *= 1.1;
1364     sctx.nshift_max = 5;
1365     sctx.shift_lo   = 0.;
1366     sctx.shift_hi   = 1.;
1367   }
1368 
1369   /* allocate working arrays
1370      c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1371      il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1372   */
1373   do {
1374     sctx.newshift = PETSC_FALSE;
1375 
1376     for (i=0; i<mbs; i++) c2r[i] = mbs;
1377     if (mbs) il[0] = 0;
1378 
1379     for (k = 0; k<mbs; k++) {
1380       /* zero rtmp */
1381       nz    = bi[k+1] - bi[k];
1382       bjtmp = bj + bi[k];
1383       for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1384 
1385       /* load in initial unfactored row */
1386       bval = ba + bi[k];
1387       jmin = ai[k]; jmax = ai[k+1];
1388       for (j = jmin; j < jmax; j++) {
1389         col       = aj[j];
1390         rtmp[col] = aa[j];
1391         *bval++   = 0.0; /* for in-place factorization */
1392       }
1393       /* shift the diagonal of the matrix: ZeropivotApply() */
1394       rtmp[k] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1395 
1396       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1397       dk = rtmp[k];
1398       i  = c2r[k]; /* first row to be added to k_th row  */
1399 
1400       while (i < k) {
1401         nexti = c2r[i]; /* next row to be added to k_th row */
1402 
1403         /* compute multiplier, update diag(k) and U(i,k) */
1404         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1405         uikdi   = -ba[ili]*ba[bdiag[i]]; /* diagonal(k) */
1406         dk     += uikdi*ba[ili]; /* update diag[k] */
1407         ba[ili] = uikdi; /* -U(i,k) */
1408 
1409         /* add multiple of row i to k-th row */
1410         jmin = ili + 1; jmax = bi[i+1];
1411         if (jmin < jmax) {
1412           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1413           /* update il and c2r for row i */
1414           il[i] = jmin;
1415           j     = bj[jmin]; c2r[i] = c2r[j]; c2r[j] = i;
1416         }
1417         i = nexti;
1418       }
1419 
1420       /* copy data into U(k,:) */
1421       rs   = 0.0;
1422       jmin = bi[k]; jmax = bi[k+1]-1;
1423       if (jmin < jmax) {
1424         for (j=jmin; j<jmax; j++) {
1425           col = bj[j]; ba[j] = rtmp[col]; rs += PetscAbsScalar(ba[j]);
1426         }
1427         /* add the k-th row into il and c2r */
1428         il[k] = jmin;
1429         i     = bj[jmin]; c2r[k] = c2r[i]; c2r[i] = k;
1430       }
1431 
1432       sctx.rs = rs;
1433       sctx.pv = dk;
1434       ierr    = MatPivotCheck(B,A,info,&sctx,k);CHKERRQ(ierr);
1435       if (sctx.newshift) break;
1436       dk = sctx.pv;
1437 
1438       ba[bdiag[k]] = 1.0/dk; /* U(k,k) */
1439     }
1440   } while (sctx.newshift);
1441 
1442   ierr = PetscFree3(rtmp,il,c2r);CHKERRQ(ierr);
1443 
1444   B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1445   B->ops->solves         = MatSolves_SeqSBAIJ_1;
1446   B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1447   B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1448   B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1449 
1450   B->assembled    = PETSC_TRUE;
1451   B->preallocated = PETSC_TRUE;
1452 
1453   ierr = PetscLogFlops(B->rmap->n);CHKERRQ(ierr);
1454 
1455   /* MatPivotView() */
1456   if (sctx.nshift) {
1457     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1458       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1459     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1460       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1461     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1462       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1463     }
1464   }
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 #undef __FUNCT__
1469 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace"
1470 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
1471 {
1472   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1473   PetscErrorCode ierr;
1474   PetscInt       i,j,mbs = a->mbs;
1475   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1476   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1477   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1478   PetscReal      rs;
1479   FactorShiftCtx sctx;
1480 
1481   PetscFunctionBegin;
1482   /* MatPivotSetUp(): initialize shift context sctx */
1483   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1484 
1485   /* initialization */
1486   /* il and jl record the first nonzero element in each row of the accessing
1487      window U(0:k, k:mbs-1).
1488      jl:    list of rows to be added to uneliminated rows
1489             i>= k: jl(i) is the first row to be added to row i
1490             i<  k: jl(i) is the row following row i in some list of rows
1491             jl(i) = mbs indicates the end of a list
1492      il(i): points to the first nonzero element in U(i,k:mbs-1)
1493   */
1494   ierr = PetscMalloc1(mbs,&rtmp);CHKERRQ(ierr);
1495   ierr = PetscMalloc2(mbs,&il,mbs,&jl);CHKERRQ(ierr);
1496 
1497   do {
1498     sctx.newshift = PETSC_FALSE;
1499     il[0] = 0;
1500     for (i=0; i<mbs; i++) {
1501       rtmp[i] = 0.0; jl[i] = mbs;
1502     }
1503 
1504     for (k = 0; k<mbs; k++) {
1505       /*initialize k-th row with elements nonzero in row perm(k) of A */
1506       nz   = ai[k+1] - ai[k];
1507       acol = aj + ai[k];
1508       aval = aa + ai[k];
1509       bval = ba + bi[k];
1510       while (nz--) {
1511         rtmp[*acol++] = *aval++;
1512         *bval++       = 0.0; /* for in-place factorization */
1513       }
1514 
1515       /* shift the diagonal of the matrix */
1516       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1517 
1518       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1519       dk = rtmp[k];
1520       i  = jl[k]; /* first row to be added to k_th row  */
1521 
1522       while (i < k) {
1523         nexti = jl[i]; /* next row to be added to k_th row */
1524         /* compute multiplier, update D(k) and U(i,k) */
1525         ili     = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1526         uikdi   = -ba[ili]*ba[bi[i]];
1527         dk     += uikdi*ba[ili];
1528         ba[ili] = uikdi; /* -U(i,k) */
1529 
1530         /* add multiple of row i to k-th row ... */
1531         jmin = ili + 1;
1532         nz   = bi[i+1] - jmin;
1533         if (nz > 0) {
1534           bcol = bj + jmin;
1535           bval = ba + jmin;
1536           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1537           while (nz--) rtmp[*bcol++] += uikdi*(*bval++);
1538 
1539           /* update il and jl for i-th row */
1540           il[i] = jmin;
1541           j     = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1542         }
1543         i = nexti;
1544       }
1545 
1546       /* shift the diagonals when zero pivot is detected */
1547       /* compute rs=sum of abs(off-diagonal) */
1548       rs   = 0.0;
1549       jmin = bi[k]+1;
1550       nz   = bi[k+1] - jmin;
1551       if (nz) {
1552         bcol = bj + jmin;
1553         while (nz--) {
1554           rs += PetscAbsScalar(rtmp[*bcol]);
1555           bcol++;
1556         }
1557       }
1558 
1559       sctx.rs = rs;
1560       sctx.pv = dk;
1561       ierr    = MatPivotCheck(C,A,info,&sctx,k);CHKERRQ(ierr);
1562       if (sctx.newshift) break;    /* sctx.shift_amount is updated */
1563       dk = sctx.pv;
1564 
1565       /* copy data into U(k,:) */
1566       ba[bi[k]] = 1.0/dk;
1567       jmin      = bi[k]+1;
1568       nz        = bi[k+1] - jmin;
1569       if (nz) {
1570         bcol = bj + jmin;
1571         bval = ba + jmin;
1572         while (nz--) {
1573           *bval++       = rtmp[*bcol];
1574           rtmp[*bcol++] = 0.0;
1575         }
1576         /* add k-th row into il and jl */
1577         il[k] = jmin;
1578         i     = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1579       }
1580     } /* end of for (k = 0; k<mbs; k++) */
1581   } while (sctx.newshift);
1582   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1583   ierr = PetscFree2(il,jl);CHKERRQ(ierr);
1584 
1585   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1586   C->ops->solves         = MatSolves_SeqSBAIJ_1_inplace;
1587   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1588   C->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1589   C->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1590 
1591   C->assembled    = PETSC_TRUE;
1592   C->preallocated = PETSC_TRUE;
1593 
1594   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1595   if (sctx.nshift) {
1596     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1597       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1598     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1599       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1600     }
1601   }
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 #undef __FUNCT__
1606 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1607 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info)
1608 {
1609   PetscErrorCode ierr;
1610   Mat            C;
1611 
1612   PetscFunctionBegin;
1613   ierr = MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);CHKERRQ(ierr);
1614   ierr = MatCholeskyFactorSymbolic(C,A,perm,info);CHKERRQ(ierr);
1615   ierr = MatCholeskyFactorNumeric(C,A,info);CHKERRQ(ierr);
1616 
1617   A->ops->solve          = C->ops->solve;
1618   A->ops->solvetranspose = C->ops->solvetranspose;
1619 
1620   ierr = MatHeaderMerge(A,&C);CHKERRQ(ierr);
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 
1625