xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 4078b935d4042e0373a7c2ff3f840da843ff0e0f)
1 #define PETSCMAT_DLL
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/mat/impls/sbaij/seq/sbaij.h"
5 #include "src/inline/ilu.h"
6 #include "include/petscis.h"
7 
8 #if !defined(PETSC_USE_COMPLEX)
9 /*
10   input:
11    F -- numeric factor
12   output:
13    nneg, nzero, npos: matrix inertia
14 */
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
18 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
19 {
20   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
21   PetscScalar  *dd = fact_ptr->a;
22   PetscInt     mbs=fact_ptr->mbs,bs=F->rmap.bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
23 
24   PetscFunctionBegin;
25   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
26   nneig_tmp = 0; npos_tmp = 0;
27   for (i=0; i<mbs; i++){
28     if (PetscRealPart(dd[*fi]) > 0.0){
29       npos_tmp++;
30     } else if (PetscRealPart(dd[*fi]) < 0.0){
31       nneig_tmp++;
32     }
33     fi++;
34   }
35   if (nneig) *nneig = nneig_tmp;
36   if (npos)  *npos  = npos_tmp;
37   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
38 
39   PetscFunctionReturn(0);
40 }
41 #endif /* !defined(PETSC_USE_COMPLEX) */
42 
43 /*
44   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
45   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
46 */
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
49 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
50 {
51   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
52   PetscErrorCode ierr;
53   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
54   PetscInt       *jutmp,bs = A->rmap.bs,bs2=a->bs2;
55   PetscInt       m,reallocs = 0,prow;
56   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
57   PetscReal      f = info->fill;
58   PetscTruth     perm_identity;
59 
60   PetscFunctionBegin;
61   /* check whether perm is the identity mapping */
62   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
63   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
64 
65   if (perm_identity){ /* without permutation */
66     a->permute = PETSC_FALSE;
67     ai = a->i; aj = a->j;
68   } else {            /* non-trivial permutation */
69     a->permute = PETSC_TRUE;
70     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
71     ai = a->inew; aj = a->jnew;
72   }
73 
74   /* initialization */
75   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
76   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
77   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
78   iu[0] = mbs+1;
79   juidx = mbs + 1; /* index for ju */
80   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
81   q     = jl + mbs;   /* linked list for col index */
82   for (i=0; i<mbs; i++){
83     jl[i] = mbs;
84     q[i] = 0;
85   }
86 
87   /* for each row k */
88   for (k=0; k<mbs; k++){
89     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
90     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
91     q[k] = mbs;
92     /* initialize nonzero structure of k-th row to row rip[k] of A */
93     jmin = ai[rip[k]] +1; /* exclude diag[k] */
94     jmax = ai[rip[k]+1];
95     for (j=jmin; j<jmax; j++){
96       vj = rip[aj[j]]; /* col. value */
97       if(vj > k){
98         qm = k;
99         do {
100           m  = qm; qm = q[m];
101         } while(qm < vj);
102         if (qm == vj) {
103           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
104         }
105         nzk++;
106         q[m]  = vj;
107         q[vj] = qm;
108       } /* if(vj > k) */
109     } /* for (j=jmin; j<jmax; j++) */
110 
111     /* modify nonzero structure of k-th row by computing fill-in
112        for each row i to be merged in */
113     prow = k;
114     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
115 
116     while (prow < k){
117       /* merge row prow into k-th row */
118       jmin = iu[prow] + 1; jmax = iu[prow+1];
119       qm = k;
120       for (j=jmin; j<jmax; j++){
121         vj = ju[j];
122         do {
123           m = qm; qm = q[m];
124         } while (qm < vj);
125         if (qm != vj){
126          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
127         }
128       }
129       prow = jl[prow]; /* next pivot row */
130     }
131 
132     /* add k to row list for first nonzero element in k-th row */
133     if (nzk > 0){
134       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
135       jl[k] = jl[i]; jl[i] = k;
136     }
137     iu[k+1] = iu[k] + nzk;
138 
139     /* allocate more space to ju if needed */
140     if (iu[k+1] > umax) {
141       /* estimate how much additional space we will need */
142       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143       /* just double the memory each time */
144       maxadd = umax;
145       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
146       umax += maxadd;
147 
148       /* allocate a longer ju */
149       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
150       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
151       ierr = PetscFree(ju);CHKERRQ(ierr);
152       ju   = jutmp;
153       reallocs++; /* count how many times we realloc */
154     }
155 
156     /* save nonzero structure of k-th row in ju */
157     i=k;
158     while (nzk --) {
159       i           = q[i];
160       ju[juidx++] = i;
161     }
162   }
163 
164 #if defined(PETSC_USE_INFO)
165   if (ai[mbs] != 0) {
166     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
167     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
168     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
169     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
170     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
171   } else {
172     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
173   }
174 #endif
175 
176   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
177   ierr = PetscFree(jl);CHKERRQ(ierr);
178 
179   /* put together the new matrix */
180   ierr = MatCreate(A->comm,B);CHKERRQ(ierr);
181   ierr = MatSetSizes(*B,bs*mbs,bs*mbs,bs*mbs,bs*mbs);CHKERRQ(ierr);
182   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
183   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
184 
185   /* ierr = PetscLogObjectParent(*B,iperm);CHKERRQ(ierr); */
186   b = (Mat_SeqSBAIJ*)(*B)->data;
187   b->singlemalloc = PETSC_FALSE;
188   b->free_a       = PETSC_TRUE;
189   b->free_ij       = PETSC_TRUE;
190   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
191   b->j    = ju;
192   b->i    = iu;
193   b->diag = 0;
194   b->ilen = 0;
195   b->imax = 0;
196   b->row  = perm;
197   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
198   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
199   b->icol = perm;
200   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
201   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
202   /* In b structure:  Free imax, ilen, old a, old j.
203      Allocate idnew, solve_work, new a, new j */
204   ierr = PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
205   b->maxnz = b->nz = iu[mbs];
206 
207   (*B)->factor                 = FACTOR_CHOLESKY;
208   (*B)->info.factor_mallocs    = reallocs;
209   (*B)->info.fill_ratio_given  = f;
210   if (ai[mbs] != 0) {
211     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
212   } else {
213     (*B)->info.fill_ratio_needed = 0.0;
214   }
215 
216   if (perm_identity){
217     switch (bs) {
218       case 1:
219         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
220         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
221         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");CHKERRQ(ierr);
222         break;
223       case 2:
224         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
225         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
226         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");CHKERRQ(ierr);
227         break;
228       case 3:
229         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
230         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
231         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");CHKERRQ(ierr);
232         break;
233       case 4:
234         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
235         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
236         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");CHKERRQ(ierr);
237         break;
238       case 5:
239         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
240         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
241         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");CHKERRQ(ierr);
242         break;
243       case 6:
244         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
245         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
246         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");CHKERRQ(ierr);
247         break;
248       case 7:
249         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
250         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
251         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");CHKERRQ(ierr);
252       break;
253       default:
254         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
255         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
256         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");CHKERRQ(ierr);
257       break;
258     }
259   }
260   PetscFunctionReturn(0);
261 }
262 /*
263     Symbolic U^T*D*U factorization for SBAIJ format.
264 */
265 #include "petscbt.h"
266 #include "src/mat/utils/freespace.h"
267 #undef __FUNCT__
268 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
269 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
270 {
271   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
272   Mat_SeqSBAIJ       *b;
273   Mat                B;
274   PetscErrorCode     ierr;
275   PetscTruth         perm_identity;
276   PetscReal          fill = info->fill;
277   PetscInt           *rip,i,mbs=a->mbs,bs=A->rmap.bs,*ai,*aj,reallocs=0,prow;
278   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
279   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
280   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
281   PetscBT            lnkbt;
282 
283   PetscFunctionBegin;
284   /*
285    This code originally uses Modified Sparse Row (MSR) storage
286    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
287    Then it is rewritten so the factor B takes seqsbaij format. However the associated
288    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
289    thus the original code in MSR format is still used for these cases.
290    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
291    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
292   */
293   if (bs > 1){
294     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);CHKERRQ(ierr);
295     PetscFunctionReturn(0);
296   }
297 
298   /* check whether perm is the identity mapping */
299   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
300 
301   if (perm_identity){
302     a->permute = PETSC_FALSE;
303     ai = a->i; aj = a->j;
304   } else {
305     SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
306     /* There are bugs for reordeing. Needs further work.
307        MatReordering for sbaij cannot be efficient. User should use aij formt! */
308     a->permute = PETSC_TRUE;
309     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
310     ai = a->inew; aj = a->jnew;
311   }
312   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
313 
314   /* initialization */
315   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
316   ui[0] = 0;
317 
318   /* jl: linked list for storing indices of the pivot rows
319      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
320   ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
321   il     = jl + mbs;
322   cols   = il + mbs;
323   ui_ptr = (PetscInt**)(cols + mbs);
324 
325   for (i=0; i<mbs; i++){
326     jl[i] = mbs; il[i] = 0;
327   }
328 
329   /* create and initialize a linked list for storing column indices of the active row k */
330   nlnk = mbs + 1;
331   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
332 
333   /* initial FreeSpace size is fill*(ai[mbs]+1) */
334   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
335   current_space = free_space;
336 
337   for (k=0; k<mbs; k++){  /* for each active row k */
338     /* initialize lnk by the column indices of row rip[k] of A */
339     nzk   = 0;
340     ncols = ai[rip[k]+1] - ai[rip[k]];
341     for (j=0; j<ncols; j++){
342       i = *(aj + ai[rip[k]] + j);
343       cols[j] = rip[i];
344     }
345     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
346     nzk += nlnk;
347 
348     /* update lnk by computing fill-in for each pivot row to be merged in */
349     prow = jl[k]; /* 1st pivot row */
350 
351     while (prow < k){
352       nextprow = jl[prow];
353       /* merge prow into k-th row */
354       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
355       jmax = ui[prow+1];
356       ncols = jmax-jmin;
357       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
358       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
359       nzk += nlnk;
360 
361       /* update il and jl for prow */
362       if (jmin < jmax){
363         il[prow] = jmin;
364         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
365       }
366       prow = nextprow;
367     }
368 
369     /* if free space is not available, make more free space */
370     if (current_space->local_remaining<nzk) {
371       i = mbs - k + 1; /* num of unfactored rows */
372       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
373       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
374       reallocs++;
375     }
376 
377     /* copy data into free space, then initialize lnk */
378     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
379 
380     /* add the k-th row into il and jl */
381     if (nzk-1 > 0){
382       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
383       jl[k] = jl[i]; jl[i] = k;
384       il[k] = ui[k] + 1;
385     }
386     ui_ptr[k] = current_space->array;
387     current_space->array           += nzk;
388     current_space->local_used      += nzk;
389     current_space->local_remaining -= nzk;
390 
391     ui[k+1] = ui[k] + nzk;
392   }
393 
394 #if defined(PETSC_USE_INFO)
395   if (ai[mbs] != 0) {
396     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
397     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
398     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
399     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
400   } else {
401     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
402   }
403 #endif
404 
405   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
406   ierr = PetscFree(jl);CHKERRQ(ierr);
407 
408   /* destroy list of free space and other temporary array(s) */
409   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
410   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
411   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
412 
413   /* put together the new matrix in MATSEQSBAIJ format */
414   ierr = MatCreate(PETSC_COMM_SELF,fact);CHKERRQ(ierr);
415   ierr = MatSetSizes(*fact,mbs,mbs,mbs,mbs);CHKERRQ(ierr);
416   B    = *fact;
417   ierr = MatSetType(B,MATSEQSBAIJ);CHKERRQ(ierr);
418   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
419 
420   b = (Mat_SeqSBAIJ*)B->data;
421   b->singlemalloc = PETSC_FALSE;
422   b->free_a       = PETSC_TRUE;
423   b->free_ij      = PETSC_TRUE;
424   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
425   b->j    = uj;
426   b->i    = ui;
427   b->diag = 0;
428   b->ilen = 0;
429   b->imax = 0;
430   b->row  = perm;
431   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
432   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
433   b->icol = perm;
434   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
435   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
436   ierr    = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
437   b->maxnz = b->nz = ui[mbs];
438 
439   B->factor                 = FACTOR_CHOLESKY;
440   B->info.factor_mallocs    = reallocs;
441   B->info.fill_ratio_given  = fill;
442   if (ai[mbs] != 0) {
443     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
444   } else {
445     B->info.fill_ratio_needed = 0.0;
446   }
447 
448   if (perm_identity){
449     switch (bs) {
450       case 1:
451         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
452         B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
453         B->ops->forwardsolve    = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
454         B->ops->backwardsolve   = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
455         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");CHKERRQ(ierr);
456         break;
457       case 2:
458         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
459         B->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
460         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");CHKERRQ(ierr);
461         break;
462       case 3:
463         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
464         B->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
465         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");CHKERRQ(ierr);
466         break;
467       case 4:
468         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
469         B->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
470         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");CHKERRQ(ierr);
471         break;
472       case 5:
473         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
474         B->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
475         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");CHKERRQ(ierr);
476         break;
477       case 6:
478         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
479         B->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
480         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");CHKERRQ(ierr);
481         break;
482       case 7:
483         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
484         B->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
485         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");CHKERRQ(ierr);
486       break;
487       default:
488         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
489         B->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
490         ierr = PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");CHKERRQ(ierr);
491       break;
492     }
493   }
494   PetscFunctionReturn(0);
495 }
496 #undef __FUNCT__
497 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
498 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,MatFactorInfo *info,Mat *B)
499 {
500   Mat            C = *B;
501   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
502   IS             perm = b->row;
503   PetscErrorCode ierr;
504   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
505   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
506   PetscInt       bs=A->rmap.bs,bs2 = a->bs2,bslog = 0;
507   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
508   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
509   MatScalar      *work;
510   PetscInt       *pivots;
511 
512   PetscFunctionBegin;
513   /* initialization */
514   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
515   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
516   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
517   jl   = il + mbs;
518   for (i=0; i<mbs; i++) {
519     jl[i] = mbs; il[0] = 0;
520   }
521   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
522   uik  = dk + bs2;
523   work = uik + bs2;
524   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
525 
526   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
527 
528   /* check permutation */
529   if (!a->permute){
530     ai = a->i; aj = a->j; aa = a->a;
531   } else {
532     ai   = a->inew; aj = a->jnew;
533     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
534     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
535     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
536     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
537 
538     /* flops in while loop */
539     bslog = 2*bs*bs2;
540 
541     for (i=0; i<mbs; i++){
542       jmin = ai[i]; jmax = ai[i+1];
543       for (j=jmin; j<jmax; j++){
544         while (a2anew[j] != j){
545           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
546           for (k1=0; k1<bs2; k1++){
547             dk[k1]       = aa[k*bs2+k1];
548             aa[k*bs2+k1] = aa[j*bs2+k1];
549             aa[j*bs2+k1] = dk[k1];
550           }
551         }
552         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
553         if (i > aj[j]){
554           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
555           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
556           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
557           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
558             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
559           }
560         }
561       }
562     }
563     ierr = PetscFree(a2anew);CHKERRQ(ierr);
564   }
565 
566   /* for each row k */
567   for (k = 0; k<mbs; k++){
568 
569     /*initialize k-th row with elements nonzero in row perm(k) of A */
570     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
571 
572     ap = aa + jmin*bs2;
573     for (j = jmin; j < jmax; j++){
574       vj = perm_ptr[aj[j]];         /* block col. index */
575       rtmp_ptr = rtmp + vj*bs2;
576       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
577     }
578 
579     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
580     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
581     i = jl[k]; /* first row to be added to k_th row  */
582 
583     while (i < k){
584       nexti = jl[i]; /* next row to be added to k_th row */
585 
586       /* compute multiplier */
587       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
588 
589       /* uik = -inv(Di)*U_bar(i,k) */
590       diag = ba + i*bs2;
591       u    = ba + ili*bs2;
592       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
593       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
594 
595       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
596       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
597       ierr = PetscLogFlops(bslog*2);CHKERRQ(ierr);
598 
599       /* update -U(i,k) */
600       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
601 
602       /* add multiple of row i to k-th row ... */
603       jmin = ili + 1; jmax = bi[i+1];
604       if (jmin < jmax){
605         for (j=jmin; j<jmax; j++) {
606           /* rtmp += -U(i,k)^T * U_bar(i,j) */
607           rtmp_ptr = rtmp + bj[j]*bs2;
608           u = ba + j*bs2;
609           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
610         }
611         ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr);
612 
613         /* ... add i to row list for next nonzero entry */
614         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
615         j     = bj[jmin];
616         jl[i] = jl[j]; jl[j] = i; /* update jl */
617       }
618       i = nexti;
619     }
620 
621     /* save nonzero entries in k-th row of U ... */
622 
623     /* invert diagonal block */
624     diag = ba+k*bs2;
625     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
626     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
627 
628     jmin = bi[k]; jmax = bi[k+1];
629     if (jmin < jmax) {
630       for (j=jmin; j<jmax; j++){
631          vj = bj[j];           /* block col. index of U */
632          u   = ba + j*bs2;
633          rtmp_ptr = rtmp + vj*bs2;
634          for (k1=0; k1<bs2; k1++){
635            *u++        = *rtmp_ptr;
636            *rtmp_ptr++ = 0.0;
637          }
638       }
639 
640       /* ... add k to row list for first nonzero entry in k-th row */
641       il[k] = jmin;
642       i     = bj[jmin];
643       jl[k] = jl[i]; jl[i] = k;
644     }
645   }
646 
647   ierr = PetscFree(rtmp);CHKERRQ(ierr);
648   ierr = PetscFree(il);CHKERRQ(ierr);
649   ierr = PetscFree(dk);CHKERRQ(ierr);
650   ierr = PetscFree(pivots);CHKERRQ(ierr);
651   if (a->permute){
652     ierr = PetscFree(aa);CHKERRQ(ierr);
653   }
654 
655   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
656   C->factor       = FACTOR_CHOLESKY;
657   C->assembled    = PETSC_TRUE;
658   C->preallocated = PETSC_TRUE;
659   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
660   PetscFunctionReturn(0);
661 }
662 
663 #undef __FUNCT__
664 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
665 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
666 {
667   Mat            C = *B;
668   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
669   PetscErrorCode ierr;
670   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
671   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
672   PetscInt       bs=A->rmap.bs,bs2 = a->bs2,bslog;
673   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
674   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
675   MatScalar      *work;
676   PetscInt       *pivots;
677 
678   PetscFunctionBegin;
679   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
680   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
681   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
682   jl   = il + mbs;
683   for (i=0; i<mbs; i++) {
684     jl[i] = mbs; il[0] = 0;
685   }
686   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
687   uik  = dk + bs2;
688   work = uik + bs2;
689   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
690 
691   ai = a->i; aj = a->j; aa = a->a;
692 
693   /* flops in while loop */
694   bslog = 2*bs*bs2;
695 
696   /* for each row k */
697   for (k = 0; k<mbs; k++){
698 
699     /*initialize k-th row with elements nonzero in row k of A */
700     jmin = ai[k]; jmax = ai[k+1];
701     ap = aa + jmin*bs2;
702     for (j = jmin; j < jmax; j++){
703       vj = aj[j];         /* block col. index */
704       rtmp_ptr = rtmp + vj*bs2;
705       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
706     }
707 
708     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
709     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
710     i = jl[k]; /* first row to be added to k_th row  */
711 
712     while (i < k){
713       nexti = jl[i]; /* next row to be added to k_th row */
714 
715       /* compute multiplier */
716       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
717 
718       /* uik = -inv(Di)*U_bar(i,k) */
719       diag = ba + i*bs2;
720       u    = ba + ili*bs2;
721       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
722       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
723 
724       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
725       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
726       ierr = PetscLogFlops(bslog*2);CHKERRQ(ierr);
727 
728       /* update -U(i,k) */
729       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
730 
731       /* add multiple of row i to k-th row ... */
732       jmin = ili + 1; jmax = bi[i+1];
733       if (jmin < jmax){
734         for (j=jmin; j<jmax; j++) {
735           /* rtmp += -U(i,k)^T * U_bar(i,j) */
736           rtmp_ptr = rtmp + bj[j]*bs2;
737           u = ba + j*bs2;
738           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
739         }
740         ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr);
741 
742         /* ... add i to row list for next nonzero entry */
743         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
744         j     = bj[jmin];
745         jl[i] = jl[j]; jl[j] = i; /* update jl */
746       }
747       i = nexti;
748     }
749 
750     /* save nonzero entries in k-th row of U ... */
751 
752     /* invert diagonal block */
753     diag = ba+k*bs2;
754     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
755     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
756 
757     jmin = bi[k]; jmax = bi[k+1];
758     if (jmin < jmax) {
759       for (j=jmin; j<jmax; j++){
760          vj = bj[j];           /* block col. index of U */
761          u   = ba + j*bs2;
762          rtmp_ptr = rtmp + vj*bs2;
763          for (k1=0; k1<bs2; k1++){
764            *u++        = *rtmp_ptr;
765            *rtmp_ptr++ = 0.0;
766          }
767       }
768 
769       /* ... add k to row list for first nonzero entry in k-th row */
770       il[k] = jmin;
771       i     = bj[jmin];
772       jl[k] = jl[i]; jl[i] = k;
773     }
774   }
775 
776   ierr = PetscFree(rtmp);CHKERRQ(ierr);
777   ierr = PetscFree(il);CHKERRQ(ierr);
778   ierr = PetscFree(dk);CHKERRQ(ierr);
779   ierr = PetscFree(pivots);CHKERRQ(ierr);
780 
781   C->factor    = FACTOR_CHOLESKY;
782   C->assembled = PETSC_TRUE;
783   C->preallocated = PETSC_TRUE;
784   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
785   PetscFunctionReturn(0);
786 }
787 
788 /*
789     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
790     Version for blocks 2 by 2.
791 */
792 #undef __FUNCT__
793 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
794 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,MatFactorInfo *info,Mat *B)
795 {
796   Mat            C = *B;
797   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
798   IS             perm = b->row;
799   PetscErrorCode ierr;
800   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
801   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
802   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
803   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
804 
805   PetscFunctionBegin;
806   /* initialization */
807   /* il and jl record the first nonzero element in each row of the accessing
808      window U(0:k, k:mbs-1).
809      jl:    list of rows to be added to uneliminated rows
810             i>= k: jl(i) is the first row to be added to row i
811             i<  k: jl(i) is the row following row i in some list of rows
812             jl(i) = mbs indicates the end of a list
813      il(i): points to the first nonzero element in columns k,...,mbs-1 of
814             row i of U */
815   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
816   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
817   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
818   jl   = il + mbs;
819   for (i=0; i<mbs; i++) {
820     jl[i] = mbs; il[0] = 0;
821   }
822   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
823   uik  = dk + 4;
824   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
825 
826   /* check permutation */
827   if (!a->permute){
828     ai = a->i; aj = a->j; aa = a->a;
829   } else {
830     ai   = a->inew; aj = a->jnew;
831     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
832     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
833     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
834     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
835 
836     for (i=0; i<mbs; i++){
837       jmin = ai[i]; jmax = ai[i+1];
838       for (j=jmin; j<jmax; j++){
839         while (a2anew[j] != j){
840           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
841           for (k1=0; k1<4; k1++){
842             dk[k1]       = aa[k*4+k1];
843             aa[k*4+k1] = aa[j*4+k1];
844             aa[j*4+k1] = dk[k1];
845           }
846         }
847         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
848         if (i > aj[j]){
849           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
850           ap = aa + j*4;     /* ptr to the beginning of the block */
851           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
852           ap[1] = ap[2];
853           ap[2] = dk[1];
854         }
855       }
856     }
857     ierr = PetscFree(a2anew);CHKERRQ(ierr);
858   }
859 
860   /* for each row k */
861   for (k = 0; k<mbs; k++){
862 
863     /*initialize k-th row with elements nonzero in row perm(k) of A */
864     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
865     ap = aa + jmin*4;
866     for (j = jmin; j < jmax; j++){
867       vj = perm_ptr[aj[j]];         /* block col. index */
868       rtmp_ptr = rtmp + vj*4;
869       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
870     }
871 
872     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
873     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
874     i = jl[k]; /* first row to be added to k_th row  */
875 
876     while (i < k){
877       nexti = jl[i]; /* next row to be added to k_th row */
878 
879       /* compute multiplier */
880       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
881 
882       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
883       diag = ba + i*4;
884       u    = ba + ili*4;
885       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
886       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
887       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
888       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
889 
890       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
891       dk[0] += uik[0]*u[0] + uik[1]*u[1];
892       dk[1] += uik[2]*u[0] + uik[3]*u[1];
893       dk[2] += uik[0]*u[2] + uik[1]*u[3];
894       dk[3] += uik[2]*u[2] + uik[3]*u[3];
895 
896       ierr = PetscLogFlops(16*2);CHKERRQ(ierr);
897 
898       /* update -U(i,k): ba[ili] = uik */
899       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
900 
901       /* add multiple of row i to k-th row ... */
902       jmin = ili + 1; jmax = bi[i+1];
903       if (jmin < jmax){
904         for (j=jmin; j<jmax; j++) {
905           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
906           rtmp_ptr = rtmp + bj[j]*4;
907           u = ba + j*4;
908           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
909           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
910           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
911           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
912         }
913         ierr = PetscLogFlops(16*(jmax-jmin));CHKERRQ(ierr);
914 
915         /* ... add i to row list for next nonzero entry */
916         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
917         j     = bj[jmin];
918         jl[i] = jl[j]; jl[j] = i; /* update jl */
919       }
920       i = nexti;
921     }
922 
923     /* save nonzero entries in k-th row of U ... */
924 
925     /* invert diagonal block */
926     diag = ba+k*4;
927     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
928     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
929 
930     jmin = bi[k]; jmax = bi[k+1];
931     if (jmin < jmax) {
932       for (j=jmin; j<jmax; j++){
933          vj = bj[j];           /* block col. index of U */
934          u   = ba + j*4;
935          rtmp_ptr = rtmp + vj*4;
936          for (k1=0; k1<4; k1++){
937            *u++        = *rtmp_ptr;
938            *rtmp_ptr++ = 0.0;
939          }
940       }
941 
942       /* ... add k to row list for first nonzero entry in k-th row */
943       il[k] = jmin;
944       i     = bj[jmin];
945       jl[k] = jl[i]; jl[i] = k;
946     }
947   }
948 
949   ierr = PetscFree(rtmp);CHKERRQ(ierr);
950   ierr = PetscFree(il);CHKERRQ(ierr);
951   ierr = PetscFree(dk);CHKERRQ(ierr);
952   if (a->permute) {
953     ierr = PetscFree(aa);CHKERRQ(ierr);
954   }
955   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
956   C->factor    = FACTOR_CHOLESKY;
957   C->assembled = PETSC_TRUE;
958   C->preallocated = PETSC_TRUE;
959   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
960   PetscFunctionReturn(0);
961 }
962 
963 /*
964       Version for when blocks are 2 by 2 Using natural ordering
965 */
966 #undef __FUNCT__
967 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
968 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
969 {
970   Mat            C = *B;
971   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
972   PetscErrorCode ierr;
973   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
974   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
975   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
976   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
977 
978   PetscFunctionBegin;
979   /* initialization */
980   /* il and jl record the first nonzero element in each row of the accessing
981      window U(0:k, k:mbs-1).
982      jl:    list of rows to be added to uneliminated rows
983             i>= k: jl(i) is the first row to be added to row i
984             i<  k: jl(i) is the row following row i in some list of rows
985             jl(i) = mbs indicates the end of a list
986      il(i): points to the first nonzero element in columns k,...,mbs-1 of
987             row i of U */
988   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
989   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
990   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
991   jl   = il + mbs;
992   for (i=0; i<mbs; i++) {
993     jl[i] = mbs; il[0] = 0;
994   }
995   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
996   uik  = dk + 4;
997 
998   ai = a->i; aj = a->j; aa = a->a;
999 
1000   /* for each row k */
1001   for (k = 0; k<mbs; k++){
1002 
1003     /*initialize k-th row with elements nonzero in row k of A */
1004     jmin = ai[k]; jmax = ai[k+1];
1005     ap = aa + jmin*4;
1006     for (j = jmin; j < jmax; j++){
1007       vj = aj[j];         /* block col. index */
1008       rtmp_ptr = rtmp + vj*4;
1009       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1010     }
1011 
1012     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1013     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
1014     i = jl[k]; /* first row to be added to k_th row  */
1015 
1016     while (i < k){
1017       nexti = jl[i]; /* next row to be added to k_th row */
1018 
1019       /* compute multiplier */
1020       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1021 
1022       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1023       diag = ba + i*4;
1024       u    = ba + ili*4;
1025       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1026       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1027       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1028       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1029 
1030       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1031       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1032       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1033       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1034       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1035 
1036       ierr = PetscLogFlops(16*2);CHKERRQ(ierr);
1037 
1038       /* update -U(i,k): ba[ili] = uik */
1039       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1040 
1041       /* add multiple of row i to k-th row ... */
1042       jmin = ili + 1; jmax = bi[i+1];
1043       if (jmin < jmax){
1044         for (j=jmin; j<jmax; j++) {
1045           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1046           rtmp_ptr = rtmp + bj[j]*4;
1047           u = ba + j*4;
1048           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1049           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1050           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1051           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1052         }
1053         ierr = PetscLogFlops(16*(jmax-jmin));CHKERRQ(ierr);
1054 
1055         /* ... add i to row list for next nonzero entry */
1056         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1057         j     = bj[jmin];
1058         jl[i] = jl[j]; jl[j] = i; /* update jl */
1059       }
1060       i = nexti;
1061     }
1062 
1063     /* save nonzero entries in k-th row of U ... */
1064 
1065     /* invert diagonal block */
1066     diag = ba+k*4;
1067     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1068     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1069 
1070     jmin = bi[k]; jmax = bi[k+1];
1071     if (jmin < jmax) {
1072       for (j=jmin; j<jmax; j++){
1073          vj = bj[j];           /* block col. index of U */
1074          u   = ba + j*4;
1075          rtmp_ptr = rtmp + vj*4;
1076          for (k1=0; k1<4; k1++){
1077            *u++        = *rtmp_ptr;
1078            *rtmp_ptr++ = 0.0;
1079          }
1080       }
1081 
1082       /* ... add k to row list for first nonzero entry in k-th row */
1083       il[k] = jmin;
1084       i     = bj[jmin];
1085       jl[k] = jl[i]; jl[i] = k;
1086     }
1087   }
1088 
1089   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1090   ierr = PetscFree(il);CHKERRQ(ierr);
1091   ierr = PetscFree(dk);CHKERRQ(ierr);
1092 
1093   C->factor    = FACTOR_CHOLESKY;
1094   C->assembled = PETSC_TRUE;
1095   C->preallocated = PETSC_TRUE;
1096   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 /*
1101     Numeric U^T*D*U factorization for SBAIJ format.
1102     Version for blocks are 1 by 1.
1103 */
1104 #undef __FUNCT__
1105 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1106 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,MatFactorInfo *info,Mat *B)
1107 {
1108   Mat            C = *B;
1109   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1110   IS             ip=b->row;
1111   PetscErrorCode ierr;
1112   PetscInt       *rip,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1113   PetscInt       *ai,*aj,*a2anew;
1114   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1115   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1116   PetscReal      zeropivot,rs,shiftnz;
1117   PetscReal      shiftpd;
1118   ChShift_Ctx    sctx;
1119   PetscInt       newshift;
1120 
1121   PetscFunctionBegin;
1122   /* initialization */
1123   shiftnz   = info->shiftnz;
1124   shiftpd   = info->shiftpd;
1125   zeropivot = info->zeropivot;
1126 
1127   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1128   if (!a->permute){
1129     ai = a->i; aj = a->j; aa = a->a;
1130   } else {
1131     ai = a->inew; aj = a->jnew;
1132     nz = ai[mbs];
1133     ierr = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1134     a2anew = a->a2anew;
1135     bval   = a->a;
1136     for (j=0; j<nz; j++){
1137       aa[a2anew[j]] = *(bval++);
1138     }
1139   }
1140 
1141   /* initialization */
1142   /* il and jl record the first nonzero element in each row of the accessing
1143      window U(0:k, k:mbs-1).
1144      jl:    list of rows to be added to uneliminated rows
1145             i>= k: jl(i) is the first row to be added to row i
1146             i<  k: jl(i) is the row following row i in some list of rows
1147             jl(i) = mbs indicates the end of a list
1148      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1149             row i of U */
1150   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
1151   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
1152   jl   = il + mbs;
1153   rtmp = (MatScalar*)(jl + mbs);
1154 
1155   sctx.shift_amount = 0;
1156   sctx.nshift       = 0;
1157   do {
1158     sctx.chshift = PETSC_FALSE;
1159     for (i=0; i<mbs; i++) {
1160       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1161     }
1162 
1163     for (k = 0; k<mbs; k++){
1164       /*initialize k-th row by the perm[k]-th row of A */
1165       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1166       bval = ba + bi[k];
1167       for (j = jmin; j < jmax; j++){
1168         col = rip[aj[j]];
1169         rtmp[col] = aa[j];
1170         *bval++  = 0.0; /* for in-place factorization */
1171       }
1172 
1173       /* shift the diagonal of the matrix */
1174       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1175 
1176       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1177       dk = rtmp[k];
1178       i = jl[k]; /* first row to be added to k_th row  */
1179 
1180       while (i < k){
1181         nexti = jl[i]; /* next row to be added to k_th row */
1182 
1183         /* compute multiplier, update diag(k) and U(i,k) */
1184         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1185         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
1186         dk += uikdi*ba[ili];
1187         ba[ili] = uikdi; /* -U(i,k) */
1188 
1189         /* add multiple of row i to k-th row */
1190         jmin = ili + 1; jmax = bi[i+1];
1191         if (jmin < jmax){
1192           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1193           ierr = PetscLogFlops(2*(jmax-jmin));CHKERRQ(ierr);
1194 
1195           /* update il and jl for row i */
1196           il[i] = jmin;
1197           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1198         }
1199         i = nexti;
1200       }
1201 
1202       /* shift the diagonals when zero pivot is detected */
1203       /* compute rs=sum of abs(off-diagonal) */
1204       rs   = 0.0;
1205       jmin = bi[k]+1;
1206       nz   = bi[k+1] - jmin;
1207       if (nz){
1208         bcol = bj + jmin;
1209         while (nz--){
1210           rs += PetscAbsScalar(rtmp[*bcol]);
1211           bcol++;
1212         }
1213       }
1214 
1215       sctx.rs = rs;
1216       sctx.pv = dk;
1217       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
1218       if (newshift == 1) break;    /* sctx.shift_amount is updated */
1219 
1220       /* copy data into U(k,:) */
1221       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1222       jmin = bi[k]+1; jmax = bi[k+1];
1223       if (jmin < jmax) {
1224         for (j=jmin; j<jmax; j++){
1225           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1226         }
1227         /* add the k-th row into il and jl */
1228         il[k] = jmin;
1229         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1230       }
1231     }
1232   } while (sctx.chshift);
1233   ierr = PetscFree(il);CHKERRQ(ierr);
1234   if (a->permute){ierr = PetscFree(aa);CHKERRQ(ierr);}
1235 
1236   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1237   C->factor       = FACTOR_CHOLESKY;
1238   C->assembled    = PETSC_TRUE;
1239   C->preallocated = PETSC_TRUE;
1240   ierr = PetscLogFlops(C->rmap.N);CHKERRQ(ierr);
1241   if (sctx.nshift){
1242     if (shiftnz) {
1243       ierr = PetscInfo2(0,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1244     } else if (shiftpd) {
1245       ierr = PetscInfo2(0,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1246     }
1247   }
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 /*
1252   Version for when blocks are 1 by 1 Using natural ordering
1253 */
1254 #undef __FUNCT__
1255 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1256 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
1257 {
1258   Mat            C = *B;
1259   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1260   PetscErrorCode ierr;
1261   PetscInt       i,j,mbs = a->mbs;
1262   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1263   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1264   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1265   PetscReal      zeropivot,rs,shiftnz;
1266   PetscReal      shiftpd;
1267   ChShift_Ctx    sctx;
1268   PetscInt       newshift;
1269 
1270   PetscFunctionBegin;
1271   /* initialization */
1272   shiftnz   = info->shiftnz;
1273   shiftpd   = info->shiftpd;
1274   zeropivot = info->zeropivot;
1275 
1276   /* il and jl record the first nonzero element in each row of the accessing
1277      window U(0:k, k:mbs-1).
1278      jl:    list of rows to be added to uneliminated rows
1279             i>= k: jl(i) is the first row to be added to row i
1280             i<  k: jl(i) is the row following row i in some list of rows
1281             jl(i) = mbs indicates the end of a list
1282      il(i): points to the first nonzero element in U(i,k:mbs-1)
1283   */
1284   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1285   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1286   jl   = il + mbs;
1287 
1288   sctx.shift_amount = 0;
1289   sctx.nshift       = 0;
1290   do {
1291     sctx.chshift = PETSC_FALSE;
1292     for (i=0; i<mbs; i++) {
1293       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1294     }
1295 
1296     for (k = 0; k<mbs; k++){
1297       /*initialize k-th row with elements nonzero in row perm(k) of A */
1298       nz   = ai[k+1] - ai[k];
1299       acol = aj + ai[k];
1300       aval = aa + ai[k];
1301       bval = ba + bi[k];
1302       while (nz -- ){
1303         rtmp[*acol++] = *aval++;
1304         *bval++       = 0.0; /* for in-place factorization */
1305       }
1306 
1307       /* shift the diagonal of the matrix */
1308       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1309 
1310       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1311       dk = rtmp[k];
1312       i  = jl[k]; /* first row to be added to k_th row  */
1313 
1314       while (i < k){
1315         nexti = jl[i]; /* next row to be added to k_th row */
1316         /* compute multiplier, update D(k) and U(i,k) */
1317         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1318         uikdi = - ba[ili]*ba[bi[i]];
1319         dk   += uikdi*ba[ili];
1320         ba[ili] = uikdi; /* -U(i,k) */
1321 
1322         /* add multiple of row i to k-th row ... */
1323         jmin = ili + 1;
1324         nz   = bi[i+1] - jmin;
1325         if (nz > 0){
1326           bcol = bj + jmin;
1327           bval = ba + jmin;
1328           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1329           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1330 
1331           /* update il and jl for i-th row */
1332           il[i] = jmin;
1333           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1334         }
1335         i = nexti;
1336       }
1337 
1338       /* shift the diagonals when zero pivot is detected */
1339       /* compute rs=sum of abs(off-diagonal) */
1340       rs   = 0.0;
1341       jmin = bi[k]+1;
1342       nz   = bi[k+1] - jmin;
1343       if (nz){
1344         bcol = bj + jmin;
1345         while (nz--){
1346           rs += PetscAbsScalar(rtmp[*bcol]);
1347           bcol++;
1348         }
1349       }
1350 
1351       sctx.rs = rs;
1352       sctx.pv = dk;
1353       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
1354       if (newshift == 1) break;    /* sctx.shift_amount is updated */
1355 
1356       /* copy data into U(k,:) */
1357       ba[bi[k]] = 1.0/dk;
1358       jmin      = bi[k]+1;
1359       nz        = bi[k+1] - jmin;
1360       if (nz){
1361         bcol = bj + jmin;
1362         bval = ba + jmin;
1363         while (nz--){
1364           *bval++       = rtmp[*bcol];
1365           rtmp[*bcol++] = 0.0;
1366         }
1367         /* add k-th row into il and jl */
1368         il[k] = jmin;
1369         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1370       }
1371     } /* end of for (k = 0; k<mbs; k++) */
1372   } while (sctx.chshift);
1373   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1374   ierr = PetscFree(il);CHKERRQ(ierr);
1375 
1376   C->factor       = FACTOR_CHOLESKY;
1377   C->assembled    = PETSC_TRUE;
1378   C->preallocated = PETSC_TRUE;
1379   ierr = PetscLogFlops(C->rmap.N);CHKERRQ(ierr);
1380   if (sctx.nshift){
1381     if (shiftnz) {
1382       ierr = PetscInfo2(0,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1383     } else if (shiftpd) {
1384       ierr = PetscInfo2(0,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1385     }
1386   }
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 #undef __FUNCT__
1391 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1392 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1393 {
1394   PetscErrorCode ierr;
1395   Mat            C;
1396 
1397   PetscFunctionBegin;
1398   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1399   ierr = MatCholeskyFactorNumeric(A,info,&C);CHKERRQ(ierr);
1400   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 
1405