xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 52eef5da67c97db1092c8b8396e40f88d178effc)
1 #define PETSCMAT_DLL
2 
3 #include "src/mat/impls/baij/seq/baij.h"
4 #include "src/mat/impls/sbaij/seq/sbaij.h"
5 #include "src/inline/ilu.h"
6 #include "include/petscis.h"
7 
8 #if !defined(PETSC_USE_COMPLEX)
9 /*
10   input:
11    F -- numeric factor
12   output:
13    nneg, nzero, npos: matrix inertia
14 */
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
18 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
19 {
20   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
21   PetscScalar  *dd = fact_ptr->a;
22   PetscInt     mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
23 
24   PetscFunctionBegin;
25   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
26   nneig_tmp = 0; npos_tmp = 0;
27   for (i=0; i<mbs; i++){
28     if (PetscRealPart(dd[*fi]) > 0.0){
29       npos_tmp++;
30     } else if (PetscRealPart(dd[*fi]) < 0.0){
31       nneig_tmp++;
32     }
33     fi++;
34   }
35   if (nneig) *nneig = nneig_tmp;
36   if (npos)  *npos  = npos_tmp;
37   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
38 
39   PetscFunctionReturn(0);
40 }
41 #endif /* !defined(PETSC_USE_COMPLEX) */
42 
43 /*
44   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
45   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
46 */
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
49 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
50 {
51   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
52   PetscErrorCode ierr;
53   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
54   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
55   PetscInt       m,reallocs = 0,prow;
56   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
57   PetscReal      f = info->fill;
58   PetscTruth     perm_identity;
59 
60   PetscFunctionBegin;
61   /* check whether perm is the identity mapping */
62   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
63   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
64 
65   if (perm_identity){ /* without permutation */
66     a->permute = PETSC_FALSE;
67     ai = a->i; aj = a->j;
68   } else {            /* non-trivial permutation */
69     a->permute = PETSC_TRUE;
70     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
71     ai = a->inew; aj = a->jnew;
72   }
73 
74   /* initialization */
75   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
76   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
77   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
78   iu[0] = mbs+1;
79   juidx = mbs + 1; /* index for ju */
80   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
81   q     = jl + mbs;   /* linked list for col index */
82   for (i=0; i<mbs; i++){
83     jl[i] = mbs;
84     q[i] = 0;
85   }
86 
87   /* for each row k */
88   for (k=0; k<mbs; k++){
89     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
90     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
91     q[k] = mbs;
92     /* initialize nonzero structure of k-th row to row rip[k] of A */
93     jmin = ai[rip[k]] +1; /* exclude diag[k] */
94     jmax = ai[rip[k]+1];
95     for (j=jmin; j<jmax; j++){
96       vj = rip[aj[j]]; /* col. value */
97       if(vj > k){
98         qm = k;
99         do {
100           m  = qm; qm = q[m];
101         } while(qm < vj);
102         if (qm == vj) {
103           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
104         }
105         nzk++;
106         q[m]  = vj;
107         q[vj] = qm;
108       } /* if(vj > k) */
109     } /* for (j=jmin; j<jmax; j++) */
110 
111     /* modify nonzero structure of k-th row by computing fill-in
112        for each row i to be merged in */
113     prow = k;
114     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
115 
116     while (prow < k){
117       /* merge row prow into k-th row */
118       jmin = iu[prow] + 1; jmax = iu[prow+1];
119       qm = k;
120       for (j=jmin; j<jmax; j++){
121         vj = ju[j];
122         do {
123           m = qm; qm = q[m];
124         } while (qm < vj);
125         if (qm != vj){
126          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
127         }
128       }
129       prow = jl[prow]; /* next pivot row */
130     }
131 
132     /* add k to row list for first nonzero element in k-th row */
133     if (nzk > 0){
134       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
135       jl[k] = jl[i]; jl[i] = k;
136     }
137     iu[k+1] = iu[k] + nzk;
138 
139     /* allocate more space to ju if needed */
140     if (iu[k+1] > umax) {
141       /* estimate how much additional space we will need */
142       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143       /* just double the memory each time */
144       maxadd = umax;
145       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
146       umax += maxadd;
147 
148       /* allocate a longer ju */
149       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
150       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
151       ierr = PetscFree(ju);CHKERRQ(ierr);
152       ju   = jutmp;
153       reallocs++; /* count how many times we realloc */
154     }
155 
156     /* save nonzero structure of k-th row in ju */
157     i=k;
158     while (nzk --) {
159       i           = q[i];
160       ju[juidx++] = i;
161     }
162   }
163 
164 #if defined(PETSC_USE_DEBUG)
165   if (ai[mbs] != 0) {
166     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
167     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af));CHKERRQ(ierr);
168     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af));CHKERRQ(ierr);
169     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af));CHKERRQ(ierr);
170     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"));CHKERRQ(ierr);
171   } else {
172     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"));CHKERRQ(ierr);
173   }
174 #endif
175 
176   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
177   ierr = PetscFree(jl);CHKERRQ(ierr);
178 
179   /* put together the new matrix */
180   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
181   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
182   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
183 
184   /* ierr = PetscLogObjectParent(*B,iperm);CHKERRQ(ierr); */
185   b = (Mat_SeqSBAIJ*)(*B)->data;
186   b->singlemalloc = PETSC_FALSE;
187   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
188   b->j    = ju;
189   b->i    = iu;
190   b->diag = 0;
191   b->ilen = 0;
192   b->imax = 0;
193   b->row  = perm;
194   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
195   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
196   b->icol = perm;
197   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
198   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
199   /* In b structure:  Free imax, ilen, old a, old j.
200      Allocate idnew, solve_work, new a, new j */
201   ierr = PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
202   b->maxnz = b->nz = iu[mbs];
203 
204   (*B)->factor                 = FACTOR_CHOLESKY;
205   (*B)->info.factor_mallocs    = reallocs;
206   (*B)->info.fill_ratio_given  = f;
207   if (ai[mbs] != 0) {
208     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
209   } else {
210     (*B)->info.fill_ratio_needed = 0.0;
211   }
212 
213   if (perm_identity){
214     switch (bs) {
215       case 1:
216         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
217         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
218         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"));CHKERRQ(ierr);
219         break;
220       case 2:
221         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
222         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
223         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"));CHKERRQ(ierr);
224         break;
225       case 3:
226         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
227         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
228         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"));CHKERRQ(ierr);
229         break;
230       case 4:
231         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
232         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
233         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"));CHKERRQ(ierr);
234         break;
235       case 5:
236         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
237         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
238         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"));CHKERRQ(ierr);
239         break;
240       case 6:
241         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
242         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
243         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"));CHKERRQ(ierr);
244         break;
245       case 7:
246         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
247         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
248         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"));CHKERRQ(ierr);
249       break;
250       default:
251         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
252         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
253         ierr = PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"));CHKERRQ(ierr);
254       break;
255     }
256   }
257   PetscFunctionReturn(0);
258 }
259 /*
260     Symbolic U^T*D*U factorization for SBAIJ format.
261 */
262 #include "petscbt.h"
263 #include "src/mat/utils/freespace.h"
264 #undef __FUNCT__
265 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
266 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
267 {
268   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
269   Mat_SeqSBAIJ   *b;
270   Mat            B;
271   PetscErrorCode ierr;
272   PetscTruth     perm_identity;
273   PetscReal      fill = info->fill;
274   PetscInt       *rip,i,mbs=a->mbs,bs=A->bs,*ai,*aj,reallocs=0,prow;
275   PetscInt       *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
276   PetscInt       nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
277   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
278   PetscBT        lnkbt;
279 
280   PetscFunctionBegin;
281   /*
282    This code originally uses Modified Sparse Row (MSR) storage
283    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
284    Then it is rewritten so the factor B takes seqsbaij format. However the associated
285    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
286    thus the original code in MSR format is still used for these cases.
287    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
288    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
289   */
290   if (bs > 1){
291     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);CHKERRQ(ierr);
292     PetscFunctionReturn(0);
293   }
294 
295   /* check whether perm is the identity mapping */
296   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
297 
298   if (perm_identity){
299     a->permute = PETSC_FALSE;
300     ai = a->i; aj = a->j;
301   } else {
302     a->permute = PETSC_TRUE;
303     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
304     ai = a->inew; aj = a->jnew;
305   }
306   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
307 
308   /* initialization */
309   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
310   ui[0] = 0;
311 
312   /* jl: linked list for storing indices of the pivot rows
313      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
314   ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
315   il     = jl + mbs;
316   cols   = il + mbs;
317   ui_ptr = (PetscInt**)(cols + mbs);
318 
319   for (i=0; i<mbs; i++){
320     jl[i] = mbs; il[i] = 0;
321   }
322 
323   /* create and initialize a linked list for storing column indices of the active row k */
324   nlnk = mbs + 1;
325   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
326 
327   /* initial FreeSpace size is fill*(ai[mbs]+1) */
328   ierr = GetMoreSpace((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
329   current_space = free_space;
330 
331   for (k=0; k<mbs; k++){  /* for each active row k */
332     /* initialize lnk by the column indices of row rip[k] of A */
333     nzk   = 0;
334     ncols = ai[rip[k]+1] - ai[rip[k]];
335     for (j=0; j<ncols; j++){
336       i = *(aj + ai[rip[k]] + j);
337       cols[j] = rip[i];
338     }
339     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
340     nzk += nlnk;
341 
342     /* update lnk by computing fill-in for each pivot row to be merged in */
343     prow = jl[k]; /* 1st pivot row */
344 
345     while (prow < k){
346       nextprow = jl[prow];
347       /* merge prow into k-th row */
348       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
349       jmax = ui[prow+1];
350       ncols = jmax-jmin;
351       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
352       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
353       nzk += nlnk;
354 
355       /* update il and jl for prow */
356       if (jmin < jmax){
357         il[prow] = jmin;
358         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
359       }
360       prow = nextprow;
361     }
362 
363     /* if free space is not available, make more free space */
364     if (current_space->local_remaining<nzk) {
365       i = mbs - k + 1; /* num of unfactored rows */
366       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
367       ierr = GetMoreSpace(i,&current_space);CHKERRQ(ierr);
368       reallocs++;
369     }
370 
371     /* copy data into free space, then initialize lnk */
372     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
373 
374     /* add the k-th row into il and jl */
375     if (nzk-1 > 0){
376       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
377       jl[k] = jl[i]; jl[i] = k;
378       il[k] = ui[k] + 1;
379     }
380     ui_ptr[k] = current_space->array;
381     current_space->array           += nzk;
382     current_space->local_used      += nzk;
383     current_space->local_remaining -= nzk;
384 
385     ui[k+1] = ui[k] + nzk;
386   }
387 
388 #if defined(PETSC_USE_DEBUG)
389   if (ai[mbs] != 0) {
390     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
391     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,fill,af));CHKERRQ(ierr);
392     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af));CHKERRQ(ierr);
393     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g) for best performance.\n",af));CHKERRQ(ierr);
394   } else {
395     ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"));CHKERRQ(ierr);
396   }
397 #endif
398 
399   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
400   ierr = PetscFree(jl);CHKERRQ(ierr);
401 
402   /* destroy list of free space and other temporary array(s) */
403   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
404   ierr = MakeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
405   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
406 
407   /* put together the new matrix in MATSEQSBAIJ format */
408   ierr = MatCreate(PETSC_COMM_SELF,mbs,mbs,mbs,mbs,fact);CHKERRQ(ierr);
409   B    = *fact;
410   ierr = MatSetType(B,MATSEQSBAIJ);CHKERRQ(ierr);
411   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
412 
413   b = (Mat_SeqSBAIJ*)B->data;
414   b->singlemalloc = PETSC_FALSE;
415   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
416   b->j    = uj;
417   b->i    = ui;
418   b->diag = 0;
419   b->ilen = 0;
420   b->imax = 0;
421   b->row  = perm;
422   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
423   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
424   b->icol = perm;
425   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
426   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
427   ierr    = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
428   b->maxnz = b->nz = ui[mbs];
429 
430   B->factor                 = FACTOR_CHOLESKY;
431   B->info.factor_mallocs    = reallocs;
432   B->info.fill_ratio_given  = fill;
433   if (ai[mbs] != 0) {
434     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
435   } else {
436     B->info.fill_ratio_needed = 0.0;
437   }
438 
439   if (perm_identity){
440     switch (bs) {
441       case 1:
442         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
443         B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
444         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"));CHKERRQ(ierr);
445         break;
446       case 2:
447         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
448         B->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
449         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"));CHKERRQ(ierr);
450         break;
451       case 3:
452         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
453         B->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
454         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"));CHKERRQ(ierr);
455         break;
456       case 4:
457         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
458         B->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
459         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"));CHKERRQ(ierr);
460         break;
461       case 5:
462         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
463         B->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
464         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"));CHKERRQ(ierr);
465         break;
466       case 6:
467         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
468         B->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
469         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"));CHKERRQ(ierr);
470         break;
471       case 7:
472         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
473         B->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
474         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"));CHKERRQ(ierr);
475       break;
476       default:
477         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
478         B->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
479         ierr = PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"));CHKERRQ(ierr);
480       break;
481     }
482   }
483   PetscFunctionReturn(0);
484 }
485 #undef __FUNCT__
486 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
487 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,MatFactorInfo *info,Mat *B)
488 {
489   Mat            C = *B;
490   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
491   IS             perm = b->row;
492   PetscErrorCode ierr;
493   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
494   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
495   PetscInt       bs=A->bs,bs2 = a->bs2;
496   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
497   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
498   MatScalar      *work;
499   PetscInt       *pivots;
500 
501   PetscFunctionBegin;
502   /* initialization */
503   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
504   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
505   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
506   jl   = il + mbs;
507   for (i=0; i<mbs; i++) {
508     jl[i] = mbs; il[0] = 0;
509   }
510   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
511   uik  = dk + bs2;
512   work = uik + bs2;
513   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
514 
515   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
516 
517   /* check permutation */
518   if (!a->permute){
519     ai = a->i; aj = a->j; aa = a->a;
520   } else {
521     ai   = a->inew; aj = a->jnew;
522     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
523     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
524     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
525     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
526 
527     for (i=0; i<mbs; i++){
528       jmin = ai[i]; jmax = ai[i+1];
529       for (j=jmin; j<jmax; j++){
530         while (a2anew[j] != j){
531           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
532           for (k1=0; k1<bs2; k1++){
533             dk[k1]       = aa[k*bs2+k1];
534             aa[k*bs2+k1] = aa[j*bs2+k1];
535             aa[j*bs2+k1] = dk[k1];
536           }
537         }
538         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
539         if (i > aj[j]){
540           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
541           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
542           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
543           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
544             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
545           }
546         }
547       }
548     }
549     ierr = PetscFree(a2anew);CHKERRQ(ierr);
550   }
551 
552   /* for each row k */
553   for (k = 0; k<mbs; k++){
554 
555     /*initialize k-th row with elements nonzero in row perm(k) of A */
556     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
557 
558     ap = aa + jmin*bs2;
559     for (j = jmin; j < jmax; j++){
560       vj = perm_ptr[aj[j]];         /* block col. index */
561       rtmp_ptr = rtmp + vj*bs2;
562       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
563     }
564 
565     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
566     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
567     i = jl[k]; /* first row to be added to k_th row  */
568 
569     while (i < k){
570       nexti = jl[i]; /* next row to be added to k_th row */
571 
572       /* compute multiplier */
573       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
574 
575       /* uik = -inv(Di)*U_bar(i,k) */
576       diag = ba + i*bs2;
577       u    = ba + ili*bs2;
578       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
579       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
580 
581       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
582       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
583 
584       /* update -U(i,k) */
585       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
586 
587       /* add multiple of row i to k-th row ... */
588       jmin = ili + 1; jmax = bi[i+1];
589       if (jmin < jmax){
590         for (j=jmin; j<jmax; j++) {
591           /* rtmp += -U(i,k)^T * U_bar(i,j) */
592           rtmp_ptr = rtmp + bj[j]*bs2;
593           u = ba + j*bs2;
594           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
595         }
596 
597         /* ... add i to row list for next nonzero entry */
598         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
599         j     = bj[jmin];
600         jl[i] = jl[j]; jl[j] = i; /* update jl */
601       }
602       i = nexti;
603     }
604 
605     /* save nonzero entries in k-th row of U ... */
606 
607     /* invert diagonal block */
608     diag = ba+k*bs2;
609     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
610     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
611 
612     jmin = bi[k]; jmax = bi[k+1];
613     if (jmin < jmax) {
614       for (j=jmin; j<jmax; j++){
615          vj = bj[j];           /* block col. index of U */
616          u   = ba + j*bs2;
617          rtmp_ptr = rtmp + vj*bs2;
618          for (k1=0; k1<bs2; k1++){
619            *u++        = *rtmp_ptr;
620            *rtmp_ptr++ = 0.0;
621          }
622       }
623 
624       /* ... add k to row list for first nonzero entry in k-th row */
625       il[k] = jmin;
626       i     = bj[jmin];
627       jl[k] = jl[i]; jl[i] = k;
628     }
629   }
630 
631   ierr = PetscFree(rtmp);CHKERRQ(ierr);
632   ierr = PetscFree(il);CHKERRQ(ierr);
633   ierr = PetscFree(dk);CHKERRQ(ierr);
634   ierr = PetscFree(pivots);CHKERRQ(ierr);
635   if (a->permute){
636     ierr = PetscFree(aa);CHKERRQ(ierr);
637   }
638 
639   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
640   C->factor       = FACTOR_CHOLESKY;
641   C->assembled    = PETSC_TRUE;
642   C->preallocated = PETSC_TRUE;
643   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
644   PetscFunctionReturn(0);
645 }
646 
647 #undef __FUNCT__
648 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
649 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
650 {
651   Mat            C = *B;
652   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
653   PetscErrorCode ierr;
654   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
655   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
656   PetscInt       bs=A->bs,bs2 = a->bs2;
657   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
658   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
659   MatScalar      *work;
660   PetscInt       *pivots;
661 
662   PetscFunctionBegin;
663   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
664   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
665   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
666   jl   = il + mbs;
667   for (i=0; i<mbs; i++) {
668     jl[i] = mbs; il[0] = 0;
669   }
670   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
671   uik  = dk + bs2;
672   work = uik + bs2;
673   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
674 
675   ai = a->i; aj = a->j; aa = a->a;
676 
677   /* for each row k */
678   for (k = 0; k<mbs; k++){
679 
680     /*initialize k-th row with elements nonzero in row k of A */
681     jmin = ai[k]; jmax = ai[k+1];
682     ap = aa + jmin*bs2;
683     for (j = jmin; j < jmax; j++){
684       vj = aj[j];         /* block col. index */
685       rtmp_ptr = rtmp + vj*bs2;
686       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
687     }
688 
689     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
690     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
691     i = jl[k]; /* first row to be added to k_th row  */
692 
693     while (i < k){
694       nexti = jl[i]; /* next row to be added to k_th row */
695 
696       /* compute multiplier */
697       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
698 
699       /* uik = -inv(Di)*U_bar(i,k) */
700       diag = ba + i*bs2;
701       u    = ba + ili*bs2;
702       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
703       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
704 
705       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
706       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
707 
708       /* update -U(i,k) */
709       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
710 
711       /* add multiple of row i to k-th row ... */
712       jmin = ili + 1; jmax = bi[i+1];
713       if (jmin < jmax){
714         for (j=jmin; j<jmax; j++) {
715           /* rtmp += -U(i,k)^T * U_bar(i,j) */
716           rtmp_ptr = rtmp + bj[j]*bs2;
717           u = ba + j*bs2;
718           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
719         }
720 
721         /* ... add i to row list for next nonzero entry */
722         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
723         j     = bj[jmin];
724         jl[i] = jl[j]; jl[j] = i; /* update jl */
725       }
726       i = nexti;
727     }
728 
729     /* save nonzero entries in k-th row of U ... */
730 
731     /* invert diagonal block */
732     diag = ba+k*bs2;
733     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
734     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
735 
736     jmin = bi[k]; jmax = bi[k+1];
737     if (jmin < jmax) {
738       for (j=jmin; j<jmax; j++){
739          vj = bj[j];           /* block col. index of U */
740          u   = ba + j*bs2;
741          rtmp_ptr = rtmp + vj*bs2;
742          for (k1=0; k1<bs2; k1++){
743            *u++        = *rtmp_ptr;
744            *rtmp_ptr++ = 0.0;
745          }
746       }
747 
748       /* ... add k to row list for first nonzero entry in k-th row */
749       il[k] = jmin;
750       i     = bj[jmin];
751       jl[k] = jl[i]; jl[i] = k;
752     }
753   }
754 
755   ierr = PetscFree(rtmp);CHKERRQ(ierr);
756   ierr = PetscFree(il);CHKERRQ(ierr);
757   ierr = PetscFree(dk);CHKERRQ(ierr);
758   ierr = PetscFree(pivots);CHKERRQ(ierr);
759 
760   C->factor    = FACTOR_CHOLESKY;
761   C->assembled = PETSC_TRUE;
762   C->preallocated = PETSC_TRUE;
763   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
764   PetscFunctionReturn(0);
765 }
766 
767 /*
768     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
769     Version for blocks 2 by 2.
770 */
771 #undef __FUNCT__
772 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
773 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,MatFactorInfo *info,Mat *B)
774 {
775   Mat            C = *B;
776   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
777   IS             perm = b->row;
778   PetscErrorCode ierr;
779   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
780   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
781   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
782   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
783 
784   PetscFunctionBegin;
785   /* initialization */
786   /* il and jl record the first nonzero element in each row of the accessing
787      window U(0:k, k:mbs-1).
788      jl:    list of rows to be added to uneliminated rows
789             i>= k: jl(i) is the first row to be added to row i
790             i<  k: jl(i) is the row following row i in some list of rows
791             jl(i) = mbs indicates the end of a list
792      il(i): points to the first nonzero element in columns k,...,mbs-1 of
793             row i of U */
794   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
795   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
796   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
797   jl   = il + mbs;
798   for (i=0; i<mbs; i++) {
799     jl[i] = mbs; il[0] = 0;
800   }
801   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
802   uik  = dk + 4;
803   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
804 
805   /* check permutation */
806   if (!a->permute){
807     ai = a->i; aj = a->j; aa = a->a;
808   } else {
809     ai   = a->inew; aj = a->jnew;
810     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
811     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
812     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
813     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
814 
815     for (i=0; i<mbs; i++){
816       jmin = ai[i]; jmax = ai[i+1];
817       for (j=jmin; j<jmax; j++){
818         while (a2anew[j] != j){
819           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
820           for (k1=0; k1<4; k1++){
821             dk[k1]       = aa[k*4+k1];
822             aa[k*4+k1] = aa[j*4+k1];
823             aa[j*4+k1] = dk[k1];
824           }
825         }
826         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
827         if (i > aj[j]){
828           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
829           ap = aa + j*4;     /* ptr to the beginning of the block */
830           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
831           ap[1] = ap[2];
832           ap[2] = dk[1];
833         }
834       }
835     }
836     ierr = PetscFree(a2anew);CHKERRQ(ierr);
837   }
838 
839   /* for each row k */
840   for (k = 0; k<mbs; k++){
841 
842     /*initialize k-th row with elements nonzero in row perm(k) of A */
843     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
844     ap = aa + jmin*4;
845     for (j = jmin; j < jmax; j++){
846       vj = perm_ptr[aj[j]];         /* block col. index */
847       rtmp_ptr = rtmp + vj*4;
848       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
849     }
850 
851     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
852     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
853     i = jl[k]; /* first row to be added to k_th row  */
854 
855     while (i < k){
856       nexti = jl[i]; /* next row to be added to k_th row */
857 
858       /* compute multiplier */
859       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
860 
861       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
862       diag = ba + i*4;
863       u    = ba + ili*4;
864       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
865       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
866       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
867       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
868 
869       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
870       dk[0] += uik[0]*u[0] + uik[1]*u[1];
871       dk[1] += uik[2]*u[0] + uik[3]*u[1];
872       dk[2] += uik[0]*u[2] + uik[1]*u[3];
873       dk[3] += uik[2]*u[2] + uik[3]*u[3];
874 
875       /* update -U(i,k): ba[ili] = uik */
876       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
877 
878       /* add multiple of row i to k-th row ... */
879       jmin = ili + 1; jmax = bi[i+1];
880       if (jmin < jmax){
881         for (j=jmin; j<jmax; j++) {
882           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
883           rtmp_ptr = rtmp + bj[j]*4;
884           u = ba + j*4;
885           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
886           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
887           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
888           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
889         }
890 
891         /* ... add i to row list for next nonzero entry */
892         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
893         j     = bj[jmin];
894         jl[i] = jl[j]; jl[j] = i; /* update jl */
895       }
896       i = nexti;
897     }
898 
899     /* save nonzero entries in k-th row of U ... */
900 
901     /* invert diagonal block */
902     diag = ba+k*4;
903     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
904     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
905 
906     jmin = bi[k]; jmax = bi[k+1];
907     if (jmin < jmax) {
908       for (j=jmin; j<jmax; j++){
909          vj = bj[j];           /* block col. index of U */
910          u   = ba + j*4;
911          rtmp_ptr = rtmp + vj*4;
912          for (k1=0; k1<4; k1++){
913            *u++        = *rtmp_ptr;
914            *rtmp_ptr++ = 0.0;
915          }
916       }
917 
918       /* ... add k to row list for first nonzero entry in k-th row */
919       il[k] = jmin;
920       i     = bj[jmin];
921       jl[k] = jl[i]; jl[i] = k;
922     }
923   }
924 
925   ierr = PetscFree(rtmp);CHKERRQ(ierr);
926   ierr = PetscFree(il);CHKERRQ(ierr);
927   ierr = PetscFree(dk);CHKERRQ(ierr);
928   if (a->permute) {
929     ierr = PetscFree(aa);CHKERRQ(ierr);
930   }
931   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
932   C->factor    = FACTOR_CHOLESKY;
933   C->assembled = PETSC_TRUE;
934   C->preallocated = PETSC_TRUE;
935   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
936   PetscFunctionReturn(0);
937 }
938 
939 /*
940       Version for when blocks are 2 by 2 Using natural ordering
941 */
942 #undef __FUNCT__
943 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
944 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
945 {
946   Mat            C = *B;
947   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
948   PetscErrorCode ierr;
949   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
950   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
951   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
952   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
953 
954   PetscFunctionBegin;
955   /* initialization */
956   /* il and jl record the first nonzero element in each row of the accessing
957      window U(0:k, k:mbs-1).
958      jl:    list of rows to be added to uneliminated rows
959             i>= k: jl(i) is the first row to be added to row i
960             i<  k: jl(i) is the row following row i in some list of rows
961             jl(i) = mbs indicates the end of a list
962      il(i): points to the first nonzero element in columns k,...,mbs-1 of
963             row i of U */
964   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
965   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
966   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
967   jl   = il + mbs;
968   for (i=0; i<mbs; i++) {
969     jl[i] = mbs; il[0] = 0;
970   }
971   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
972   uik  = dk + 4;
973 
974   ai = a->i; aj = a->j; aa = a->a;
975 
976   /* for each row k */
977   for (k = 0; k<mbs; k++){
978 
979     /*initialize k-th row with elements nonzero in row k of A */
980     jmin = ai[k]; jmax = ai[k+1];
981     ap = aa + jmin*4;
982     for (j = jmin; j < jmax; j++){
983       vj = aj[j];         /* block col. index */
984       rtmp_ptr = rtmp + vj*4;
985       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
986     }
987 
988     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
989     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
990     i = jl[k]; /* first row to be added to k_th row  */
991 
992     while (i < k){
993       nexti = jl[i]; /* next row to be added to k_th row */
994 
995       /* compute multiplier */
996       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
997 
998       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
999       diag = ba + i*4;
1000       u    = ba + ili*4;
1001       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1002       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1003       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1004       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1005 
1006       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1007       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1008       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1009       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1010       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1011 
1012       /* update -U(i,k): ba[ili] = uik */
1013       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1014 
1015       /* add multiple of row i to k-th row ... */
1016       jmin = ili + 1; jmax = bi[i+1];
1017       if (jmin < jmax){
1018         for (j=jmin; j<jmax; j++) {
1019           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1020           rtmp_ptr = rtmp + bj[j]*4;
1021           u = ba + j*4;
1022           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1023           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1024           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1025           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1026         }
1027 
1028         /* ... add i to row list for next nonzero entry */
1029         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1030         j     = bj[jmin];
1031         jl[i] = jl[j]; jl[j] = i; /* update jl */
1032       }
1033       i = nexti;
1034     }
1035 
1036     /* save nonzero entries in k-th row of U ... */
1037 
1038     /* invert diagonal block */
1039     diag = ba+k*4;
1040     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1041     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1042 
1043     jmin = bi[k]; jmax = bi[k+1];
1044     if (jmin < jmax) {
1045       for (j=jmin; j<jmax; j++){
1046          vj = bj[j];           /* block col. index of U */
1047          u   = ba + j*4;
1048          rtmp_ptr = rtmp + vj*4;
1049          for (k1=0; k1<4; k1++){
1050            *u++        = *rtmp_ptr;
1051            *rtmp_ptr++ = 0.0;
1052          }
1053       }
1054 
1055       /* ... add k to row list for first nonzero entry in k-th row */
1056       il[k] = jmin;
1057       i     = bj[jmin];
1058       jl[k] = jl[i]; jl[i] = k;
1059     }
1060   }
1061 
1062   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1063   ierr = PetscFree(il);CHKERRQ(ierr);
1064   ierr = PetscFree(dk);CHKERRQ(ierr);
1065 
1066   C->factor    = FACTOR_CHOLESKY;
1067   C->assembled = PETSC_TRUE;
1068   C->preallocated = PETSC_TRUE;
1069   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 /*
1074     Numeric U^T*D*U factorization for SBAIJ format.
1075     Version for blocks are 1 by 1.
1076 */
1077 #undef __FUNCT__
1078 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1079 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,MatFactorInfo *info,Mat *B)
1080 {
1081   Mat            C = *B;
1082   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1083   IS             ip=b->row;
1084   PetscErrorCode ierr;
1085   PetscInt       *rip,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1086   PetscInt       *ai,*aj,*a2anew;
1087   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1088   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1089   PetscReal      zeropivot,rs,shiftnz;
1090   PetscTruth     shiftpd;
1091   ChShift_Ctx    sctx;
1092   PetscInt       newshift;
1093 
1094   PetscFunctionBegin;
1095   /* initialization */
1096   shiftnz   = info->shiftnz;
1097   shiftpd   = info->shiftpd;
1098   zeropivot = info->zeropivot;
1099 
1100   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1101   if (!a->permute){
1102     ai = a->i; aj = a->j; aa = a->a;
1103   } else {
1104     ai = a->inew; aj = a->jnew;
1105     nz = ai[mbs];
1106     ierr = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1107     a2anew = a->a2anew;
1108     bval   = a->a;
1109     for (j=0; j<nz; j++){
1110       aa[a2anew[j]] = *(bval++);
1111     }
1112   }
1113 
1114   /* initialization */
1115   /* il and jl record the first nonzero element in each row of the accessing
1116      window U(0:k, k:mbs-1).
1117      jl:    list of rows to be added to uneliminated rows
1118             i>= k: jl(i) is the first row to be added to row i
1119             i<  k: jl(i) is the row following row i in some list of rows
1120             jl(i) = mbs indicates the end of a list
1121      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1122             row i of U */
1123   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
1124   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
1125   jl   = il + mbs;
1126   rtmp = (MatScalar*)(jl + mbs);
1127 
1128   sctx.shift_amount = 0;
1129   sctx.nshift       = 0;
1130   do {
1131     sctx.chshift = PETSC_FALSE;
1132     for (i=0; i<mbs; i++) {
1133       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1134     }
1135 
1136     for (k = 0; k<mbs; k++){
1137       /*initialize k-th row by the perm[k]-th row of A */
1138       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1139       bval = ba + bi[k];
1140       for (j = jmin; j < jmax; j++){
1141         col = rip[aj[j]];
1142         rtmp[col] = aa[j];
1143         *bval++  = 0.0; /* for in-place factorization */
1144       }
1145 
1146       /* shift the diagonal of the matrix */
1147       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1148 
1149       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1150       dk = rtmp[k];
1151       i = jl[k]; /* first row to be added to k_th row  */
1152 
1153       while (i < k){
1154         nexti = jl[i]; /* next row to be added to k_th row */
1155 
1156         /* compute multiplier, update diag(k) and U(i,k) */
1157         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1158         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
1159         dk += uikdi*ba[ili];
1160         ba[ili] = uikdi; /* -U(i,k) */
1161 
1162         /* add multiple of row i to k-th row */
1163         jmin = ili + 1; jmax = bi[i+1];
1164         if (jmin < jmax){
1165           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1166           /* update il and jl for row i */
1167           il[i] = jmin;
1168           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1169         }
1170         i = nexti;
1171       }
1172 
1173       /* shift the diagonals when zero pivot is detected */
1174       /* compute rs=sum of abs(off-diagonal) */
1175       rs   = 0.0;
1176       jmin = bi[k]+1;
1177       nz   = bi[k+1] - jmin;
1178       if (nz){
1179         bcol = bj + jmin;
1180         while (nz--){
1181           rs += PetscAbsScalar(rtmp[*bcol]);
1182           bcol++;
1183         }
1184       }
1185 
1186       sctx.rs = rs;
1187       sctx.pv = dk;
1188       ierr = MatCholeskyCheckShift_inline(info,sctx,newshift);CHKERRQ(ierr);
1189       if (newshift == 1){
1190         break;    /* sctx.shift_amount is updated */
1191       } else if (newshift == -1){
1192         SETERRQ4(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g",k,PetscAbsScalar(dk),zeropivot,rs);
1193       }
1194 
1195       /* copy data into U(k,:) */
1196       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1197       jmin = bi[k]+1; jmax = bi[k+1];
1198       if (jmin < jmax) {
1199         for (j=jmin; j<jmax; j++){
1200           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1201         }
1202         /* add the k-th row into il and jl */
1203         il[k] = jmin;
1204         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1205       }
1206     }
1207   } while (sctx.chshift);
1208   ierr = PetscFree(il);CHKERRQ(ierr);
1209   if (a->permute){ierr = PetscFree(aa);CHKERRQ(ierr);}
1210 
1211   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1212   C->factor       = FACTOR_CHOLESKY;
1213   C->assembled    = PETSC_TRUE;
1214   C->preallocated = PETSC_TRUE;
1215   ierr = PetscLogFlops(C->m);CHKERRQ(ierr);
1216     if (sctx.nshift){
1217     if (shiftnz) {
1218       ierr = PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1: number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));CHKERRQ(ierr);
1219     } else if (shiftpd) {
1220       ierr = PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1: number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));CHKERRQ(ierr);
1221     }
1222   }
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 /*
1227   Version for when blocks are 1 by 1 Using natural ordering
1228 */
1229 #undef __FUNCT__
1230 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1231 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
1232 {
1233   Mat            C = *B;
1234   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1235   PetscErrorCode ierr;
1236   PetscInt       i,j,mbs = a->mbs;
1237   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1238   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1239   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1240   PetscReal      zeropivot,rs,shiftnz;
1241   PetscTruth     shiftpd;
1242   ChShift_Ctx    sctx;
1243   PetscInt       newshift;
1244 
1245   PetscFunctionBegin;
1246   /* initialization */
1247   shiftnz   = info->shiftnz;
1248   shiftpd   = info->shiftpd;
1249   zeropivot = info->zeropivot;
1250 
1251   /* il and jl record the first nonzero element in each row of the accessing
1252      window U(0:k, k:mbs-1).
1253      jl:    list of rows to be added to uneliminated rows
1254             i>= k: jl(i) is the first row to be added to row i
1255             i<  k: jl(i) is the row following row i in some list of rows
1256             jl(i) = mbs indicates the end of a list
1257      il(i): points to the first nonzero element in U(i,k:mbs-1)
1258   */
1259   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1260   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1261   jl   = il + mbs;
1262 
1263   sctx.shift_amount = 0;
1264   sctx.nshift       = 0;
1265   do {
1266     sctx.chshift = PETSC_FALSE;
1267     for (i=0; i<mbs; i++) {
1268       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1269     }
1270 
1271     for (k = 0; k<mbs; k++){
1272       /*initialize k-th row with elements nonzero in row perm(k) of A */
1273       nz   = ai[k+1] - ai[k];
1274       acol = aj + ai[k];
1275       aval = aa + ai[k];
1276       bval = ba + bi[k];
1277       while (nz -- ){
1278         rtmp[*acol++] = *aval++;
1279         *bval++       = 0.0; /* for in-place factorization */
1280       }
1281 
1282       /* shift the diagonal of the matrix */
1283       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1284 
1285       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1286       dk = rtmp[k];
1287       i  = jl[k]; /* first row to be added to k_th row  */
1288 
1289       while (i < k){
1290         nexti = jl[i]; /* next row to be added to k_th row */
1291         /* compute multiplier, update D(k) and U(i,k) */
1292         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1293         uikdi = - ba[ili]*ba[bi[i]];
1294         dk   += uikdi*ba[ili];
1295         ba[ili] = uikdi; /* -U(i,k) */
1296 
1297         /* add multiple of row i to k-th row ... */
1298         jmin = ili + 1;
1299         nz   = bi[i+1] - jmin;
1300         if (nz > 0){
1301           bcol = bj + jmin;
1302           bval = ba + jmin;
1303           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1304           /* update il and jl for i-th row */
1305           il[i] = jmin;
1306           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1307         }
1308         i = nexti;
1309       }
1310 
1311       /* shift the diagonals when zero pivot is detected */
1312       /* compute rs=sum of abs(off-diagonal) */
1313       rs   = 0.0;
1314       jmin = bi[k]+1;
1315       nz   = bi[k+1] - jmin;
1316       if (nz){
1317         bcol = bj + jmin;
1318         while (nz--){
1319           rs += PetscAbsScalar(rtmp[*bcol]);
1320           bcol++;
1321         }
1322       }
1323 
1324       sctx.rs = rs;
1325       sctx.pv = dk;
1326       ierr = MatCholeskyCheckShift_inline(info,sctx,newshift);CHKERRQ(ierr);
1327       if (newshift == 1){
1328         break;    /* sctx.shift_amount is updated */
1329       } else if (newshift == -1){
1330         SETERRQ4(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g",k,PetscAbsScalar(dk),zeropivot,rs);
1331       }
1332 
1333       /* copy data into U(k,:) */
1334       ba[bi[k]] = 1.0/dk;
1335       jmin      = bi[k]+1;
1336       nz        = bi[k+1] - jmin;
1337       if (nz){
1338         bcol = bj + jmin;
1339         bval = ba + jmin;
1340         while (nz--){
1341           *bval++       = rtmp[*bcol];
1342           rtmp[*bcol++] = 0.0;
1343         }
1344         /* add k-th row into il and jl */
1345         il[k] = jmin;
1346         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1347       }
1348     } /* end of for (k = 0; k<mbs; k++) */
1349   } while (sctx.chshift);
1350   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1351   ierr = PetscFree(il);CHKERRQ(ierr);
1352 
1353   C->factor       = FACTOR_CHOLESKY;
1354   C->assembled    = PETSC_TRUE;
1355   C->preallocated = PETSC_TRUE;
1356   ierr = PetscLogFlops(C->m);CHKERRQ(ierr);
1357   if (sctx.nshift){
1358     if (shiftnz) {
1359       ierr = PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering: number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));CHKERRQ(ierr);
1360     } else if (shiftpd) {
1361       ierr = PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering: number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));CHKERRQ(ierr);
1362     }
1363   }
1364   PetscFunctionReturn(0);
1365 }
1366 
1367 #undef __FUNCT__
1368 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1369 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1370 {
1371   PetscErrorCode ierr;
1372   Mat            C;
1373 
1374   PetscFunctionBegin;
1375   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1376   ierr = MatCholeskyFactorNumeric(A,info,&C);CHKERRQ(ierr);
1377   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 
1382