xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 98a8e78ddbb85805b39c4372d217e786d117606d)
1 
2 #include "src/mat/impls/baij/seq/baij.h"
3 #include "src/mat/impls/sbaij/seq/sbaij.h"
4 #include "src/inline/ilu.h"
5 #include "include/petscis.h"
6 
7 #if !defined(PETSC_USE_COMPLEX)
8 /*
9   input:
10    F -- numeric factor
11   output:
12    nneg, nzero, npos: matrix inertia
13 */
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
17 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
18 {
19   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
20   PetscScalar  *dd = fact_ptr->a;
21   PetscInt     mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
22 
23   PetscFunctionBegin;
24   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
25   nneig_tmp = 0; npos_tmp = 0;
26   for (i=0; i<mbs; i++){
27     if (PetscRealPart(dd[*fi]) > 0.0){
28       npos_tmp++;
29     } else if (PetscRealPart(dd[*fi]) < 0.0){
30       nneig_tmp++;
31     }
32     fi++;
33   }
34   if (nneig) *nneig = nneig_tmp;
35   if (npos)  *npos  = npos_tmp;
36   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
37 
38   PetscFunctionReturn(0);
39 }
40 #endif /* !defined(PETSC_USE_COMPLEX) */
41 
42 /*
43   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
44   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
45 */
46 #undef __FUNCT__
47 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
48 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
49 {
50   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
51   PetscErrorCode ierr;
52   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
53   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
54   PetscInt       m,reallocs = 0,prow;
55   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
56   PetscReal      f = info->fill;
57   PetscTruth     perm_identity;
58 
59   PetscFunctionBegin;
60   /* check whether perm is the identity mapping */
61   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
62   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
63 
64   if (perm_identity){ /* without permutation */
65     a->permute = PETSC_FALSE;
66     ai = a->i; aj = a->j;
67   } else {            /* non-trivial permutation */
68     a->permute = PETSC_TRUE;
69     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
70     ai = a->inew; aj = a->jnew;
71   }
72 
73   /* initialization */
74   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
75   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
76   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
77   iu[0] = mbs+1;
78   juidx = mbs + 1; /* index for ju */
79   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
80   q     = jl + mbs;   /* linked list for col index */
81   for (i=0; i<mbs; i++){
82     jl[i] = mbs;
83     q[i] = 0;
84   }
85 
86   /* for each row k */
87   for (k=0; k<mbs; k++){
88     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
89     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
90     q[k] = mbs;
91     /* initialize nonzero structure of k-th row to row rip[k] of A */
92     jmin = ai[rip[k]] +1; /* exclude diag[k] */
93     jmax = ai[rip[k]+1];
94     for (j=jmin; j<jmax; j++){
95       vj = rip[aj[j]]; /* col. value */
96       if(vj > k){
97         qm = k;
98         do {
99           m  = qm; qm = q[m];
100         } while(qm < vj);
101         if (qm == vj) {
102           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
103         }
104         nzk++;
105         q[m]  = vj;
106         q[vj] = qm;
107       } /* if(vj > k) */
108     } /* for (j=jmin; j<jmax; j++) */
109 
110     /* modify nonzero structure of k-th row by computing fill-in
111        for each row i to be merged in */
112     prow = k;
113     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
114 
115     while (prow < k){
116       /* merge row prow into k-th row */
117       jmin = iu[prow] + 1; jmax = iu[prow+1];
118       qm = k;
119       for (j=jmin; j<jmax; j++){
120         vj = ju[j];
121         do {
122           m = qm; qm = q[m];
123         } while (qm < vj);
124         if (qm != vj){
125          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
126         }
127       }
128       prow = jl[prow]; /* next pivot row */
129     }
130 
131     /* add k to row list for first nonzero element in k-th row */
132     if (nzk > 0){
133       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134       jl[k] = jl[i]; jl[i] = k;
135     }
136     iu[k+1] = iu[k] + nzk;
137 
138     /* allocate more space to ju if needed */
139     if (iu[k+1] > umax) {
140       /* estimate how much additional space we will need */
141       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
142       /* just double the memory each time */
143       maxadd = umax;
144       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
145       umax += maxadd;
146 
147       /* allocate a longer ju */
148       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
149       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
150       ierr = PetscFree(ju);CHKERRQ(ierr);
151       ju   = jutmp;
152       reallocs++; /* count how many times we realloc */
153     }
154 
155     /* save nonzero structure of k-th row in ju */
156     i=k;
157     while (nzk --) {
158       i           = q[i];
159       ju[juidx++] = i;
160     }
161   }
162 
163   if (ai[mbs] != 0) {
164     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
165     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
166     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
167     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
168     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
169   } else {
170      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
171   }
172 
173   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
174   ierr = PetscFree(jl);CHKERRQ(ierr);
175 
176   /* put together the new matrix */
177   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
178   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
179   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
180 
181   /* PetscLogObjectParent(*B,iperm); */
182   b = (Mat_SeqSBAIJ*)(*B)->data;
183   ierr = PetscFree(b->imax);CHKERRQ(ierr);
184   b->singlemalloc = PETSC_FALSE;
185   /* the next line frees the default space generated by the Create() */
186   ierr = PetscFree(b->a);CHKERRQ(ierr);
187   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
188   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
189   b->j    = ju;
190   b->i    = iu;
191   b->diag = 0;
192   b->ilen = 0;
193   b->imax = 0;
194   b->row  = perm;
195   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
197   b->icol = perm;
198   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
199   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
200   /* In b structure:  Free imax, ilen, old a, old j.
201      Allocate idnew, solve_work, new a, new j */
202   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
203   b->maxnz = b->nz = iu[mbs];
204 
205   (*B)->factor                 = FACTOR_CHOLESKY;
206   (*B)->info.factor_mallocs    = reallocs;
207   (*B)->info.fill_ratio_given  = f;
208   if (ai[mbs] != 0) {
209     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
210   } else {
211     (*B)->info.fill_ratio_needed = 0.0;
212   }
213 
214   if (perm_identity){
215     switch (bs) {
216       case 1:
217         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
218         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
219         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
220         break;
221       case 2:
222         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
223         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
224         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
225         break;
226       case 3:
227         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
228         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
229         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
230         break;
231       case 4:
232         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
233         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
234         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
235         break;
236       case 5:
237         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
238         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
239         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
240         break;
241       case 6:
242         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
243         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
244         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
245         break;
246       case 7:
247         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
248         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
249         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
250       break;
251       default:
252         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
253         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
254         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
255       break;
256     }
257   }
258   PetscFunctionReturn(0);
259 }
260 /*
261     Symbolic U^T*D*U factorization for SBAIJ format.
262 */
263 #include "petscbt.h"
264 #include "src/mat/utils/freespace.h"
265 #undef __FUNCT__
266 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
267 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
268 {
269   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
270   Mat_SeqSBAIJ   *b;
271   Mat            B;
272   PetscErrorCode ierr;
273   PetscTruth     perm_identity;
274   PetscReal      fill = info->fill;
275   PetscInt       *rip,i,am=A->m,bs=A->bs,*ai,*aj,reallocs=0,prow;
276   PetscInt       *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
277   PetscInt       nlnk,*lnk,ncols,*cols,*uj,**uj_ptr;
278   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
279   PetscBT        lnkbt;
280 
281   PetscFunctionBegin;
282   /*
283    This code originally uses Modified Sparse Row (MSR) storage
284    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
285    Then it is rewritten so the factor B takes seqsbaij format. However the associated
286    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
287    thus the original code in MSR format is still used for these cases.
288    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
289    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
290   */
291   if (bs > 1){
292     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);CHKERRQ(ierr);
293     PetscFunctionReturn(0);
294   }
295 
296   /* check whether perm is the identity mapping */
297   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
298 
299   if (perm_identity){
300     a->permute = PETSC_FALSE;
301     ai = a->i; aj = a->j;
302   } else {
303     a->permute = PETSC_TRUE;
304     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
305     ai = a->inew; aj = a->jnew;
306   }
307   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
308 
309   /* initialization */
310   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
311   ui[0] = 0;
312 
313   /* jl: linked list for storing indices of the pivot rows
314      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
315   ierr = PetscMalloc((2*am+1)*sizeof(PetscInt)+am*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
316   il     = jl + am;
317   uj_ptr = (PetscInt**)(il + am);
318   for (i=0; i<am; i++){
319     jl[i] = am; il[i] = 0;
320   }
321 
322   /* create and initialize a linked list for storing column indices of the active row k */
323   nlnk = am + 1;
324   ierr = PetscLLCreate(am,am,nlnk,lnk,lnkbt);CHKERRQ(ierr);
325 
326   /* initial FreeSpace size is fill*(ai[am]+1) */
327   ierr = GetMoreSpace((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
328   current_space = free_space;
329 
330   for (k=0; k<am; k++){  /* for each active row k */
331     /* initialize lnk by the column indices of row rip[k] of A */
332     nzk     = 0;
333     ncols    = ai[rip[k]+1] - ai[rip[k]];
334     cols = aj + ai[rip[k]];
335     ierr = PetscLLAdd(ncols,cols,am,nlnk,lnk,lnkbt);CHKERRQ(ierr);
336     nzk += nlnk;
337 
338     /* update lnk by computing fill-in for each pivot row to be merged in */
339     prow = jl[k]; /* 1st pivot row */
340 
341     while (prow < k){
342       nextprow = jl[prow];
343       /* merge prow into k-th row */
344       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
345       jmax = ui[prow+1];
346       ncols = jmax-jmin;
347       cols = uj_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:am-1) */
348       ierr = PetscLLAdd(ncols,cols,am,nlnk,lnk,lnkbt);CHKERRQ(ierr);
349       nzk += nlnk;
350 
351       /* update il and jl for prow */
352       if (jmin < jmax){
353         il[prow] = jmin;
354         j = *cols; jl[prow] = jl[j]; jl[j] = prow;
355       }
356       prow = nextprow;
357     }
358 
359     /* if free space is not available, make more free space */
360     if (current_space->local_remaining<nzk) {
361       i = am - k + 1; /* num of unfactored rows */
362       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
363       ierr = GetMoreSpace(i,&current_space);CHKERRQ(ierr);
364       reallocs++;
365     }
366 
367     /* copy data into free space, then initialize lnk */
368     ierr = PetscLLClean(am,am,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
369 
370     /* add the k-th row into il and jl */
371     if (nzk-1 > 0){
372       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
373       jl[k] = jl[i]; jl[i] = k;
374       il[k] = ui[k] + 1;
375     }
376     uj_ptr[k] = current_space->array;
377     current_space->array           += nzk;
378     current_space->local_used      += nzk;
379     current_space->local_remaining -= nzk;
380 
381     ui[k+1] = ui[k] + nzk;
382   }
383 
384   if (ai[am] != 0) {
385     PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
386     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,fill,af);
387     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
388     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g) for best performance.\n",af);
389   } else {
390      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
391   }
392 
393   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
394   ierr = PetscFree(jl);CHKERRQ(ierr);
395 
396   /* destroy list of free space and other temporary array(s) */
397   ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
398   ierr = MakeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
399   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
400 
401   /* put together the new matrix in MATSEQSBAIJ format */
402   ierr = MatCreate(PETSC_COMM_SELF,am,am,am,am,fact);CHKERRQ(ierr);
403   B    = *fact;
404   ierr = MatSetType(B,MATSEQSBAIJ);CHKERRQ(ierr);
405   ierr = MatSeqSBAIJSetPreallocation(B,bs,0,PETSC_NULL);CHKERRQ(ierr);
406 
407   b = (Mat_SeqSBAIJ*)B->data;
408   ierr = PetscFree(b->imax);CHKERRQ(ierr);
409   b->singlemalloc = PETSC_FALSE;
410   /* the next line frees the default space generated by the Create() */
411   ierr = PetscFree(b->a);CHKERRQ(ierr);
412   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
413   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
414   b->j    = uj;
415   b->i    = ui;
416   b->diag = 0;
417   b->ilen = 0;
418   b->imax = 0;
419   b->row  = perm;
420   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
421   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
422   b->icol = perm;
423   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
424   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
425   PetscLogObjectMemory(B,(ui[am]-am)*(sizeof(PetscInt)+sizeof(MatScalar)));
426   b->maxnz = b->nz = ui[am];
427 
428   B->factor                 = FACTOR_CHOLESKY;
429   B->info.factor_mallocs    = reallocs;
430   B->info.fill_ratio_given  = fill;
431   if (ai[am] != 0) {
432     B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
433   } else {
434     B->info.fill_ratio_needed = 0.0;
435   }
436 
437   if (perm_identity){
438     switch (bs) {
439       case 1:
440         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
441         B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
442         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
443         break;
444       case 2:
445         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
446         B->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
447         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
448         break;
449       case 3:
450         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
451         B->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
452         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
453         break;
454       case 4:
455         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
456         B->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
457         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
458         break;
459       case 5:
460         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
461         B->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
462         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
463         break;
464       case 6:
465         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
466         B->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
467         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
468         break;
469       case 7:
470         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
471         B->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
472         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
473       break;
474       default:
475         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
476         B->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
477         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
478       break;
479     }
480   }
481   PetscFunctionReturn(0);
482 }
483 #undef __FUNCT__
484 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
485 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
486 {
487   Mat            C = *B;
488   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
489   IS             perm = b->row;
490   PetscErrorCode ierr;
491   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
492   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
493   PetscInt       bs=A->bs,bs2 = a->bs2;
494   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
495   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
496   MatScalar      *work;
497   PetscInt       *pivots;
498 
499   PetscFunctionBegin;
500   /* initialization */
501   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
502   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
503   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
504   jl   = il + mbs;
505   for (i=0; i<mbs; i++) {
506     jl[i] = mbs; il[0] = 0;
507   }
508   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
509   uik  = dk + bs2;
510   work = uik + bs2;
511   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
512 
513   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
514 
515   /* check permutation */
516   if (!a->permute){
517     ai = a->i; aj = a->j; aa = a->a;
518   } else {
519     ai   = a->inew; aj = a->jnew;
520     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
521     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
522     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
523     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
524 
525     for (i=0; i<mbs; i++){
526       jmin = ai[i]; jmax = ai[i+1];
527       for (j=jmin; j<jmax; j++){
528         while (a2anew[j] != j){
529           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
530           for (k1=0; k1<bs2; k1++){
531             dk[k1]       = aa[k*bs2+k1];
532             aa[k*bs2+k1] = aa[j*bs2+k1];
533             aa[j*bs2+k1] = dk[k1];
534           }
535         }
536         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
537         if (i > aj[j]){
538           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
539           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
540           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
541           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
542             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
543           }
544         }
545       }
546     }
547     ierr = PetscFree(a2anew);CHKERRQ(ierr);
548   }
549 
550   /* for each row k */
551   for (k = 0; k<mbs; k++){
552 
553     /*initialize k-th row with elements nonzero in row perm(k) of A */
554     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
555 
556     ap = aa + jmin*bs2;
557     for (j = jmin; j < jmax; j++){
558       vj = perm_ptr[aj[j]];         /* block col. index */
559       rtmp_ptr = rtmp + vj*bs2;
560       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
561     }
562 
563     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
564     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
565     i = jl[k]; /* first row to be added to k_th row  */
566 
567     while (i < k){
568       nexti = jl[i]; /* next row to be added to k_th row */
569 
570       /* compute multiplier */
571       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
572 
573       /* uik = -inv(Di)*U_bar(i,k) */
574       diag = ba + i*bs2;
575       u    = ba + ili*bs2;
576       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
577       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
578 
579       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
580       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
581 
582       /* update -U(i,k) */
583       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
584 
585       /* add multiple of row i to k-th row ... */
586       jmin = ili + 1; jmax = bi[i+1];
587       if (jmin < jmax){
588         for (j=jmin; j<jmax; j++) {
589           /* rtmp += -U(i,k)^T * U_bar(i,j) */
590           rtmp_ptr = rtmp + bj[j]*bs2;
591           u = ba + j*bs2;
592           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
593         }
594 
595         /* ... add i to row list for next nonzero entry */
596         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
597         j     = bj[jmin];
598         jl[i] = jl[j]; jl[j] = i; /* update jl */
599       }
600       i = nexti;
601     }
602 
603     /* save nonzero entries in k-th row of U ... */
604 
605     /* invert diagonal block */
606     diag = ba+k*bs2;
607     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
608     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
609 
610     jmin = bi[k]; jmax = bi[k+1];
611     if (jmin < jmax) {
612       for (j=jmin; j<jmax; j++){
613          vj = bj[j];           /* block col. index of U */
614          u   = ba + j*bs2;
615          rtmp_ptr = rtmp + vj*bs2;
616          for (k1=0; k1<bs2; k1++){
617            *u++        = *rtmp_ptr;
618            *rtmp_ptr++ = 0.0;
619          }
620       }
621 
622       /* ... add k to row list for first nonzero entry in k-th row */
623       il[k] = jmin;
624       i     = bj[jmin];
625       jl[k] = jl[i]; jl[i] = k;
626     }
627   }
628 
629   ierr = PetscFree(rtmp);CHKERRQ(ierr);
630   ierr = PetscFree(il);CHKERRQ(ierr);
631   ierr = PetscFree(dk);CHKERRQ(ierr);
632   ierr = PetscFree(pivots);CHKERRQ(ierr);
633   if (a->permute){
634     ierr = PetscFree(aa);CHKERRQ(ierr);
635   }
636 
637   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
638   C->factor       = FACTOR_CHOLESKY;
639   C->assembled    = PETSC_TRUE;
640   C->preallocated = PETSC_TRUE;
641   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
642   PetscFunctionReturn(0);
643 }
644 
645 #undef __FUNCT__
646 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
647 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
648 {
649   Mat            C = *B;
650   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
651   PetscErrorCode ierr;
652   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
653   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
654   PetscInt       bs=A->bs,bs2 = a->bs2;
655   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
656   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
657   MatScalar      *work;
658   PetscInt       *pivots;
659 
660   PetscFunctionBegin;
661   /* initialization */
662 
663   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
664   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
665   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
666   jl   = il + mbs;
667   for (i=0; i<mbs; i++) {
668     jl[i] = mbs; il[0] = 0;
669   }
670   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
671   uik  = dk + bs2;
672   work = uik + bs2;
673   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
674 
675   ai = a->i; aj = a->j; aa = a->a;
676 
677   /* for each row k */
678   for (k = 0; k<mbs; k++){
679 
680     /*initialize k-th row with elements nonzero in row k of A */
681     jmin = ai[k]; jmax = ai[k+1];
682     ap = aa + jmin*bs2;
683     for (j = jmin; j < jmax; j++){
684       vj = aj[j];         /* block col. index */
685       rtmp_ptr = rtmp + vj*bs2;
686       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
687     }
688 
689     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
690     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
691     i = jl[k]; /* first row to be added to k_th row  */
692 
693     while (i < k){
694       nexti = jl[i]; /* next row to be added to k_th row */
695 
696       /* compute multiplier */
697       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
698 
699       /* uik = -inv(Di)*U_bar(i,k) */
700       diag = ba + i*bs2;
701       u    = ba + ili*bs2;
702       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
703       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
704 
705       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
706       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
707 
708       /* update -U(i,k) */
709       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
710 
711       /* add multiple of row i to k-th row ... */
712       jmin = ili + 1; jmax = bi[i+1];
713       if (jmin < jmax){
714         for (j=jmin; j<jmax; j++) {
715           /* rtmp += -U(i,k)^T * U_bar(i,j) */
716           rtmp_ptr = rtmp + bj[j]*bs2;
717           u = ba + j*bs2;
718           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
719         }
720 
721         /* ... add i to row list for next nonzero entry */
722         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
723         j     = bj[jmin];
724         jl[i] = jl[j]; jl[j] = i; /* update jl */
725       }
726       i = nexti;
727     }
728 
729     /* save nonzero entries in k-th row of U ... */
730 
731     /* invert diagonal block */
732     diag = ba+k*bs2;
733     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
734     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
735 
736     jmin = bi[k]; jmax = bi[k+1];
737     if (jmin < jmax) {
738       for (j=jmin; j<jmax; j++){
739          vj = bj[j];           /* block col. index of U */
740          u   = ba + j*bs2;
741          rtmp_ptr = rtmp + vj*bs2;
742          for (k1=0; k1<bs2; k1++){
743            *u++        = *rtmp_ptr;
744            *rtmp_ptr++ = 0.0;
745          }
746       }
747 
748       /* ... add k to row list for first nonzero entry in k-th row */
749       il[k] = jmin;
750       i     = bj[jmin];
751       jl[k] = jl[i]; jl[i] = k;
752     }
753   }
754 
755   ierr = PetscFree(rtmp);CHKERRQ(ierr);
756   ierr = PetscFree(il);CHKERRQ(ierr);
757   ierr = PetscFree(dk);CHKERRQ(ierr);
758   ierr = PetscFree(pivots);CHKERRQ(ierr);
759 
760   C->factor    = FACTOR_CHOLESKY;
761   C->assembled = PETSC_TRUE;
762   C->preallocated = PETSC_TRUE;
763   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
764   PetscFunctionReturn(0);
765 }
766 
767 /*
768     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
769     Version for blocks 2 by 2.
770 */
771 #undef __FUNCT__
772 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
773 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
774 {
775   Mat            C = *B;
776   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
777   IS             perm = b->row;
778   PetscErrorCode ierr;
779   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
780   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
781   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
782   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
783 
784   PetscFunctionBegin;
785   /* initialization */
786   /* il and jl record the first nonzero element in each row of the accessing
787      window U(0:k, k:mbs-1).
788      jl:    list of rows to be added to uneliminated rows
789             i>= k: jl(i) is the first row to be added to row i
790             i<  k: jl(i) is the row following row i in some list of rows
791             jl(i) = mbs indicates the end of a list
792      il(i): points to the first nonzero element in columns k,...,mbs-1 of
793             row i of U */
794   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
795   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
796   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
797   jl   = il + mbs;
798   for (i=0; i<mbs; i++) {
799     jl[i] = mbs; il[0] = 0;
800   }
801   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
802   uik  = dk + 4;
803   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
804 
805   /* check permutation */
806   if (!a->permute){
807     ai = a->i; aj = a->j; aa = a->a;
808   } else {
809     ai   = a->inew; aj = a->jnew;
810     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
811     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
812     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
813     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
814 
815     for (i=0; i<mbs; i++){
816       jmin = ai[i]; jmax = ai[i+1];
817       for (j=jmin; j<jmax; j++){
818         while (a2anew[j] != j){
819           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
820           for (k1=0; k1<4; k1++){
821             dk[k1]       = aa[k*4+k1];
822             aa[k*4+k1] = aa[j*4+k1];
823             aa[j*4+k1] = dk[k1];
824           }
825         }
826         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
827         if (i > aj[j]){
828           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
829           ap = aa + j*4;     /* ptr to the beginning of the block */
830           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
831           ap[1] = ap[2];
832           ap[2] = dk[1];
833         }
834       }
835     }
836     ierr = PetscFree(a2anew);CHKERRQ(ierr);
837   }
838 
839   /* for each row k */
840   for (k = 0; k<mbs; k++){
841 
842     /*initialize k-th row with elements nonzero in row perm(k) of A */
843     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
844     ap = aa + jmin*4;
845     for (j = jmin; j < jmax; j++){
846       vj = perm_ptr[aj[j]];         /* block col. index */
847       rtmp_ptr = rtmp + vj*4;
848       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
849     }
850 
851     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
852     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
853     i = jl[k]; /* first row to be added to k_th row  */
854 
855     while (i < k){
856       nexti = jl[i]; /* next row to be added to k_th row */
857 
858       /* compute multiplier */
859       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
860 
861       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
862       diag = ba + i*4;
863       u    = ba + ili*4;
864       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
865       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
866       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
867       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
868 
869       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
870       dk[0] += uik[0]*u[0] + uik[1]*u[1];
871       dk[1] += uik[2]*u[0] + uik[3]*u[1];
872       dk[2] += uik[0]*u[2] + uik[1]*u[3];
873       dk[3] += uik[2]*u[2] + uik[3]*u[3];
874 
875       /* update -U(i,k): ba[ili] = uik */
876       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
877 
878       /* add multiple of row i to k-th row ... */
879       jmin = ili + 1; jmax = bi[i+1];
880       if (jmin < jmax){
881         for (j=jmin; j<jmax; j++) {
882           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
883           rtmp_ptr = rtmp + bj[j]*4;
884           u = ba + j*4;
885           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
886           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
887           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
888           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
889         }
890 
891         /* ... add i to row list for next nonzero entry */
892         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
893         j     = bj[jmin];
894         jl[i] = jl[j]; jl[j] = i; /* update jl */
895       }
896       i = nexti;
897     }
898 
899     /* save nonzero entries in k-th row of U ... */
900 
901     /* invert diagonal block */
902     diag = ba+k*4;
903     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
904     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
905 
906     jmin = bi[k]; jmax = bi[k+1];
907     if (jmin < jmax) {
908       for (j=jmin; j<jmax; j++){
909          vj = bj[j];           /* block col. index of U */
910          u   = ba + j*4;
911          rtmp_ptr = rtmp + vj*4;
912          for (k1=0; k1<4; k1++){
913            *u++        = *rtmp_ptr;
914            *rtmp_ptr++ = 0.0;
915          }
916       }
917 
918       /* ... add k to row list for first nonzero entry in k-th row */
919       il[k] = jmin;
920       i     = bj[jmin];
921       jl[k] = jl[i]; jl[i] = k;
922     }
923   }
924 
925   ierr = PetscFree(rtmp);CHKERRQ(ierr);
926   ierr = PetscFree(il);CHKERRQ(ierr);
927   ierr = PetscFree(dk);CHKERRQ(ierr);
928   if (a->permute) {
929     ierr = PetscFree(aa);CHKERRQ(ierr);
930   }
931   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
932   C->factor    = FACTOR_CHOLESKY;
933   C->assembled = PETSC_TRUE;
934   C->preallocated = PETSC_TRUE;
935   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
936   PetscFunctionReturn(0);
937 }
938 
939 /*
940       Version for when blocks are 2 by 2 Using natural ordering
941 */
942 #undef __FUNCT__
943 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
944 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
945 {
946   Mat            C = *B;
947   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
948   PetscErrorCode ierr;
949   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
950   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
951   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
952   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
953 
954   PetscFunctionBegin;
955   /* initialization */
956   /* il and jl record the first nonzero element in each row of the accessing
957      window U(0:k, k:mbs-1).
958      jl:    list of rows to be added to uneliminated rows
959             i>= k: jl(i) is the first row to be added to row i
960             i<  k: jl(i) is the row following row i in some list of rows
961             jl(i) = mbs indicates the end of a list
962      il(i): points to the first nonzero element in columns k,...,mbs-1 of
963             row i of U */
964   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
965   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
966   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
967   jl   = il + mbs;
968   for (i=0; i<mbs; i++) {
969     jl[i] = mbs; il[0] = 0;
970   }
971   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
972   uik  = dk + 4;
973 
974   ai = a->i; aj = a->j; aa = a->a;
975 
976   /* for each row k */
977   for (k = 0; k<mbs; k++){
978 
979     /*initialize k-th row with elements nonzero in row k of A */
980     jmin = ai[k]; jmax = ai[k+1];
981     ap = aa + jmin*4;
982     for (j = jmin; j < jmax; j++){
983       vj = aj[j];         /* block col. index */
984       rtmp_ptr = rtmp + vj*4;
985       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
986     }
987 
988     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
989     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
990     i = jl[k]; /* first row to be added to k_th row  */
991 
992     while (i < k){
993       nexti = jl[i]; /* next row to be added to k_th row */
994 
995       /* compute multiplier */
996       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
997 
998       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
999       diag = ba + i*4;
1000       u    = ba + ili*4;
1001       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1002       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1003       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1004       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1005 
1006       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1007       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1008       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1009       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1010       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1011 
1012       /* update -U(i,k): ba[ili] = uik */
1013       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1014 
1015       /* add multiple of row i to k-th row ... */
1016       jmin = ili + 1; jmax = bi[i+1];
1017       if (jmin < jmax){
1018         for (j=jmin; j<jmax; j++) {
1019           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1020           rtmp_ptr = rtmp + bj[j]*4;
1021           u = ba + j*4;
1022           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1023           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1024           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1025           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1026         }
1027 
1028         /* ... add i to row list for next nonzero entry */
1029         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1030         j     = bj[jmin];
1031         jl[i] = jl[j]; jl[j] = i; /* update jl */
1032       }
1033       i = nexti;
1034     }
1035 
1036     /* save nonzero entries in k-th row of U ... */
1037 
1038     /* invert diagonal block */
1039     diag = ba+k*4;
1040     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1041     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1042 
1043     jmin = bi[k]; jmax = bi[k+1];
1044     if (jmin < jmax) {
1045       for (j=jmin; j<jmax; j++){
1046          vj = bj[j];           /* block col. index of U */
1047          u   = ba + j*4;
1048          rtmp_ptr = rtmp + vj*4;
1049          for (k1=0; k1<4; k1++){
1050            *u++        = *rtmp_ptr;
1051            *rtmp_ptr++ = 0.0;
1052          }
1053       }
1054 
1055       /* ... add k to row list for first nonzero entry in k-th row */
1056       il[k] = jmin;
1057       i     = bj[jmin];
1058       jl[k] = jl[i]; jl[i] = k;
1059     }
1060   }
1061 
1062   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1063   ierr = PetscFree(il);CHKERRQ(ierr);
1064   ierr = PetscFree(dk);CHKERRQ(ierr);
1065 
1066   C->factor    = FACTOR_CHOLESKY;
1067   C->assembled = PETSC_TRUE;
1068   C->preallocated = PETSC_TRUE;
1069   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 /*
1074     Numeric U^T*D*U factorization for SBAIJ format.
1075     Version for blocks are 1 by 1.
1076 */
1077 #undef __FUNCT__
1078 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1079 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
1080 {
1081   Mat            C = *B;
1082   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1083   IS             ip=b->row;
1084   PetscErrorCode ierr;
1085   PetscInt       *rip,i,j,mbs = a->mbs,*bi=b->i,*bj=b->j;
1086   PetscInt       *ai,*aj,*a2anew;
1087   PetscInt       k,jmin,jmax,*jl,*il,vj,nexti,ili,nz;
1088   MatScalar      *rtmp,*ba,*aa,dk,uikdi;
1089 
1090   PetscFunctionBegin;
1091   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1092   if (!a->permute){
1093     ai = a->i; aj = a->j; aa = a->a;
1094   } else {
1095     ai = a->inew; aj = a->jnew;
1096     nz = ai[mbs];
1097     ierr = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1098     a2anew = a->a2anew;
1099     ba     = a->a;
1100     for (j=0; j<nz; j++){
1101       aa[a2anew[j]] = *(ba++);
1102     }
1103   }
1104 
1105   /* initialization */
1106   /* il and jl record the first nonzero element in each row of the accessing
1107      window U(0:k, k:mbs-1).
1108      jl:    list of rows to be added to uneliminated rows
1109             i>= k: jl(i) is the first row to be added to row i
1110             i<  k: jl(i) is the row following row i in some list of rows
1111             jl(i) = mbs indicates the end of a list
1112      il(i): points to the first nonzero element in columns k,...,mbs-1 of
1113             row i of U */
1114   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1115   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1116   jl   = il + mbs;
1117   for (i=0; i<mbs; i++) {
1118     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1119   }
1120   ba = b->a;
1121   for (k = 0; k<mbs; k++){
1122     /*initialize k-th row with elements nonzero in row perm(k) of A */
1123     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1124     for (j = jmin; j < jmax; j++){
1125       vj = rip[aj[j]];
1126       rtmp[vj] = aa[j];
1127     }
1128 
1129     /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1130     dk = rtmp[k];
1131     i = jl[k]; /* first row to be added to k_th row  */
1132 
1133     while (i < k){
1134       nexti = jl[i]; /* next row to be added to k_th row */
1135 
1136       /* compute multiplier, update diag(k) and U(i,k) */
1137       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1138       uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */ /* uikdi = - ba[ili]*ba[i]; */
1139       dk += uikdi*ba[ili];
1140       ba[ili] = uikdi; /* -U(i,k) */
1141 
1142       /* add multiple of row i to k-th row ... */
1143       jmin = ili + 1; jmax = bi[i+1];
1144       if (jmin < jmax){
1145         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1146         /* update il and jl for row i */
1147         il[i] = jmin;
1148         j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1149       }
1150       i = nexti;
1151     }
1152 
1153     /* check for zero pivot and save diagoanl element */
1154     if (dk == 0.0){
1155       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1156     }
1157 
1158     /* copy data ino U(k,:) */
1159     ba[bi[k]] = 1.0/dk; /* U(k,k) */
1160     jmin = bi[k]+1; jmax = bi[k+1];
1161     if (jmin < jmax) {
1162       for (j=jmin; j<jmax; j++){
1163          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1164       }
1165       /* add the k-th row into il and jl */
1166       il[k] = jmin;
1167       i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1168     }
1169   }
1170 
1171   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1172   ierr = PetscFree(il);CHKERRQ(ierr);
1173   if (a->permute){ierr = PetscFree(aa);CHKERRQ(ierr);}
1174 
1175   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1176   C->factor    = FACTOR_CHOLESKY;
1177   C->assembled = PETSC_TRUE;
1178   C->preallocated = PETSC_TRUE;
1179   PetscLogFlops(C->m);
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 /*
1184   Version for when blocks are 1 by 1 Using natural ordering
1185 */
1186 #undef __FUNCT__
1187 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1188 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1189 {
1190   Mat            C = *B;
1191   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1192   PetscErrorCode ierr;
1193   PetscInt       i,j,mbs = a->mbs;
1194   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1195   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1196   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1197   PetscReal      damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
1198   PetscTruth     damp,chshift;
1199   PetscInt       nshift=0;
1200 
1201   PetscFunctionBegin;
1202   /* initialization */
1203   /* il and jl record the first nonzero element in each row of the accessing
1204      window U(0:k, k:mbs-1).
1205      jl:    list of rows to be added to uneliminated rows
1206             i>= k: jl(i) is the first row to be added to row i
1207             i<  k: jl(i) is the row following row i in some list of rows
1208             jl(i) = mbs indicates the end of a list
1209      il(i): points to the first nonzero element in U(i,k:mbs-1)
1210   */
1211   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1212   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1213   jl   = il + mbs;
1214 
1215   shift_amount = 0;
1216   do {
1217     damp = PETSC_FALSE;
1218     chshift = PETSC_FALSE;
1219     for (i=0; i<mbs; i++) {
1220       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1221     }
1222 
1223     for (k = 0; k<mbs; k++){ /* row k */
1224     /*initialize k-th row with elements nonzero in row perm(k) of A */
1225       nz   = ai[k+1] - ai[k];
1226       acol = aj + ai[k];
1227       aval = aa + ai[k];
1228       bval = ba + bi[k];
1229       while (nz -- ){
1230         rtmp[*acol++] = *aval++;
1231         *bval++       = 0.0; /* for in-place factorization */
1232       }
1233       /* damp the diagonal of the matrix */
1234       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1235 
1236       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1237       dk = rtmp[k];
1238       i  = jl[k]; /* first row to be added to k_th row  */
1239 
1240       while (i < k){
1241         nexti = jl[i]; /* next row to be added to k_th row */
1242 
1243         /* compute multiplier, update D(k) and U(i,k) */
1244         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1245         uikdi = - ba[ili]*ba[bi[i]];
1246         dk   += uikdi*ba[ili];
1247         ba[ili] = uikdi; /* -U(i,k) */
1248 
1249         /* add multiple of row i to k-th row ... */
1250         jmin = ili + 1;
1251         nz   = bi[i+1] - jmin;
1252         if (nz > 0){
1253           bcol = bj + jmin;
1254           bval = ba + jmin;
1255           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1256           /* ... add i to row list for next nonzero entry */
1257           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1258           j     = bj[jmin];
1259           jl[i] = jl[j]; jl[j] = i; /* update jl */
1260         }
1261         i = nexti;
1262       }
1263 
1264       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1265 	/* calculate a shift that would make this row diagonally dominant */
1266 	PetscReal rs = PetscAbs(PetscRealPart(dk));
1267 	jmin      = bi[k]+1;
1268 	nz        = bi[k+1] - jmin;
1269 	if (nz){
1270 	  bcol = bj + jmin;
1271 	  bval = ba + jmin;
1272 	  while (nz--){
1273 	    rs += PetscAbsScalar(rtmp[*bcol++]);
1274 	  }
1275 	}
1276 	/* if this shift is less than the previous, just up the previous
1277 	   one by a bit */
1278 	shift_amount = PetscMax(rs,1.1*shift_amount);
1279 	chshift  = PETSC_TRUE;
1280 	/* Unlike in the ILU case there is no exit condition on nshift:
1281 	   we increase the shift until it converges. There is no guarantee that
1282 	   this algorithm converges faster or slower, or is better or worse
1283 	   than the ILU algorithm. */
1284 	nshift++;
1285 	break;
1286       }
1287       if (PetscRealPart(dk) < zeropivot){
1288         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1289         if (damping > 0.0) {
1290           if (ndamp) damping *= 2.0;
1291           damp = PETSC_TRUE;
1292           ndamp++;
1293           break;
1294         } else if (PetscAbsScalar(dk) < zeropivot){
1295           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1296         } else {
1297           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %D of Cholesky factorization\n",PetscRealPart(dk),k);
1298         }
1299       }
1300 
1301       /* save nonzero entries in k-th row of U ... */
1302       /* printf("%D, dk: %g, 1/dk: %g\n",k,dk,1/dk); */
1303       ba[bi[k]] = 1.0/dk;
1304       jmin      = bi[k]+1;
1305       nz        = bi[k+1] - jmin;
1306       if (nz){
1307         bcol = bj + jmin;
1308         bval = ba + jmin;
1309         while (nz--){
1310           *bval++       = rtmp[*bcol];
1311           rtmp[*bcol++] = 0.0;
1312         }
1313         /* ... add k to row list for first nonzero entry in k-th row */
1314         il[k] = jmin;
1315         i     = bj[jmin];
1316         jl[k] = jl[i]; jl[i] = k;
1317       }
1318     } /* end of for (k = 0; k<mbs; k++) */
1319   } while (damp||chshift);
1320   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1321   ierr = PetscFree(il);CHKERRQ(ierr);
1322 
1323   C->factor       = FACTOR_CHOLESKY;
1324   C->assembled    = PETSC_TRUE;
1325   C->preallocated = PETSC_TRUE;
1326   PetscLogFlops(b->mbs);
1327   if (ndamp) {
1328     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %D damping value %g\n",ndamp,damping);
1329   }
1330  if (nshift) {
1331     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %D shifts\n",nshift);
1332   }
1333 
1334   PetscFunctionReturn(0);
1335 }
1336 
1337 #undef __FUNCT__
1338 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1339 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1340 {
1341   PetscErrorCode ierr;
1342   Mat            C;
1343 
1344   PetscFunctionBegin;
1345   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1346   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
1347   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 
1352