xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 10c27e3d218d7de20f78fbda3e70c7640c689bc0)
149b5e25fSSatish Balay 
249b5e25fSSatish Balay #include "src/mat/impls/baij/seq/baij.h"
33a7fca6bSBarry Smith #include "src/mat/impls/sbaij/seq/sbaij.h"
449b5e25fSSatish Balay #include "src/inline/ilu.h"
549a6740bSHong Zhang #include "include/petscis.h"
649b5e25fSSatish Balay 
78dc9108eSHong Zhang #if !defined(PETSC_USE_COMPLEX)
85f9f512dSHong Zhang /*
95f9f512dSHong Zhang   input:
10c037c3f7SHong Zhang    F -- numeric factor
115f9f512dSHong Zhang   output:
12c037c3f7SHong Zhang    nneg, nzero, npos: matrix inertia
135f9f512dSHong Zhang */
145f9f512dSHong Zhang 
155f9f512dSHong Zhang #undef __FUNCT__
165f9f512dSHong Zhang #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
1713f74950SBarry Smith PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
185f9f512dSHong Zhang {
19638f5ce0SDinesh Kaushik   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
203e0d88b5SBarry Smith   PetscScalar  *dd = fact_ptr->a;
2113f74950SBarry Smith   PetscInt     mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
225f9f512dSHong Zhang 
235f9f512dSHong Zhang   PetscFunctionBegin;
2477431f27SBarry Smith   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
25eeeff2ecSHong Zhang   nneig_tmp = 0; npos_tmp = 0;
26eeeff2ecSHong Zhang   for (i=0; i<mbs; i++){
27eeeff2ecSHong Zhang     if (PetscRealPart(dd[*fi]) > 0.0){
28eeeff2ecSHong Zhang       npos_tmp++;
29eeeff2ecSHong Zhang     } else if (PetscRealPart(dd[*fi]) < 0.0){
30eeeff2ecSHong Zhang       nneig_tmp++;
315f9f512dSHong Zhang     }
32eeeff2ecSHong Zhang     fi++;
333e0d88b5SBarry Smith   }
34eeeff2ecSHong Zhang   if (nneig) *nneig = nneig_tmp;
35eeeff2ecSHong Zhang   if (npos)  *npos  = npos_tmp;
36eeeff2ecSHong Zhang   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
37eeeff2ecSHong Zhang 
385f9f512dSHong Zhang   PetscFunctionReturn(0);
395f9f512dSHong Zhang }
408dc9108eSHong Zhang #endif /* !defined(PETSC_USE_COMPLEX) */
415f9f512dSHong Zhang 
425f9f512dSHong Zhang /*
435f9f512dSHong Zhang   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
44*10c27e3dSHong Zhang   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
455f9f512dSHong Zhang */
46*10c27e3dSHong Zhang #undef __FUNCT__
47*10c27e3dSHong Zhang #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
48*10c27e3dSHong Zhang PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
49*10c27e3dSHong Zhang {
50*10c27e3dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
51*10c27e3dSHong Zhang   PetscErrorCode ierr;
52*10c27e3dSHong Zhang   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
53*10c27e3dSHong Zhang   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
54*10c27e3dSHong Zhang   PetscInt       m,reallocs = 0,prow;
55*10c27e3dSHong Zhang   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
56*10c27e3dSHong Zhang   PetscReal      f = info->fill;
57*10c27e3dSHong Zhang   PetscTruth     perm_identity;
58*10c27e3dSHong Zhang 
59*10c27e3dSHong Zhang   PetscFunctionBegin;
60*10c27e3dSHong Zhang   /* check whether perm is the identity mapping */
61*10c27e3dSHong Zhang   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
62*10c27e3dSHong Zhang   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
63*10c27e3dSHong Zhang 
64*10c27e3dSHong Zhang   if (perm_identity){ /* without permutation */
65*10c27e3dSHong Zhang     a->permute = PETSC_FALSE;
66*10c27e3dSHong Zhang     ai = a->i; aj = a->j;
67*10c27e3dSHong Zhang   } else {            /* non-trivial permutation */
68*10c27e3dSHong Zhang     a->permute = PETSC_TRUE;
69*10c27e3dSHong Zhang     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
70*10c27e3dSHong Zhang     ai = a->inew; aj = a->jnew;
71*10c27e3dSHong Zhang   }
72*10c27e3dSHong Zhang 
73*10c27e3dSHong Zhang   /* initialization */
74*10c27e3dSHong Zhang   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
75*10c27e3dSHong Zhang   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
76*10c27e3dSHong Zhang   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
77*10c27e3dSHong Zhang   iu[0] = mbs+1;
78*10c27e3dSHong Zhang   juidx = mbs + 1; /* index for ju */
79*10c27e3dSHong Zhang   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
80*10c27e3dSHong Zhang   q     = jl + mbs;   /* linked list for col index */
81*10c27e3dSHong Zhang   for (i=0; i<mbs; i++){
82*10c27e3dSHong Zhang     jl[i] = mbs;
83*10c27e3dSHong Zhang     q[i] = 0;
84*10c27e3dSHong Zhang   }
85*10c27e3dSHong Zhang 
86*10c27e3dSHong Zhang   /* for each row k */
87*10c27e3dSHong Zhang   for (k=0; k<mbs; k++){
88*10c27e3dSHong Zhang     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
89*10c27e3dSHong Zhang     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
90*10c27e3dSHong Zhang     q[k] = mbs;
91*10c27e3dSHong Zhang     /* initialize nonzero structure of k-th row to row rip[k] of A */
92*10c27e3dSHong Zhang     jmin = ai[rip[k]] +1; /* exclude diag[k] */
93*10c27e3dSHong Zhang     jmax = ai[rip[k]+1];
94*10c27e3dSHong Zhang     for (j=jmin; j<jmax; j++){
95*10c27e3dSHong Zhang       vj = rip[aj[j]]; /* col. value */
96*10c27e3dSHong Zhang       if(vj > k){
97*10c27e3dSHong Zhang         qm = k;
98*10c27e3dSHong Zhang         do {
99*10c27e3dSHong Zhang           m  = qm; qm = q[m];
100*10c27e3dSHong Zhang         } while(qm < vj);
101*10c27e3dSHong Zhang         if (qm == vj) {
102*10c27e3dSHong Zhang           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
103*10c27e3dSHong Zhang         }
104*10c27e3dSHong Zhang         nzk++;
105*10c27e3dSHong Zhang         q[m]  = vj;
106*10c27e3dSHong Zhang         q[vj] = qm;
107*10c27e3dSHong Zhang       } /* if(vj > k) */
108*10c27e3dSHong Zhang     } /* for (j=jmin; j<jmax; j++) */
109*10c27e3dSHong Zhang 
110*10c27e3dSHong Zhang     /* modify nonzero structure of k-th row by computing fill-in
111*10c27e3dSHong Zhang        for each row i to be merged in */
112*10c27e3dSHong Zhang     prow = k;
113*10c27e3dSHong Zhang     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
114*10c27e3dSHong Zhang 
115*10c27e3dSHong Zhang     while (prow < k){
116*10c27e3dSHong Zhang       /* merge row prow into k-th row */
117*10c27e3dSHong Zhang       jmin = iu[prow] + 1; jmax = iu[prow+1];
118*10c27e3dSHong Zhang       qm = k;
119*10c27e3dSHong Zhang       for (j=jmin; j<jmax; j++){
120*10c27e3dSHong Zhang         vj = ju[j];
121*10c27e3dSHong Zhang         do {
122*10c27e3dSHong Zhang           m = qm; qm = q[m];
123*10c27e3dSHong Zhang         } while (qm < vj);
124*10c27e3dSHong Zhang         if (qm != vj){
125*10c27e3dSHong Zhang          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
126*10c27e3dSHong Zhang         }
127*10c27e3dSHong Zhang       }
128*10c27e3dSHong Zhang       prow = jl[prow]; /* next pivot row */
129*10c27e3dSHong Zhang     }
130*10c27e3dSHong Zhang 
131*10c27e3dSHong Zhang     /* add k to row list for first nonzero element in k-th row */
132*10c27e3dSHong Zhang     if (nzk > 0){
133*10c27e3dSHong Zhang       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
134*10c27e3dSHong Zhang       jl[k] = jl[i]; jl[i] = k;
135*10c27e3dSHong Zhang     }
136*10c27e3dSHong Zhang     iu[k+1] = iu[k] + nzk;
137*10c27e3dSHong Zhang 
138*10c27e3dSHong Zhang     /* allocate more space to ju if needed */
139*10c27e3dSHong Zhang     if (iu[k+1] > umax) {
140*10c27e3dSHong Zhang       /* estimate how much additional space we will need */
141*10c27e3dSHong Zhang       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
142*10c27e3dSHong Zhang       /* just double the memory each time */
143*10c27e3dSHong Zhang       maxadd = umax;
144*10c27e3dSHong Zhang       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
145*10c27e3dSHong Zhang       umax += maxadd;
146*10c27e3dSHong Zhang 
147*10c27e3dSHong Zhang       /* allocate a longer ju */
148*10c27e3dSHong Zhang       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
149*10c27e3dSHong Zhang       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
150*10c27e3dSHong Zhang       ierr = PetscFree(ju);CHKERRQ(ierr);
151*10c27e3dSHong Zhang       ju   = jutmp;
152*10c27e3dSHong Zhang       reallocs++; /* count how many times we realloc */
153*10c27e3dSHong Zhang     }
154*10c27e3dSHong Zhang 
155*10c27e3dSHong Zhang     /* save nonzero structure of k-th row in ju */
156*10c27e3dSHong Zhang     i=k;
157*10c27e3dSHong Zhang     while (nzk --) {
158*10c27e3dSHong Zhang       i           = q[i];
159*10c27e3dSHong Zhang       ju[juidx++] = i;
160*10c27e3dSHong Zhang     }
161*10c27e3dSHong Zhang   }
162*10c27e3dSHong Zhang 
163*10c27e3dSHong Zhang   if (ai[mbs] != 0) {
164*10c27e3dSHong Zhang     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
165*10c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
166*10c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
167*10c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
168*10c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
169*10c27e3dSHong Zhang   } else {
170*10c27e3dSHong Zhang      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
171*10c27e3dSHong Zhang   }
172*10c27e3dSHong Zhang 
173*10c27e3dSHong Zhang   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
174*10c27e3dSHong Zhang   ierr = PetscFree(jl);CHKERRQ(ierr);
175*10c27e3dSHong Zhang 
176*10c27e3dSHong Zhang   /* put together the new matrix */
177*10c27e3dSHong Zhang   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
178*10c27e3dSHong Zhang   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
179*10c27e3dSHong Zhang   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
180*10c27e3dSHong Zhang 
181*10c27e3dSHong Zhang   /* PetscLogObjectParent(*B,iperm); */
182*10c27e3dSHong Zhang   b = (Mat_SeqSBAIJ*)(*B)->data;
183*10c27e3dSHong Zhang   ierr = PetscFree(b->imax);CHKERRQ(ierr);
184*10c27e3dSHong Zhang   b->singlemalloc = PETSC_FALSE;
185*10c27e3dSHong Zhang   /* the next line frees the default space generated by the Create() */
186*10c27e3dSHong Zhang   ierr = PetscFree(b->a);CHKERRQ(ierr);
187*10c27e3dSHong Zhang   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
188*10c27e3dSHong Zhang   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
189*10c27e3dSHong Zhang   b->j    = ju;
190*10c27e3dSHong Zhang   b->i    = iu;
191*10c27e3dSHong Zhang   b->diag = 0;
192*10c27e3dSHong Zhang   b->ilen = 0;
193*10c27e3dSHong Zhang   b->imax = 0;
194*10c27e3dSHong Zhang   b->row  = perm;
195*10c27e3dSHong Zhang   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196*10c27e3dSHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
197*10c27e3dSHong Zhang   b->icol = perm;
198*10c27e3dSHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
199*10c27e3dSHong Zhang   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
200*10c27e3dSHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
201*10c27e3dSHong Zhang      Allocate idnew, solve_work, new a, new j */
202*10c27e3dSHong Zhang   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
203*10c27e3dSHong Zhang   b->maxnz = b->nz = iu[mbs];
204*10c27e3dSHong Zhang 
205*10c27e3dSHong Zhang   (*B)->factor                 = FACTOR_CHOLESKY;
206*10c27e3dSHong Zhang   (*B)->info.factor_mallocs    = reallocs;
207*10c27e3dSHong Zhang   (*B)->info.fill_ratio_given  = f;
208*10c27e3dSHong Zhang   if (ai[mbs] != 0) {
209*10c27e3dSHong Zhang     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
210*10c27e3dSHong Zhang   } else {
211*10c27e3dSHong Zhang     (*B)->info.fill_ratio_needed = 0.0;
212*10c27e3dSHong Zhang   }
213*10c27e3dSHong Zhang 
214*10c27e3dSHong Zhang   if (perm_identity){
215*10c27e3dSHong Zhang     switch (bs) {
216*10c27e3dSHong Zhang       case 1:
217*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
218*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
219*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
220*10c27e3dSHong Zhang         break;
221*10c27e3dSHong Zhang       case 2:
222*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
223*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
224*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
225*10c27e3dSHong Zhang         break;
226*10c27e3dSHong Zhang       case 3:
227*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
228*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
229*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
230*10c27e3dSHong Zhang         break;
231*10c27e3dSHong Zhang       case 4:
232*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
233*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
234*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
235*10c27e3dSHong Zhang         break;
236*10c27e3dSHong Zhang       case 5:
237*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
238*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
239*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
240*10c27e3dSHong Zhang         break;
241*10c27e3dSHong Zhang       case 6:
242*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
243*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
244*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
245*10c27e3dSHong Zhang         break;
246*10c27e3dSHong Zhang       case 7:
247*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
248*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
249*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
250*10c27e3dSHong Zhang       break;
251*10c27e3dSHong Zhang       default:
252*10c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
253*10c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
254*10c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
255*10c27e3dSHong Zhang       break;
256*10c27e3dSHong Zhang     }
257*10c27e3dSHong Zhang   }
258*10c27e3dSHong Zhang   PetscFunctionReturn(0);
259*10c27e3dSHong Zhang }
260*10c27e3dSHong Zhang /*
261*10c27e3dSHong Zhang     Symbolic U^T*D*U factorization for SBAIJ format.
262*10c27e3dSHong Zhang */
2634a2ae208SSatish Balay #undef __FUNCT__
2644a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
265dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
26649b5e25fSSatish Balay {
26749b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2686849ba73SBarry Smith   PetscErrorCode ierr;
26913f74950SBarry Smith   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
27013f74950SBarry Smith   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
27113f74950SBarry Smith   PetscInt       m,reallocs = 0,prow;
27213f74950SBarry Smith   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
27313f74950SBarry Smith   PetscInt       *il,ili,nextprow;
27415e8a5b3SHong Zhang   PetscReal      f = info->fill;
275671cb588SHong Zhang   PetscTruth     perm_identity;
27649b5e25fSSatish Balay 
27749b5e25fSSatish Balay   PetscFunctionBegin;
278*10c27e3dSHong Zhang   /*
279*10c27e3dSHong Zhang    This code originally uses Modified Sparse Row (MSR) storage
280*10c27e3dSHong Zhang    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
281*10c27e3dSHong Zhang    Then it is rewritten so the factor B takes seqsbaij format. However the associated
282*10c27e3dSHong Zhang    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
283*10c27e3dSHong Zhang    thus the original code in MSR format is still used for these cases.
284*10c27e3dSHong Zhang    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
285*10c27e3dSHong Zhang    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
286*10c27e3dSHong Zhang   */
287*10c27e3dSHong Zhang 
288cb718733SHong Zhang   /* check whether perm is the identity mapping */
289671cb588SHong Zhang   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
290*10c27e3dSHong Zhang   if (bs>1 || !perm_identity){
291*10c27e3dSHong Zhang     a->permute = PETSC_TRUE;
292*10c27e3dSHong Zhang     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,B);CHKERRQ(ierr);
293*10c27e3dSHong Zhang     PetscFunctionReturn(0);
294*10c27e3dSHong Zhang   }
295064503c5SHong Zhang 
296*10c27e3dSHong Zhang   /* At present, the code below works only for perm_identity=PETSC_TRUE */
297*10c27e3dSHong Zhang   printf(" MatCholeskyFactorSymbolic_SeqSBAIJ\n");
298*10c27e3dSHong Zhang   a->permute = PETSC_FALSE;
299064503c5SHong Zhang 
300064503c5SHong Zhang   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
301064503c5SHong Zhang   ai = a->i; aj = a->j;
302064503c5SHong Zhang 
303064503c5SHong Zhang   /* initialization */
30413f74950SBarry Smith   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
30513f74950SBarry Smith   umax  = (PetscInt)(f*ai[mbs] + 1);
30613f74950SBarry Smith   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
307064503c5SHong Zhang   iu[0] = 0;
308064503c5SHong Zhang   juidx = 0; /* index for ju */
30913f74950SBarry Smith   ierr  = PetscMalloc((3*mbs+1)*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for getting pivot row */
310064503c5SHong Zhang   q     = jl + mbs;   /* linked list for col index of active row */
311064503c5SHong Zhang   il    = q  + mbs;
312064503c5SHong Zhang   for (i=0; i<mbs; i++){
313064503c5SHong Zhang     jl[i] = mbs;
314064503c5SHong Zhang     q[i]  = 0;
315064503c5SHong Zhang     il[i] = 0;
316064503c5SHong Zhang   }
317064503c5SHong Zhang 
318064503c5SHong Zhang   /* for each row k */
319064503c5SHong Zhang   for (k=0; k<mbs; k++){
320064503c5SHong Zhang     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
321064503c5SHong Zhang     q[k] = mbs;
322064503c5SHong Zhang     /* initialize nonzero structure of k-th row to row rip[k] of A */
323064503c5SHong Zhang     jmin = ai[rip[k]] +1; /* exclude diag[k] */
324064503c5SHong Zhang     jmax = ai[rip[k]+1];
325064503c5SHong Zhang     for (j=jmin; j<jmax; j++){
326064503c5SHong Zhang       vj = rip[aj[j]]; /* col. value */
327064503c5SHong Zhang       if(vj > k){
328064503c5SHong Zhang         qm = k;
329064503c5SHong Zhang         do {
330064503c5SHong Zhang           m  = qm; qm = q[m];
331064503c5SHong Zhang         } while(qm < vj);
332064503c5SHong Zhang         if (qm == vj) {
333e005ede5SBarry Smith           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
334064503c5SHong Zhang         }
335064503c5SHong Zhang         nzk++;
336064503c5SHong Zhang         q[m]  = vj;
337064503c5SHong Zhang         q[vj] = qm;
338064503c5SHong Zhang       } /* if(vj > k) */
339064503c5SHong Zhang     } /* for (j=jmin; j<jmax; j++) */
340064503c5SHong Zhang 
341064503c5SHong Zhang     /* modify nonzero structure of k-th row by computing fill-in
342064503c5SHong Zhang        for each row i to be merged in */
343064503c5SHong Zhang     prow = k;
344064503c5SHong Zhang     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
345064503c5SHong Zhang 
346064503c5SHong Zhang     while (prow < k){
347064503c5SHong Zhang       nextprow = jl[prow];
348064503c5SHong Zhang 
349064503c5SHong Zhang       /* merge row prow into k-th row */
350064503c5SHong Zhang       ili = il[prow];
351064503c5SHong Zhang       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
352064503c5SHong Zhang       jmax = iu[prow+1];
353064503c5SHong Zhang       qm = k;
354064503c5SHong Zhang       for (j=jmin; j<jmax; j++){
355064503c5SHong Zhang         vj = ju[j];
356064503c5SHong Zhang         do {
357064503c5SHong Zhang           m = qm; qm = q[m];
358064503c5SHong Zhang         } while (qm < vj);
359064503c5SHong Zhang         if (qm != vj){  /* a fill */
360064503c5SHong Zhang           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
361064503c5SHong Zhang         }
362064503c5SHong Zhang       } /* end of for (j=jmin; j<jmax; j++) */
363064503c5SHong Zhang       if (jmin < jmax){
364064503c5SHong Zhang         il[prow] = jmin;
365064503c5SHong Zhang         j = ju[jmin];
366064503c5SHong Zhang         jl[prow] = jl[j]; jl[j] = prow;  /* update jl */
367064503c5SHong Zhang       }
368064503c5SHong Zhang       prow = nextprow;
369064503c5SHong Zhang     }
370064503c5SHong Zhang 
371064503c5SHong Zhang     /* update il and jl */
372064503c5SHong Zhang     if (nzk > 0){
373064503c5SHong Zhang       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
374064503c5SHong Zhang       jl[k] = jl[i]; jl[i] = k;
375064503c5SHong Zhang       il[k] = iu[k] + 1;
376064503c5SHong Zhang     }
377064503c5SHong Zhang     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */
378064503c5SHong Zhang 
379064503c5SHong Zhang     /* allocate more space to ju if needed */
380064503c5SHong Zhang     if (iu[k+1] > umax) {
381064503c5SHong Zhang       /* estimate how much additional space we will need */
382064503c5SHong Zhang       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
383064503c5SHong Zhang       /* just double the memory each time */
384064503c5SHong Zhang       maxadd = umax;
385064503c5SHong Zhang       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
386064503c5SHong Zhang       umax += maxadd;
387064503c5SHong Zhang 
388064503c5SHong Zhang       /* allocate a longer ju */
38913f74950SBarry Smith       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
39013f74950SBarry Smith       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
391064503c5SHong Zhang       ierr = PetscFree(ju);CHKERRQ(ierr);
392064503c5SHong Zhang       ju   = jutmp;
393418422e8SSatish Balay       reallocs++; /* count how many times we realloc */
394064503c5SHong Zhang     }
395064503c5SHong Zhang 
396064503c5SHong Zhang     /* save nonzero structure of k-th row in ju */
397064503c5SHong Zhang     ju[juidx++] = k; /* diag[k] */
398064503c5SHong Zhang     i = k;
399064503c5SHong Zhang     while (nzk --) {
400064503c5SHong Zhang       i           = q[i];
401064503c5SHong Zhang       ju[juidx++] = i;
402064503c5SHong Zhang     }
403064503c5SHong Zhang   }
404064503c5SHong Zhang 
405064503c5SHong Zhang   if (ai[mbs] != 0) {
406064503c5SHong Zhang     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
407418422e8SSatish Balay     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
408064503c5SHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
409064503c5SHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
410064503c5SHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
411064503c5SHong Zhang   } else {
412064503c5SHong Zhang      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
413064503c5SHong Zhang   }
414064503c5SHong Zhang 
415064503c5SHong Zhang   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
416064503c5SHong Zhang   /* ierr = PetscFree(q);CHKERRQ(ierr); */
417064503c5SHong Zhang   ierr = PetscFree(jl);CHKERRQ(ierr);
418064503c5SHong Zhang 
419064503c5SHong Zhang   /* put together the new matrix */
420be5d1d56SKris Buschelman   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
421be5d1d56SKris Buschelman   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
422be5d1d56SKris Buschelman   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
423be5d1d56SKris Buschelman 
424064503c5SHong Zhang   /* PetscLogObjectParent(*B,iperm); */
425064503c5SHong Zhang   b = (Mat_SeqSBAIJ*)(*B)->data;
426064503c5SHong Zhang   ierr = PetscFree(b->imax);CHKERRQ(ierr);
427064503c5SHong Zhang   b->singlemalloc = PETSC_FALSE;
428064503c5SHong Zhang   /* the next line frees the default space generated by the Create() */
429064503c5SHong Zhang   ierr = PetscFree(b->a);CHKERRQ(ierr);
430064503c5SHong Zhang   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
431064503c5SHong Zhang   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
432064503c5SHong Zhang   b->j    = ju;
433064503c5SHong Zhang   b->i    = iu;
434064503c5SHong Zhang   b->diag = 0;
435064503c5SHong Zhang   b->ilen = 0;
436064503c5SHong Zhang   b->imax = 0;
437064503c5SHong Zhang   b->row  = perm;
43815e8a5b3SHong Zhang   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
439064503c5SHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
440064503c5SHong Zhang   b->icol = perm;
441064503c5SHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
442064503c5SHong Zhang   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
443064503c5SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
444064503c5SHong Zhang      Allocate idnew, solve_work, new a, new j */
44513f74950SBarry Smith   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
4466c6c5352SBarry Smith   b->maxnz = b->nz = iu[mbs];
447064503c5SHong Zhang 
448064503c5SHong Zhang   (*B)->factor                 = FACTOR_CHOLESKY;
449418422e8SSatish Balay   (*B)->info.factor_mallocs    = reallocs;
450064503c5SHong Zhang   (*B)->info.fill_ratio_given  = f;
451064503c5SHong Zhang   if (ai[mbs] != 0) {
452064503c5SHong Zhang     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
453064503c5SHong Zhang   } else {
454064503c5SHong Zhang     (*B)->info.fill_ratio_needed = 0.0;
455064503c5SHong Zhang   }
456064503c5SHong Zhang 
457b45a75daSHong Zhang   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
458b45a75daSHong Zhang   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
459e005ede5SBarry Smith   (*B)->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
460064503c5SHong Zhang   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
461064503c5SHong Zhang   PetscFunctionReturn(0);
462064503c5SHong Zhang }
4634a2ae208SSatish Balay #undef __FUNCT__
4644a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
465dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
46649b5e25fSSatish Balay {
46749b5e25fSSatish Balay   Mat            C = *B;
4684c16a6a6SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
4694c16a6a6SHong Zhang   IS             perm = b->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
47113f74950SBarry Smith   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
47213f74950SBarry Smith   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
47313f74950SBarry Smith   PetscInt       bs=A->bs,bs2 = a->bs2;
4744c16a6a6SHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
4754c16a6a6SHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
47628de702eSHong Zhang   MatScalar      *work;
47713f74950SBarry Smith   PetscInt       *pivots;
4784c16a6a6SHong Zhang 
4794c16a6a6SHong Zhang   PetscFunctionBegin;
4804c16a6a6SHong Zhang   /* initialization */
48182502324SSatish Balay   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4824c16a6a6SHong Zhang   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
48313f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
48428de702eSHong Zhang   jl   = il + mbs;
4854c16a6a6SHong Zhang   for (i=0; i<mbs; i++) {
4864c16a6a6SHong Zhang     jl[i] = mbs; il[0] = 0;
4874c16a6a6SHong Zhang   }
488b0a32e0cSBarry Smith   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
48928de702eSHong Zhang   uik  = dk + bs2;
49028de702eSHong Zhang   work = uik + bs2;
49113f74950SBarry Smith   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
4924c16a6a6SHong Zhang 
4934c16a6a6SHong Zhang   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
4944c16a6a6SHong Zhang 
4954c16a6a6SHong Zhang   /* check permutation */
4964c16a6a6SHong Zhang   if (!a->permute){
4974c16a6a6SHong Zhang     ai = a->i; aj = a->j; aa = a->a;
4984c16a6a6SHong Zhang   } else {
4994c16a6a6SHong Zhang     ai   = a->inew; aj = a->jnew;
50082502324SSatish Balay     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
5014c16a6a6SHong Zhang     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
50213f74950SBarry Smith     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
50313f74950SBarry Smith     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
5044c16a6a6SHong Zhang 
5054c16a6a6SHong Zhang     for (i=0; i<mbs; i++){
5064c16a6a6SHong Zhang       jmin = ai[i]; jmax = ai[i+1];
5074c16a6a6SHong Zhang       for (j=jmin; j<jmax; j++){
5084c16a6a6SHong Zhang         while (a2anew[j] != j){
5094c16a6a6SHong Zhang           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
5104c16a6a6SHong Zhang           for (k1=0; k1<bs2; k1++){
5114c16a6a6SHong Zhang             dk[k1]       = aa[k*bs2+k1];
5124c16a6a6SHong Zhang             aa[k*bs2+k1] = aa[j*bs2+k1];
5134c16a6a6SHong Zhang             aa[j*bs2+k1] = dk[k1];
5144c16a6a6SHong Zhang           }
5154c16a6a6SHong Zhang         }
5164c16a6a6SHong Zhang         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
5174c16a6a6SHong Zhang         if (i > aj[j]){
5184c16a6a6SHong Zhang           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
5194c16a6a6SHong Zhang           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
5204c16a6a6SHong Zhang           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
5214c16a6a6SHong Zhang           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
5224c16a6a6SHong Zhang             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
5234c16a6a6SHong Zhang           }
5244c16a6a6SHong Zhang         }
5254c16a6a6SHong Zhang       }
5264c16a6a6SHong Zhang     }
527323b833fSBarry Smith     ierr = PetscFree(a2anew);CHKERRQ(ierr);
5284c16a6a6SHong Zhang   }
5294c16a6a6SHong Zhang 
5304c16a6a6SHong Zhang   /* for each row k */
5314c16a6a6SHong Zhang   for (k = 0; k<mbs; k++){
5324c16a6a6SHong Zhang 
5334c16a6a6SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
5344c16a6a6SHong Zhang     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
535057f5ba7SHong Zhang 
5364c16a6a6SHong Zhang     ap = aa + jmin*bs2;
5374c16a6a6SHong Zhang     for (j = jmin; j < jmax; j++){
5384c16a6a6SHong Zhang       vj = perm_ptr[aj[j]];         /* block col. index */
5394c16a6a6SHong Zhang       rtmp_ptr = rtmp + vj*bs2;
5404c16a6a6SHong Zhang       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
5414c16a6a6SHong Zhang     }
5424c16a6a6SHong Zhang 
5434c16a6a6SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
5444c16a6a6SHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5454c16a6a6SHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
5464c16a6a6SHong Zhang 
547057f5ba7SHong Zhang     while (i < k){
5484c16a6a6SHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
5494c16a6a6SHong Zhang 
5504c16a6a6SHong Zhang       /* compute multiplier */
5514c16a6a6SHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
5524c16a6a6SHong Zhang 
5534c16a6a6SHong Zhang       /* uik = -inv(Di)*U_bar(i,k) */
5544c16a6a6SHong Zhang       diag = ba + i*bs2;
5554c16a6a6SHong Zhang       u    = ba + ili*bs2;
5564c16a6a6SHong Zhang       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5574c16a6a6SHong Zhang       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
5584c16a6a6SHong Zhang 
5594c16a6a6SHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
5604c16a6a6SHong Zhang       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
5614c16a6a6SHong Zhang 
5624c16a6a6SHong Zhang       /* update -U(i,k) */
5634c16a6a6SHong Zhang       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5644c16a6a6SHong Zhang 
5654c16a6a6SHong Zhang       /* add multiple of row i to k-th row ... */
5664c16a6a6SHong Zhang       jmin = ili + 1; jmax = bi[i+1];
5674c16a6a6SHong Zhang       if (jmin < jmax){
5684c16a6a6SHong Zhang         for (j=jmin; j<jmax; j++) {
5694c16a6a6SHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j) */
5704c16a6a6SHong Zhang           rtmp_ptr = rtmp + bj[j]*bs2;
5714c16a6a6SHong Zhang           u = ba + j*bs2;
5724c16a6a6SHong Zhang           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
5734c16a6a6SHong Zhang         }
5744c16a6a6SHong Zhang 
5754c16a6a6SHong Zhang         /* ... add i to row list for next nonzero entry */
5764c16a6a6SHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
5774c16a6a6SHong Zhang         j     = bj[jmin];
5784c16a6a6SHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
5794c16a6a6SHong Zhang       }
5804c16a6a6SHong Zhang       i = nexti;
5814c16a6a6SHong Zhang     }
5824c16a6a6SHong Zhang 
5834c16a6a6SHong Zhang     /* save nonzero entries in k-th row of U ... */
5844c16a6a6SHong Zhang 
5854c16a6a6SHong Zhang     /* invert diagonal block */
5864c16a6a6SHong Zhang     diag = ba+k*bs2;
5874c16a6a6SHong Zhang     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
588d230e6fdSBarry Smith     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
5894c16a6a6SHong Zhang 
5904c16a6a6SHong Zhang     jmin = bi[k]; jmax = bi[k+1];
5914c16a6a6SHong Zhang     if (jmin < jmax) {
5924c16a6a6SHong Zhang       for (j=jmin; j<jmax; j++){
5934c16a6a6SHong Zhang          vj = bj[j];           /* block col. index of U */
5944c16a6a6SHong Zhang          u   = ba + j*bs2;
5954c16a6a6SHong Zhang          rtmp_ptr = rtmp + vj*bs2;
5964c16a6a6SHong Zhang          for (k1=0; k1<bs2; k1++){
5974c16a6a6SHong Zhang            *u++        = *rtmp_ptr;
5984c16a6a6SHong Zhang            *rtmp_ptr++ = 0.0;
5994c16a6a6SHong Zhang          }
6004c16a6a6SHong Zhang       }
6014c16a6a6SHong Zhang 
6024c16a6a6SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
6034c16a6a6SHong Zhang       il[k] = jmin;
6044c16a6a6SHong Zhang       i     = bj[jmin];
6054c16a6a6SHong Zhang       jl[k] = jl[i]; jl[i] = k;
6064c16a6a6SHong Zhang     }
6074c16a6a6SHong Zhang   }
6084c16a6a6SHong Zhang 
6094c16a6a6SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
6104c16a6a6SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
6114c16a6a6SHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
6124c16a6a6SHong Zhang   ierr = PetscFree(pivots);CHKERRQ(ierr);
6134c16a6a6SHong Zhang   if (a->permute){
6144c16a6a6SHong Zhang     ierr = PetscFree(aa);CHKERRQ(ierr);
6154c16a6a6SHong Zhang   }
6164c16a6a6SHong Zhang 
6174c16a6a6SHong Zhang   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
6184c16a6a6SHong Zhang   C->factor       = FACTOR_CHOLESKY;
6194c16a6a6SHong Zhang   C->assembled    = PETSC_TRUE;
6204c16a6a6SHong Zhang   C->preallocated = PETSC_TRUE;
621b0a32e0cSBarry Smith   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
6224c16a6a6SHong Zhang   PetscFunctionReturn(0);
6234c16a6a6SHong Zhang }
624d4edadd4SHong Zhang 
6254a2ae208SSatish Balay #undef __FUNCT__
6264a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
627dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
628671cb588SHong Zhang {
629671cb588SHong Zhang   Mat            C = *B;
630671cb588SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
631dfbe8321SBarry Smith   PetscErrorCode ierr;
63213f74950SBarry Smith   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
63313f74950SBarry Smith   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
63413f74950SBarry Smith   PetscInt       bs=A->bs,bs2 = a->bs2;
635671cb588SHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
636671cb588SHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
63728de702eSHong Zhang   MatScalar      *work;
63813f74950SBarry Smith   PetscInt       *pivots;
639671cb588SHong Zhang 
640671cb588SHong Zhang   PetscFunctionBegin;
641671cb588SHong Zhang   /* initialization */
642671cb588SHong Zhang 
64382502324SSatish Balay   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
644671cb588SHong Zhang   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
64513f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
64628de702eSHong Zhang   jl   = il + mbs;
647671cb588SHong Zhang   for (i=0; i<mbs; i++) {
648671cb588SHong Zhang     jl[i] = mbs; il[0] = 0;
649671cb588SHong Zhang   }
650b0a32e0cSBarry Smith   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
65128de702eSHong Zhang   uik  = dk + bs2;
65228de702eSHong Zhang   work = uik + bs2;
65313f74950SBarry Smith   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
654671cb588SHong Zhang 
655671cb588SHong Zhang   ai = a->i; aj = a->j; aa = a->a;
656671cb588SHong Zhang 
657671cb588SHong Zhang   /* for each row k */
658671cb588SHong Zhang   for (k = 0; k<mbs; k++){
659671cb588SHong Zhang 
660671cb588SHong Zhang     /*initialize k-th row with elements nonzero in row k of A */
661671cb588SHong Zhang     jmin = ai[k]; jmax = ai[k+1];
662671cb588SHong Zhang     ap = aa + jmin*bs2;
663671cb588SHong Zhang     for (j = jmin; j < jmax; j++){
664671cb588SHong Zhang       vj = aj[j];         /* block col. index */
665671cb588SHong Zhang       rtmp_ptr = rtmp + vj*bs2;
666671cb588SHong Zhang       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
667671cb588SHong Zhang     }
668671cb588SHong Zhang 
669671cb588SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
670671cb588SHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
671671cb588SHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
672671cb588SHong Zhang 
673057f5ba7SHong Zhang     while (i < k){
674671cb588SHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
675671cb588SHong Zhang 
676671cb588SHong Zhang       /* compute multiplier */
677671cb588SHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
678671cb588SHong Zhang 
679671cb588SHong Zhang       /* uik = -inv(Di)*U_bar(i,k) */
680671cb588SHong Zhang       diag = ba + i*bs2;
681671cb588SHong Zhang       u    = ba + ili*bs2;
682671cb588SHong Zhang       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
683671cb588SHong Zhang       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
684671cb588SHong Zhang 
685671cb588SHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
686671cb588SHong Zhang       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
687671cb588SHong Zhang 
688671cb588SHong Zhang       /* update -U(i,k) */
689671cb588SHong Zhang       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
690671cb588SHong Zhang 
691671cb588SHong Zhang       /* add multiple of row i to k-th row ... */
692671cb588SHong Zhang       jmin = ili + 1; jmax = bi[i+1];
693671cb588SHong Zhang       if (jmin < jmax){
694671cb588SHong Zhang         for (j=jmin; j<jmax; j++) {
695671cb588SHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j) */
696671cb588SHong Zhang           rtmp_ptr = rtmp + bj[j]*bs2;
697671cb588SHong Zhang           u = ba + j*bs2;
698671cb588SHong Zhang           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
699671cb588SHong Zhang         }
700671cb588SHong Zhang 
701671cb588SHong Zhang         /* ... add i to row list for next nonzero entry */
702671cb588SHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
703671cb588SHong Zhang         j     = bj[jmin];
704671cb588SHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
705671cb588SHong Zhang       }
706671cb588SHong Zhang       i = nexti;
707671cb588SHong Zhang     }
708671cb588SHong Zhang 
709671cb588SHong Zhang     /* save nonzero entries in k-th row of U ... */
710671cb588SHong Zhang 
711671cb588SHong Zhang     /* invert diagonal block */
712671cb588SHong Zhang     diag = ba+k*bs2;
713671cb588SHong Zhang     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
714d230e6fdSBarry Smith     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
715671cb588SHong Zhang 
716671cb588SHong Zhang     jmin = bi[k]; jmax = bi[k+1];
717671cb588SHong Zhang     if (jmin < jmax) {
718671cb588SHong Zhang       for (j=jmin; j<jmax; j++){
719671cb588SHong Zhang          vj = bj[j];           /* block col. index of U */
720671cb588SHong Zhang          u   = ba + j*bs2;
721671cb588SHong Zhang          rtmp_ptr = rtmp + vj*bs2;
722671cb588SHong Zhang          for (k1=0; k1<bs2; k1++){
723671cb588SHong Zhang            *u++        = *rtmp_ptr;
724671cb588SHong Zhang            *rtmp_ptr++ = 0.0;
725671cb588SHong Zhang          }
726671cb588SHong Zhang       }
727671cb588SHong Zhang 
728671cb588SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
729671cb588SHong Zhang       il[k] = jmin;
730671cb588SHong Zhang       i     = bj[jmin];
731671cb588SHong Zhang       jl[k] = jl[i]; jl[i] = k;
732671cb588SHong Zhang     }
733671cb588SHong Zhang   }
734671cb588SHong Zhang 
735671cb588SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
736671cb588SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
737671cb588SHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
738671cb588SHong Zhang   ierr = PetscFree(pivots);CHKERRQ(ierr);
739671cb588SHong Zhang 
740671cb588SHong Zhang   C->factor    = FACTOR_CHOLESKY;
741671cb588SHong Zhang   C->assembled = PETSC_TRUE;
742671cb588SHong Zhang   C->preallocated = PETSC_TRUE;
743b0a32e0cSBarry Smith   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
744671cb588SHong Zhang   PetscFunctionReturn(0);
745671cb588SHong Zhang }
746671cb588SHong Zhang 
74749b5e25fSSatish Balay /*
748fcf159c0SHong Zhang     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
749cc0c071aSHong Zhang     Version for blocks 2 by 2.
75049b5e25fSSatish Balay */
7514a2ae208SSatish Balay #undef __FUNCT__
7524a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
753dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
75449b5e25fSSatish Balay {
75549b5e25fSSatish Balay   Mat            C = *B;
75691602c66SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
757cc0c071aSHong Zhang   IS             perm = b->row;
7586849ba73SBarry Smith   PetscErrorCode ierr;
75913f74950SBarry Smith   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
76013f74950SBarry Smith   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
761a1723e09SHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
762cc0c071aSHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
76349b5e25fSSatish Balay 
76449b5e25fSSatish Balay   PetscFunctionBegin;
76591602c66SHong Zhang   /* initialization */
76691602c66SHong Zhang   /* il and jl record the first nonzero element in each row of the accessing
76791602c66SHong Zhang      window U(0:k, k:mbs-1).
76891602c66SHong Zhang      jl:    list of rows to be added to uneliminated rows
76991602c66SHong Zhang             i>= k: jl(i) is the first row to be added to row i
77091602c66SHong Zhang             i<  k: jl(i) is the row following row i in some list of rows
77191602c66SHong Zhang             jl(i) = mbs indicates the end of a list
77291602c66SHong Zhang      il(i): points to the first nonzero element in columns k,...,mbs-1 of
77391602c66SHong Zhang             row i of U */
77482502324SSatish Balay   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
775cc0c071aSHong Zhang   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
77613f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
77728de702eSHong Zhang   jl   = il + mbs;
77891602c66SHong Zhang   for (i=0; i<mbs; i++) {
7793845f261SHong Zhang     jl[i] = mbs; il[0] = 0;
78091602c66SHong Zhang   }
78182502324SSatish Balay   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
78228de702eSHong Zhang   uik  = dk + 4;
783cc0c071aSHong Zhang   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
784cc0c071aSHong Zhang 
785cc0c071aSHong Zhang   /* check permutation */
786cc0c071aSHong Zhang   if (!a->permute){
787cc0c071aSHong Zhang     ai = a->i; aj = a->j; aa = a->a;
788cc0c071aSHong Zhang   } else {
789cc0c071aSHong Zhang     ai   = a->inew; aj = a->jnew;
79082502324SSatish Balay     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
791cc0c071aSHong Zhang     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
79213f74950SBarry Smith     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
79313f74950SBarry Smith     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
794cc0c071aSHong Zhang 
795cc0c071aSHong Zhang     for (i=0; i<mbs; i++){
796cc0c071aSHong Zhang       jmin = ai[i]; jmax = ai[i+1];
797cc0c071aSHong Zhang       for (j=jmin; j<jmax; j++){
798cc0c071aSHong Zhang         while (a2anew[j] != j){
799cc0c071aSHong Zhang           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
800cc0c071aSHong Zhang           for (k1=0; k1<4; k1++){
801cc0c071aSHong Zhang             dk[k1]       = aa[k*4+k1];
802cc0c071aSHong Zhang             aa[k*4+k1] = aa[j*4+k1];
803cc0c071aSHong Zhang             aa[j*4+k1] = dk[k1];
804cc0c071aSHong Zhang           }
805cc0c071aSHong Zhang         }
806cc0c071aSHong Zhang         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
807cc0c071aSHong Zhang         if (i > aj[j]){
808a1723e09SHong Zhang           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
809cc0c071aSHong Zhang           ap = aa + j*4;     /* ptr to the beginning of the block */
810cc0c071aSHong Zhang           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
811cc0c071aSHong Zhang           ap[1] = ap[2];
812cc0c071aSHong Zhang           ap[2] = dk[1];
813cc0c071aSHong Zhang         }
814cc0c071aSHong Zhang       }
815cc0c071aSHong Zhang     }
816ac355199SBarry Smith     ierr = PetscFree(a2anew);CHKERRQ(ierr);
817cc0c071aSHong Zhang   }
8183845f261SHong Zhang 
81991602c66SHong Zhang   /* for each row k */
82091602c66SHong Zhang   for (k = 0; k<mbs; k++){
82191602c66SHong Zhang 
82291602c66SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
823cc0c071aSHong Zhang     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
824cc0c071aSHong Zhang     ap = aa + jmin*4;
82591602c66SHong Zhang     for (j = jmin; j < jmax; j++){
826cc0c071aSHong Zhang       vj = perm_ptr[aj[j]];         /* block col. index */
827cc0c071aSHong Zhang       rtmp_ptr = rtmp + vj*4;
828cc0c071aSHong Zhang       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
82991602c66SHong Zhang     }
83091602c66SHong Zhang 
83191602c66SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
832cc0c071aSHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
83391602c66SHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
83491602c66SHong Zhang 
835057f5ba7SHong Zhang     while (i < k){
83691602c66SHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
83791602c66SHong Zhang 
8383845f261SHong Zhang       /* compute multiplier */
83991602c66SHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
8403845f261SHong Zhang 
8413845f261SHong Zhang       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
842cc0c071aSHong Zhang       diag = ba + i*4;
843cc0c071aSHong Zhang       u    = ba + ili*4;
844cc0c071aSHong Zhang       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
845cc0c071aSHong Zhang       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
846cc0c071aSHong Zhang       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
847cc0c071aSHong Zhang       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
8483845f261SHong Zhang 
8493845f261SHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
850cc0c071aSHong Zhang       dk[0] += uik[0]*u[0] + uik[1]*u[1];
851cc0c071aSHong Zhang       dk[1] += uik[2]*u[0] + uik[3]*u[1];
852cc0c071aSHong Zhang       dk[2] += uik[0]*u[2] + uik[1]*u[3];
853cc0c071aSHong Zhang       dk[3] += uik[2]*u[2] + uik[3]*u[3];
8543845f261SHong Zhang 
8553845f261SHong Zhang       /* update -U(i,k): ba[ili] = uik */
856cc0c071aSHong Zhang       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
85791602c66SHong Zhang 
85891602c66SHong Zhang       /* add multiple of row i to k-th row ... */
85991602c66SHong Zhang       jmin = ili + 1; jmax = bi[i+1];
86091602c66SHong Zhang       if (jmin < jmax){
8613845f261SHong Zhang         for (j=jmin; j<jmax; j++) {
8623845f261SHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
863cc0c071aSHong Zhang           rtmp_ptr = rtmp + bj[j]*4;
864cc0c071aSHong Zhang           u = ba + j*4;
865cc0c071aSHong Zhang           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
866cc0c071aSHong Zhang           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
867cc0c071aSHong Zhang           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
868cc0c071aSHong Zhang           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
8693845f261SHong Zhang         }
8703845f261SHong Zhang 
87191602c66SHong Zhang         /* ... add i to row list for next nonzero entry */
87291602c66SHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
87391602c66SHong Zhang         j     = bj[jmin];
87491602c66SHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
87591602c66SHong Zhang       }
876a1723e09SHong Zhang       i = nexti;
87791602c66SHong Zhang     }
878cc0c071aSHong Zhang 
87991602c66SHong Zhang     /* save nonzero entries in k-th row of U ... */
8803845f261SHong Zhang 
881cc0c071aSHong Zhang     /* invert diagonal block */
882cc0c071aSHong Zhang     diag = ba+k*4;
883cc0c071aSHong Zhang     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
8843845f261SHong Zhang     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
8853845f261SHong Zhang 
88691602c66SHong Zhang     jmin = bi[k]; jmax = bi[k+1];
88791602c66SHong Zhang     if (jmin < jmax) {
88891602c66SHong Zhang       for (j=jmin; j<jmax; j++){
889cc0c071aSHong Zhang          vj = bj[j];           /* block col. index of U */
890cc0c071aSHong Zhang          u   = ba + j*4;
891cc0c071aSHong Zhang          rtmp_ptr = rtmp + vj*4;
892cc0c071aSHong Zhang          for (k1=0; k1<4; k1++){
893cc0c071aSHong Zhang            *u++        = *rtmp_ptr;
894cc0c071aSHong Zhang            *rtmp_ptr++ = 0.0;
895cc0c071aSHong Zhang          }
896cc0c071aSHong Zhang       }
8973845f261SHong Zhang 
89891602c66SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
89991602c66SHong Zhang       il[k] = jmin;
90091602c66SHong Zhang       i     = bj[jmin];
90191602c66SHong Zhang       jl[k] = jl[i]; jl[i] = k;
90291602c66SHong Zhang     }
90391602c66SHong Zhang   }
9043845f261SHong Zhang 
90549b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
90691602c66SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
9073845f261SHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
90891602c66SHong Zhang   if (a->permute) {
90991602c66SHong Zhang     ierr = PetscFree(aa);CHKERRQ(ierr);
91091602c66SHong Zhang   }
911cc0c071aSHong Zhang   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
91291602c66SHong Zhang   C->factor    = FACTOR_CHOLESKY;
91349b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
9145c0bcdfcSHong Zhang   C->preallocated = PETSC_TRUE;
915b0a32e0cSBarry Smith   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
91649b5e25fSSatish Balay   PetscFunctionReturn(0);
91749b5e25fSSatish Balay }
91891602c66SHong Zhang 
91949b5e25fSSatish Balay /*
92049b5e25fSSatish Balay       Version for when blocks are 2 by 2 Using natural ordering
92149b5e25fSSatish Balay */
9224a2ae208SSatish Balay #undef __FUNCT__
9234a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
924dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
92549b5e25fSSatish Balay {
92649b5e25fSSatish Balay   Mat            C = *B;
927ab27746eSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
928dfbe8321SBarry Smith   PetscErrorCode ierr;
92913f74950SBarry Smith   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
93013f74950SBarry Smith   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
931ab27746eSHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
932ab27746eSHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
93349b5e25fSSatish Balay 
93449b5e25fSSatish Balay   PetscFunctionBegin;
935ab27746eSHong Zhang   /* initialization */
936ab27746eSHong Zhang   /* il and jl record the first nonzero element in each row of the accessing
937ab27746eSHong Zhang      window U(0:k, k:mbs-1).
938ab27746eSHong Zhang      jl:    list of rows to be added to uneliminated rows
939ab27746eSHong Zhang             i>= k: jl(i) is the first row to be added to row i
940ab27746eSHong Zhang             i<  k: jl(i) is the row following row i in some list of rows
941ab27746eSHong Zhang             jl(i) = mbs indicates the end of a list
942ab27746eSHong Zhang      il(i): points to the first nonzero element in columns k,...,mbs-1 of
943ab27746eSHong Zhang             row i of U */
94482502324SSatish Balay   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
945ab27746eSHong Zhang   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
94613f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
94728de702eSHong Zhang   jl   = il + mbs;
948ab27746eSHong Zhang   for (i=0; i<mbs; i++) {
949ab27746eSHong Zhang     jl[i] = mbs; il[0] = 0;
95049b5e25fSSatish Balay   }
95182502324SSatish Balay   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
95228de702eSHong Zhang   uik  = dk + 4;
953ab27746eSHong Zhang 
954ab27746eSHong Zhang   ai = a->i; aj = a->j; aa = a->a;
955ab27746eSHong Zhang 
956ab27746eSHong Zhang   /* for each row k */
957ab27746eSHong Zhang   for (k = 0; k<mbs; k++){
958ab27746eSHong Zhang 
959ab27746eSHong Zhang     /*initialize k-th row with elements nonzero in row k of A */
960ab27746eSHong Zhang     jmin = ai[k]; jmax = ai[k+1];
961ab27746eSHong Zhang     ap = aa + jmin*4;
962ab27746eSHong Zhang     for (j = jmin; j < jmax; j++){
963ab27746eSHong Zhang       vj = aj[j];         /* block col. index */
964ab27746eSHong Zhang       rtmp_ptr = rtmp + vj*4;
965ab27746eSHong Zhang       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
96649b5e25fSSatish Balay     }
967ab27746eSHong Zhang 
968ab27746eSHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
969ab27746eSHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
970ab27746eSHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
971ab27746eSHong Zhang 
972057f5ba7SHong Zhang     while (i < k){
973ab27746eSHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
974ab27746eSHong Zhang 
975ab27746eSHong Zhang       /* compute multiplier */
976ab27746eSHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
977ab27746eSHong Zhang 
978ab27746eSHong Zhang       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
979ab27746eSHong Zhang       diag = ba + i*4;
980ab27746eSHong Zhang       u    = ba + ili*4;
981ab27746eSHong Zhang       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
982ab27746eSHong Zhang       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
983ab27746eSHong Zhang       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
984ab27746eSHong Zhang       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
985ab27746eSHong Zhang 
986ab27746eSHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
987ab27746eSHong Zhang       dk[0] += uik[0]*u[0] + uik[1]*u[1];
988ab27746eSHong Zhang       dk[1] += uik[2]*u[0] + uik[3]*u[1];
989ab27746eSHong Zhang       dk[2] += uik[0]*u[2] + uik[1]*u[3];
990ab27746eSHong Zhang       dk[3] += uik[2]*u[2] + uik[3]*u[3];
991ab27746eSHong Zhang 
992ab27746eSHong Zhang       /* update -U(i,k): ba[ili] = uik */
993ab27746eSHong Zhang       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
994ab27746eSHong Zhang 
995ab27746eSHong Zhang       /* add multiple of row i to k-th row ... */
996ab27746eSHong Zhang       jmin = ili + 1; jmax = bi[i+1];
997ab27746eSHong Zhang       if (jmin < jmax){
998ab27746eSHong Zhang         for (j=jmin; j<jmax; j++) {
999ab27746eSHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1000ab27746eSHong Zhang           rtmp_ptr = rtmp + bj[j]*4;
1001ab27746eSHong Zhang           u = ba + j*4;
1002ab27746eSHong Zhang           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1003ab27746eSHong Zhang           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1004ab27746eSHong Zhang           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1005ab27746eSHong Zhang           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
100649b5e25fSSatish Balay         }
1007ab27746eSHong Zhang 
1008ab27746eSHong Zhang         /* ... add i to row list for next nonzero entry */
1009ab27746eSHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1010ab27746eSHong Zhang         j     = bj[jmin];
1011ab27746eSHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
101249b5e25fSSatish Balay       }
1013ab27746eSHong Zhang       i = nexti;
101449b5e25fSSatish Balay     }
1015ab27746eSHong Zhang 
1016ab27746eSHong Zhang     /* save nonzero entries in k-th row of U ... */
1017ab27746eSHong Zhang 
101849b5e25fSSatish Balay     /* invert diagonal block */
1019ab27746eSHong Zhang     diag = ba+k*4;
1020ab27746eSHong Zhang     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1021ab27746eSHong Zhang     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1022ab27746eSHong Zhang 
1023ab27746eSHong Zhang     jmin = bi[k]; jmax = bi[k+1];
1024ab27746eSHong Zhang     if (jmin < jmax) {
1025ab27746eSHong Zhang       for (j=jmin; j<jmax; j++){
1026ab27746eSHong Zhang          vj = bj[j];           /* block col. index of U */
1027ab27746eSHong Zhang          u   = ba + j*4;
1028ab27746eSHong Zhang          rtmp_ptr = rtmp + vj*4;
1029ab27746eSHong Zhang          for (k1=0; k1<4; k1++){
1030ab27746eSHong Zhang            *u++        = *rtmp_ptr;
1031ab27746eSHong Zhang            *rtmp_ptr++ = 0.0;
1032ab27746eSHong Zhang          }
1033ab27746eSHong Zhang       }
1034ab27746eSHong Zhang 
1035ab27746eSHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
1036ab27746eSHong Zhang       il[k] = jmin;
1037ab27746eSHong Zhang       i     = bj[jmin];
1038ab27746eSHong Zhang       jl[k] = jl[i]; jl[i] = k;
1039ab27746eSHong Zhang     }
104049b5e25fSSatish Balay   }
104149b5e25fSSatish Balay 
104249b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1043ab27746eSHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
1044ab27746eSHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
1045ab27746eSHong Zhang 
1046ab27746eSHong Zhang   C->factor    = FACTOR_CHOLESKY;
104749b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
10485c0bcdfcSHong Zhang   C->preallocated = PETSC_TRUE;
1049b0a32e0cSBarry Smith   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
105049b5e25fSSatish Balay   PetscFunctionReturn(0);
105149b5e25fSSatish Balay }
105249b5e25fSSatish Balay 
105349b5e25fSSatish Balay /*
10545c0bcdfcSHong Zhang     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
105591602c66SHong Zhang     Version for blocks are 1 by 1.
105649b5e25fSSatish Balay */
10574a2ae208SSatish Balay #undef __FUNCT__
10584a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1059dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
106049b5e25fSSatish Balay {
106149b5e25fSSatish Balay   Mat            C = *B;
106249b5e25fSSatish Balay   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
106349b5e25fSSatish Balay   IS             ip = b->row;
10646849ba73SBarry Smith   PetscErrorCode ierr;
106513f74950SBarry Smith   PetscInt       *rip,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
106613f74950SBarry Smith   PetscInt       *ai,*aj,*r;
106713f74950SBarry Smith   PetscInt       k,jmin,jmax,*jl,*il,vj,nexti,ili;
1068066653e3SSatish Balay   MatScalar      *rtmp;
10692d007305SHong Zhang   MatScalar      *ba = b->a,*aa,ak;
107049b5e25fSSatish Balay   MatScalar      dk,uikdi;
107149b5e25fSSatish Balay 
107249b5e25fSSatish Balay   PetscFunctionBegin;
107349b5e25fSSatish Balay   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1074cb718733SHong Zhang   if (!a->permute){
10752d007305SHong Zhang     ai = a->i; aj = a->j; aa = a->a;
10762d007305SHong Zhang   } else {
10772d007305SHong Zhang     ai = a->inew; aj = a->jnew;
107882502324SSatish Balay     ierr = PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1079cb718733SHong Zhang     ierr = PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
108013f74950SBarry Smith     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&r);CHKERRQ(ierr);
108113f74950SBarry Smith     ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
10822d007305SHong Zhang 
10832d007305SHong Zhang     jmin = ai[0]; jmax = ai[mbs];
10842d007305SHong Zhang     for (j=jmin; j<jmax; j++){
1085cb718733SHong Zhang       while (r[j] != j){
10862d007305SHong Zhang         k = r[j]; r[j] = r[k]; r[k] = k;
10872d007305SHong Zhang         ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
10882d007305SHong Zhang       }
10892d007305SHong Zhang     }
1090ac355199SBarry Smith     ierr = PetscFree(r);CHKERRQ(ierr);
10912d007305SHong Zhang   }
10922d007305SHong Zhang 
109391602c66SHong Zhang   /* initialization */
109449b5e25fSSatish Balay   /* il and jl record the first nonzero element in each row of the accessing
109549b5e25fSSatish Balay      window U(0:k, k:mbs-1).
109649b5e25fSSatish Balay      jl:    list of rows to be added to uneliminated rows
109749b5e25fSSatish Balay             i>= k: jl(i) is the first row to be added to row i
109849b5e25fSSatish Balay             i<  k: jl(i) is the row following row i in some list of rows
109949b5e25fSSatish Balay             jl(i) = mbs indicates the end of a list
110049b5e25fSSatish Balay      il(i): points to the first nonzero element in columns k,...,mbs-1 of
110149b5e25fSSatish Balay             row i of U */
110282502324SSatish Balay   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
110313f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
110428de702eSHong Zhang   jl   = il + mbs;
110549b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
110649b5e25fSSatish Balay     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
110749b5e25fSSatish Balay   }
110849b5e25fSSatish Balay 
110991602c66SHong Zhang   /* for each row k */
111049b5e25fSSatish Balay   for (k = 0; k<mbs; k++){
111149b5e25fSSatish Balay 
111291602c66SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
111349b5e25fSSatish Balay     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1114057f5ba7SHong Zhang 
111549b5e25fSSatish Balay     for (j = jmin; j < jmax; j++){
1116351d0355SHong Zhang       vj = rip[aj[j]];
1117ab27746eSHong Zhang       rtmp[vj] = aa[j];
111849b5e25fSSatish Balay     }
111949b5e25fSSatish Balay 
112091602c66SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
112149b5e25fSSatish Balay     dk = rtmp[k];
112249b5e25fSSatish Balay     i = jl[k]; /* first row to be added to k_th row  */
112349b5e25fSSatish Balay 
1124057f5ba7SHong Zhang     while (i < k){
112549b5e25fSSatish Balay       nexti = jl[i]; /* next row to be added to k_th row */
112649b5e25fSSatish Balay 
112791602c66SHong Zhang       /* compute multiplier, update D(k) and U(i,k) */
112849b5e25fSSatish Balay       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
112949b5e25fSSatish Balay       uikdi = - ba[ili]*ba[i];
113049b5e25fSSatish Balay       dk += uikdi*ba[ili];
1131658e7b3eSHong Zhang       ba[ili] = uikdi; /* -U(i,k) */
113249b5e25fSSatish Balay 
113391602c66SHong Zhang       /* add multiple of row i to k-th row ... */
113449b5e25fSSatish Balay       jmin = ili + 1; jmax = bi[i+1];
113549b5e25fSSatish Balay       if (jmin < jmax){
113649b5e25fSSatish Balay         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
113791602c66SHong Zhang         /* ... add i to row list for next nonzero entry */
113849b5e25fSSatish Balay         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
113949b5e25fSSatish Balay         j     = bj[jmin];
114049b5e25fSSatish Balay         jl[i] = jl[j]; jl[j] = i; /* update jl */
114149b5e25fSSatish Balay       }
1142ab27746eSHong Zhang       i = nexti;
114349b5e25fSSatish Balay     }
114449b5e25fSSatish Balay 
114591602c66SHong Zhang     /* check for zero pivot and save diagoanl element */
11465b8514ebSBarry Smith     if (dk == 0.0){
114729bbc08cSBarry Smith       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1148671cb588SHong Zhang       /*
11491223c01bSHong Zhang     } else if (PetscRealPart(dk) < 0.0){
11501223c01bSHong Zhang       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Negative pivot: d[%d] = %g\n",k,dk);
1151671cb588SHong Zhang       */
115249b5e25fSSatish Balay     }
115349b5e25fSSatish Balay 
115491602c66SHong Zhang     /* save nonzero entries in k-th row of U ... */
1155f3dd7b2dSKris Buschelman     ba[k] = 1.0/dk;
115649b5e25fSSatish Balay     jmin = bi[k]; jmax = bi[k+1];
115749b5e25fSSatish Balay     if (jmin < jmax) {
115849b5e25fSSatish Balay       for (j=jmin; j<jmax; j++){
1159cc0c071aSHong Zhang          vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
116049b5e25fSSatish Balay       }
116191602c66SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
116249b5e25fSSatish Balay       il[k] = jmin;
116349b5e25fSSatish Balay       i     = bj[jmin];
116449b5e25fSSatish Balay       jl[k] = jl[i]; jl[i] = k;
116549b5e25fSSatish Balay     }
116649b5e25fSSatish Balay   }
116749b5e25fSSatish Balay 
116849b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
116949b5e25fSSatish Balay   ierr = PetscFree(il);CHKERRQ(ierr);
1170cb718733SHong Zhang   if (a->permute){
1171cb718733SHong Zhang     ierr = PetscFree(aa);CHKERRQ(ierr);
1172cb718733SHong Zhang   }
117349b5e25fSSatish Balay 
117449b5e25fSSatish Balay   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
11758be8c0c7SHong Zhang   C->factor    = FACTOR_CHOLESKY;
117649b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
11775c0bcdfcSHong Zhang   C->preallocated = PETSC_TRUE;
1178b0a32e0cSBarry Smith   PetscLogFlops(b->mbs);
117949b5e25fSSatish Balay   PetscFunctionReturn(0);
118049b5e25fSSatish Balay }
118149b5e25fSSatish Balay 
1182671cb588SHong Zhang /*
1183671cb588SHong Zhang   Version for when blocks are 1 by 1 Using natural ordering
1184671cb588SHong Zhang */
11854a2ae208SSatish Balay #undef __FUNCT__
11864a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1187dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
1188671cb588SHong Zhang {
1189671cb588SHong Zhang   Mat            C = *B;
1190671cb588SHong Zhang   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1191dfbe8321SBarry Smith   PetscErrorCode ierr;
119213f74950SBarry Smith   PetscInt       i,j,mbs = a->mbs;
119313f74950SBarry Smith   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
119413f74950SBarry Smith   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1195653a6975SHong Zhang   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
119621c26570Svictorle   PetscReal      damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
119721c26570Svictorle   PetscTruth     damp,chshift;
119813f74950SBarry Smith   PetscInt       nshift=0;
1199653a6975SHong Zhang 
1200653a6975SHong Zhang   PetscFunctionBegin;
1201653a6975SHong Zhang   /* initialization */
1202653a6975SHong Zhang   /* il and jl record the first nonzero element in each row of the accessing
1203653a6975SHong Zhang      window U(0:k, k:mbs-1).
1204653a6975SHong Zhang      jl:    list of rows to be added to uneliminated rows
1205653a6975SHong Zhang             i>= k: jl(i) is the first row to be added to row i
1206653a6975SHong Zhang             i<  k: jl(i) is the row following row i in some list of rows
1207653a6975SHong Zhang             jl(i) = mbs indicates the end of a list
1208653a6975SHong Zhang      il(i): points to the first nonzero element in U(i,k:mbs-1)
1209653a6975SHong Zhang   */
1210653a6975SHong Zhang   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
121113f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1212653a6975SHong Zhang   jl   = il + mbs;
1213b00f7748SHong Zhang 
121421c26570Svictorle   shift_amount = 0;
1215b00f7748SHong Zhang   do {
1216b00f7748SHong Zhang     damp = PETSC_FALSE;
121721c26570Svictorle     chshift = PETSC_FALSE;
1218653a6975SHong Zhang     for (i=0; i<mbs; i++) {
1219653a6975SHong Zhang       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1220653a6975SHong Zhang     }
1221653a6975SHong Zhang 
1222b00f7748SHong Zhang     for (k = 0; k<mbs; k++){ /* row k */
1223653a6975SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
1224653a6975SHong Zhang       nz   = ai[k+1] - ai[k];
1225653a6975SHong Zhang       acol = aj + ai[k];
1226653a6975SHong Zhang       aval = aa + ai[k];
1227653a6975SHong Zhang       bval = ba + bi[k];
1228653a6975SHong Zhang       while (nz -- ){
1229653a6975SHong Zhang         rtmp[*acol++] = *aval++;
1230653a6975SHong Zhang         *bval++       = 0.0; /* for in-place factorization */
1231653a6975SHong Zhang       }
1232b00f7748SHong Zhang       /* damp the diagonal of the matrix */
123321c26570Svictorle       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1234653a6975SHong Zhang 
1235653a6975SHong Zhang       /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1236653a6975SHong Zhang       dk = rtmp[k];
1237653a6975SHong Zhang       i  = jl[k]; /* first row to be added to k_th row  */
1238653a6975SHong Zhang 
1239653a6975SHong Zhang       while (i < k){
1240653a6975SHong Zhang         nexti = jl[i]; /* next row to be added to k_th row */
1241653a6975SHong Zhang 
1242653a6975SHong Zhang         /* compute multiplier, update D(k) and U(i,k) */
1243653a6975SHong Zhang         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1244653a6975SHong Zhang         uikdi = - ba[ili]*ba[bi[i]];
1245653a6975SHong Zhang         dk   += uikdi*ba[ili];
1246653a6975SHong Zhang         ba[ili] = uikdi; /* -U(i,k) */
1247653a6975SHong Zhang 
1248653a6975SHong Zhang         /* add multiple of row i to k-th row ... */
1249653a6975SHong Zhang         jmin = ili + 1;
1250653a6975SHong Zhang         nz   = bi[i+1] - jmin;
1251653a6975SHong Zhang         if (nz > 0){
1252653a6975SHong Zhang           bcol = bj + jmin;
1253653a6975SHong Zhang           bval = ba + jmin;
1254653a6975SHong Zhang           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1255653a6975SHong Zhang           /* ... add i to row list for next nonzero entry */
1256653a6975SHong Zhang           il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1257653a6975SHong Zhang           j     = bj[jmin];
1258653a6975SHong Zhang           jl[i] = jl[j]; jl[j] = i; /* update jl */
1259653a6975SHong Zhang         }
1260653a6975SHong Zhang         i = nexti;
1261653a6975SHong Zhang       }
1262653a6975SHong Zhang 
126321c26570Svictorle       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1264d530a6e7Svictorle 	/* calculate a shift that would make this row diagonally dominant */
126529f69fd4SBarry Smith 	PetscReal rs = PetscAbs(PetscRealPart(dk));
126621c26570Svictorle 	jmin      = bi[k]+1;
126721c26570Svictorle 	nz        = bi[k+1] - jmin;
126821c26570Svictorle 	if (nz){
126921c26570Svictorle 	  bcol = bj + jmin;
127021c26570Svictorle 	  bval = ba + jmin;
127121c26570Svictorle 	  while (nz--){
127221c26570Svictorle 	    rs += PetscAbsScalar(rtmp[*bcol++]);
127321c26570Svictorle 	  }
127421c26570Svictorle 	}
1275d530a6e7Svictorle 	/* if this shift is less than the previous, just up the previous
1276d530a6e7Svictorle 	   one by a bit */
1277d530a6e7Svictorle 	shift_amount = PetscMax(rs,1.1*shift_amount);
127821c26570Svictorle 	chshift  = PETSC_TRUE;
1279d530a6e7Svictorle 	/* Unlike in the ILU case there is no exit condition on nshift:
1280d530a6e7Svictorle 	   we increase the shift until it converges. There is no guarantee that
1281d530a6e7Svictorle 	   this algorithm converges faster or slower, or is better or worse
1282d530a6e7Svictorle 	   than the ILU algorithm. */
128321c26570Svictorle 	nshift++;
128421c26570Svictorle 	break;
128521c26570Svictorle       }
1286ffa0b812SHong Zhang       if (PetscRealPart(dk) < zeropivot){
1287ffa0b812SHong Zhang         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1288b00f7748SHong Zhang         if (damping > 0.0) {
1289b00f7748SHong Zhang           if (ndamp) damping *= 2.0;
1290b00f7748SHong Zhang           damp = PETSC_TRUE;
1291b00f7748SHong Zhang           ndamp++;
1292b00f7748SHong Zhang           break;
1293481c2c92SHong Zhang         } else if (PetscAbsScalar(dk) < zeropivot){
129477431f27SBarry Smith           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1295481c2c92SHong Zhang         } else {
129677431f27SBarry Smith           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %D of Cholesky factorization\n",PetscRealPart(dk),k);
1297b00f7748SHong Zhang         }
1298064503c5SHong Zhang       }
1299653a6975SHong Zhang 
1300653a6975SHong Zhang       /* save nonzero entries in k-th row of U ... */
130177431f27SBarry Smith       /* printf("%D, dk: %g, 1/dk: %g\n",k,dk,1/dk); */
1302653a6975SHong Zhang       ba[bi[k]] = 1.0/dk;
1303653a6975SHong Zhang       jmin      = bi[k]+1;
1304653a6975SHong Zhang       nz        = bi[k+1] - jmin;
1305653a6975SHong Zhang       if (nz){
1306653a6975SHong Zhang         bcol = bj + jmin;
1307653a6975SHong Zhang         bval = ba + jmin;
1308653a6975SHong Zhang         while (nz--){
1309653a6975SHong Zhang           *bval++       = rtmp[*bcol];
1310653a6975SHong Zhang           rtmp[*bcol++] = 0.0;
1311653a6975SHong Zhang         }
1312653a6975SHong Zhang         /* ... add k to row list for first nonzero entry in k-th row */
1313653a6975SHong Zhang         il[k] = jmin;
1314653a6975SHong Zhang         i     = bj[jmin];
1315653a6975SHong Zhang         jl[k] = jl[i]; jl[i] = k;
1316653a6975SHong Zhang       }
1317b00f7748SHong Zhang     } /* end of for (k = 0; k<mbs; k++) */
131821c26570Svictorle   } while (damp||chshift);
1319653a6975SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1320653a6975SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
1321653a6975SHong Zhang 
1322653a6975SHong Zhang   C->factor       = FACTOR_CHOLESKY;
1323653a6975SHong Zhang   C->assembled    = PETSC_TRUE;
1324653a6975SHong Zhang   C->preallocated = PETSC_TRUE;
1325653a6975SHong Zhang   PetscLogFlops(b->mbs);
1326b00f7748SHong Zhang   if (ndamp) {
132777431f27SBarry Smith     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1_NaturalOrdering: number of damping tries %D damping value %g\n",ndamp,damping);
1328b00f7748SHong Zhang   }
1329fb10cecfSBarry Smith  if (nshift) {
133077431f27SBarry Smith     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %D shifts\n",nshift);
1331fb10cecfSBarry Smith   }
1332fb10cecfSBarry Smith 
1333653a6975SHong Zhang   PetscFunctionReturn(0);
1334653a6975SHong Zhang }
1335653a6975SHong Zhang 
1336653a6975SHong Zhang #undef __FUNCT__
13374a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1338dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
133949b5e25fSSatish Balay {
1340dfbe8321SBarry Smith   PetscErrorCode ierr;
134149b5e25fSSatish Balay   Mat            C;
134249b5e25fSSatish Balay 
134349b5e25fSSatish Balay   PetscFunctionBegin;
134415e8a5b3SHong Zhang   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1345a4ada70bSHong Zhang   ierr = MatCholeskyFactorNumeric(A,&C);CHKERRQ(ierr);
13464445ddedSHong Zhang   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
134749b5e25fSSatish Balay   PetscFunctionReturn(0);
134849b5e25fSSatish Balay }
134949b5e25fSSatish Balay 
135049b5e25fSSatish Balay 
1351