xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision af281ebde57afa1efdf97417df29819a0120af3b)
149b5e25fSSatish Balay 
249b5e25fSSatish Balay #include "src/mat/impls/baij/seq/baij.h"
33a7fca6bSBarry Smith #include "src/mat/impls/sbaij/seq/sbaij.h"
449b5e25fSSatish Balay #include "src/inline/ilu.h"
549a6740bSHong Zhang #include "include/petscis.h"
649b5e25fSSatish Balay 
78dc9108eSHong Zhang #if !defined(PETSC_USE_COMPLEX)
85f9f512dSHong Zhang /*
95f9f512dSHong Zhang   input:
10c037c3f7SHong Zhang    F -- numeric factor
115f9f512dSHong Zhang   output:
12c037c3f7SHong Zhang    nneg, nzero, npos: matrix inertia
135f9f512dSHong Zhang */
145f9f512dSHong Zhang 
155f9f512dSHong Zhang #undef __FUNCT__
165f9f512dSHong Zhang #define __FUNCT__ "MatGetInertia_SeqSBAIJ"
1713f74950SBarry Smith PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
185f9f512dSHong Zhang {
19638f5ce0SDinesh Kaushik   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
203e0d88b5SBarry Smith   PetscScalar  *dd = fact_ptr->a;
2113f74950SBarry Smith   PetscInt     mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
225f9f512dSHong Zhang 
235f9f512dSHong Zhang   PetscFunctionBegin;
2477431f27SBarry Smith   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
25eeeff2ecSHong Zhang   nneig_tmp = 0; npos_tmp = 0;
26eeeff2ecSHong Zhang   for (i=0; i<mbs; i++){
27eeeff2ecSHong Zhang     if (PetscRealPart(dd[*fi]) > 0.0){
28eeeff2ecSHong Zhang       npos_tmp++;
29eeeff2ecSHong Zhang     } else if (PetscRealPart(dd[*fi]) < 0.0){
30eeeff2ecSHong Zhang       nneig_tmp++;
315f9f512dSHong Zhang     }
32eeeff2ecSHong Zhang     fi++;
333e0d88b5SBarry Smith   }
34eeeff2ecSHong Zhang   if (nneig) *nneig = nneig_tmp;
35eeeff2ecSHong Zhang   if (npos)  *npos  = npos_tmp;
36eeeff2ecSHong Zhang   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
37eeeff2ecSHong Zhang 
385f9f512dSHong Zhang   PetscFunctionReturn(0);
395f9f512dSHong Zhang }
408dc9108eSHong Zhang #endif /* !defined(PETSC_USE_COMPLEX) */
415f9f512dSHong Zhang 
425f9f512dSHong Zhang /*
435f9f512dSHong Zhang   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
4410c27e3dSHong Zhang   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
455f9f512dSHong Zhang */
4610c27e3dSHong Zhang #undef __FUNCT__
4710c27e3dSHong Zhang #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR"
4810c27e3dSHong Zhang PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
4910c27e3dSHong Zhang {
5010c27e3dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
5110c27e3dSHong Zhang   PetscErrorCode ierr;
5210c27e3dSHong Zhang   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
5310c27e3dSHong Zhang   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
5410c27e3dSHong Zhang   PetscInt       m,reallocs = 0,prow;
5510c27e3dSHong Zhang   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
5610c27e3dSHong Zhang   PetscReal      f = info->fill;
5710c27e3dSHong Zhang   PetscTruth     perm_identity;
5810c27e3dSHong Zhang 
5910c27e3dSHong Zhang   PetscFunctionBegin;
6010c27e3dSHong Zhang   /* check whether perm is the identity mapping */
6110c27e3dSHong Zhang   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
6210c27e3dSHong Zhang   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
6310c27e3dSHong Zhang 
6410c27e3dSHong Zhang   if (perm_identity){ /* without permutation */
6510c27e3dSHong Zhang     a->permute = PETSC_FALSE;
6610c27e3dSHong Zhang     ai = a->i; aj = a->j;
6710c27e3dSHong Zhang   } else {            /* non-trivial permutation */
6810c27e3dSHong Zhang     a->permute = PETSC_TRUE;
6910c27e3dSHong Zhang     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
7010c27e3dSHong Zhang     ai = a->inew; aj = a->jnew;
7110c27e3dSHong Zhang   }
7210c27e3dSHong Zhang 
7310c27e3dSHong Zhang   /* initialization */
7410c27e3dSHong Zhang   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
7510c27e3dSHong Zhang   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
7610c27e3dSHong Zhang   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
7710c27e3dSHong Zhang   iu[0] = mbs+1;
7810c27e3dSHong Zhang   juidx = mbs + 1; /* index for ju */
7910c27e3dSHong Zhang   ierr  = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */
8010c27e3dSHong Zhang   q     = jl + mbs;   /* linked list for col index */
8110c27e3dSHong Zhang   for (i=0; i<mbs; i++){
8210c27e3dSHong Zhang     jl[i] = mbs;
8310c27e3dSHong Zhang     q[i] = 0;
8410c27e3dSHong Zhang   }
8510c27e3dSHong Zhang 
8610c27e3dSHong Zhang   /* for each row k */
8710c27e3dSHong Zhang   for (k=0; k<mbs; k++){
8810c27e3dSHong Zhang     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
8910c27e3dSHong Zhang     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
9010c27e3dSHong Zhang     q[k] = mbs;
9110c27e3dSHong Zhang     /* initialize nonzero structure of k-th row to row rip[k] of A */
9210c27e3dSHong Zhang     jmin = ai[rip[k]] +1; /* exclude diag[k] */
9310c27e3dSHong Zhang     jmax = ai[rip[k]+1];
9410c27e3dSHong Zhang     for (j=jmin; j<jmax; j++){
9510c27e3dSHong Zhang       vj = rip[aj[j]]; /* col. value */
9610c27e3dSHong Zhang       if(vj > k){
9710c27e3dSHong Zhang         qm = k;
9810c27e3dSHong Zhang         do {
9910c27e3dSHong Zhang           m  = qm; qm = q[m];
10010c27e3dSHong Zhang         } while(qm < vj);
10110c27e3dSHong Zhang         if (qm == vj) {
10210c27e3dSHong Zhang           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
10310c27e3dSHong Zhang         }
10410c27e3dSHong Zhang         nzk++;
10510c27e3dSHong Zhang         q[m]  = vj;
10610c27e3dSHong Zhang         q[vj] = qm;
10710c27e3dSHong Zhang       } /* if(vj > k) */
10810c27e3dSHong Zhang     } /* for (j=jmin; j<jmax; j++) */
10910c27e3dSHong Zhang 
11010c27e3dSHong Zhang     /* modify nonzero structure of k-th row by computing fill-in
11110c27e3dSHong Zhang        for each row i to be merged in */
11210c27e3dSHong Zhang     prow = k;
11310c27e3dSHong Zhang     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
11410c27e3dSHong Zhang 
11510c27e3dSHong Zhang     while (prow < k){
11610c27e3dSHong Zhang       /* merge row prow into k-th row */
11710c27e3dSHong Zhang       jmin = iu[prow] + 1; jmax = iu[prow+1];
11810c27e3dSHong Zhang       qm = k;
11910c27e3dSHong Zhang       for (j=jmin; j<jmax; j++){
12010c27e3dSHong Zhang         vj = ju[j];
12110c27e3dSHong Zhang         do {
12210c27e3dSHong Zhang           m = qm; qm = q[m];
12310c27e3dSHong Zhang         } while (qm < vj);
12410c27e3dSHong Zhang         if (qm != vj){
12510c27e3dSHong Zhang          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
12610c27e3dSHong Zhang         }
12710c27e3dSHong Zhang       }
12810c27e3dSHong Zhang       prow = jl[prow]; /* next pivot row */
12910c27e3dSHong Zhang     }
13010c27e3dSHong Zhang 
13110c27e3dSHong Zhang     /* add k to row list for first nonzero element in k-th row */
13210c27e3dSHong Zhang     if (nzk > 0){
13310c27e3dSHong Zhang       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
13410c27e3dSHong Zhang       jl[k] = jl[i]; jl[i] = k;
13510c27e3dSHong Zhang     }
13610c27e3dSHong Zhang     iu[k+1] = iu[k] + nzk;
13710c27e3dSHong Zhang 
13810c27e3dSHong Zhang     /* allocate more space to ju if needed */
13910c27e3dSHong Zhang     if (iu[k+1] > umax) {
14010c27e3dSHong Zhang       /* estimate how much additional space we will need */
14110c27e3dSHong Zhang       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
14210c27e3dSHong Zhang       /* just double the memory each time */
14310c27e3dSHong Zhang       maxadd = umax;
14410c27e3dSHong Zhang       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
14510c27e3dSHong Zhang       umax += maxadd;
14610c27e3dSHong Zhang 
14710c27e3dSHong Zhang       /* allocate a longer ju */
14810c27e3dSHong Zhang       ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
14910c27e3dSHong Zhang       ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
15010c27e3dSHong Zhang       ierr = PetscFree(ju);CHKERRQ(ierr);
15110c27e3dSHong Zhang       ju   = jutmp;
15210c27e3dSHong Zhang       reallocs++; /* count how many times we realloc */
15310c27e3dSHong Zhang     }
15410c27e3dSHong Zhang 
15510c27e3dSHong Zhang     /* save nonzero structure of k-th row in ju */
15610c27e3dSHong Zhang     i=k;
15710c27e3dSHong Zhang     while (nzk --) {
15810c27e3dSHong Zhang       i           = q[i];
15910c27e3dSHong Zhang       ju[juidx++] = i;
16010c27e3dSHong Zhang     }
16110c27e3dSHong Zhang   }
16210c27e3dSHong Zhang 
16310c27e3dSHong Zhang   if (ai[mbs] != 0) {
16410c27e3dSHong Zhang     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
16510c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
16610c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
16710c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
16810c27e3dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
16910c27e3dSHong Zhang   } else {
17010c27e3dSHong Zhang      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
17110c27e3dSHong Zhang   }
17210c27e3dSHong Zhang 
17310c27e3dSHong Zhang   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
17410c27e3dSHong Zhang   ierr = PetscFree(jl);CHKERRQ(ierr);
17510c27e3dSHong Zhang 
17610c27e3dSHong Zhang   /* put together the new matrix */
17710c27e3dSHong Zhang   ierr = MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);CHKERRQ(ierr);
17810c27e3dSHong Zhang   ierr = MatSetType(*B,A->type_name);CHKERRQ(ierr);
17910c27e3dSHong Zhang   ierr = MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);CHKERRQ(ierr);
18010c27e3dSHong Zhang 
18110c27e3dSHong Zhang   /* PetscLogObjectParent(*B,iperm); */
18210c27e3dSHong Zhang   b = (Mat_SeqSBAIJ*)(*B)->data;
18310c27e3dSHong Zhang   ierr = PetscFree(b->imax);CHKERRQ(ierr);
18410c27e3dSHong Zhang   b->singlemalloc = PETSC_FALSE;
18510c27e3dSHong Zhang   /* the next line frees the default space generated by the Create() */
18610c27e3dSHong Zhang   ierr = PetscFree(b->a);CHKERRQ(ierr);
18710c27e3dSHong Zhang   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
18810c27e3dSHong Zhang   ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
18910c27e3dSHong Zhang   b->j    = ju;
19010c27e3dSHong Zhang   b->i    = iu;
19110c27e3dSHong Zhang   b->diag = 0;
19210c27e3dSHong Zhang   b->ilen = 0;
19310c27e3dSHong Zhang   b->imax = 0;
19410c27e3dSHong Zhang   b->row  = perm;
19510c27e3dSHong Zhang   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
19610c27e3dSHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
19710c27e3dSHong Zhang   b->icol = perm;
19810c27e3dSHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
19910c27e3dSHong Zhang   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
20010c27e3dSHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
20110c27e3dSHong Zhang      Allocate idnew, solve_work, new a, new j */
20210c27e3dSHong Zhang   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
20310c27e3dSHong Zhang   b->maxnz = b->nz = iu[mbs];
20410c27e3dSHong Zhang 
20510c27e3dSHong Zhang   (*B)->factor                 = FACTOR_CHOLESKY;
20610c27e3dSHong Zhang   (*B)->info.factor_mallocs    = reallocs;
20710c27e3dSHong Zhang   (*B)->info.fill_ratio_given  = f;
20810c27e3dSHong Zhang   if (ai[mbs] != 0) {
20910c27e3dSHong Zhang     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
21010c27e3dSHong Zhang   } else {
21110c27e3dSHong Zhang     (*B)->info.fill_ratio_needed = 0.0;
21210c27e3dSHong Zhang   }
21310c27e3dSHong Zhang 
21410c27e3dSHong Zhang   if (perm_identity){
21510c27e3dSHong Zhang     switch (bs) {
21610c27e3dSHong Zhang       case 1:
21710c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
21810c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
21910c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
22010c27e3dSHong Zhang         break;
22110c27e3dSHong Zhang       case 2:
22210c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
22310c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
22410c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
22510c27e3dSHong Zhang         break;
22610c27e3dSHong Zhang       case 3:
22710c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
22810c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
22910c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
23010c27e3dSHong Zhang         break;
23110c27e3dSHong Zhang       case 4:
23210c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
23310c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
23410c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
23510c27e3dSHong Zhang         break;
23610c27e3dSHong Zhang       case 5:
23710c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
23810c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
23910c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
24010c27e3dSHong Zhang         break;
24110c27e3dSHong Zhang       case 6:
24210c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
24310c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
24410c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
24510c27e3dSHong Zhang         break;
24610c27e3dSHong Zhang       case 7:
24710c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
24810c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
24910c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
25010c27e3dSHong Zhang       break;
25110c27e3dSHong Zhang       default:
25210c27e3dSHong Zhang         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
25310c27e3dSHong Zhang         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
25410c27e3dSHong Zhang         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
25510c27e3dSHong Zhang       break;
25610c27e3dSHong Zhang     }
25710c27e3dSHong Zhang   }
25810c27e3dSHong Zhang   PetscFunctionReturn(0);
25910c27e3dSHong Zhang }
26010c27e3dSHong Zhang /*
26110c27e3dSHong Zhang     Symbolic U^T*D*U factorization for SBAIJ format.
26210c27e3dSHong Zhang */
26398a8e78dSHong Zhang #include "petscbt.h"
26498a8e78dSHong Zhang #include "src/mat/utils/freespace.h"
2654a2ae208SSatish Balay #undef __FUNCT__
2664a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ"
26798a8e78dSHong Zhang PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
26849b5e25fSSatish Balay {
26998a8e78dSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
27098a8e78dSHong Zhang   Mat_SeqSBAIJ   *b;
27198a8e78dSHong Zhang   Mat            B;
2726849ba73SBarry Smith   PetscErrorCode ierr;
273671cb588SHong Zhang   PetscTruth     perm_identity;
27498a8e78dSHong Zhang   PetscReal      fill = info->fill;
2757625dc9aSHong Zhang   PetscInt       *rip,i,mbs=a->mbs,bs=A->bs,*ai,*aj,reallocs=0,prow;
27698a8e78dSHong Zhang   PetscInt       *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
2777625dc9aSHong Zhang   PetscInt       nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
27898a8e78dSHong Zhang   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
27998a8e78dSHong Zhang   PetscBT        lnkbt;
28049b5e25fSSatish Balay 
28149b5e25fSSatish Balay   PetscFunctionBegin;
28210c27e3dSHong Zhang   /*
28310c27e3dSHong Zhang    This code originally uses Modified Sparse Row (MSR) storage
28410c27e3dSHong Zhang    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
28510c27e3dSHong Zhang    Then it is rewritten so the factor B takes seqsbaij format. However the associated
28610c27e3dSHong Zhang    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
28710c27e3dSHong Zhang    thus the original code in MSR format is still used for these cases.
28810c27e3dSHong Zhang    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
28910c27e3dSHong Zhang    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
29010c27e3dSHong Zhang   */
291fff829cfSHong Zhang   if (bs > 1){
29298a8e78dSHong Zhang     ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);CHKERRQ(ierr);
293fff829cfSHong Zhang     PetscFunctionReturn(0);
294fff829cfSHong Zhang   }
29510c27e3dSHong Zhang 
29698a8e78dSHong Zhang   /* check whether perm is the identity mapping */
29798a8e78dSHong Zhang   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
29898a8e78dSHong Zhang 
299fff829cfSHong Zhang   if (perm_identity){
300fff829cfSHong Zhang     a->permute = PETSC_FALSE;
301fff829cfSHong Zhang     ai = a->i; aj = a->j;
302fff829cfSHong Zhang   } else {
303fff829cfSHong Zhang     a->permute = PETSC_TRUE;
304fff829cfSHong Zhang     ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr);
305fff829cfSHong Zhang     ai = a->inew; aj = a->jnew;
306fff829cfSHong Zhang   }
307fff829cfSHong Zhang   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
308fff829cfSHong Zhang 
309fff829cfSHong Zhang   /* initialization */
3107625dc9aSHong Zhang   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
31198a8e78dSHong Zhang   ui[0] = 0;
31298a8e78dSHong Zhang 
31398a8e78dSHong Zhang   /* jl: linked list for storing indices of the pivot rows
3147625dc9aSHong Zhang      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
3157625dc9aSHong Zhang   ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
3167625dc9aSHong Zhang   il     = jl + mbs;
3177625dc9aSHong Zhang   cols   = il + mbs;
3187625dc9aSHong Zhang   ui_ptr = (PetscInt**)(cols + mbs);
3197625dc9aSHong Zhang 
3207625dc9aSHong Zhang   for (i=0; i<mbs; i++){
3217625dc9aSHong Zhang     jl[i] = mbs; il[i] = 0;
322fff829cfSHong Zhang   }
323fff829cfSHong Zhang 
32498a8e78dSHong Zhang   /* create and initialize a linked list for storing column indices of the active row k */
3257625dc9aSHong Zhang   nlnk = mbs + 1;
3267625dc9aSHong Zhang   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
327fff829cfSHong Zhang 
3287625dc9aSHong Zhang   /* initial FreeSpace size is fill*(ai[mbs]+1) */
3297625dc9aSHong Zhang   ierr = GetMoreSpace((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
33098a8e78dSHong Zhang   current_space = free_space;
33198a8e78dSHong Zhang 
3327625dc9aSHong Zhang   for (k=0; k<mbs; k++){  /* for each active row k */
33398a8e78dSHong Zhang     /* initialize lnk by the column indices of row rip[k] of A */
33498a8e78dSHong Zhang     nzk   = 0;
33598a8e78dSHong Zhang     ncols = ai[rip[k]+1] - ai[rip[k]];
3367625dc9aSHong Zhang     for (j=0; j<ncols; j++){
3377625dc9aSHong Zhang       i = *(aj + ai[rip[k]] + j);
3387625dc9aSHong Zhang       cols[j] = rip[i];
3397625dc9aSHong Zhang     }
3407625dc9aSHong Zhang     ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
34198a8e78dSHong Zhang     nzk += nlnk;
34298a8e78dSHong Zhang 
34398a8e78dSHong Zhang     /* update lnk by computing fill-in for each pivot row to be merged in */
34498a8e78dSHong Zhang     prow = jl[k]; /* 1st pivot row */
345fff829cfSHong Zhang 
346fff829cfSHong Zhang     while (prow < k){
347fff829cfSHong Zhang       nextprow = jl[prow];
34898a8e78dSHong Zhang       /* merge prow into k-th row */
3497625dc9aSHong Zhang       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
35098a8e78dSHong Zhang       jmax = ui[prow+1];
35198a8e78dSHong Zhang       ncols = jmax-jmin;
3527625dc9aSHong Zhang       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
3537625dc9aSHong Zhang       ierr = PetscLLAdd(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
35498a8e78dSHong Zhang       nzk += nlnk;
355fff829cfSHong Zhang 
35698a8e78dSHong Zhang       /* update il and jl for prow */
357fff829cfSHong Zhang       if (jmin < jmax){
358fff829cfSHong Zhang         il[prow] = jmin;
3597625dc9aSHong Zhang         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
360fff829cfSHong Zhang       }
361fff829cfSHong Zhang       prow = nextprow;
362fff829cfSHong Zhang     }
363fff829cfSHong Zhang 
36498a8e78dSHong Zhang     /* if free space is not available, make more free space */
36598a8e78dSHong Zhang     if (current_space->local_remaining<nzk) {
3667625dc9aSHong Zhang       i = mbs - k + 1; /* num of unfactored rows */
36798a8e78dSHong Zhang       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
36898a8e78dSHong Zhang       ierr = GetMoreSpace(i,&current_space);CHKERRQ(ierr);
36998a8e78dSHong Zhang       reallocs++;
37098a8e78dSHong Zhang     }
37198a8e78dSHong Zhang 
37298a8e78dSHong Zhang     /* copy data into free space, then initialize lnk */
3737625dc9aSHong Zhang     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
37498a8e78dSHong Zhang 
37598a8e78dSHong Zhang     /* add the k-th row into il and jl */
37698a8e78dSHong Zhang     if (nzk-1 > 0){
3777625dc9aSHong Zhang       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
378fff829cfSHong Zhang       jl[k] = jl[i]; jl[i] = k;
37998a8e78dSHong Zhang       il[k] = ui[k] + 1;
380fff829cfSHong Zhang     }
3817625dc9aSHong Zhang     ui_ptr[k] = current_space->array;
38298a8e78dSHong Zhang     current_space->array           += nzk;
38398a8e78dSHong Zhang     current_space->local_used      += nzk;
38498a8e78dSHong Zhang     current_space->local_remaining -= nzk;
385fff829cfSHong Zhang 
38698a8e78dSHong Zhang     ui[k+1] = ui[k] + nzk;
387fff829cfSHong Zhang   }
388fff829cfSHong Zhang 
3897625dc9aSHong Zhang   if (ai[mbs] != 0) {
3907625dc9aSHong Zhang     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
39198a8e78dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,fill,af);
392fff829cfSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
39398a8e78dSHong Zhang     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g) for best performance.\n",af);
394fff829cfSHong Zhang   } else {
395fff829cfSHong Zhang      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
396fff829cfSHong Zhang   }
397fff829cfSHong Zhang 
398fff829cfSHong Zhang   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
399fff829cfSHong Zhang   ierr = PetscFree(jl);CHKERRQ(ierr);
400fff829cfSHong Zhang 
40198a8e78dSHong Zhang   /* destroy list of free space and other temporary array(s) */
4027625dc9aSHong Zhang   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
40398a8e78dSHong Zhang   ierr = MakeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
40498a8e78dSHong Zhang   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
405fff829cfSHong Zhang 
40698a8e78dSHong Zhang   /* put together the new matrix in MATSEQSBAIJ format */
4077625dc9aSHong Zhang   ierr = MatCreate(PETSC_COMM_SELF,mbs,mbs,mbs,mbs,fact);CHKERRQ(ierr);
40898a8e78dSHong Zhang   B    = *fact;
40998a8e78dSHong Zhang   ierr = MatSetType(B,MATSEQSBAIJ);CHKERRQ(ierr);
41098a8e78dSHong Zhang   ierr = MatSeqSBAIJSetPreallocation(B,bs,0,PETSC_NULL);CHKERRQ(ierr);
41198a8e78dSHong Zhang 
41298a8e78dSHong Zhang   b = (Mat_SeqSBAIJ*)B->data;
413fff829cfSHong Zhang   ierr = PetscFree(b->imax);CHKERRQ(ierr);
414fff829cfSHong Zhang   b->singlemalloc = PETSC_FALSE;
415fff829cfSHong Zhang   /* the next line frees the default space generated by the Create() */
416fff829cfSHong Zhang   ierr = PetscFree(b->a);CHKERRQ(ierr);
417fff829cfSHong Zhang   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
4187625dc9aSHong Zhang   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
41998a8e78dSHong Zhang   b->j    = uj;
42098a8e78dSHong Zhang   b->i    = ui;
421fff829cfSHong Zhang   b->diag = 0;
422fff829cfSHong Zhang   b->ilen = 0;
423fff829cfSHong Zhang   b->imax = 0;
424fff829cfSHong Zhang   b->row  = perm;
425fff829cfSHong Zhang   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
426fff829cfSHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
427fff829cfSHong Zhang   b->icol = perm;
428fff829cfSHong Zhang   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
4297625dc9aSHong Zhang   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
4307625dc9aSHong Zhang   PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
4317625dc9aSHong Zhang   b->maxnz = b->nz = ui[mbs];
432fff829cfSHong Zhang 
43398a8e78dSHong Zhang   B->factor                 = FACTOR_CHOLESKY;
43498a8e78dSHong Zhang   B->info.factor_mallocs    = reallocs;
43598a8e78dSHong Zhang   B->info.fill_ratio_given  = fill;
4367625dc9aSHong Zhang   if (ai[mbs] != 0) {
4377625dc9aSHong Zhang     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
438fff829cfSHong Zhang   } else {
43998a8e78dSHong Zhang     B->info.fill_ratio_needed = 0.0;
440fff829cfSHong Zhang   }
441fff829cfSHong Zhang 
442fff829cfSHong Zhang   if (perm_identity){
44398a8e78dSHong Zhang     switch (bs) {
44498a8e78dSHong Zhang       case 1:
44598a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
44698a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
4477701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
44898a8e78dSHong Zhang         break;
44998a8e78dSHong Zhang       case 2:
45098a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
45198a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
4527701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
45398a8e78dSHong Zhang         break;
45498a8e78dSHong Zhang       case 3:
45598a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
45698a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
4577701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
45898a8e78dSHong Zhang         break;
45998a8e78dSHong Zhang       case 4:
46098a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
46198a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
4627701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
46398a8e78dSHong Zhang         break;
46498a8e78dSHong Zhang       case 5:
46598a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
46698a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
4677701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
46898a8e78dSHong Zhang         break;
46998a8e78dSHong Zhang       case 6:
47098a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
47198a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
4727701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
47398a8e78dSHong Zhang         break;
47498a8e78dSHong Zhang       case 7:
47598a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
47698a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
4777701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
47898a8e78dSHong Zhang       break;
47998a8e78dSHong Zhang       default:
48098a8e78dSHong Zhang         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
48198a8e78dSHong Zhang         B->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
4827701de02SHong Zhang         PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
48398a8e78dSHong Zhang       break;
48498a8e78dSHong Zhang     }
48598a8e78dSHong Zhang   }
486064503c5SHong Zhang   PetscFunctionReturn(0);
487064503c5SHong Zhang }
4884a2ae208SSatish Balay #undef __FUNCT__
4894a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N"
490*af281ebdSHong Zhang PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,MatFactorInfo *info,Mat *B)
49149b5e25fSSatish Balay {
49249b5e25fSSatish Balay   Mat            C = *B;
4934c16a6a6SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
4944c16a6a6SHong Zhang   IS             perm = b->row;
4956849ba73SBarry Smith   PetscErrorCode ierr;
49613f74950SBarry Smith   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
49713f74950SBarry Smith   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
49813f74950SBarry Smith   PetscInt       bs=A->bs,bs2 = a->bs2;
4994c16a6a6SHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
5004c16a6a6SHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
50128de702eSHong Zhang   MatScalar      *work;
50213f74950SBarry Smith   PetscInt       *pivots;
5034c16a6a6SHong Zhang 
5044c16a6a6SHong Zhang   PetscFunctionBegin;
5054c16a6a6SHong Zhang   /* initialization */
50682502324SSatish Balay   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5074c16a6a6SHong Zhang   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
50813f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
50928de702eSHong Zhang   jl   = il + mbs;
5104c16a6a6SHong Zhang   for (i=0; i<mbs; i++) {
5114c16a6a6SHong Zhang     jl[i] = mbs; il[0] = 0;
5124c16a6a6SHong Zhang   }
513b0a32e0cSBarry Smith   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
51428de702eSHong Zhang   uik  = dk + bs2;
51528de702eSHong Zhang   work = uik + bs2;
51613f74950SBarry Smith   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
5174c16a6a6SHong Zhang 
5184c16a6a6SHong Zhang   ierr  = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
5194c16a6a6SHong Zhang 
5204c16a6a6SHong Zhang   /* check permutation */
5214c16a6a6SHong Zhang   if (!a->permute){
5224c16a6a6SHong Zhang     ai = a->i; aj = a->j; aa = a->a;
5234c16a6a6SHong Zhang   } else {
5244c16a6a6SHong Zhang     ai   = a->inew; aj = a->jnew;
52582502324SSatish Balay     ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
5264c16a6a6SHong Zhang     ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
52713f74950SBarry Smith     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
52813f74950SBarry Smith     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
5294c16a6a6SHong Zhang 
5304c16a6a6SHong Zhang     for (i=0; i<mbs; i++){
5314c16a6a6SHong Zhang       jmin = ai[i]; jmax = ai[i+1];
5324c16a6a6SHong Zhang       for (j=jmin; j<jmax; j++){
5334c16a6a6SHong Zhang         while (a2anew[j] != j){
5344c16a6a6SHong Zhang           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
5354c16a6a6SHong Zhang           for (k1=0; k1<bs2; k1++){
5364c16a6a6SHong Zhang             dk[k1]       = aa[k*bs2+k1];
5374c16a6a6SHong Zhang             aa[k*bs2+k1] = aa[j*bs2+k1];
5384c16a6a6SHong Zhang             aa[j*bs2+k1] = dk[k1];
5394c16a6a6SHong Zhang           }
5404c16a6a6SHong Zhang         }
5414c16a6a6SHong Zhang         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
5424c16a6a6SHong Zhang         if (i > aj[j]){
5434c16a6a6SHong Zhang           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
5444c16a6a6SHong Zhang           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
5454c16a6a6SHong Zhang           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
5464c16a6a6SHong Zhang           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
5474c16a6a6SHong Zhang             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
5484c16a6a6SHong Zhang           }
5494c16a6a6SHong Zhang         }
5504c16a6a6SHong Zhang       }
5514c16a6a6SHong Zhang     }
552323b833fSBarry Smith     ierr = PetscFree(a2anew);CHKERRQ(ierr);
5534c16a6a6SHong Zhang   }
5544c16a6a6SHong Zhang 
5554c16a6a6SHong Zhang   /* for each row k */
5564c16a6a6SHong Zhang   for (k = 0; k<mbs; k++){
5574c16a6a6SHong Zhang 
5584c16a6a6SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
5594c16a6a6SHong Zhang     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
560057f5ba7SHong Zhang 
5614c16a6a6SHong Zhang     ap = aa + jmin*bs2;
5624c16a6a6SHong Zhang     for (j = jmin; j < jmax; j++){
5634c16a6a6SHong Zhang       vj = perm_ptr[aj[j]];         /* block col. index */
5644c16a6a6SHong Zhang       rtmp_ptr = rtmp + vj*bs2;
5654c16a6a6SHong Zhang       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
5664c16a6a6SHong Zhang     }
5674c16a6a6SHong Zhang 
5684c16a6a6SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
5694c16a6a6SHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5704c16a6a6SHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
5714c16a6a6SHong Zhang 
572057f5ba7SHong Zhang     while (i < k){
5734c16a6a6SHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
5744c16a6a6SHong Zhang 
5754c16a6a6SHong Zhang       /* compute multiplier */
5764c16a6a6SHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
5774c16a6a6SHong Zhang 
5784c16a6a6SHong Zhang       /* uik = -inv(Di)*U_bar(i,k) */
5794c16a6a6SHong Zhang       diag = ba + i*bs2;
5804c16a6a6SHong Zhang       u    = ba + ili*bs2;
5814c16a6a6SHong Zhang       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5824c16a6a6SHong Zhang       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
5834c16a6a6SHong Zhang 
5844c16a6a6SHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
5854c16a6a6SHong Zhang       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
5864c16a6a6SHong Zhang 
5874c16a6a6SHong Zhang       /* update -U(i,k) */
5884c16a6a6SHong Zhang       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5894c16a6a6SHong Zhang 
5904c16a6a6SHong Zhang       /* add multiple of row i to k-th row ... */
5914c16a6a6SHong Zhang       jmin = ili + 1; jmax = bi[i+1];
5924c16a6a6SHong Zhang       if (jmin < jmax){
5934c16a6a6SHong Zhang         for (j=jmin; j<jmax; j++) {
5944c16a6a6SHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j) */
5954c16a6a6SHong Zhang           rtmp_ptr = rtmp + bj[j]*bs2;
5964c16a6a6SHong Zhang           u = ba + j*bs2;
5974c16a6a6SHong Zhang           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
5984c16a6a6SHong Zhang         }
5994c16a6a6SHong Zhang 
6004c16a6a6SHong Zhang         /* ... add i to row list for next nonzero entry */
6014c16a6a6SHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
6024c16a6a6SHong Zhang         j     = bj[jmin];
6034c16a6a6SHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
6044c16a6a6SHong Zhang       }
6054c16a6a6SHong Zhang       i = nexti;
6064c16a6a6SHong Zhang     }
6074c16a6a6SHong Zhang 
6084c16a6a6SHong Zhang     /* save nonzero entries in k-th row of U ... */
6094c16a6a6SHong Zhang 
6104c16a6a6SHong Zhang     /* invert diagonal block */
6114c16a6a6SHong Zhang     diag = ba+k*bs2;
6124c16a6a6SHong Zhang     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
613d230e6fdSBarry Smith     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
6144c16a6a6SHong Zhang 
6154c16a6a6SHong Zhang     jmin = bi[k]; jmax = bi[k+1];
6164c16a6a6SHong Zhang     if (jmin < jmax) {
6174c16a6a6SHong Zhang       for (j=jmin; j<jmax; j++){
6184c16a6a6SHong Zhang          vj = bj[j];           /* block col. index of U */
6194c16a6a6SHong Zhang          u   = ba + j*bs2;
6204c16a6a6SHong Zhang          rtmp_ptr = rtmp + vj*bs2;
6214c16a6a6SHong Zhang          for (k1=0; k1<bs2; k1++){
6224c16a6a6SHong Zhang            *u++        = *rtmp_ptr;
6234c16a6a6SHong Zhang            *rtmp_ptr++ = 0.0;
6244c16a6a6SHong Zhang          }
6254c16a6a6SHong Zhang       }
6264c16a6a6SHong Zhang 
6274c16a6a6SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
6284c16a6a6SHong Zhang       il[k] = jmin;
6294c16a6a6SHong Zhang       i     = bj[jmin];
6304c16a6a6SHong Zhang       jl[k] = jl[i]; jl[i] = k;
6314c16a6a6SHong Zhang     }
6324c16a6a6SHong Zhang   }
6334c16a6a6SHong Zhang 
6344c16a6a6SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
6354c16a6a6SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
6364c16a6a6SHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
6374c16a6a6SHong Zhang   ierr = PetscFree(pivots);CHKERRQ(ierr);
6384c16a6a6SHong Zhang   if (a->permute){
6394c16a6a6SHong Zhang     ierr = PetscFree(aa);CHKERRQ(ierr);
6404c16a6a6SHong Zhang   }
6414c16a6a6SHong Zhang 
6424c16a6a6SHong Zhang   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
6434c16a6a6SHong Zhang   C->factor       = FACTOR_CHOLESKY;
6444c16a6a6SHong Zhang   C->assembled    = PETSC_TRUE;
6454c16a6a6SHong Zhang   C->preallocated = PETSC_TRUE;
646b0a32e0cSBarry Smith   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
6474c16a6a6SHong Zhang   PetscFunctionReturn(0);
6484c16a6a6SHong Zhang }
649d4edadd4SHong Zhang 
6504a2ae208SSatish Balay #undef __FUNCT__
6514a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering"
652*af281ebdSHong Zhang PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
653671cb588SHong Zhang {
654671cb588SHong Zhang   Mat            C = *B;
655671cb588SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
656dfbe8321SBarry Smith   PetscErrorCode ierr;
65713f74950SBarry Smith   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
65813f74950SBarry Smith   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
65913f74950SBarry Smith   PetscInt       bs=A->bs,bs2 = a->bs2;
660671cb588SHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
661671cb588SHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
66228de702eSHong Zhang   MatScalar      *work;
66313f74950SBarry Smith   PetscInt       *pivots;
664671cb588SHong Zhang 
665671cb588SHong Zhang   PetscFunctionBegin;
666671cb588SHong Zhang   /* initialization */
667671cb588SHong Zhang 
66882502324SSatish Balay   ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
669671cb588SHong Zhang   ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr);
67013f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
67128de702eSHong Zhang   jl   = il + mbs;
672671cb588SHong Zhang   for (i=0; i<mbs; i++) {
673671cb588SHong Zhang     jl[i] = mbs; il[0] = 0;
674671cb588SHong Zhang   }
675b0a32e0cSBarry Smith   ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr);
67628de702eSHong Zhang   uik  = dk + bs2;
67728de702eSHong Zhang   work = uik + bs2;
67813f74950SBarry Smith   ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr);
679671cb588SHong Zhang 
680671cb588SHong Zhang   ai = a->i; aj = a->j; aa = a->a;
681671cb588SHong Zhang 
682671cb588SHong Zhang   /* for each row k */
683671cb588SHong Zhang   for (k = 0; k<mbs; k++){
684671cb588SHong Zhang 
685671cb588SHong Zhang     /*initialize k-th row with elements nonzero in row k of A */
686671cb588SHong Zhang     jmin = ai[k]; jmax = ai[k+1];
687671cb588SHong Zhang     ap = aa + jmin*bs2;
688671cb588SHong Zhang     for (j = jmin; j < jmax; j++){
689671cb588SHong Zhang       vj = aj[j];         /* block col. index */
690671cb588SHong Zhang       rtmp_ptr = rtmp + vj*bs2;
691671cb588SHong Zhang       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
692671cb588SHong Zhang     }
693671cb588SHong Zhang 
694671cb588SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
695671cb588SHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
696671cb588SHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
697671cb588SHong Zhang 
698057f5ba7SHong Zhang     while (i < k){
699671cb588SHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
700671cb588SHong Zhang 
701671cb588SHong Zhang       /* compute multiplier */
702671cb588SHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
703671cb588SHong Zhang 
704671cb588SHong Zhang       /* uik = -inv(Di)*U_bar(i,k) */
705671cb588SHong Zhang       diag = ba + i*bs2;
706671cb588SHong Zhang       u    = ba + ili*bs2;
707671cb588SHong Zhang       ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
708671cb588SHong Zhang       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
709671cb588SHong Zhang 
710671cb588SHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
711671cb588SHong Zhang       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
712671cb588SHong Zhang 
713671cb588SHong Zhang       /* update -U(i,k) */
714671cb588SHong Zhang       ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr);
715671cb588SHong Zhang 
716671cb588SHong Zhang       /* add multiple of row i to k-th row ... */
717671cb588SHong Zhang       jmin = ili + 1; jmax = bi[i+1];
718671cb588SHong Zhang       if (jmin < jmax){
719671cb588SHong Zhang         for (j=jmin; j<jmax; j++) {
720671cb588SHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j) */
721671cb588SHong Zhang           rtmp_ptr = rtmp + bj[j]*bs2;
722671cb588SHong Zhang           u = ba + j*bs2;
723671cb588SHong Zhang           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
724671cb588SHong Zhang         }
725671cb588SHong Zhang 
726671cb588SHong Zhang         /* ... add i to row list for next nonzero entry */
727671cb588SHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
728671cb588SHong Zhang         j     = bj[jmin];
729671cb588SHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
730671cb588SHong Zhang       }
731671cb588SHong Zhang       i = nexti;
732671cb588SHong Zhang     }
733671cb588SHong Zhang 
734671cb588SHong Zhang     /* save nonzero entries in k-th row of U ... */
735671cb588SHong Zhang 
736671cb588SHong Zhang     /* invert diagonal block */
737671cb588SHong Zhang     diag = ba+k*bs2;
738671cb588SHong Zhang     ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr);
739d230e6fdSBarry Smith     ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr);
740671cb588SHong Zhang 
741671cb588SHong Zhang     jmin = bi[k]; jmax = bi[k+1];
742671cb588SHong Zhang     if (jmin < jmax) {
743671cb588SHong Zhang       for (j=jmin; j<jmax; j++){
744671cb588SHong Zhang          vj = bj[j];           /* block col. index of U */
745671cb588SHong Zhang          u   = ba + j*bs2;
746671cb588SHong Zhang          rtmp_ptr = rtmp + vj*bs2;
747671cb588SHong Zhang          for (k1=0; k1<bs2; k1++){
748671cb588SHong Zhang            *u++        = *rtmp_ptr;
749671cb588SHong Zhang            *rtmp_ptr++ = 0.0;
750671cb588SHong Zhang          }
751671cb588SHong Zhang       }
752671cb588SHong Zhang 
753671cb588SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
754671cb588SHong Zhang       il[k] = jmin;
755671cb588SHong Zhang       i     = bj[jmin];
756671cb588SHong Zhang       jl[k] = jl[i]; jl[i] = k;
757671cb588SHong Zhang     }
758671cb588SHong Zhang   }
759671cb588SHong Zhang 
760671cb588SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
761671cb588SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
762671cb588SHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
763671cb588SHong Zhang   ierr = PetscFree(pivots);CHKERRQ(ierr);
764671cb588SHong Zhang 
765671cb588SHong Zhang   C->factor    = FACTOR_CHOLESKY;
766671cb588SHong Zhang   C->assembled = PETSC_TRUE;
767671cb588SHong Zhang   C->preallocated = PETSC_TRUE;
768b0a32e0cSBarry Smith   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
769671cb588SHong Zhang   PetscFunctionReturn(0);
770671cb588SHong Zhang }
771671cb588SHong Zhang 
77249b5e25fSSatish Balay /*
773fcf159c0SHong Zhang     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
774cc0c071aSHong Zhang     Version for blocks 2 by 2.
77549b5e25fSSatish Balay */
7764a2ae208SSatish Balay #undef __FUNCT__
7774a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2"
778*af281ebdSHong Zhang PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,MatFactorInfo *info,Mat *B)
77949b5e25fSSatish Balay {
78049b5e25fSSatish Balay   Mat            C = *B;
78191602c66SHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
782cc0c071aSHong Zhang   IS             perm = b->row;
7836849ba73SBarry Smith   PetscErrorCode ierr;
78413f74950SBarry Smith   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
78513f74950SBarry Smith   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
786a1723e09SHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
787cc0c071aSHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
78849b5e25fSSatish Balay 
78949b5e25fSSatish Balay   PetscFunctionBegin;
79091602c66SHong Zhang   /* initialization */
79191602c66SHong Zhang   /* il and jl record the first nonzero element in each row of the accessing
79291602c66SHong Zhang      window U(0:k, k:mbs-1).
79391602c66SHong Zhang      jl:    list of rows to be added to uneliminated rows
79491602c66SHong Zhang             i>= k: jl(i) is the first row to be added to row i
79591602c66SHong Zhang             i<  k: jl(i) is the row following row i in some list of rows
79691602c66SHong Zhang             jl(i) = mbs indicates the end of a list
79791602c66SHong Zhang      il(i): points to the first nonzero element in columns k,...,mbs-1 of
79891602c66SHong Zhang             row i of U */
79982502324SSatish Balay   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
800cc0c071aSHong Zhang   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
80113f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
80228de702eSHong Zhang   jl   = il + mbs;
80391602c66SHong Zhang   for (i=0; i<mbs; i++) {
8043845f261SHong Zhang     jl[i] = mbs; il[0] = 0;
80591602c66SHong Zhang   }
80682502324SSatish Balay   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
80728de702eSHong Zhang   uik  = dk + 4;
808cc0c071aSHong Zhang   ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr);
809cc0c071aSHong Zhang 
810cc0c071aSHong Zhang   /* check permutation */
811cc0c071aSHong Zhang   if (!a->permute){
812cc0c071aSHong Zhang     ai = a->i; aj = a->j; aa = a->a;
813cc0c071aSHong Zhang   } else {
814cc0c071aSHong Zhang     ai   = a->inew; aj = a->jnew;
81582502324SSatish Balay     ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr);
816cc0c071aSHong Zhang     ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr);
81713f74950SBarry Smith     ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr);
81813f74950SBarry Smith     ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr);
819cc0c071aSHong Zhang 
820cc0c071aSHong Zhang     for (i=0; i<mbs; i++){
821cc0c071aSHong Zhang       jmin = ai[i]; jmax = ai[i+1];
822cc0c071aSHong Zhang       for (j=jmin; j<jmax; j++){
823cc0c071aSHong Zhang         while (a2anew[j] != j){
824cc0c071aSHong Zhang           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
825cc0c071aSHong Zhang           for (k1=0; k1<4; k1++){
826cc0c071aSHong Zhang             dk[k1]       = aa[k*4+k1];
827cc0c071aSHong Zhang             aa[k*4+k1] = aa[j*4+k1];
828cc0c071aSHong Zhang             aa[j*4+k1] = dk[k1];
829cc0c071aSHong Zhang           }
830cc0c071aSHong Zhang         }
831cc0c071aSHong Zhang         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
832cc0c071aSHong Zhang         if (i > aj[j]){
833a1723e09SHong Zhang           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
834cc0c071aSHong Zhang           ap = aa + j*4;     /* ptr to the beginning of the block */
835cc0c071aSHong Zhang           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
836cc0c071aSHong Zhang           ap[1] = ap[2];
837cc0c071aSHong Zhang           ap[2] = dk[1];
838cc0c071aSHong Zhang         }
839cc0c071aSHong Zhang       }
840cc0c071aSHong Zhang     }
841ac355199SBarry Smith     ierr = PetscFree(a2anew);CHKERRQ(ierr);
842cc0c071aSHong Zhang   }
8433845f261SHong Zhang 
84491602c66SHong Zhang   /* for each row k */
84591602c66SHong Zhang   for (k = 0; k<mbs; k++){
84691602c66SHong Zhang 
84791602c66SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
848cc0c071aSHong Zhang     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
849cc0c071aSHong Zhang     ap = aa + jmin*4;
85091602c66SHong Zhang     for (j = jmin; j < jmax; j++){
851cc0c071aSHong Zhang       vj = perm_ptr[aj[j]];         /* block col. index */
852cc0c071aSHong Zhang       rtmp_ptr = rtmp + vj*4;
853cc0c071aSHong Zhang       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
85491602c66SHong Zhang     }
85591602c66SHong Zhang 
85691602c66SHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
857cc0c071aSHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
85891602c66SHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
85991602c66SHong Zhang 
860057f5ba7SHong Zhang     while (i < k){
86191602c66SHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
86291602c66SHong Zhang 
8633845f261SHong Zhang       /* compute multiplier */
86491602c66SHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
8653845f261SHong Zhang 
8663845f261SHong Zhang       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
867cc0c071aSHong Zhang       diag = ba + i*4;
868cc0c071aSHong Zhang       u    = ba + ili*4;
869cc0c071aSHong Zhang       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
870cc0c071aSHong Zhang       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
871cc0c071aSHong Zhang       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
872cc0c071aSHong Zhang       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
8733845f261SHong Zhang 
8743845f261SHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
875cc0c071aSHong Zhang       dk[0] += uik[0]*u[0] + uik[1]*u[1];
876cc0c071aSHong Zhang       dk[1] += uik[2]*u[0] + uik[3]*u[1];
877cc0c071aSHong Zhang       dk[2] += uik[0]*u[2] + uik[1]*u[3];
878cc0c071aSHong Zhang       dk[3] += uik[2]*u[2] + uik[3]*u[3];
8793845f261SHong Zhang 
8803845f261SHong Zhang       /* update -U(i,k): ba[ili] = uik */
881cc0c071aSHong Zhang       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
88291602c66SHong Zhang 
88391602c66SHong Zhang       /* add multiple of row i to k-th row ... */
88491602c66SHong Zhang       jmin = ili + 1; jmax = bi[i+1];
88591602c66SHong Zhang       if (jmin < jmax){
8863845f261SHong Zhang         for (j=jmin; j<jmax; j++) {
8873845f261SHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
888cc0c071aSHong Zhang           rtmp_ptr = rtmp + bj[j]*4;
889cc0c071aSHong Zhang           u = ba + j*4;
890cc0c071aSHong Zhang           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
891cc0c071aSHong Zhang           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
892cc0c071aSHong Zhang           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
893cc0c071aSHong Zhang           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
8943845f261SHong Zhang         }
8953845f261SHong Zhang 
89691602c66SHong Zhang         /* ... add i to row list for next nonzero entry */
89791602c66SHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
89891602c66SHong Zhang         j     = bj[jmin];
89991602c66SHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
90091602c66SHong Zhang       }
901a1723e09SHong Zhang       i = nexti;
90291602c66SHong Zhang     }
903cc0c071aSHong Zhang 
90491602c66SHong Zhang     /* save nonzero entries in k-th row of U ... */
9053845f261SHong Zhang 
906cc0c071aSHong Zhang     /* invert diagonal block */
907cc0c071aSHong Zhang     diag = ba+k*4;
908cc0c071aSHong Zhang     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
9093845f261SHong Zhang     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
9103845f261SHong Zhang 
91191602c66SHong Zhang     jmin = bi[k]; jmax = bi[k+1];
91291602c66SHong Zhang     if (jmin < jmax) {
91391602c66SHong Zhang       for (j=jmin; j<jmax; j++){
914cc0c071aSHong Zhang          vj = bj[j];           /* block col. index of U */
915cc0c071aSHong Zhang          u   = ba + j*4;
916cc0c071aSHong Zhang          rtmp_ptr = rtmp + vj*4;
917cc0c071aSHong Zhang          for (k1=0; k1<4; k1++){
918cc0c071aSHong Zhang            *u++        = *rtmp_ptr;
919cc0c071aSHong Zhang            *rtmp_ptr++ = 0.0;
920cc0c071aSHong Zhang          }
921cc0c071aSHong Zhang       }
9223845f261SHong Zhang 
92391602c66SHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
92491602c66SHong Zhang       il[k] = jmin;
92591602c66SHong Zhang       i     = bj[jmin];
92691602c66SHong Zhang       jl[k] = jl[i]; jl[i] = k;
92791602c66SHong Zhang     }
92891602c66SHong Zhang   }
9293845f261SHong Zhang 
93049b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
93191602c66SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
9323845f261SHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
93391602c66SHong Zhang   if (a->permute) {
93491602c66SHong Zhang     ierr = PetscFree(aa);CHKERRQ(ierr);
93591602c66SHong Zhang   }
936cc0c071aSHong Zhang   ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr);
93791602c66SHong Zhang   C->factor    = FACTOR_CHOLESKY;
93849b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
9395c0bcdfcSHong Zhang   C->preallocated = PETSC_TRUE;
940b0a32e0cSBarry Smith   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
94149b5e25fSSatish Balay   PetscFunctionReturn(0);
94249b5e25fSSatish Balay }
94391602c66SHong Zhang 
94449b5e25fSSatish Balay /*
94549b5e25fSSatish Balay       Version for when blocks are 2 by 2 Using natural ordering
94649b5e25fSSatish Balay */
9474a2ae208SSatish Balay #undef __FUNCT__
9484a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
949*af281ebdSHong Zhang PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
95049b5e25fSSatish Balay {
95149b5e25fSSatish Balay   Mat            C = *B;
952ab27746eSHong Zhang   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
953dfbe8321SBarry Smith   PetscErrorCode ierr;
95413f74950SBarry Smith   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
95513f74950SBarry Smith   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
956ab27746eSHong Zhang   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
957ab27746eSHong Zhang   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
95849b5e25fSSatish Balay 
95949b5e25fSSatish Balay   PetscFunctionBegin;
960ab27746eSHong Zhang   /* initialization */
961ab27746eSHong Zhang   /* il and jl record the first nonzero element in each row of the accessing
962ab27746eSHong Zhang      window U(0:k, k:mbs-1).
963ab27746eSHong Zhang      jl:    list of rows to be added to uneliminated rows
964ab27746eSHong Zhang             i>= k: jl(i) is the first row to be added to row i
965ab27746eSHong Zhang             i<  k: jl(i) is the row following row i in some list of rows
966ab27746eSHong Zhang             jl(i) = mbs indicates the end of a list
967ab27746eSHong Zhang      il(i): points to the first nonzero element in columns k,...,mbs-1 of
968ab27746eSHong Zhang             row i of U */
96982502324SSatish Balay   ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
970ab27746eSHong Zhang   ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr);
97113f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
97228de702eSHong Zhang   jl   = il + mbs;
973ab27746eSHong Zhang   for (i=0; i<mbs; i++) {
974ab27746eSHong Zhang     jl[i] = mbs; il[0] = 0;
97549b5e25fSSatish Balay   }
97682502324SSatish Balay   ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr);
97728de702eSHong Zhang   uik  = dk + 4;
978ab27746eSHong Zhang 
979ab27746eSHong Zhang   ai = a->i; aj = a->j; aa = a->a;
980ab27746eSHong Zhang 
981ab27746eSHong Zhang   /* for each row k */
982ab27746eSHong Zhang   for (k = 0; k<mbs; k++){
983ab27746eSHong Zhang 
984ab27746eSHong Zhang     /*initialize k-th row with elements nonzero in row k of A */
985ab27746eSHong Zhang     jmin = ai[k]; jmax = ai[k+1];
986ab27746eSHong Zhang     ap = aa + jmin*4;
987ab27746eSHong Zhang     for (j = jmin; j < jmax; j++){
988ab27746eSHong Zhang       vj = aj[j];         /* block col. index */
989ab27746eSHong Zhang       rtmp_ptr = rtmp + vj*4;
990ab27746eSHong Zhang       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
99149b5e25fSSatish Balay     }
992ab27746eSHong Zhang 
993ab27746eSHong Zhang     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
994ab27746eSHong Zhang     ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr);
995ab27746eSHong Zhang     i = jl[k]; /* first row to be added to k_th row  */
996ab27746eSHong Zhang 
997057f5ba7SHong Zhang     while (i < k){
998ab27746eSHong Zhang       nexti = jl[i]; /* next row to be added to k_th row */
999ab27746eSHong Zhang 
1000ab27746eSHong Zhang       /* compute multiplier */
1001ab27746eSHong Zhang       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1002ab27746eSHong Zhang 
1003ab27746eSHong Zhang       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1004ab27746eSHong Zhang       diag = ba + i*4;
1005ab27746eSHong Zhang       u    = ba + ili*4;
1006ab27746eSHong Zhang       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1007ab27746eSHong Zhang       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1008ab27746eSHong Zhang       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1009ab27746eSHong Zhang       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1010ab27746eSHong Zhang 
1011ab27746eSHong Zhang       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1012ab27746eSHong Zhang       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1013ab27746eSHong Zhang       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1014ab27746eSHong Zhang       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1015ab27746eSHong Zhang       dk[3] += uik[2]*u[2] + uik[3]*u[3];
1016ab27746eSHong Zhang 
1017ab27746eSHong Zhang       /* update -U(i,k): ba[ili] = uik */
1018ab27746eSHong Zhang       ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr);
1019ab27746eSHong Zhang 
1020ab27746eSHong Zhang       /* add multiple of row i to k-th row ... */
1021ab27746eSHong Zhang       jmin = ili + 1; jmax = bi[i+1];
1022ab27746eSHong Zhang       if (jmin < jmax){
1023ab27746eSHong Zhang         for (j=jmin; j<jmax; j++) {
1024ab27746eSHong Zhang           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1025ab27746eSHong Zhang           rtmp_ptr = rtmp + bj[j]*4;
1026ab27746eSHong Zhang           u = ba + j*4;
1027ab27746eSHong Zhang           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1028ab27746eSHong Zhang           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1029ab27746eSHong Zhang           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1030ab27746eSHong Zhang           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
103149b5e25fSSatish Balay         }
1032ab27746eSHong Zhang 
1033ab27746eSHong Zhang         /* ... add i to row list for next nonzero entry */
1034ab27746eSHong Zhang         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1035ab27746eSHong Zhang         j     = bj[jmin];
1036ab27746eSHong Zhang         jl[i] = jl[j]; jl[j] = i; /* update jl */
103749b5e25fSSatish Balay       }
1038ab27746eSHong Zhang       i = nexti;
103949b5e25fSSatish Balay     }
1040ab27746eSHong Zhang 
1041ab27746eSHong Zhang     /* save nonzero entries in k-th row of U ... */
1042ab27746eSHong Zhang 
104349b5e25fSSatish Balay     /* invert diagonal block */
1044ab27746eSHong Zhang     diag = ba+k*4;
1045ab27746eSHong Zhang     ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr);
1046ab27746eSHong Zhang     ierr = Kernel_A_gets_inverse_A_2(diag);CHKERRQ(ierr);
1047ab27746eSHong Zhang 
1048ab27746eSHong Zhang     jmin = bi[k]; jmax = bi[k+1];
1049ab27746eSHong Zhang     if (jmin < jmax) {
1050ab27746eSHong Zhang       for (j=jmin; j<jmax; j++){
1051ab27746eSHong Zhang          vj = bj[j];           /* block col. index of U */
1052ab27746eSHong Zhang          u   = ba + j*4;
1053ab27746eSHong Zhang          rtmp_ptr = rtmp + vj*4;
1054ab27746eSHong Zhang          for (k1=0; k1<4; k1++){
1055ab27746eSHong Zhang            *u++        = *rtmp_ptr;
1056ab27746eSHong Zhang            *rtmp_ptr++ = 0.0;
1057ab27746eSHong Zhang          }
1058ab27746eSHong Zhang       }
1059ab27746eSHong Zhang 
1060ab27746eSHong Zhang       /* ... add k to row list for first nonzero entry in k-th row */
1061ab27746eSHong Zhang       il[k] = jmin;
1062ab27746eSHong Zhang       i     = bj[jmin];
1063ab27746eSHong Zhang       jl[k] = jl[i]; jl[i] = k;
1064ab27746eSHong Zhang     }
106549b5e25fSSatish Balay   }
106649b5e25fSSatish Balay 
106749b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1068ab27746eSHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
1069ab27746eSHong Zhang   ierr = PetscFree(dk);CHKERRQ(ierr);
1070ab27746eSHong Zhang 
1071ab27746eSHong Zhang   C->factor    = FACTOR_CHOLESKY;
107249b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
10735c0bcdfcSHong Zhang   C->preallocated = PETSC_TRUE;
1074b0a32e0cSBarry Smith   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
107549b5e25fSSatish Balay   PetscFunctionReturn(0);
107649b5e25fSSatish Balay }
107749b5e25fSSatish Balay 
107849b5e25fSSatish Balay /*
107998a8e78dSHong Zhang     Numeric U^T*D*U factorization for SBAIJ format.
108091602c66SHong Zhang     Version for blocks are 1 by 1.
108149b5e25fSSatish Balay */
10824a2ae208SSatish Balay #undef __FUNCT__
10834a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1"
1084*af281ebdSHong Zhang PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,MatFactorInfo *info,Mat *B)
108549b5e25fSSatish Balay {
108649b5e25fSSatish Balay   Mat            C = *B;
108749b5e25fSSatish Balay   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
108849b5e25fSSatish Balay   IS             ip=b->row;
10896849ba73SBarry Smith   PetscErrorCode ierr;
1090997a0949SHong Zhang   PetscInt       *rip,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1091fff829cfSHong Zhang   PetscInt       *ai,*aj,*a2anew;
1092997a0949SHong Zhang   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1093997a0949SHong Zhang   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1094997a0949SHong Zhang   PetscReal      damping=b->factor_damping,zeropivot=b->factor_zeropivot,shift_amount;
1095997a0949SHong Zhang   PetscTruth     damp,chshift;
1096997a0949SHong Zhang   PetscInt       nshift=0,ndamp=0;
109749b5e25fSSatish Balay 
109849b5e25fSSatish Balay   PetscFunctionBegin;
109949b5e25fSSatish Balay   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1100cb718733SHong Zhang   if (!a->permute){
11012d007305SHong Zhang     ai = a->i; aj = a->j; aa = a->a;
11022d007305SHong Zhang   } else {
11032d007305SHong Zhang     ai = a->inew; aj = a->jnew;
1104fff829cfSHong Zhang     nz = ai[mbs];
1105fff829cfSHong Zhang     ierr = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr);
1106fff829cfSHong Zhang     a2anew = a->a2anew;
1107997a0949SHong Zhang     bval   = a->a;
1108fff829cfSHong Zhang     for (j=0; j<nz; j++){
1109997a0949SHong Zhang       aa[a2anew[j]] = *(bval++);
11102d007305SHong Zhang     }
11112d007305SHong Zhang   }
11122d007305SHong Zhang 
111391602c66SHong Zhang   /* initialization */
111449b5e25fSSatish Balay   /* il and jl record the first nonzero element in each row of the accessing
111549b5e25fSSatish Balay      window U(0:k, k:mbs-1).
111649b5e25fSSatish Balay      jl:    list of rows to be added to uneliminated rows
111749b5e25fSSatish Balay             i>= k: jl(i) is the first row to be added to row i
111849b5e25fSSatish Balay             i<  k: jl(i) is the row following row i in some list of rows
111949b5e25fSSatish Balay             jl(i) = mbs indicates the end of a list
112049b5e25fSSatish Balay      il(i): points to the first nonzero element in columns k,...,mbs-1 of
112149b5e25fSSatish Balay             row i of U */
1122997a0949SHong Zhang   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
1123997a0949SHong Zhang   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
112428de702eSHong Zhang   jl   = il + mbs;
1125997a0949SHong Zhang   rtmp = (MatScalar*)(jl + mbs);
1126997a0949SHong Zhang 
1127997a0949SHong Zhang   shift_amount = 0;
1128997a0949SHong Zhang   do {
1129997a0949SHong Zhang     damp = PETSC_FALSE;
1130997a0949SHong Zhang     chshift = PETSC_FALSE;
113149b5e25fSSatish Balay     for (i=0; i<mbs; i++) {
113249b5e25fSSatish Balay       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
113349b5e25fSSatish Balay     }
1134997a0949SHong Zhang 
113549b5e25fSSatish Balay     for (k = 0; k<mbs; k++){
1136997a0949SHong Zhang       /*initialize k-th row by the perm[k]-th row of A */
113749b5e25fSSatish Balay       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
11387701de02SHong Zhang       bval = ba + bi[k];
113949b5e25fSSatish Balay       for (j = jmin; j < jmax; j++){
1140997a0949SHong Zhang         col = rip[aj[j]];
1141997a0949SHong Zhang         rtmp[col] = aa[j];
11427701de02SHong Zhang         *bval++  = 0.0; /* for in-place factorization */
114349b5e25fSSatish Balay       }
1144997a0949SHong Zhang       /* damp the diagonal of the matrix */
1145997a0949SHong Zhang       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
114649b5e25fSSatish Balay 
114791602c66SHong Zhang       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
114849b5e25fSSatish Balay       dk = rtmp[k];
114949b5e25fSSatish Balay       i = jl[k]; /* first row to be added to k_th row  */
115049b5e25fSSatish Balay 
1151057f5ba7SHong Zhang       while (i < k){
115249b5e25fSSatish Balay         nexti = jl[i]; /* next row to be added to k_th row */
115349b5e25fSSatish Balay 
1154fff829cfSHong Zhang         /* compute multiplier, update diag(k) and U(i,k) */
115549b5e25fSSatish Balay         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1156997a0949SHong Zhang         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
115749b5e25fSSatish Balay         dk += uikdi*ba[ili];
1158658e7b3eSHong Zhang         ba[ili] = uikdi; /* -U(i,k) */
115949b5e25fSSatish Balay 
1160997a0949SHong Zhang         /* add multiple of row i to k-th row */
116149b5e25fSSatish Balay         jmin = ili + 1; jmax = bi[i+1];
116249b5e25fSSatish Balay         if (jmin < jmax){
116349b5e25fSSatish Balay           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1164fff829cfSHong Zhang           /* update il and jl for row i */
1165fff829cfSHong Zhang           il[i] = jmin;
1166fff829cfSHong Zhang           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
116749b5e25fSSatish Balay         }
1168ab27746eSHong Zhang         i = nexti;
116949b5e25fSSatish Balay       }
117049b5e25fSSatish Balay 
1171997a0949SHong Zhang       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1172997a0949SHong Zhang 	/* calculate a shift that would make this row diagonally dominant */
1173997a0949SHong Zhang 	PetscReal rs = PetscAbs(PetscRealPart(dk));
1174997a0949SHong Zhang 	jmin      = bi[k]+1;
1175997a0949SHong Zhang 	nz        = bi[k+1] - jmin;
1176997a0949SHong Zhang 	if (nz){
1177997a0949SHong Zhang 	  bcol = bj + jmin;
1178997a0949SHong Zhang 	  bval = ba + jmin;
1179997a0949SHong Zhang 	  while (nz--){
1180997a0949SHong Zhang 	    rs += PetscAbsScalar(rtmp[*bcol++]);
1181997a0949SHong Zhang 	  }
1182997a0949SHong Zhang 	}
1183997a0949SHong Zhang 	/* if this shift is less than the previous, just up the previous
1184997a0949SHong Zhang 	   one by a bit */
1185997a0949SHong Zhang 	shift_amount = PetscMax(rs,1.1*shift_amount);
1186997a0949SHong Zhang 	chshift  = PETSC_TRUE;
1187997a0949SHong Zhang 	/* Unlike in the ILU case there is no exit condition on nshift:
1188997a0949SHong Zhang 	   we increase the shift until it converges. There is no guarantee that
1189997a0949SHong Zhang 	   this algorithm converges faster or slower, or is better or worse
1190997a0949SHong Zhang 	   than the ILU algorithm. */
1191997a0949SHong Zhang 	nshift++;
1192997a0949SHong Zhang 	break;
1193997a0949SHong Zhang       }
1194997a0949SHong Zhang       if (PetscRealPart(dk) < zeropivot){
1195997a0949SHong Zhang         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1196997a0949SHong Zhang         if (damping > 0.0) {
1197997a0949SHong Zhang           if (ndamp) damping *= 2.0;
1198997a0949SHong Zhang           damp = PETSC_TRUE;
1199997a0949SHong Zhang           ndamp++;
1200997a0949SHong Zhang           break;
1201997a0949SHong Zhang         } else if (PetscAbsScalar(dk) < zeropivot){
1202997a0949SHong Zhang           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1203997a0949SHong Zhang         } else {
1204997a0949SHong Zhang           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %D of Cholesky factorization\n",PetscRealPart(dk),k);
1205997a0949SHong Zhang         }
120649b5e25fSSatish Balay       }
120749b5e25fSSatish Balay 
1208997a0949SHong Zhang       /* copy data into U(k,:) */
120998a8e78dSHong Zhang       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1210fff829cfSHong Zhang       jmin = bi[k]+1; jmax = bi[k+1];
121149b5e25fSSatish Balay       if (jmin < jmax) {
121249b5e25fSSatish Balay         for (j=jmin; j<jmax; j++){
1213997a0949SHong Zhang           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
121449b5e25fSSatish Balay         }
121598a8e78dSHong Zhang         /* add the k-th row into il and jl */
121649b5e25fSSatish Balay         il[k] = jmin;
121798a8e78dSHong Zhang         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
121849b5e25fSSatish Balay       }
121949b5e25fSSatish Balay     }
1220997a0949SHong Zhang   } while (damp||chshift);
122149b5e25fSSatish Balay   ierr = PetscFree(il);CHKERRQ(ierr);
122298a8e78dSHong Zhang   if (a->permute){ierr = PetscFree(aa);CHKERRQ(ierr);}
122349b5e25fSSatish Balay 
122449b5e25fSSatish Balay   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
12258be8c0c7SHong Zhang   C->factor       = FACTOR_CHOLESKY;
122649b5e25fSSatish Balay   C->assembled    = PETSC_TRUE;
12275c0bcdfcSHong Zhang   C->preallocated = PETSC_TRUE;
122898a8e78dSHong Zhang   PetscLogFlops(C->m);
1229997a0949SHong Zhang   if (ndamp) {
1230997a0949SHong Zhang     PetscLogInfo(0,"MatCholeskyFactorNumerical_SeqSBAIJ_1: number of damping tries %D damping value %g\n",ndamp,damping);
1231997a0949SHong Zhang   }
1232997a0949SHong Zhang   if (nshift) {
1233997a0949SHong Zhang     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1 diagonal shifted %D shifts\n",nshift);
1234997a0949SHong Zhang   }
123549b5e25fSSatish Balay   PetscFunctionReturn(0);
123649b5e25fSSatish Balay }
123749b5e25fSSatish Balay 
1238671cb588SHong Zhang /*
1239671cb588SHong Zhang   Version for when blocks are 1 by 1 Using natural ordering
1240671cb588SHong Zhang */
12414a2ae208SSatish Balay #undef __FUNCT__
12424a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering"
1243*af281ebdSHong Zhang PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
1244671cb588SHong Zhang {
1245671cb588SHong Zhang   Mat            C = *B;
1246671cb588SHong Zhang   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1247dfbe8321SBarry Smith   PetscErrorCode ierr;
124813f74950SBarry Smith   PetscInt       i,j,mbs = a->mbs;
124913f74950SBarry Smith   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
125013f74950SBarry Smith   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz,ndamp = 0;
1251653a6975SHong Zhang   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
125221c26570Svictorle   PetscReal      damping=b->factor_damping, zeropivot=b->factor_zeropivot,shift_amount;
125321c26570Svictorle   PetscTruth     damp,chshift;
125413f74950SBarry Smith   PetscInt       nshift=0;
1255653a6975SHong Zhang 
1256653a6975SHong Zhang   PetscFunctionBegin;
1257653a6975SHong Zhang   /* initialization */
1258653a6975SHong Zhang   /* il and jl record the first nonzero element in each row of the accessing
1259653a6975SHong Zhang      window U(0:k, k:mbs-1).
1260653a6975SHong Zhang      jl:    list of rows to be added to uneliminated rows
1261653a6975SHong Zhang             i>= k: jl(i) is the first row to be added to row i
1262653a6975SHong Zhang             i<  k: jl(i) is the row following row i in some list of rows
1263653a6975SHong Zhang             jl(i) = mbs indicates the end of a list
1264653a6975SHong Zhang      il(i): points to the first nonzero element in U(i,k:mbs-1)
1265653a6975SHong Zhang   */
1266653a6975SHong Zhang   ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
126713f74950SBarry Smith   ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr);
1268653a6975SHong Zhang   jl   = il + mbs;
1269b00f7748SHong Zhang 
127021c26570Svictorle   shift_amount = 0;
1271b00f7748SHong Zhang   do {
1272b00f7748SHong Zhang     damp = PETSC_FALSE;
127321c26570Svictorle     chshift = PETSC_FALSE;
1274653a6975SHong Zhang     for (i=0; i<mbs; i++) {
1275653a6975SHong Zhang       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1276653a6975SHong Zhang     }
1277653a6975SHong Zhang 
1278997a0949SHong Zhang     for (k = 0; k<mbs; k++){
1279653a6975SHong Zhang     /*initialize k-th row with elements nonzero in row perm(k) of A */
1280653a6975SHong Zhang       nz   = ai[k+1] - ai[k];
1281653a6975SHong Zhang       acol = aj + ai[k];
1282653a6975SHong Zhang       aval = aa + ai[k];
1283653a6975SHong Zhang       bval = ba + bi[k];
1284653a6975SHong Zhang       while (nz -- ){
1285653a6975SHong Zhang         rtmp[*acol++] = *aval++;
1286653a6975SHong Zhang         *bval++       = 0.0; /* for in-place factorization */
1287653a6975SHong Zhang       }
1288b00f7748SHong Zhang       /* damp the diagonal of the matrix */
128921c26570Svictorle       if (ndamp||nshift) rtmp[k] += damping+shift_amount;
1290653a6975SHong Zhang 
1291653a6975SHong Zhang       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1292653a6975SHong Zhang       dk = rtmp[k];
1293653a6975SHong Zhang       i  = jl[k]; /* first row to be added to k_th row  */
1294653a6975SHong Zhang 
1295653a6975SHong Zhang       while (i < k){
1296653a6975SHong Zhang         nexti = jl[i]; /* next row to be added to k_th row */
1297653a6975SHong Zhang         /* compute multiplier, update D(k) and U(i,k) */
1298653a6975SHong Zhang         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1299653a6975SHong Zhang         uikdi = - ba[ili]*ba[bi[i]];
1300653a6975SHong Zhang         dk   += uikdi*ba[ili];
1301653a6975SHong Zhang         ba[ili] = uikdi; /* -U(i,k) */
1302653a6975SHong Zhang 
1303653a6975SHong Zhang         /* add multiple of row i to k-th row ... */
1304653a6975SHong Zhang         jmin = ili + 1;
1305653a6975SHong Zhang         nz   = bi[i+1] - jmin;
1306653a6975SHong Zhang         if (nz > 0){
1307653a6975SHong Zhang           bcol = bj + jmin;
1308653a6975SHong Zhang           bval = ba + jmin;
1309653a6975SHong Zhang           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1310997a0949SHong Zhang           /* update il and jl for i-th row */
1311997a0949SHong Zhang           il[i] = jmin;
1312997a0949SHong Zhang           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1313653a6975SHong Zhang         }
1314653a6975SHong Zhang         i = nexti;
1315653a6975SHong Zhang       }
1316653a6975SHong Zhang 
131721c26570Svictorle       if (PetscRealPart(dk) < zeropivot && b->factor_shift){
1318d530a6e7Svictorle 	/* calculate a shift that would make this row diagonally dominant */
131929f69fd4SBarry Smith 	PetscReal rs = PetscAbs(PetscRealPart(dk));
132021c26570Svictorle 	jmin      = bi[k]+1;
132121c26570Svictorle 	nz        = bi[k+1] - jmin;
132221c26570Svictorle 	if (nz){
132321c26570Svictorle 	  bcol = bj + jmin;
132421c26570Svictorle 	  bval = ba + jmin;
132521c26570Svictorle 	  while (nz--){
132621c26570Svictorle 	    rs += PetscAbsScalar(rtmp[*bcol++]);
132721c26570Svictorle 	  }
132821c26570Svictorle 	}
1329d530a6e7Svictorle 	/* if this shift is less than the previous, just up the previous
1330d530a6e7Svictorle 	   one by a bit */
1331d530a6e7Svictorle 	shift_amount = PetscMax(rs,1.1*shift_amount);
133221c26570Svictorle 	chshift  = PETSC_TRUE;
1333d530a6e7Svictorle 	/* Unlike in the ILU case there is no exit condition on nshift:
1334d530a6e7Svictorle 	   we increase the shift until it converges. There is no guarantee that
1335d530a6e7Svictorle 	   this algorithm converges faster or slower, or is better or worse
1336d530a6e7Svictorle 	   than the ILU algorithm. */
133721c26570Svictorle 	nshift++;
133821c26570Svictorle 	break;
133921c26570Svictorle       }
1340ffa0b812SHong Zhang       if (PetscRealPart(dk) < zeropivot){
1341ffa0b812SHong Zhang         if (damping == (PetscReal) PETSC_DECIDE) damping = -PetscRealPart(dk)/(k+1);
1342b00f7748SHong Zhang         if (damping > 0.0) {
1343b00f7748SHong Zhang           if (ndamp) damping *= 2.0;
1344b00f7748SHong Zhang           damp = PETSC_TRUE;
1345b00f7748SHong Zhang           ndamp++;
1346b00f7748SHong Zhang           break;
1347481c2c92SHong Zhang         } else if (PetscAbsScalar(dk) < zeropivot){
134877431f27SBarry Smith           SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g",k,PetscRealPart(dk),zeropivot);
1349481c2c92SHong Zhang         } else {
135077431f27SBarry Smith           PetscLogInfo((PetscObject)A,"Negative pivot %g in row %D of Cholesky factorization\n",PetscRealPart(dk),k);
1351b00f7748SHong Zhang         }
1352064503c5SHong Zhang       }
1353653a6975SHong Zhang 
1354997a0949SHong Zhang       /* copy data into U(k,:) */
1355653a6975SHong Zhang       ba[bi[k]] = 1.0/dk;
1356653a6975SHong Zhang       jmin      = bi[k]+1;
1357653a6975SHong Zhang       nz        = bi[k+1] - jmin;
1358653a6975SHong Zhang       if (nz){
1359653a6975SHong Zhang         bcol = bj + jmin;
1360653a6975SHong Zhang         bval = ba + jmin;
1361653a6975SHong Zhang         while (nz--){
1362653a6975SHong Zhang           *bval++       = rtmp[*bcol];
1363653a6975SHong Zhang           rtmp[*bcol++] = 0.0;
1364653a6975SHong Zhang         }
1365997a0949SHong Zhang         /* add k-th row into il and jl */
1366653a6975SHong Zhang         il[k] = jmin;
1367997a0949SHong Zhang         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1368653a6975SHong Zhang       }
1369b00f7748SHong Zhang     } /* end of for (k = 0; k<mbs; k++) */
137021c26570Svictorle   } while (damp||chshift);
1371653a6975SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1372653a6975SHong Zhang   ierr = PetscFree(il);CHKERRQ(ierr);
1373653a6975SHong Zhang 
1374653a6975SHong Zhang   C->factor       = FACTOR_CHOLESKY;
1375653a6975SHong Zhang   C->assembled    = PETSC_TRUE;
1376653a6975SHong Zhang   C->preallocated = PETSC_TRUE;
1377997a0949SHong Zhang   PetscLogFlops(C->m);
1378b00f7748SHong Zhang   if (ndamp) {
1379*af281ebdSHong Zhang     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering: number of damping tries %D damping value %g\n",ndamp,damping);
1380b00f7748SHong Zhang   }
1381fb10cecfSBarry Smith   if (nshift) {
138277431f27SBarry Smith     PetscLogInfo(0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering diagonal shifted %D shifts\n",nshift);
1383fb10cecfSBarry Smith   }
1384653a6975SHong Zhang   PetscFunctionReturn(0);
1385653a6975SHong Zhang }
1386653a6975SHong Zhang 
1387653a6975SHong Zhang #undef __FUNCT__
13884a2ae208SSatish Balay #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ"
1389dfbe8321SBarry Smith PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
139049b5e25fSSatish Balay {
1391dfbe8321SBarry Smith   PetscErrorCode ierr;
139249b5e25fSSatish Balay   Mat            C;
139349b5e25fSSatish Balay 
139449b5e25fSSatish Balay   PetscFunctionBegin;
139515e8a5b3SHong Zhang   ierr = MatCholeskyFactorSymbolic(A,perm,info,&C);CHKERRQ(ierr);
1396*af281ebdSHong Zhang   ierr = MatCholeskyFactorNumeric(A,info,&C);CHKERRQ(ierr);
13974445ddedSHong Zhang   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
139849b5e25fSSatish Balay   PetscFunctionReturn(0);
139949b5e25fSSatish Balay }
140049b5e25fSSatish Balay 
140149b5e25fSSatish Balay 
1402