#include <../src/mat/impls/aij/seq/aij.h>
#include <../src/mat/impls/sbaij/seq/sbaij.h>
#include <petscbt.h>
#include <../src/mat/utils/freespace.h>

static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
{
  PetscFunctionBegin;
  *type = MATSOLVERPETSC;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat A, MatFactorType ftype, Mat *B)
{
  PetscInt n = A->rmap->n;

  PetscFunctionBegin;
  if (PetscDefined(USE_COMPLEX) && (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) && A->hermitian == PETSC_BOOL3_TRUE && A->symmetric != PETSC_BOOL3_TRUE) {
    PetscCall(PetscInfo(A, "Hermitian MAT_FACTOR_CHOLESKY or MAT_FACTOR_ICC are not supported. Use MAT_FACTOR_LU instead.\n"));
    *B = NULL;
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
  PetscCall(MatSetSizes(*B, n, n, n, n));
  if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
    PetscCall(MatSetType(*B, MATSEQAIJ));

    (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
    (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;

    PetscCall(MatSetBlockSizesFromMats(*B, A, A));
    PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
  } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
    PetscCall(MatSetType(*B, MATSEQSBAIJ));
    PetscCall(MatSeqSBAIJSetPreallocation(*B, 1, MAT_SKIP_ALLOCATION, NULL));

    (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
    (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
    PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
  } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
  (*B)->factortype = ftype;

  PetscCall(PetscFree((*B)->solvertype));
  PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
  (*B)->canuseordering = PETSC_TRUE;
  PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatLUFactorSymbolic_SeqAIJ(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
{
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b;
  IS                 isicol;
  const PetscInt    *r, *ic, *ai = a->i, *aj = a->j, *ajtmp;
  PetscInt           i, n = A->rmap->n;
  PetscInt          *bi, *bj;
  PetscInt          *bdiag, row, nnz, nzi, reallocs = 0, nzbd, *im;
  PetscReal          f;
  PetscInt           nlnk, *lnk, k, **bi_ptr;
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  PetscBT            lnkbt;
  PetscBool          diagDense;

  PetscFunctionBegin;
  PetscCheck(A->rmap->N == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "matrix must be square");
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
  PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");

  PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));

  /* get new row and diagonal pointers, must be allocated separately because they will be given to the Mat_SeqAIJ and freed separately */
  PetscCall(PetscShmgetAllocateArray(n + 1, sizeof(PetscInt), (void **)&bi));
  PetscCall(PetscMalloc1(n + 1, &bdiag));
  bi[0] = bdiag[0] = 0;

  /* linked list for storing column indices of the active row */
  nlnk = n + 1;
  PetscCall(PetscLLCreate(n, n, nlnk, lnk, lnkbt));

  PetscCall(PetscMalloc2(n + 1, &bi_ptr, n + 1, &im));

  /* initial FreeSpace size is f*(ai[n]+1) */
  f = info->fill;
  if (n == 1) f = 1; /* prevent failure in corner case of 1x1 matrix with fill < 0.5 */
  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space));
  current_space = free_space;

  for (i = 0; i < n; i++) {
    /* copy previous fill into linked list */
    nzi   = 0;
    nnz   = ai[r[i] + 1] - ai[r[i]];
    ajtmp = aj + ai[r[i]];
    PetscCall(PetscLLAddPerm(nnz, ajtmp, ic, n, &nlnk, lnk, lnkbt));
    nzi += nlnk;

    /* add pivot rows into linked list */
    row = lnk[n];
    while (row < i) {
      nzbd  = bdiag[row] + 1;     /* num of entries in the row with column index <= row */
      ajtmp = bi_ptr[row] + nzbd; /* points to the entry next to the diagonal */
      PetscCall(PetscLLAddSortedLU(ajtmp, row, &nlnk, lnk, lnkbt, i, nzbd, im));
      nzi += nlnk;
      row = lnk[row];
    }
    bi[i + 1] = bi[i] + nzi;
    im[i]     = nzi;

    /* mark bdiag */
    nzbd = 0;
    nnz  = nzi;
    k    = lnk[n];
    while (nnz-- && k < i) {
      nzbd++;
      k = lnk[k];
    }
    bdiag[i] = nzbd; /* note: bdiag[i] = nnzL as input for PetscFreeSpaceContiguous_LU() */

    /* if free space is not available, make more free space */
    if (current_space->local_remaining < nzi) {
      /* estimated additional space needed */
      nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(n - 1, nzi));
      PetscCall(PetscFreeSpaceGet(nnz, &current_space));
      reallocs++;
    }

    /* copy data into free space, then initialize lnk */
    PetscCall(PetscLLClean(n, n, nzi, lnk, current_space->array, lnkbt));

    bi_ptr[i] = current_space->array;
    current_space->array += nzi;
    current_space->local_used += nzi;
    current_space->local_remaining -= nzi;
  }

  PetscCall(ISRestoreIndices(isrow, &r));
  PetscCall(ISRestoreIndices(isicol, &ic));

  /*   copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
  PetscCall(PetscShmgetAllocateArray(bi[n], sizeof(PetscInt), (void **)&bj));
  PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag));
  PetscCall(PetscLLDestroy(lnk, lnkbt));
  PetscCall(PetscFree2(bi_ptr, im));

  /* put together the new matrix */
  PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
  b          = (Mat_SeqAIJ *)B->data;
  b->free_ij = PETSC_TRUE;
  PetscCall(PetscShmgetAllocateArray(bdiag[0] + 1, sizeof(PetscScalar), (void **)&b->a));
  b->free_a = PETSC_TRUE;
  b->j      = bj;
  b->i      = bi;
  b->diag   = bdiag;
  b->ilen   = NULL;
  b->imax   = NULL;
  b->row    = isrow;
  b->col    = iscol;
  PetscCall(PetscObjectReference((PetscObject)isrow));
  PetscCall(PetscObjectReference((PetscObject)iscol));
  b->icol = isicol;
  PetscCall(PetscMalloc1(n, &b->solve_work));

  /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
  b->maxnz = b->nz = bdiag[0] + 1;

  B->factortype            = MAT_FACTOR_LU;
  B->info.factor_mallocs   = reallocs;
  B->info.fill_ratio_given = f;

  if (ai[n]) {
    B->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
  } else {
    B->info.fill_ratio_needed = 0.0;
  }
#if defined(PETSC_USE_INFO)
  if (ai[n] != 0) {
    PetscReal af = B->info.fill_ratio_needed;
    PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
    PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
    PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
    PetscCall(PetscInfo(A, "for best performance.\n"));
  } else {
    PetscCall(PetscInfo(A, "Empty matrix\n"));
  }
#endif
  B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
  if (a->inode.size_csr) B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
  PetscCall(MatSeqAIJCheckInode_FactorLU(B));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
    Trouble in factorization, should we dump the original matrix?
*/
PetscErrorCode MatFactorDumpMatrix(Mat A)
{
  PetscBool flg = PETSC_FALSE;

  PetscFunctionBegin;
  PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, NULL, "-mat_factor_dump_on_error", &flg, NULL));
  if (flg) {
    PetscViewer viewer;
    char        filename[PETSC_MAX_PATH_LEN];

    PetscCall(PetscSNPrintf(filename, PETSC_MAX_PATH_LEN, "matrix_factor_error.%d", PetscGlobalRank));
    PetscCall(PetscViewerBinaryOpen(PetscObjectComm((PetscObject)A), filename, FILE_MODE_WRITE, &viewer));
    PetscCall(MatView(A, viewer));
    PetscCall(PetscViewerDestroy(&viewer));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatLUFactorNumeric_SeqAIJ(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat              C = B;
  Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
  IS               isrow = b->row, isicol = b->icol;
  const PetscInt  *r, *ic, *ics;
  const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
  PetscInt         i, j, k, nz, nzL, row, *pj;
  const PetscInt  *ajtmp, *bjtmp;
  MatScalar       *rtmp, *pc, multiplier, *pv;
  const MatScalar *aa, *v;
  MatScalar       *ba;
  PetscBool        row_identity, col_identity;
  FactorShiftCtx   sctx;
  const PetscInt  *ddiag;
  PetscReal        rs;
  MatScalar        d;

  PetscFunctionBegin;
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(MatSeqAIJGetArrayWrite(B, &ba));
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &ddiag, NULL));
    sctx.shift_top = info->zeropivot;
    for (i = 0; i < n; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d  = aa[ddiag[i]];
      rs = -PetscAbsScalar(d) - PetscRealPart(d);
      v  = aa + ai[i];
      nz = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));
  PetscCall(PetscMalloc1(n + 1, &rtmp));
  ics = ic;

  do {
    sctx.newshift = PETSC_FALSE;
    for (i = 0; i < n; i++) {
      /* zero rtmp */
      /* L part */
      nz    = bi[i + 1] - bi[i];
      bjtmp = bj + bi[i];
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      /* U part */
      nz    = bdiag[i] - bdiag[i + 1];
      bjtmp = bj + bdiag[i + 1] + 1;
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      /* load in initial (unfactored row) */
      nz    = ai[r[i] + 1] - ai[r[i]];
      ajtmp = aj + ai[r[i]];
      v     = aa + ai[r[i]];
      for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
      /* ZeropivotApply() */
      rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

      /* elimination */
      bjtmp = bj + bi[i];
      row   = *bjtmp++;
      nzL   = bi[i + 1] - bi[i];
      for (k = 0; k < nzL; k++) {
        pc = rtmp + row;
        if (*pc != 0.0) {
          pv         = ba + bdiag[row];
          multiplier = *pc * (*pv);
          *pc        = multiplier;

          pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
          pv = ba + bdiag[row + 1] + 1;
          nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */

          for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
          PetscCall(PetscLogFlops(1 + 2.0 * nz));
        }
        row = *bjtmp++;
      }

      /* finished row so stick it into b->a */
      rs = 0.0;
      /* L part */
      pv = ba + bi[i];
      pj = b->j + bi[i];
      nz = bi[i + 1] - bi[i];
      for (j = 0; j < nz; j++) {
        pv[j] = rtmp[pj[j]];
        rs += PetscAbsScalar(pv[j]);
      }

      /* U part */
      pv = ba + bdiag[i + 1] + 1;
      pj = b->j + bdiag[i + 1] + 1;
      nz = bdiag[i] - bdiag[i + 1] - 1;
      for (j = 0; j < nz; j++) {
        pv[j] = rtmp[pj[j]];
        rs += PetscAbsScalar(pv[j]);
      }

      sctx.rs = rs;
      sctx.pv = rtmp[i];
      PetscCall(MatPivotCheck(B, A, info, &sctx, i));
      if (sctx.newshift) break; /* break for-loop */
      rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */

      /* Mark diagonal and invert diagonal for simpler triangular solves */
      pv  = ba + bdiag[i];
      *pv = 1.0 / rtmp[i];

    } /* endof for (i=0; i<n; i++) { */

    /* MatPivotRefine() */
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
      /*
       * if no shift in this attempt & shifting & started shifting & can refine,
       * then try lower shift
       */
      sctx.shift_hi       = sctx.shift_fraction;
      sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
      sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
      sctx.newshift       = PETSC_TRUE;
      sctx.nshift++;
    }
  } while (sctx.newshift);

  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(MatSeqAIJRestoreArrayWrite(B, &ba));

  PetscCall(PetscFree(rtmp));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));

  PetscCall(ISIdentity(isrow, &row_identity));
  PetscCall(ISIdentity(isicol, &col_identity));
  if (b->inode.size_csr) {
    C->ops->solve = MatSolve_SeqAIJ_Inode;
  } else if (row_identity && col_identity) {
    C->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
  } else {
    C->ops->solve = MatSolve_SeqAIJ;
  }
  C->ops->solveadd          = MatSolveAdd_SeqAIJ;
  C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
  C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
  C->ops->matsolve          = MatMatSolve_SeqAIJ;
  C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
  C->assembled              = PETSC_TRUE;
  C->preallocated           = PETSC_TRUE;

  PetscCall(PetscLogFlops(C->cmap->n));

  /* MatShiftView(A,info,&sctx) */
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
      PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatSolve_SeqAIJ_inplace(Mat, Mat, Mat);
static PetscErrorCode MatSolve_SeqAIJ_NaturalOrdering_inplace(Mat, Vec, Vec);
static PetscErrorCode MatSolveAdd_SeqAIJ_inplace(Mat, Vec, Vec, Vec);

PetscErrorCode MatLUFactorNumeric_SeqAIJ_inplace(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat              C = B;
  Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
  IS               isrow = b->row, isicol = b->icol;
  const PetscInt  *r, *ic, *ics;
  PetscInt         nz, row, i, j, n = A->rmap->n, diag;
  const PetscInt  *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
  const PetscInt  *ajtmp, *bjtmp, *ddiag, *pj;
  MatScalar       *pv, *rtmp, *pc, multiplier, d;
  const MatScalar *v, *aa;
  MatScalar       *ba;
  PetscReal        rs = 0.0;
  FactorShiftCtx   sctx;
  PetscBool        row_identity, col_identity;

  PetscFunctionBegin;
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &ddiag, NULL));

  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(MatSeqAIJGetArrayWrite(B, &ba));
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &ddiag, NULL));
    sctx.shift_top = info->zeropivot;
    for (i = 0; i < n; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d  = aa[ddiag[i]];
      rs = -PetscAbsScalar(d) - PetscRealPart(d);
      v  = aa + ai[i];
      nz = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));
  PetscCall(PetscMalloc1(n + 1, &rtmp));
  ics = ic;

  do {
    sctx.newshift = PETSC_FALSE;
    for (i = 0; i < n; i++) {
      nz    = bi[i + 1] - bi[i];
      bjtmp = bj + bi[i];
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      /* load in initial (unfactored row) */
      nz    = ai[r[i] + 1] - ai[r[i]];
      ajtmp = aj + ai[r[i]];
      v     = aa + ai[r[i]];
      for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
      rtmp[ics[r[i]]] += sctx.shift_amount; /* shift the diagonal of the matrix */

      row = *bjtmp++;
      while (row < i) {
        pc = rtmp + row;
        if (*pc != 0.0) {
          pv         = ba + ddiag[row];
          pj         = b->j + ddiag[row] + 1;
          multiplier = *pc / *pv++;
          *pc        = multiplier;
          nz         = bi[row + 1] - ddiag[row] - 1;
          for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
          PetscCall(PetscLogFlops(1 + 2.0 * nz));
        }
        row = *bjtmp++;
      }
      /* finished row so stick it into b->a */
      pv   = ba + bi[i];
      pj   = b->j + bi[i];
      nz   = bi[i + 1] - bi[i];
      diag = ddiag[i] - bi[i];
      rs   = 0.0;
      for (j = 0; j < nz; j++) {
        pv[j] = rtmp[pj[j]];
        rs += PetscAbsScalar(pv[j]);
      }
      rs -= PetscAbsScalar(pv[diag]);

      sctx.rs = rs;
      sctx.pv = pv[diag];
      PetscCall(MatPivotCheck(B, A, info, &sctx, i));
      if (sctx.newshift) break;
      pv[diag] = sctx.pv;
    }

    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
      /*
       * if no shift in this attempt & shifting & started shifting & can refine,
       * then try lower shift
       */
      sctx.shift_hi       = sctx.shift_fraction;
      sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
      sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
      sctx.newshift       = PETSC_TRUE;
      sctx.nshift++;
    }
  } while (sctx.newshift);

  /* invert diagonal entries for simpler triangular solves */
  for (i = 0; i < n; i++) ba[ddiag[i]] = 1.0 / ba[ddiag[i]];
  PetscCall(PetscFree(rtmp));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(MatSeqAIJRestoreArrayWrite(B, &ba));

  PetscCall(ISIdentity(isrow, &row_identity));
  PetscCall(ISIdentity(isicol, &col_identity));
  if (row_identity && col_identity) {
    C->ops->solve = MatSolve_SeqAIJ_NaturalOrdering_inplace;
  } else {
    C->ops->solve = MatSolve_SeqAIJ_inplace;
  }
  C->ops->solveadd          = MatSolveAdd_SeqAIJ_inplace;
  C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
  C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
  C->ops->matsolve          = MatMatSolve_SeqAIJ_inplace;
  C->ops->matsolvetranspose = NULL;

  C->assembled    = PETSC_TRUE;
  C->preallocated = PETSC_TRUE;

  PetscCall(PetscLogFlops(C->cmap->n));
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    }
  }
  C->ops->solve          = MatSolve_SeqAIJ_inplace;
  C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;

  PetscCall(MatSeqAIJCheckInode(C));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSolve_SeqAIJ_InplaceWithPerm(Mat, Vec, Vec);

/*
   This routine implements inplace ILU(0) with row or/and column permutations.
   Input:
     A - original matrix
   Output;
     A - a->i (rowptr) is same as original rowptr, but factored i-the row is stored in rowperm[i]
         a->j (col index) is permuted by the inverse of colperm, then sorted
         a->a reordered accordingly with a->j
         a->diag (ptr to diagonal elements) is updated.
*/
PetscErrorCode MatLUFactorNumeric_SeqAIJ_InplaceWithPerm(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat_SeqAIJ     *a     = (Mat_SeqAIJ *)A->data;
  IS              isrow = a->row, isicol = a->icol;
  const PetscInt *r, *ic, *ics;
  PetscInt        i, j, n = A->rmap->n, *ai = a->i, *aj = a->j;
  PetscInt       *ajtmp, nz, row;
  PetscInt        nbdiag, *pj;
  PetscScalar    *rtmp, *pc, multiplier, d;
  MatScalar      *pv, *v;
  PetscReal       rs;
  FactorShiftCtx  sctx;
  MatScalar      *aa, *vtmp;
  PetscInt       *diag;

  PetscFunctionBegin;
  PetscCheck(A == B, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "input and output matrix must have same address");

  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, (const PetscInt **)&diag, NULL));
  PetscCall(MatSeqAIJGetArray(A, &aa));
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    const PetscInt *ddiag;

    PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &ddiag, NULL));
    sctx.shift_top = info->zeropivot;
    for (i = 0; i < n; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d    = aa[ddiag[i]];
      rs   = -PetscAbsScalar(d) - PetscRealPart(d);
      vtmp = aa + ai[i];
      nz   = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(vtmp[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));
  PetscCall(PetscMalloc1(n + 1, &rtmp));
  PetscCall(PetscArrayzero(rtmp, n + 1));
  ics = ic;

#if defined(MV)
  sctx.shift_top      = 0.;
  sctx.nshift_max     = 0;
  sctx.shift_lo       = 0.;
  sctx.shift_hi       = 0.;
  sctx.shift_fraction = 0.;

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    sctx.shift_top = 0.;
    for (i = 0; i < n; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d  = aa[diag[i]];
      rs = -PetscAbsScalar(d) - PetscRealPart(d);
      v  = aa + ai[i];
      nz = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    if (sctx.shift_top < info->zeropivot) sctx.shift_top = info->zeropivot;
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  sctx.shift_amount = 0.;
  sctx.nshift       = 0;
#endif

  do {
    sctx.newshift = PETSC_FALSE;
    for (i = 0; i < n; i++) {
      /* load in initial unfactored row */
      nz    = ai[r[i] + 1] - ai[r[i]];
      ajtmp = aj + ai[r[i]];
      v     = aa + ai[r[i]];
      /* sort permuted ajtmp and values v accordingly */
      for (j = 0; j < nz; j++) ajtmp[j] = ics[ajtmp[j]];
      PetscCall(PetscSortIntWithScalarArray(nz, ajtmp, v));

      diag[r[i]] = ai[r[i]];
      for (j = 0; j < nz; j++) {
        rtmp[ajtmp[j]] = v[j];
        if (ajtmp[j] < i) diag[r[i]]++; /* update a->diag */
      }
      rtmp[r[i]] += sctx.shift_amount; /* shift the diagonal of the matrix */

      row = *ajtmp++;
      while (row < i) {
        pc = rtmp + row;
        if (*pc != 0.0) {
          pv = aa + diag[r[row]];
          pj = aj + diag[r[row]] + 1;

          multiplier = *pc / *pv++;
          *pc        = multiplier;
          nz         = ai[r[row] + 1] - diag[r[row]] - 1;
          for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
          PetscCall(PetscLogFlops(1 + 2.0 * nz));
        }
        row = *ajtmp++;
      }
      /* finished row so overwrite it onto aa */
      pv     = aa + ai[r[i]];
      pj     = aj + ai[r[i]];
      nz     = ai[r[i] + 1] - ai[r[i]];
      nbdiag = diag[r[i]] - ai[r[i]]; /* num of entries before the diagonal */

      rs = 0.0;
      for (j = 0; j < nz; j++) {
        pv[j] = rtmp[pj[j]];
        if (j != nbdiag) rs += PetscAbsScalar(pv[j]);
      }

      sctx.rs = rs;
      sctx.pv = pv[nbdiag];
      PetscCall(MatPivotCheck(B, A, info, &sctx, i));
      if (sctx.newshift) break;
      pv[nbdiag] = sctx.pv;
    }

    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
      /*
       * if no shift in this attempt & shifting & started shifting & can refine,
       * then try lower shift
       */
      sctx.shift_hi       = sctx.shift_fraction;
      sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
      sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
      sctx.newshift       = PETSC_TRUE;
      sctx.nshift++;
    }
  } while (sctx.newshift);

  /* invert diagonal entries for simpler triangular solves */
  for (i = 0; i < n; i++) aa[diag[r[i]]] = 1.0 / aa[diag[r[i]]];

  PetscCall(MatSeqAIJRestoreArray(A, &aa));
  PetscCall(PetscFree(rtmp));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));

  A->ops->solve             = MatSolve_SeqAIJ_InplaceWithPerm;
  A->ops->solveadd          = MatSolveAdd_SeqAIJ_inplace;
  A->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
  A->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;

  A->assembled    = PETSC_TRUE;
  A->preallocated = PETSC_TRUE;

  PetscCall(PetscLogFlops(A->cmap->n));
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatLUFactor_SeqAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
{
  Mat C;

  PetscFunctionBegin;
  PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
  PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
  PetscCall(MatLUFactorNumeric(C, A, info));

  A->ops->solve          = C->ops->solve;
  A->ops->solvetranspose = C->ops->solvetranspose;

  PetscCall(MatHeaderMerge(A, &C));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqAIJ_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n = A->rmap->n, *vi, *ai = a->i, *aj = a->j;
  PetscInt           nz;
  const PetscInt    *rout, *cout, *r, *c;
  PetscScalar       *x, *tmp, *tmps, sum;
  const PetscScalar *b;
  const MatScalar   *aa, *v;
  const PetscInt    *adiag;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  tmp[0] = b[*r++];
  tmps   = tmp;
  for (i = 1; i < n; i++) {
    v   = aa + ai[i];
    vi  = aj + ai[i];
    nz  = adiag[i] - ai[i];
    sum = b[*r++];
    PetscSparseDenseMinusDot(sum, tmps, v, vi, nz);
    tmp[i] = sum;
  }

  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + adiag[i] + 1;
    vi  = aj + adiag[i] + 1;
    nz  = ai[i + 1] - adiag[i] - 1;
    sum = tmp[i];
    PetscSparseDenseMinusDot(sum, tmps, v, vi, nz);
    x[*c--] = tmp[i] = sum * aa[adiag[i]];
  }
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));
  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatSolve_SeqAIJ_inplace(Mat A, Mat B, Mat X)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n = A->rmap->n, *vi, *ai = a->i, *aj = a->j;
  PetscInt           nz, neq, ldb, ldx;
  const PetscInt    *rout, *cout, *r, *c;
  PetscScalar       *x, *tmp = a->solve_work, *tmps, sum;
  const PetscScalar *b, *aa, *v;
  PetscBool          isdense;
  const PetscInt    *adiag;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
  PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
  if (X != B) {
    PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
    PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
  }
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(MatDenseGetArrayRead(B, &b));
  PetscCall(MatDenseGetLDA(B, &ldb));
  PetscCall(MatDenseGetArray(X, &x));
  PetscCall(MatDenseGetLDA(X, &ldx));
  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;
  for (neq = 0; neq < B->cmap->n; neq++) {
    /* forward solve the lower triangular */
    tmp[0] = b[r[0]];
    tmps   = tmp;
    for (i = 1; i < n; i++) {
      v   = aa + ai[i];
      vi  = aj + ai[i];
      nz  = adiag[i] - ai[i];
      sum = b[r[i]];
      PetscSparseDenseMinusDot(sum, tmps, v, vi, nz);
      tmp[i] = sum;
    }
    /* backward solve the upper triangular */
    for (i = n - 1; i >= 0; i--) {
      v   = aa + adiag[i] + 1;
      vi  = aj + adiag[i] + 1;
      nz  = ai[i + 1] - adiag[i] - 1;
      sum = tmp[i];
      PetscSparseDenseMinusDot(sum, tmps, v, vi, nz);
      x[c[i]] = tmp[i] = sum * aa[adiag[i]];
    }
    b += ldb;
    x += ldx;
  }
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatDenseRestoreArrayRead(B, &b));
  PetscCall(MatDenseRestoreArray(X, &x));
  PetscCall(PetscLogFlops(B->cmap->n * (2.0 * a->nz - n)));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatMatSolve_SeqAIJ(Mat A, Mat B, Mat X)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n = A->rmap->n, *vi, *ai = a->i, *aj = a->j;
  const PetscInt    *adiag;
  PetscInt           nz, neq, ldb, ldx;
  const PetscInt    *rout, *cout, *r, *c;
  PetscScalar       *x, *tmp = a->solve_work, sum;
  const PetscScalar *b, *aa, *v;
  PetscBool          isdense;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
  PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
  if (X != B) {
    PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
    PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
  }
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(MatDenseGetArrayRead(B, &b));
  PetscCall(MatDenseGetLDA(B, &ldb));
  PetscCall(MatDenseGetArray(X, &x));
  PetscCall(MatDenseGetLDA(X, &ldx));
  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;
  for (neq = 0; neq < B->cmap->n; neq++) {
    /* forward solve the lower triangular */
    tmp[0] = b[r[0]];
    v      = aa;
    vi     = aj;
    for (i = 1; i < n; i++) {
      nz  = ai[i + 1] - ai[i];
      sum = b[r[i]];
      PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
      tmp[i] = sum;
      v += nz;
      vi += nz;
    }
    /* backward solve the upper triangular */
    for (i = n - 1; i >= 0; i--) {
      v   = aa + adiag[i + 1] + 1;
      vi  = aj + adiag[i + 1] + 1;
      nz  = adiag[i] - adiag[i + 1] - 1;
      sum = tmp[i];
      PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
      x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
    }
    b += ldb;
    x += ldx;
  }
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatDenseRestoreArrayRead(B, &b));
  PetscCall(MatDenseRestoreArray(X, &x));
  PetscCall(PetscLogFlops(B->cmap->n * (2.0 * a->nz - n)));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatMatSolveTranspose_SeqAIJ(Mat A, Mat B, Mat X)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n = A->rmap->n, *vi, *ai = a->i, *aj = a->j, j;
  const PetscInt    *adiag = a->diag;
  PetscInt           nz, neq, ldb, ldx;
  const PetscInt    *rout, *cout, *r, *c;
  PetscScalar       *x, *tmp = a->solve_work, s1;
  const PetscScalar *b, *aa, *v;
  PetscBool          isdense;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
  PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
  if (X != B) {
    PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
    PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
  }
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(MatDenseGetArrayRead(B, &b));
  PetscCall(MatDenseGetLDA(B, &ldb));
  PetscCall(MatDenseGetArray(X, &x));
  PetscCall(MatDenseGetLDA(X, &ldx));
  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;
  for (neq = 0; neq < B->cmap->n; neq++) {
    /* copy the b into temp work space according to permutation */
    for (i = 0; i < n; i++) tmp[i] = b[c[i]];

    /* forward solve the U^T */
    for (i = 0; i < n; i++) {
      v  = aa + adiag[i + 1] + 1;
      vi = aj + adiag[i + 1] + 1;
      nz = adiag[i] - adiag[i + 1] - 1;
      s1 = tmp[i];
      s1 *= v[nz]; /* multiply by inverse of diagonal entry */
      for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
      tmp[i] = s1;
    }

    /* backward solve the L^T */
    for (i = n - 1; i >= 0; i--) {
      v  = aa + ai[i];
      vi = aj + ai[i];
      nz = ai[i + 1] - ai[i];
      s1 = tmp[i];
      for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
    }

    /* copy tmp into x according to permutation */
    for (i = 0; i < n; i++) x[r[i]] = tmp[i];
    b += ldb;
    x += ldx;
  }
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatDenseRestoreArrayRead(B, &b));
  PetscCall(MatDenseRestoreArray(X, &x));
  PetscCall(PetscLogFlops(B->cmap->n * (2.0 * a->nz - n)));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSolve_SeqAIJ_InplaceWithPerm(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout, *adiag;
  PetscInt           i, n = A->rmap->n, *vi, *ai = a->i, *aj = a->j;
  PetscInt           nz, row;
  PetscScalar       *x, *tmp, *tmps, sum;
  const PetscScalar *b;
  const MatScalar   *aa, *v;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  tmp[0] = b[*r++];
  tmps   = tmp;
  for (row = 1; row < n; row++) {
    i   = rout[row]; /* permuted row */
    v   = aa + ai[i];
    vi  = aj + ai[i];
    nz  = adiag[i] - ai[i];
    sum = b[*r++];
    PetscSparseDenseMinusDot(sum, tmps, v, vi, nz);
    tmp[row] = sum;
  }

  /* backward solve the upper triangular */
  for (row = n - 1; row >= 0; row--) {
    i   = rout[row]; /* permuted row */
    v   = aa + adiag[i] + 1;
    vi  = aj + adiag[i] + 1;
    nz  = ai[i + 1] - adiag[i] - 1;
    sum = tmp[row];
    PetscSparseDenseMinusDot(sum, tmps, v, vi, nz);
    x[*c--] = tmp[row] = sum * aa[adiag[i]];
  }
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));
  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

#include <../src/mat/impls/aij/seq/ftn-kernels/fsolve.h>
static PetscErrorCode MatSolve_SeqAIJ_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
  PetscInt           n  = A->rmap->n;
  const PetscInt    *ai = a->i, *aj = a->j, *adiag;
  PetscScalar       *x;
  const PetscScalar *b;
  const MatScalar   *aa;
#if !defined(PETSC_USE_FORTRAN_KERNEL_SOLVEAIJ)
  PetscInt         adiag_i, i, nz, ai_i;
  const PetscInt  *vi;
  const MatScalar *v;
  PetscScalar      sum;
#endif

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));

#if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEAIJ)
  fortransolveaij_(&n, x, ai, aj, adiag, aa, b);
#else
  /* forward solve the lower triangular */
  x[0] = b[0];
  for (i = 1; i < n; i++) {
    ai_i = ai[i];
    v    = aa + ai_i;
    vi   = aj + ai_i;
    nz   = adiag[i] - ai_i;
    sum  = b[i];
    PetscSparseDenseMinusDot(sum, x, v, vi, nz);
    x[i] = sum;
  }

  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    adiag_i = adiag[i];
    v       = aa + adiag_i + 1;
    vi      = aj + adiag_i + 1;
    nz      = ai[i + 1] - adiag_i - 1;
    sum     = x[i];
    PetscSparseDenseMinusDot(sum, x, v, vi, nz);
    x[i] = sum * aa[adiag_i];
  }
#endif
  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSolveAdd_SeqAIJ_inplace(Mat A, Vec bb, Vec yy, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n                  = A->rmap->n, j;
  PetscInt           nz;
  const PetscInt    *rout, *cout, *r, *c, *vi, *ai = a->i, *aj = a->j, *adiag;
  PetscScalar       *x, *tmp, sum;
  const PetscScalar *b;
  const MatScalar   *aa, *v;

  PetscFunctionBegin;
  if (yy != xx) PetscCall(VecCopy(yy, xx));

  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  tmp[0] = b[*r++];
  for (i = 1; i < n; i++) {
    v   = aa + ai[i];
    vi  = aj + ai[i];
    nz  = adiag[i] - ai[i];
    sum = b[*r++];
    for (j = 0; j < nz; j++) sum -= v[j] * tmp[vi[j]];
    tmp[i] = sum;
  }

  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + adiag[i] + 1;
    vi  = aj + adiag[i] + 1;
    nz  = ai[i + 1] - adiag[i] - 1;
    sum = tmp[i];
    for (j = 0; j < nz; j++) sum -= v[j] * tmp[vi[j]];
    tmp[i] = sum * aa[adiag[i]];
    x[*c--] += tmp[i];
  }

  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * a->nz));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolveAdd_SeqAIJ(Mat A, Vec bb, Vec yy, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n                  = A->rmap->n, j;
  PetscInt           nz;
  const PetscInt    *rout, *cout, *r, *c, *vi, *ai = a->i, *aj = a->j, *adiag;
  PetscScalar       *x, *tmp, sum;
  const PetscScalar *b;
  const MatScalar   *aa, *v;

  PetscFunctionBegin;
  if (yy != xx) PetscCall(VecCopy(yy, xx));

  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  tmp[0] = b[r[0]];
  v      = aa;
  vi     = aj;
  for (i = 1; i < n; i++) {
    nz  = ai[i + 1] - ai[i];
    sum = b[r[i]];
    for (j = 0; j < nz; j++) sum -= v[j] * tmp[vi[j]];
    tmp[i] = sum;
    v += nz;
    vi += nz;
  }

  /* backward solve the upper triangular */
  v  = aa + adiag[n - 1];
  vi = aj + adiag[n - 1];
  for (i = n - 1; i >= 0; i--) {
    nz  = adiag[i] - adiag[i + 1] - 1;
    sum = tmp[i];
    for (j = 0; j < nz; j++) sum -= v[j] * tmp[vi[j]];
    tmp[i] = sum * v[nz];
    x[c[i]] += tmp[i];
    v += nz + 1;
    vi += nz + 1;
  }

  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * a->nz));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolveTranspose_SeqAIJ_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *rout, *cout, *r, *c, *diag = a->diag, *ai = a->i, *aj = a->j, *vi;
  PetscInt           i, n = A->rmap->n, j;
  PetscInt           nz;
  PetscScalar       *x, *tmp, s1;
  const MatScalar   *aa, *v;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* copy the b into temp work space according to permutation */
  for (i = 0; i < n; i++) tmp[i] = b[c[i]];

  /* forward solve the U^T */
  for (i = 0; i < n; i++) {
    v  = aa + diag[i];
    vi = aj + diag[i] + 1;
    nz = ai[i + 1] - diag[i] - 1;
    s1 = tmp[i];
    s1 *= (*v++); /* multiply by inverse of diagonal entry */
    for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
    tmp[i] = s1;
  }

  /* backward solve the L^T */
  for (i = n - 1; i >= 0; i--) {
    v  = aa + diag[i] - 1;
    vi = aj + diag[i] - 1;
    nz = diag[i] - ai[i];
    s1 = tmp[i];
    for (j = 0; j > -nz; j--) tmp[vi[j]] -= s1 * v[j];
  }

  /* copy tmp into x according to permutation */
  for (i = 0; i < n; i++) x[r[i]] = tmp[i];

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));

  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolveTranspose_SeqAIJ(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *rout, *cout, *r, *c, *adiag, *ai = a->i, *aj = a->j, *vi;
  PetscInt           i, n = A->rmap->n, j;
  PetscInt           nz;
  PetscScalar       *x, *tmp, s1;
  const MatScalar   *aa, *v;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* copy the b into temp work space according to permutation */
  for (i = 0; i < n; i++) tmp[i] = b[c[i]];

  /* forward solve the U^T */
  for (i = 0; i < n; i++) {
    v  = aa + adiag[i + 1] + 1;
    vi = aj + adiag[i + 1] + 1;
    nz = adiag[i] - adiag[i + 1] - 1;
    s1 = tmp[i];
    s1 *= v[nz]; /* multiply by inverse of diagonal entry */
    for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
    tmp[i] = s1;
  }

  /* backward solve the L^T */
  for (i = n - 1; i >= 0; i--) {
    v  = aa + ai[i];
    vi = aj + ai[i];
    nz = ai[i + 1] - ai[i];
    s1 = tmp[i];
    for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
  }

  /* copy tmp into x according to permutation */
  for (i = 0; i < n; i++) x[r[i]] = tmp[i];

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));

  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolveTransposeAdd_SeqAIJ_inplace(Mat A, Vec bb, Vec zz, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *rout, *cout, *r, *c, *diag = a->diag, *ai = a->i, *aj = a->j, *vi;
  PetscInt           i, n = A->rmap->n, j;
  PetscInt           nz;
  PetscScalar       *x, *tmp, s1;
  const MatScalar   *aa, *v;
  const PetscScalar *b;

  PetscFunctionBegin;
  if (zz != xx) PetscCall(VecCopy(zz, xx));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* copy the b into temp work space according to permutation */
  for (i = 0; i < n; i++) tmp[i] = b[c[i]];

  /* forward solve the U^T */
  for (i = 0; i < n; i++) {
    v  = aa + diag[i];
    vi = aj + diag[i] + 1;
    nz = ai[i + 1] - diag[i] - 1;
    s1 = tmp[i];
    s1 *= (*v++); /* multiply by inverse of diagonal entry */
    for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
    tmp[i] = s1;
  }

  /* backward solve the L^T */
  for (i = n - 1; i >= 0; i--) {
    v  = aa + diag[i] - 1;
    vi = aj + diag[i] - 1;
    nz = diag[i] - ai[i];
    s1 = tmp[i];
    for (j = 0; j > -nz; j--) tmp[vi[j]] -= s1 * v[j];
  }

  /* copy tmp into x according to permutation */
  for (i = 0; i < n; i++) x[r[i]] += tmp[i];

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));

  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolveTransposeAdd_SeqAIJ(Mat A, Vec bb, Vec zz, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *rout, *cout, *r, *c, *adiag, *ai = a->i, *aj = a->j, *vi;
  PetscInt           i, n = A->rmap->n, j;
  PetscInt           nz;
  PetscScalar       *x, *tmp, s1;
  const MatScalar   *aa, *v;
  const PetscScalar *b;

  PetscFunctionBegin;
  if (zz != xx) PetscCall(VecCopy(zz, xx));
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* copy the b into temp work space according to permutation */
  for (i = 0; i < n; i++) tmp[i] = b[c[i]];

  /* forward solve the U^T */
  for (i = 0; i < n; i++) {
    v  = aa + adiag[i + 1] + 1;
    vi = aj + adiag[i + 1] + 1;
    nz = adiag[i] - adiag[i + 1] - 1;
    s1 = tmp[i];
    s1 *= v[nz]; /* multiply by inverse of diagonal entry */
    for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
    tmp[i] = s1;
  }

  /* backward solve the L^T */
  for (i = n - 1; i >= 0; i--) {
    v  = aa + ai[i];
    vi = aj + ai[i];
    nz = ai[i + 1] - ai[i];
    s1 = tmp[i];
    for (j = 0; j < nz; j++) tmp[vi[j]] -= s1 * v[j];
  }

  /* copy tmp into x according to permutation */
  for (i = 0; i < n; i++) x[r[i]] += tmp[i];

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));

  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
   ilu() under revised new data structure.
   Factored arrays bj and ba are stored as
     L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)

   bi=fact->i is an array of size n+1, in which
     bi[i]:  points to 1st entry of L(i,:),i=0,...,n-1
     bi[n]:  points to L(n-1,n-1)+1

  bdiag=fact->diag is an array of size n+1,in which
     bdiag[i]: points to diagonal of U(i,:), i=0,...,n-1
     bdiag[n]: points to entry of U(n-1,0)-1

   U(i,:) contains bdiag[i] as its last entry, i.e.,
    U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
*/
PetscErrorCode MatILUFactorSymbolic_SeqAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
{
  Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b;
  const PetscInt n = A->rmap->n, *ai = a->i, *aj, *adiag;
  PetscInt       i, j, k = 0, nz, *bi, *bj, *bdiag;
  IS             isicol;

  PetscFunctionBegin;
  PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE));
  b = (Mat_SeqAIJ *)fact->data;

  /* allocate matrix arrays for new data structure */
  PetscCall(PetscShmgetAllocateArray(ai[n], sizeof(PetscScalar), (void **)&b->a));
  PetscCall(PetscShmgetAllocateArray(ai[n], sizeof(PetscInt), (void **)&b->j));
  PetscCall(PetscShmgetAllocateArray(n + 1, sizeof(PetscInt), (void **)&b->i));
  if (n > 0) PetscCall(PetscArrayzero(b->a, ai[n]));
  b->free_a  = PETSC_TRUE;
  b->free_ij = PETSC_TRUE;

  if (!b->diag) PetscCall(PetscMalloc1(n + 1, &b->diag));
  bdiag = b->diag;

  /* set bi and bj with new data structure */
  bi = b->i;
  bj = b->j;

  /* L part */
  bi[0] = 0;
  for (i = 0; i < n; i++) {
    nz        = adiag[i] - ai[i];
    bi[i + 1] = bi[i] + nz;
    aj        = a->j + ai[i];
    for (j = 0; j < nz; j++) bj[k++] = aj[j];
  }

  /* U part */
  bdiag[n] = bi[n] - 1;
  for (i = n - 1; i >= 0; i--) {
    nz = ai[i + 1] - adiag[i] - 1;
    aj = a->j + adiag[i] + 1;
    for (j = 0; j < nz; j++) bj[k++] = aj[j];
    /* diag[i] */
    bj[k++]  = i;
    bdiag[i] = bdiag[i + 1] + nz + 1;
  }

  fact->factortype             = MAT_FACTOR_ILU;
  fact->info.factor_mallocs    = 0;
  fact->info.fill_ratio_given  = info->fill;
  fact->info.fill_ratio_needed = 1.0;
  fact->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ;
  PetscCall(MatSeqAIJCheckInode_FactorLU(fact));

  b       = (Mat_SeqAIJ *)fact->data;
  b->row  = isrow;
  b->col  = iscol;
  b->icol = isicol;
  PetscCall(PetscMalloc1(fact->rmap->n, &b->solve_work));
  PetscCall(PetscObjectReference((PetscObject)isrow));
  PetscCall(PetscObjectReference((PetscObject)iscol));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatILUFactorSymbolic_SeqAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
{
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b;
  IS                 isicol;
  const PetscInt    *r, *ic;
  PetscInt           n = A->rmap->n, *ai = a->i, *aj = a->j;
  PetscInt          *bi, *cols, nnz, *cols_lvl;
  PetscInt          *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0;
  PetscInt           i, levels, diagonal_fill;
  PetscBool          col_identity, row_identity;
  PetscReal          f;
  PetscInt           nlnk, *lnk, *lnk_lvl = NULL;
  PetscBT            lnkbt;
  PetscInt           nzi, *bj, **bj_ptr, **bjlvl_ptr;
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
  PetscBool          diagDense;

  PetscFunctionBegin;
  PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
  PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");

  levels = (PetscInt)info->levels;
  PetscCall(ISIdentity(isrow, &row_identity));
  PetscCall(ISIdentity(iscol, &col_identity));
  if (!levels && row_identity && col_identity) {
    /* special case: ilu(0) with natural ordering */
    PetscCall(MatILUFactorSymbolic_SeqAIJ_ilu0(fact, A, isrow, iscol, info));
    if (a->inode.size_csr) fact->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));

  /* get new row and diagonal pointers, must be allocated separately because they will be given to the Mat_SeqAIJ and freed separately */
  PetscCall(PetscShmgetAllocateArray(n + 1, sizeof(PetscInt), (void **)&bi));
  PetscCall(PetscMalloc1(n + 1, &bdiag));
  bi[0] = bdiag[0] = 0;
  PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr));

  /* create a linked list for storing column indices of the active row */
  nlnk = n + 1;
  PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt));

  /* initial FreeSpace size is f*(ai[n]+1) */
  f             = info->fill;
  diagonal_fill = (PetscInt)info->diagonal_fill;
  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space));
  current_space = free_space;
  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl));
  current_space_lvl = free_space_lvl;
  for (i = 0; i < n; i++) {
    nzi = 0;
    /* copy current row into linked list */
    nnz = ai[r[i] + 1] - ai[r[i]];
    PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
    cols   = aj + ai[r[i]];
    lnk[i] = -1; /* marker to indicate if diagonal exists */
    PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt));
    nzi += nlnk;

    /* make sure diagonal entry is included */
    if (diagonal_fill && lnk[i] == -1) {
      fm = n;
      while (lnk[fm] < i) fm = lnk[fm];
      lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
      lnk[fm]    = i;
      lnk_lvl[i] = 0;
      nzi++;
      dcount++;
    }

    /* add pivot rows into the active row */
    nzbd = 0;
    prow = lnk[n];
    while (prow < i) {
      nnz      = bdiag[prow];
      cols     = bj_ptr[prow] + nnz + 1;
      cols_lvl = bjlvl_ptr[prow] + nnz + 1;
      nnz      = bi[prow + 1] - bi[prow] - nnz - 1;
      PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow));
      nzi += nlnk;
      prow = lnk[prow];
      nzbd++;
    }
    bdiag[i]  = nzbd;
    bi[i + 1] = bi[i] + nzi;
    /* if free space is not available, make more free space */
    if (current_space->local_remaining < nzi) {
      nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, n - i)); /* estimated and max additional space needed */
      PetscCall(PetscFreeSpaceGet(nnz, &current_space));
      PetscCall(PetscFreeSpaceGet(nnz, &current_space_lvl));
      reallocs++;
    }

    /* copy data into free_space and free_space_lvl, then initialize lnk */
    PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
    bj_ptr[i]    = current_space->array;
    bjlvl_ptr[i] = current_space_lvl->array;

    /* make sure the active row i has diagonal entry */
    PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix, try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i);

    current_space->array += nzi;
    current_space->local_used += nzi;
    current_space->local_remaining -= nzi;
    current_space_lvl->array += nzi;
    current_space_lvl->local_used += nzi;
    current_space_lvl->local_remaining -= nzi;
  }

  PetscCall(ISRestoreIndices(isrow, &r));
  PetscCall(ISRestoreIndices(isicol, &ic));
  /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
  PetscCall(PetscShmgetAllocateArray(bi[n], sizeof(PetscInt), (void **)&bj));
  PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag));

  PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
  PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
  PetscCall(PetscFree2(bj_ptr, bjlvl_ptr));

#if defined(PETSC_USE_INFO)
  {
    PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
    PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
    PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af));
    PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af));
    PetscCall(PetscInfo(A, "for best performance.\n"));
    if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount));
  }
#endif
  /* put together the new matrix */
  PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(fact, MAT_SKIP_ALLOCATION, NULL));
  b          = (Mat_SeqAIJ *)fact->data;
  b->free_ij = PETSC_TRUE;
  PetscCall(PetscShmgetAllocateArray(bdiag[0] + 1, sizeof(PetscScalar), (void **)&b->a));
  b->free_a = PETSC_TRUE;
  b->j      = bj;
  b->i      = bi;
  b->diag   = bdiag;
  b->ilen   = NULL;
  b->imax   = NULL;
  b->row    = isrow;
  b->col    = iscol;
  PetscCall(PetscObjectReference((PetscObject)isrow));
  PetscCall(PetscObjectReference((PetscObject)iscol));
  b->icol = isicol;

  PetscCall(PetscMalloc1(n, &b->solve_work));
  /* In b structure:  Free imax, ilen, old a, old j.
     Allocate bdiag, solve_work, new a, new j */
  b->maxnz = b->nz = bdiag[0] + 1;

  fact->info.factor_mallocs    = reallocs;
  fact->info.fill_ratio_given  = f;
  fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
  fact->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ;
  if (a->inode.size_csr) fact->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
  PetscCall(MatSeqAIJCheckInode_FactorLU(fact));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatCholeskyFactorNumeric_SeqAIJ(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat              C  = B;
  Mat_SeqAIJ      *a  = (Mat_SeqAIJ *)A->data;
  Mat_SeqSBAIJ    *b  = (Mat_SeqSBAIJ *)C->data;
  IS               ip = b->row, iip = b->icol;
  const PetscInt  *rip, *riip;
  PetscInt         i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bdiag = b->diag, *bjtmp;
  PetscInt        *ai = a->i, *aj = a->j;
  PetscInt         k, jmin, jmax, *c2r, *il, col, nexti, ili, nz;
  MatScalar       *rtmp, *ba = b->a, *bval, dk, uikdi;
  PetscBool        perm_identity;
  FactorShiftCtx   sctx;
  PetscReal        rs;
  const MatScalar *aa, *v;
  MatScalar        d;
  const PetscInt  *adiag;

  PetscFunctionBegin;
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    sctx.shift_top = info->zeropivot;
    for (i = 0; i < mbs; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d  = aa[adiag[i]];
      rs = -PetscAbsScalar(d) - PetscRealPart(d);
      v  = aa + ai[i];
      nz = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  PetscCall(ISGetIndices(ip, &rip));
  PetscCall(ISGetIndices(iip, &riip));

  /* allocate working arrays
     c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
     il:  for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
  */
  PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &c2r));

  do {
    sctx.newshift = PETSC_FALSE;

    for (i = 0; i < mbs; i++) c2r[i] = mbs;
    if (mbs) il[0] = 0;

    for (k = 0; k < mbs; k++) {
      /* zero rtmp */
      nz    = bi[k + 1] - bi[k];
      bjtmp = bj + bi[k];
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      /* load in initial unfactored row */
      bval = ba + bi[k];
      jmin = ai[rip[k]];
      jmax = ai[rip[k] + 1];
      for (j = jmin; j < jmax; j++) {
        col = riip[aj[j]];
        if (col >= k) { /* only take upper triangular entry */
          rtmp[col] = aa[j];
          *bval++   = 0.0; /* for in-place factorization */
        }
      }
      /* shift the diagonal of the matrix: ZeropivotApply() */
      rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */

      /* modify k-th row by adding in those rows i with U(i,k)!=0 */
      dk = rtmp[k];
      i  = c2r[k]; /* first row to be added to k_th row  */

      while (i < k) {
        nexti = c2r[i]; /* next row to be added to k_th row */

        /* compute multiplier, update diag(k) and U(i,k) */
        ili   = il[i];                   /* index of first nonzero element in U(i,k:bms-1) */
        uikdi = -ba[ili] * ba[bdiag[i]]; /* diagonal(k) */
        dk += uikdi * ba[ili];           /* update diag[k] */
        ba[ili] = uikdi;                 /* -U(i,k) */

        /* add multiple of row i to k-th row */
        jmin = ili + 1;
        jmax = bi[i + 1];
        if (jmin < jmax) {
          for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
          /* update il and c2r for row i */
          il[i]  = jmin;
          j      = bj[jmin];
          c2r[i] = c2r[j];
          c2r[j] = i;
        }
        i = nexti;
      }

      /* copy data into U(k,:) */
      rs   = 0.0;
      jmin = bi[k];
      jmax = bi[k + 1] - 1;
      if (jmin < jmax) {
        for (j = jmin; j < jmax; j++) {
          col   = bj[j];
          ba[j] = rtmp[col];
          rs += PetscAbsScalar(ba[j]);
        }
        /* add the k-th row into il and c2r */
        il[k]  = jmin;
        i      = bj[jmin];
        c2r[k] = c2r[i];
        c2r[i] = k;
      }

      /* MatPivotCheck() */
      sctx.rs = rs;
      sctx.pv = dk;
      PetscCall(MatPivotCheck(B, A, info, &sctx, i));
      if (sctx.newshift) break;
      dk = sctx.pv;

      ba[bdiag[k]] = 1.0 / dk; /* U(k,k) */
    }
  } while (sctx.newshift);

  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(PetscFree3(rtmp, il, c2r));
  PetscCall(ISRestoreIndices(ip, &rip));
  PetscCall(ISRestoreIndices(iip, &riip));

  PetscCall(ISIdentity(ip, &perm_identity));
  if (perm_identity) {
    B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering;
    B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
    B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
    B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
  } else {
    B->ops->solve          = MatSolve_SeqSBAIJ_1;
    B->ops->solvetranspose = MatSolve_SeqSBAIJ_1;
    B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1;
    B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1;
  }

  C->assembled    = PETSC_TRUE;
  C->preallocated = PETSC_TRUE;

  PetscCall(PetscLogFlops(C->rmap->n));

  /* MatPivotView() */
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
      PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatCholeskyFactorNumeric_SeqAIJ_inplace(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat              C  = B;
  Mat_SeqAIJ      *a  = (Mat_SeqAIJ *)A->data;
  Mat_SeqSBAIJ    *b  = (Mat_SeqSBAIJ *)C->data;
  IS               ip = b->row, iip = b->icol;
  const PetscInt  *rip, *riip;
  PetscInt         i, j, mbs = A->rmap->n, *bi = b->i, *bj = b->j, *bcol, *bjtmp;
  PetscInt        *ai = a->i, *aj = a->j;
  PetscInt         k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
  MatScalar       *rtmp, *ba = b->a, *bval, dk, uikdi;
  const MatScalar *aa, *v;
  PetscBool        perm_identity;
  FactorShiftCtx   sctx;
  PetscReal        rs;
  MatScalar        d;
  const PetscInt  *adiag;

  PetscFunctionBegin;
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    sctx.shift_top = info->zeropivot;
    for (i = 0; i < mbs; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d  = aa[adiag[i]];
      rs = -PetscAbsScalar(d) - PetscRealPart(d);
      v  = aa + ai[i];
      nz = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  PetscCall(ISGetIndices(ip, &rip));
  PetscCall(ISGetIndices(iip, &riip));

  /* initialization */
  PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));

  do {
    sctx.newshift = PETSC_FALSE;

    for (i = 0; i < mbs; i++) jl[i] = mbs;
    il[0] = 0;

    for (k = 0; k < mbs; k++) {
      /* zero rtmp */
      nz    = bi[k + 1] - bi[k];
      bjtmp = bj + bi[k];
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      bval = ba + bi[k];
      /* initialize k-th row by the perm[k]-th row of A */
      jmin = ai[rip[k]];
      jmax = ai[rip[k] + 1];
      for (j = jmin; j < jmax; j++) {
        col = riip[aj[j]];
        if (col >= k) { /* only take upper triangular entry */
          rtmp[col] = aa[j];
          *bval++   = 0.0; /* for in-place factorization */
        }
      }
      /* shift the diagonal of the matrix */
      if (sctx.nshift) rtmp[k] += sctx.shift_amount;

      /* modify k-th row by adding in those rows i with U(i,k)!=0 */
      dk = rtmp[k];
      i  = jl[k]; /* first row to be added to k_th row  */

      while (i < k) {
        nexti = jl[i]; /* next row to be added to k_th row */

        /* compute multiplier, update diag(k) and U(i,k) */
        ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
        uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
        dk += uikdi * ba[ili];
        ba[ili] = uikdi; /* -U(i,k) */

        /* add multiple of row i to k-th row */
        jmin = ili + 1;
        jmax = bi[i + 1];
        if (jmin < jmax) {
          for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
          /* update il and jl for row i */
          il[i] = jmin;
          j     = bj[jmin];
          jl[i] = jl[j];
          jl[j] = i;
        }
        i = nexti;
      }

      /* shift the diagonals when zero pivot is detected */
      /* compute rs=sum of abs(off-diagonal) */
      rs   = 0.0;
      jmin = bi[k] + 1;
      nz   = bi[k + 1] - jmin;
      bcol = bj + jmin;
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(rtmp[bcol[j]]);

      sctx.rs = rs;
      sctx.pv = dk;
      PetscCall(MatPivotCheck(B, A, info, &sctx, k));
      if (sctx.newshift) break;
      dk = sctx.pv;

      /* copy data into U(k,:) */
      ba[bi[k]] = 1.0 / dk; /* U(k,k) */
      jmin      = bi[k] + 1;
      jmax      = bi[k + 1];
      if (jmin < jmax) {
        for (j = jmin; j < jmax; j++) {
          col   = bj[j];
          ba[j] = rtmp[col];
        }
        /* add the k-th row into il and jl */
        il[k] = jmin;
        i     = bj[jmin];
        jl[k] = jl[i];
        jl[i] = k;
      }
    }
  } while (sctx.newshift);

  PetscCall(PetscFree3(rtmp, il, jl));
  PetscCall(ISRestoreIndices(ip, &rip));
  PetscCall(ISRestoreIndices(iip, &riip));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));

  PetscCall(ISIdentity(ip, &perm_identity));
  if (perm_identity) {
    B->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
    B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
    B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
    B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
  } else {
    B->ops->solve          = MatSolve_SeqSBAIJ_1_inplace;
    B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
    B->ops->forwardsolve   = MatForwardSolve_SeqSBAIJ_1_inplace;
    B->ops->backwardsolve  = MatBackwardSolve_SeqSBAIJ_1_inplace;
  }

  C->assembled    = PETSC_TRUE;
  C->preallocated = PETSC_TRUE;

  PetscCall(PetscLogFlops(C->rmap->n));
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
   icc() under revised new data structure.
   Factored arrays bj and ba are stored as
     U(0,:),...,U(i,:),U(n-1,:)

   ui=fact->i is an array of size n+1, in which
   ui+
     ui[i]:  points to 1st entry of U(i,:),i=0,...,n-1
     ui[n]:  points to U(n-1,n-1)+1

  udiag=fact->diag is an array of size n,in which
     udiag[i]: points to diagonal of U(i,:), i=0,...,n-1

   U(i,:) contains udiag[i] as its last entry, i.e.,
    U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
*/

PetscErrorCode MatICCFactorSymbolic_SeqAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
{
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
  Mat_SeqSBAIJ      *b;
  PetscBool          perm_identity;
  PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = A->rmap->n, *ui, *udiag;
  const PetscInt    *rip, *riip, *adiag;
  PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
  PetscInt           nlnk, *lnk, *lnk_lvl = NULL;
  PetscInt           ncols, ncols_upper, *cols, *ajtmp, *uj, **uj_ptr, **uj_lvl_ptr;
  PetscReal          fill = info->fill, levels = info->levels;
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
  PetscBT            lnkbt;
  IS                 iperm;
  PetscBool          diagDense;

  PetscFunctionBegin;
  PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, &diagDense));
  PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
  PetscCall(ISIdentity(perm, &perm_identity));
  PetscCall(ISInvertPermutation(perm, PETSC_DECIDE, &iperm));

  PetscCall(PetscShmgetAllocateArray(am + 1, sizeof(PetscInt), (void **)&ui));
  PetscCall(PetscMalloc1(am + 1, &udiag));
  ui[0] = 0;

  /* ICC(0) without matrix ordering: simply rearrange column indices */
  if (!levels && perm_identity) {
    for (i = 0; i < am; i++) {
      ncols     = ai[i + 1] - adiag[i];
      ui[i + 1] = ui[i] + ncols;
      udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
    }
    PetscCall(PetscMalloc1(ui[am] + 1, &uj));
    cols = uj;
    for (i = 0; i < am; i++) {
      aj    = a->j + adiag[i] + 1; /* 1st entry of U(i,:) without diagonal */
      ncols = ai[i + 1] - adiag[i] - 1;
      for (j = 0; j < ncols; j++) *cols++ = aj[j];
      *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
    }
  } else { /* case: levels>0 || (levels=0 && !perm_identity) */
    PetscCall(ISGetIndices(iperm, &riip));
    PetscCall(ISGetIndices(perm, &rip));

    /* initialization */
    PetscCall(PetscMalloc1(am + 1, &ajtmp));

    /* jl: linked list for storing indices of the pivot rows
       il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
    PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &jl, am, &il));
    for (i = 0; i < am; i++) {
      jl[i] = am;
      il[i] = 0;
    }

    /* create and initialize a linked list for storing column indices of the active row k */
    nlnk = am + 1;
    PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

    /* initial FreeSpace size is fill*(ai[am]+am)/2 */
    PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, (ai[am] + am) / 2), &free_space));
    current_space = free_space;
    PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, (ai[am] + am) / 2), &free_space_lvl));
    current_space_lvl = free_space_lvl;

    for (k = 0; k < am; k++) { /* for each active row k */
      /* initialize lnk by the column indices of row rip[k] of A */
      nzk   = 0;
      ncols = ai[rip[k] + 1] - ai[rip[k]];
      PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, rip[k], k);
      ncols_upper = 0;
      for (j = 0; j < ncols; j++) {
        i = *(aj + ai[rip[k]] + j); /* unpermuted column index */
        if (riip[i] >= k) {         /* only take upper triangular entry */
          ajtmp[ncols_upper] = i;
          ncols_upper++;
        }
      }
      PetscCall(PetscIncompleteLLInit(ncols_upper, ajtmp, am, riip, &nlnk, lnk, lnk_lvl, lnkbt));
      nzk += nlnk;

      /* update lnk by computing fill-in for each pivot row to be merged in */
      prow = jl[k]; /* 1st pivot row */

      while (prow < k) {
        nextprow = jl[prow];

        /* merge prow into k-th row */
        jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
        jmax  = ui[prow + 1];
        ncols = jmax - jmin;
        i     = jmin - ui[prow];
        cols  = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
        uj    = uj_lvl_ptr[prow] + i; /* levels of cols */
        j     = *(uj - 1);
        PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
        nzk += nlnk;

        /* update il and jl for prow */
        if (jmin < jmax) {
          il[prow] = jmin;
          j        = *cols;
          jl[prow] = jl[j];
          jl[j]    = prow;
        }
        prow = nextprow;
      }

      /* if free space is not available, make more free space */
      if (current_space->local_remaining < nzk) {
        i = am - k + 1;                                    /* num of unfactored rows */
        i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
        PetscCall(PetscFreeSpaceGet(i, &current_space));
        PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
        reallocs++;
      }

      /* copy data into free_space and free_space_lvl, then initialize lnk */
      PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
      PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

      /* add the k-th row into il and jl */
      if (nzk > 1) {
        i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
        jl[k] = jl[i];
        jl[i] = k;
        il[k] = ui[k] + 1;
      }
      uj_ptr[k]     = current_space->array;
      uj_lvl_ptr[k] = current_space_lvl->array;

      current_space->array += nzk;
      current_space->local_used += nzk;
      current_space->local_remaining -= nzk;

      current_space_lvl->array += nzk;
      current_space_lvl->local_used += nzk;
      current_space_lvl->local_remaining -= nzk;

      ui[k + 1] = ui[k] + nzk;
    }

    PetscCall(ISRestoreIndices(perm, &rip));
    PetscCall(ISRestoreIndices(iperm, &riip));
    PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, jl, il));
    PetscCall(PetscFree(ajtmp));

    /* copy free_space into uj and free free_space; set ui, uj, udiag in new datastructure; */
    PetscCall(PetscShmgetAllocateArray(ui[am] + 1, sizeof(PetscInt), (void **)&uj));
    PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
    PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
    PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

  } /* end of case: levels>0 || (levels=0 && !perm_identity) */

  /* put together the new matrix in MATSEQSBAIJ format */
  b          = (Mat_SeqSBAIJ *)fact->data;
  b->free_ij = PETSC_TRUE;
  PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a));
  b->free_a = PETSC_TRUE;
  b->j      = uj;
  b->i      = ui;
  b->diag   = udiag;
  b->ilen   = NULL;
  b->imax   = NULL;
  b->row    = perm;
  b->col    = perm;
  PetscCall(PetscObjectReference((PetscObject)perm));
  PetscCall(PetscObjectReference((PetscObject)perm));
  b->icol          = iperm;
  b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

  PetscCall(PetscMalloc1(am, &b->solve_work));

  b->maxnz = b->nz = ui[am];

  fact->info.factor_mallocs   = reallocs;
  fact->info.fill_ratio_given = fill;
  if (ai[am] != 0) {
    /* nonzeros in lower triangular part of A (including diagonals) = (ai[am]+am)/2 */
    fact->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
  } else {
    fact->info.fill_ratio_needed = 0.0;
  }
#if defined(PETSC_USE_INFO)
  if (ai[am] != 0) {
    PetscReal af = fact->info.fill_ratio_needed;
    PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
    PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
    PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
  } else {
    PetscCall(PetscInfo(A, "Empty matrix\n"));
  }
#endif
  fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJ;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
{
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
  Mat_SeqSBAIJ      *b;
  PetscBool          perm_identity;
  PetscReal          fill = info->fill;
  const PetscInt    *rip, *riip;
  PetscInt           i, am = A->rmap->n, *ai = a->i, *aj = a->j, reallocs = 0, prow;
  PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
  PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr, *udiag;
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  PetscBT            lnkbt;
  IS                 iperm;
  PetscBool          diagDense;

  PetscFunctionBegin;
  PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
  PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
  PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");

  /* check whether perm is the identity mapping */
  PetscCall(ISIdentity(perm, &perm_identity));
  PetscCall(ISInvertPermutation(perm, PETSC_DECIDE, &iperm));
  PetscCall(ISGetIndices(iperm, &riip));
  PetscCall(ISGetIndices(perm, &rip));

  /* initialization */
  PetscCall(PetscShmgetAllocateArray(am + 1, sizeof(PetscInt), (void **)&ui));
  PetscCall(PetscMalloc1(am + 1, &udiag));
  ui[0] = 0;

  /* jl: linked list for storing indices of the pivot rows
     il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
  PetscCall(PetscMalloc4(am, &ui_ptr, am, &jl, am, &il, am, &cols));
  for (i = 0; i < am; i++) {
    jl[i] = am;
    il[i] = 0;
  }

  /* create and initialize a linked list for storing column indices of the active row k */
  nlnk = am + 1;
  PetscCall(PetscLLCreate(am, am, nlnk, lnk, lnkbt));

  /* initial FreeSpace size is fill*(ai[am]+am)/2 */
  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, (ai[am] + am) / 2), &free_space));
  current_space = free_space;

  for (k = 0; k < am; k++) { /* for each active row k */
    /* initialize lnk by the column indices of row rip[k] of A */
    nzk   = 0;
    ncols = ai[rip[k] + 1] - ai[rip[k]];
    PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, rip[k], k);
    ncols_upper = 0;
    for (j = 0; j < ncols; j++) {
      i = riip[*(aj + ai[rip[k]] + j)];
      if (i >= k) { /* only take upper triangular entry */
        cols[ncols_upper] = i;
        ncols_upper++;
      }
    }
    PetscCall(PetscLLAdd(ncols_upper, cols, am, &nlnk, lnk, lnkbt));
    nzk += nlnk;

    /* update lnk by computing fill-in for each pivot row to be merged in */
    prow = jl[k]; /* 1st pivot row */

    while (prow < k) {
      nextprow = jl[prow];
      /* merge prow into k-th row */
      jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
      jmax   = ui[prow + 1];
      ncols  = jmax - jmin;
      uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:am-1) */
      PetscCall(PetscLLAddSorted(ncols, uj_ptr, am, &nlnk, lnk, lnkbt));
      nzk += nlnk;

      /* update il and jl for prow */
      if (jmin < jmax) {
        il[prow] = jmin;
        j        = *uj_ptr;
        jl[prow] = jl[j];
        jl[j]    = prow;
      }
      prow = nextprow;
    }

    /* if free space is not available, make more free space */
    if (current_space->local_remaining < nzk) {
      i = am - k + 1;                                    /* num of unfactored rows */
      i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
      PetscCall(PetscFreeSpaceGet(i, &current_space));
      reallocs++;
    }

    /* copy data into free space, then initialize lnk */
    PetscCall(PetscLLClean(am, am, nzk, lnk, current_space->array, lnkbt));

    /* add the k-th row into il and jl */
    if (nzk > 1) {
      i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
      jl[k] = jl[i];
      jl[i] = k;
      il[k] = ui[k] + 1;
    }
    ui_ptr[k] = current_space->array;

    current_space->array += nzk;
    current_space->local_used += nzk;
    current_space->local_remaining -= nzk;

    ui[k + 1] = ui[k] + nzk;
  }

  PetscCall(ISRestoreIndices(perm, &rip));
  PetscCall(ISRestoreIndices(iperm, &riip));
  PetscCall(PetscFree4(ui_ptr, jl, il, cols));

  /* copy free_space into uj and free free_space; set ui, uj, udiag in new datastructure; */
  PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscInt), (void **)&uj));
  PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */
  PetscCall(PetscLLDestroy(lnk, lnkbt));

  /* put together the new matrix in MATSEQSBAIJ format */
  b          = (Mat_SeqSBAIJ *)fact->data;
  b->free_ij = PETSC_TRUE;
  PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a));
  b->free_a = PETSC_TRUE;
  b->j      = uj;
  b->i      = ui;
  b->diag   = udiag;
  b->ilen   = NULL;
  b->imax   = NULL;
  b->row    = perm;
  b->col    = perm;

  PetscCall(PetscObjectReference((PetscObject)perm));
  PetscCall(PetscObjectReference((PetscObject)perm));

  b->icol          = iperm;
  b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

  PetscCall(PetscMalloc1(am, &b->solve_work));

  b->maxnz = b->nz = ui[am];

  fact->info.factor_mallocs   = reallocs;
  fact->info.fill_ratio_given = fill;
  if (ai[am] != 0) {
    /* nonzeros in lower triangular part of A (including diagonals) = (ai[am]+am)/2 */
    fact->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
  } else {
    fact->info.fill_ratio_needed = 0.0;
  }
#if defined(PETSC_USE_INFO)
  if (ai[am] != 0) {
    PetscReal af = fact->info.fill_ratio_needed;
    PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
    PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
    PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
  } else {
    PetscCall(PetscInfo(A, "Empty matrix\n"));
  }
#endif
  fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJ;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqAIJ_NaturalOrdering(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
  PetscInt           n  = A->rmap->n;
  const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
  PetscScalar       *x, sum;
  const PetscScalar *b;
  const MatScalar   *aa, *v;
  PetscInt           i, nz;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));

  /* forward solve the lower triangular */
  x[0] = b[0];
  v    = aa;
  vi   = aj;
  for (i = 1; i < n; i++) {
    nz  = ai[i + 1] - ai[i];
    sum = b[i];
    PetscSparseDenseMinusDot(sum, x, v, vi, nz);
    v += nz;
    vi += nz;
    x[i] = sum;
  }

  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + adiag[i + 1] + 1;
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    sum = x[i];
    PetscSparseDenseMinusDot(sum, x, v, vi, nz);
    x[i] = sum * v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
  }

  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqAIJ(Mat A, Vec bb, Vec xx)
{
  Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n = A->rmap->n, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
  const PetscInt    *rout, *cout, *r, *c;
  PetscScalar       *x, *tmp, sum;
  const PetscScalar *b;
  const MatScalar   *aa, *v;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(MatSeqAIJGetArrayRead(A, &aa));
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArrayWrite(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  tmp[0] = b[r[0]];
  v      = aa;
  vi     = aj;
  for (i = 1; i < n; i++) {
    nz  = ai[i + 1] - ai[i];
    sum = b[r[i]];
    PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
    tmp[i] = sum;
    v += nz;
    vi += nz;
  }

  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + adiag[i + 1] + 1;
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    sum = tmp[i];
    PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
    x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArrayWrite(xx, &x));
  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}
