/*
    Factorization code for BAIJ format.
*/
#include <../src/mat/impls/baij/seq/baij.h>
#include <petsc/private/kernels/blockinvert.h>

PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat             C = B;
  Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
  IS              isrow = b->row, isicol = b->icol;
  const PetscInt *r, *ic;
  const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
  const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
  MatScalar      *rtmp, *pc, *mwork, *pv;
  MatScalar      *aa             = a->a, *v;
  PetscReal       shift          = info->shiftamount;
  const PetscBool allowzeropivot = PetscNot(A->erroriffailure);

  PetscFunctionBegin;
  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));

  /* generate work space needed by the factorization */
  PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
  PetscCall(PetscArrayzero(rtmp, bs2 * n));

  for (PetscInt i = 0; i < n; i++) {
    /* zero rtmp */
    /* L part */
    PetscInt *pj;
    PetscInt  nzL, nz = bi[i + 1] - bi[i];
    bjtmp = bj + bi[i];
    for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

    /* U part */
    nz    = bdiag[i] - bdiag[i + 1];
    bjtmp = bj + bdiag[i + 1] + 1;
    for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

    /* load in initial (unfactored row) */
    nz    = ai[r[i] + 1] - ai[r[i]];
    ajtmp = aj + ai[r[i]];
    v     = aa + bs2 * ai[r[i]];
    for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

    /* elimination */
    bjtmp = bj + bi[i];
    nzL   = bi[i + 1] - bi[i];
    for (PetscInt k = 0; k < nzL; k++) {
      PetscBool flg = PETSC_FALSE;
      PetscInt  row = bjtmp[k];

      pc = rtmp + bs2 * row;
      for (PetscInt j = 0; j < bs2; j++) {
        if (pc[j] != (PetscScalar)0.0) {
          flg = PETSC_TRUE;
          break;
        }
      }
      if (flg) {
        pv = b->a + bs2 * bdiag[row];
        /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
        PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

        pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
        pv = b->a + bs2 * (bdiag[row + 1] + 1);
        nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
        for (PetscInt j = 0; j < nz; j++) {
          /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
          /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
          v = rtmp + 4 * pj[j];
          PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
          pv += 4;
        }
        PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
      }
    }

    /* finished row so stick it into b->a */
    /* L part */
    pv = b->a + bs2 * bi[i];
    pj = b->j + bi[i];
    nz = bi[i + 1] - bi[i];
    for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

    /* Mark diagonal and invert diagonal for simpler triangular solves */
    pv = b->a + bs2 * bdiag[i];
    pj = b->j + bdiag[i];
    PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
    {
      PetscBool zeropivotdetected;

      PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
      if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
    }

    /* U part */
    pv = b->a + bs2 * (bdiag[i + 1] + 1);
    pj = b->j + bdiag[i + 1] + 1;
    nz = bdiag[i] - bdiag[i + 1] - 1;
    for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
  }

  PetscCall(PetscFree2(rtmp, mwork));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));

  C->ops->solve          = MatSolve_SeqBAIJ_2;
  C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
  C->assembled           = PETSC_TRUE;

  PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat             C = B;
  Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
  PetscInt        i, j, k, nz, nzL, row, *pj;
  const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
  const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
  MatScalar      *rtmp, *pc, *mwork, *pv;
  MatScalar      *aa = a->a, *v;
  PetscInt        flg;
  PetscReal       shift = info->shiftamount;
  PetscBool       allowzeropivot, zeropivotdetected;

  PetscFunctionBegin;
  allowzeropivot = PetscNot(A->erroriffailure);

  /* generate work space needed by the factorization */
  PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
  PetscCall(PetscArrayzero(rtmp, bs2 * n));

  for (i = 0; i < n; i++) {
    /* zero rtmp */
    /* L part */
    nz    = bi[i + 1] - bi[i];
    bjtmp = bj + bi[i];
    for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

    /* U part */
    nz    = bdiag[i] - bdiag[i + 1];
    bjtmp = bj + bdiag[i + 1] + 1;
    for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

    /* load in initial (unfactored row) */
    nz    = ai[i + 1] - ai[i];
    ajtmp = aj + ai[i];
    v     = aa + bs2 * ai[i];
    for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

    /* elimination */
    bjtmp = bj + bi[i];
    nzL   = bi[i + 1] - bi[i];
    for (k = 0; k < nzL; k++) {
      row = bjtmp[k];
      pc  = rtmp + bs2 * row;
      for (flg = 0, j = 0; j < bs2; j++) {
        if (pc[j] != (PetscScalar)0.0) {
          flg = 1;
          break;
        }
      }
      if (flg) {
        pv = b->a + bs2 * bdiag[row];
        /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
        PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

        pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
        pv = b->a + bs2 * (bdiag[row + 1] + 1);
        nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
        for (j = 0; j < nz; j++) {
          /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
          /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
          v = rtmp + 4 * pj[j];
          PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
          pv += 4;
        }
        PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
      }
    }

    /* finished row so stick it into b->a */
    /* L part */
    pv = b->a + bs2 * bi[i];
    pj = b->j + bi[i];
    nz = bi[i + 1] - bi[i];
    for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

    /* Mark diagonal and invert diagonal for simpler triangular solves */
    pv = b->a + bs2 * bdiag[i];
    pj = b->j + bdiag[i];
    PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
    PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
    if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

    /* U part */
    /*
    pv = b->a + bs2*bi[2*n-i];
    pj = b->j + bi[2*n-i];
    nz = bi[2*n-i+1] - bi[2*n-i] - 1;
    */
    pv = b->a + bs2 * (bdiag[i + 1] + 1);
    pj = b->j + bdiag[i + 1] + 1;
    nz = bdiag[i] - bdiag[i + 1] - 1;
    for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
  }
  PetscCall(PetscFree2(rtmp, mwork));

  C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
  C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
  C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
  C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
  C->assembled           = PETSC_TRUE;

  PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat             C = B;
  Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
  IS              isrow = b->row, isicol = b->icol;
  const PetscInt *r, *ic;
  PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
  PetscInt       *ajtmpold, *ajtmp, nz, row;
  PetscInt        idx, *ai = a->i, *aj = a->j, *pj;
  const PetscInt *diag_offset;
  MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
  MatScalar       p1, p2, p3, p4;
  MatScalar      *ba = b->a, *aa = a->a;
  PetscReal       shift = info->shiftamount;
  PetscBool       allowzeropivot, zeropivotdetected;

  PetscFunctionBegin;
  /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
  A->factortype = MAT_FACTOR_NONE;
  PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
  A->factortype  = MAT_FACTOR_ILU;
  allowzeropivot = PetscNot(A->erroriffailure);
  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));
  PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));

  for (i = 0; i < n; i++) {
    nz    = bi[i + 1] - bi[i];
    ajtmp = bj + bi[i];
    for (j = 0; j < nz; j++) {
      x    = rtmp + 4 * ajtmp[j];
      x[0] = x[1] = x[2] = x[3] = 0.0;
    }
    /* load in initial (unfactored row) */
    idx      = r[i];
    nz       = ai[idx + 1] - ai[idx];
    ajtmpold = aj + ai[idx];
    v        = aa + 4 * ai[idx];
    for (j = 0; j < nz; j++) {
      x    = rtmp + 4 * ic[ajtmpold[j]];
      x[0] = v[0];
      x[1] = v[1];
      x[2] = v[2];
      x[3] = v[3];
      v += 4;
    }
    row = *ajtmp++;
    while (row < i) {
      pc = rtmp + 4 * row;
      p1 = pc[0];
      p2 = pc[1];
      p3 = pc[2];
      p4 = pc[3];
      if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
        pv    = ba + 4 * diag_offset[row];
        pj    = bj + diag_offset[row] + 1;
        x1    = pv[0];
        x2    = pv[1];
        x3    = pv[2];
        x4    = pv[3];
        pc[0] = m1 = p1 * x1 + p3 * x2;
        pc[1] = m2 = p2 * x1 + p4 * x2;
        pc[2] = m3 = p1 * x3 + p3 * x4;
        pc[3] = m4 = p2 * x3 + p4 * x4;
        nz         = bi[row + 1] - diag_offset[row] - 1;
        pv += 4;
        for (j = 0; j < nz; j++) {
          x1 = pv[0];
          x2 = pv[1];
          x3 = pv[2];
          x4 = pv[3];
          x  = rtmp + 4 * pj[j];
          x[0] -= m1 * x1 + m3 * x2;
          x[1] -= m2 * x1 + m4 * x2;
          x[2] -= m1 * x3 + m3 * x4;
          x[3] -= m2 * x3 + m4 * x4;
          pv += 4;
        }
        PetscCall(PetscLogFlops(16.0 * nz + 12.0));
      }
      row = *ajtmp++;
    }
    /* finished row so stick it into b->a */
    pv = ba + 4 * bi[i];
    pj = bj + bi[i];
    nz = bi[i + 1] - bi[i];
    for (j = 0; j < nz; j++) {
      x     = rtmp + 4 * pj[j];
      pv[0] = x[0];
      pv[1] = x[1];
      pv[2] = x[2];
      pv[3] = x[3];
      pv += 4;
    }
    /* invert diagonal block */
    w = ba + 4 * diag_offset[i];
    PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
    if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
  }

  PetscCall(PetscFree(rtmp));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));

  C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
  C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
  C->assembled           = PETSC_TRUE;

  PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
  PetscFunctionReturn(PETSC_SUCCESS);
}
/*
      Version for when blocks are 2 by 2 Using natural ordering
*/
PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
{
  Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
  PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
  PetscInt       *ajtmpold, *ajtmp, nz, row;
  PetscInt       *ai = a->i, *aj = a->j, *pj;
  const PetscInt *diag_offset;
  MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
  MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
  MatScalar      *ba = b->a, *aa = a->a;
  PetscReal       shift = info->shiftamount;
  PetscBool       allowzeropivot, zeropivotdetected;

  PetscFunctionBegin;
  /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
  A->factortype = MAT_FACTOR_NONE;
  PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
  A->factortype  = MAT_FACTOR_ILU;
  allowzeropivot = PetscNot(A->erroriffailure);
  PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
  for (i = 0; i < n; i++) {
    nz    = bi[i + 1] - bi[i];
    ajtmp = bj + bi[i];
    for (j = 0; j < nz; j++) {
      x    = rtmp + 4 * ajtmp[j];
      x[0] = x[1] = x[2] = x[3] = 0.0;
    }
    /* load in initial (unfactored row) */
    nz       = ai[i + 1] - ai[i];
    ajtmpold = aj + ai[i];
    v        = aa + 4 * ai[i];
    for (j = 0; j < nz; j++) {
      x    = rtmp + 4 * ajtmpold[j];
      x[0] = v[0];
      x[1] = v[1];
      x[2] = v[2];
      x[3] = v[3];
      v += 4;
    }
    row = *ajtmp++;
    while (row < i) {
      pc = rtmp + 4 * row;
      p1 = pc[0];
      p2 = pc[1];
      p3 = pc[2];
      p4 = pc[3];
      if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
        pv    = ba + 4 * diag_offset[row];
        pj    = bj + diag_offset[row] + 1;
        x1    = pv[0];
        x2    = pv[1];
        x3    = pv[2];
        x4    = pv[3];
        pc[0] = m1 = p1 * x1 + p3 * x2;
        pc[1] = m2 = p2 * x1 + p4 * x2;
        pc[2] = m3 = p1 * x3 + p3 * x4;
        pc[3] = m4 = p2 * x3 + p4 * x4;
        nz         = bi[row + 1] - diag_offset[row] - 1;
        pv += 4;
        for (j = 0; j < nz; j++) {
          x1 = pv[0];
          x2 = pv[1];
          x3 = pv[2];
          x4 = pv[3];
          x  = rtmp + 4 * pj[j];
          x[0] -= m1 * x1 + m3 * x2;
          x[1] -= m2 * x1 + m4 * x2;
          x[2] -= m1 * x3 + m3 * x4;
          x[3] -= m2 * x3 + m4 * x4;
          pv += 4;
        }
        PetscCall(PetscLogFlops(16.0 * nz + 12.0));
      }
      row = *ajtmp++;
    }
    /* finished row so stick it into b->a */
    pv = ba + 4 * bi[i];
    pj = bj + bi[i];
    nz = bi[i + 1] - bi[i];
    for (j = 0; j < nz; j++) {
      x     = rtmp + 4 * pj[j];
      pv[0] = x[0];
      pv[1] = x[1];
      pv[2] = x[2];
      pv[3] = x[3];
      /*
      printf(" col %d:",pj[j]);
      PetscInt j1;
      for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
      printf("\n");
      */
      pv += 4;
    }
    /* invert diagonal block */
    w = ba + 4 * diag_offset[i];
    PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
    if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
  }

  PetscCall(PetscFree(rtmp));

  C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
  C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
  C->assembled           = PETSC_TRUE;

  PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
     Version for when blocks are 1 by 1.
*/
PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
{
  Mat              C = B;
  Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
  IS               isrow = b->row, isicol = b->icol;
  const PetscInt  *r, *ic, *ics;
  const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
  PetscInt         i, j, k, nz, nzL, row, *pj;
  const PetscInt  *ajtmp, *bjtmp;
  MatScalar       *rtmp, *pc, multiplier, *pv;
  const MatScalar *aa = a->a, *v;
  PetscBool        row_identity, col_identity;
  FactorShiftCtx   sctx;
  const PetscInt  *ddiag;
  PetscReal        rs;
  MatScalar        d;

  PetscFunctionBegin;
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
    PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
    sctx.shift_top = info->zeropivot;
    for (i = 0; i < n; i++) {
      /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
      d  = (aa)[ddiag[i]];
      rs = -PetscAbsScalar(d) - PetscRealPart(d);
      v  = aa + ai[i];
      nz = ai[i + 1] - ai[i];
      for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
      if (rs > sctx.shift_top) sctx.shift_top = rs;
    }
    sctx.shift_top *= 1.1;
    sctx.nshift_max = 5;
    sctx.shift_lo   = 0.;
    sctx.shift_hi   = 1.;
  }

  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));
  PetscCall(PetscMalloc1(n + 1, &rtmp));
  ics = ic;

  do {
    sctx.newshift = PETSC_FALSE;
    for (i = 0; i < n; i++) {
      /* zero rtmp */
      /* L part */
      nz    = bi[i + 1] - bi[i];
      bjtmp = bj + bi[i];
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      /* U part */
      nz    = bdiag[i] - bdiag[i + 1];
      bjtmp = bj + bdiag[i + 1] + 1;
      for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

      /* load in initial (unfactored row) */
      nz    = ai[r[i] + 1] - ai[r[i]];
      ajtmp = aj + ai[r[i]];
      v     = aa + ai[r[i]];
      for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];

      /* ZeropivotApply() */
      rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

      /* elimination */
      bjtmp = bj + bi[i];
      row   = *bjtmp++;
      nzL   = bi[i + 1] - bi[i];
      for (k = 0; k < nzL; k++) {
        pc = rtmp + row;
        if (*pc != (PetscScalar)0.0) {
          pv         = b->a + bdiag[row];
          multiplier = *pc * (*pv);
          *pc        = multiplier;

          pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
          pv = b->a + bdiag[row + 1] + 1;
          nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
          for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
          PetscCall(PetscLogFlops(2.0 * nz));
        }
        row = *bjtmp++;
      }

      /* finished row so stick it into b->a */
      rs = 0.0;
      /* L part */
      pv = b->a + bi[i];
      pj = b->j + bi[i];
      nz = bi[i + 1] - bi[i];
      for (j = 0; j < nz; j++) {
        pv[j] = rtmp[pj[j]];
        rs += PetscAbsScalar(pv[j]);
      }

      /* U part */
      pv = b->a + bdiag[i + 1] + 1;
      pj = b->j + bdiag[i + 1] + 1;
      nz = bdiag[i] - bdiag[i + 1] - 1;
      for (j = 0; j < nz; j++) {
        pv[j] = rtmp[pj[j]];
        rs += PetscAbsScalar(pv[j]);
      }

      sctx.rs = rs;
      sctx.pv = rtmp[i];
      PetscCall(MatPivotCheck(B, A, info, &sctx, i));
      if (sctx.newshift) break; /* break for-loop */
      rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */

      /* Mark diagonal and invert diagonal for simpler triangular solves */
      pv  = b->a + bdiag[i];
      *pv = (PetscScalar)1.0 / rtmp[i];

    } /* endof for (i=0; i<n; i++) { */

    /* MatPivotRefine() */
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
      /*
       * if no shift in this attempt & shifting & started shifting & can refine,
       * then try lower shift
       */
      sctx.shift_hi       = sctx.shift_fraction;
      sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
      sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
      sctx.newshift       = PETSC_TRUE;
      sctx.nshift++;
    }
  } while (sctx.newshift);

  PetscCall(PetscFree(rtmp));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));

  PetscCall(ISIdentity(isrow, &row_identity));
  PetscCall(ISIdentity(isicol, &col_identity));
  if (row_identity && col_identity) {
    C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
    C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
    C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
    C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
  } else {
    C->ops->solve          = MatSolve_SeqBAIJ_1;
    C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
  }
  C->assembled = PETSC_TRUE;
  PetscCall(PetscLogFlops(C->cmap->n));

  /* MatShiftView(A,info,&sctx) */
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
      PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
{
  Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
  IS              isrow = b->row, isicol = b->icol;
  const PetscInt *r, *ic;
  PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
  PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
  PetscInt        diag, *pj;
  const PetscInt *diag_offset;
  MatScalar      *pv, *v, *rtmp, multiplier, *pc;
  MatScalar      *ba = b->a, *aa = a->a;
  PetscBool       row_identity, col_identity;

  PetscFunctionBegin;
  /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
  A->factortype = MAT_FACTOR_NONE;
  PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
  A->factortype = MAT_FACTOR_ILU;
  PetscCall(ISGetIndices(isrow, &r));
  PetscCall(ISGetIndices(isicol, &ic));
  PetscCall(PetscMalloc1(n + 1, &rtmp));

  for (i = 0; i < n; i++) {
    nz    = bi[i + 1] - bi[i];
    ajtmp = bj + bi[i];
    for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;

    /* load in initial (unfactored row) */
    nz       = ai[r[i] + 1] - ai[r[i]];
    ajtmpold = aj + ai[r[i]];
    v        = aa + ai[r[i]];
    for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];

    row = *ajtmp++;
    while (row < i) {
      pc = rtmp + row;
      if (*pc != 0.0) {
        pv         = ba + diag_offset[row];
        pj         = bj + diag_offset[row] + 1;
        multiplier = *pc * *pv++;
        *pc        = multiplier;
        nz         = bi[row + 1] - diag_offset[row] - 1;
        for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
        PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
      }
      row = *ajtmp++;
    }
    /* finished row so stick it into b->a */
    pv = ba + bi[i];
    pj = bj + bi[i];
    nz = bi[i + 1] - bi[i];
    for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
    diag = diag_offset[i] - bi[i];
    /* check pivot entry for current row */
    PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
    pv[diag] = 1.0 / pv[diag];
  }

  PetscCall(PetscFree(rtmp));
  PetscCall(ISRestoreIndices(isicol, &ic));
  PetscCall(ISRestoreIndices(isrow, &r));
  PetscCall(ISIdentity(isrow, &row_identity));
  PetscCall(ISIdentity(isicol, &col_identity));
  if (row_identity && col_identity) {
    C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
    C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
  } else {
    C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
    C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
  }
  C->assembled = PETSC_TRUE;
  PetscCall(PetscLogFlops(C->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
{
  PetscFunctionBegin;
  *type = MATSOLVERPETSC;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
{
  PetscInt n = A->rmap->n;

  PetscFunctionBegin;
  PetscCheck(!PetscDefined(USE_COMPLEX) || A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
  PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
  PetscCall(MatSetSizes(*B, n, n, n, n));
  if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
    PetscCall(MatSetType(*B, MATSEQBAIJ));

    (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
    (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
    PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
  } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
    PetscCall(MatSetType(*B, MATSEQSBAIJ));
    PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));

    (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
    (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
    /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
    PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
  } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
  (*B)->factortype     = ftype;
  (*B)->canuseordering = PETSC_TRUE;

  PetscCall(PetscFree((*B)->solvertype));
  PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
  PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
{
  Mat C;

  PetscFunctionBegin;
  PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
  PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
  PetscCall(MatLUFactorNumeric(C, A, info));

  A->ops->solve          = C->ops->solve;
  A->ops->solvetranspose = C->ops->solvetranspose;

  PetscCall(MatHeaderMerge(A, &C));
  PetscFunctionReturn(PETSC_SUCCESS);
}

#include <../src/mat/impls/sbaij/seq/sbaij.h>
PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
{
  Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
  Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
  IS              ip = b->row;
  const PetscInt *rip;
  PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
  PetscInt       *ai = a->i, *aj = a->j;
  PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
  MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
  PetscReal       rs;
  FactorShiftCtx  sctx;

  PetscFunctionBegin;
  if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
    if (!a->sbaijMat) {
      A->symmetric = PETSC_BOOL3_TRUE;
      if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
      PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
    }
    PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
    PetscCall(MatDestroy(&a->sbaijMat));
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  PetscCall(ISGetIndices(ip, &rip));
  PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));

  sctx.shift_amount = 0.;
  sctx.nshift       = 0;
  do {
    sctx.newshift = PETSC_FALSE;
    for (i = 0; i < mbs; i++) {
      rtmp[i] = 0.0;
      jl[i]   = mbs;
      il[0]   = 0;
    }

    for (k = 0; k < mbs; k++) {
      bval = ba + bi[k];
      /* initialize k-th row by the perm[k]-th row of A */
      jmin = ai[rip[k]];
      jmax = ai[rip[k] + 1];
      for (j = jmin; j < jmax; j++) {
        col = rip[aj[j]];
        if (col >= k) { /* only take upper triangular entry */
          rtmp[col] = aa[j];
          *bval++   = 0.0; /* for in-place factorization */
        }
      }

      /* shift the diagonal of the matrix */
      if (sctx.nshift) rtmp[k] += sctx.shift_amount;

      /* modify k-th row by adding in those rows i with U(i,k)!=0 */
      dk = rtmp[k];
      i  = jl[k]; /* first row to be added to k_th row  */

      while (i < k) {
        nexti = jl[i]; /* next row to be added to k_th row */

        /* compute multiplier, update diag(k) and U(i,k) */
        ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
        uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
        dk += uikdi * ba[ili];
        ba[ili] = uikdi; /* -U(i,k) */

        /* add multiple of row i to k-th row */
        jmin = ili + 1;
        jmax = bi[i + 1];
        if (jmin < jmax) {
          for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
          /* update il and jl for row i */
          il[i] = jmin;
          j     = bj[jmin];
          jl[i] = jl[j];
          jl[j] = i;
        }
        i = nexti;
      }

      /* shift the diagonals when zero pivot is detected */
      /* compute rs=sum of abs(off-diagonal) */
      rs   = 0.0;
      jmin = bi[k] + 1;
      nz   = bi[k + 1] - jmin;
      if (nz) {
        bcol = bj + jmin;
        while (nz--) {
          rs += PetscAbsScalar(rtmp[*bcol]);
          bcol++;
        }
      }

      sctx.rs = rs;
      sctx.pv = dk;
      PetscCall(MatPivotCheck(C, A, info, &sctx, k));
      if (sctx.newshift) break;
      dk = sctx.pv;

      /* copy data into U(k,:) */
      ba[bi[k]] = 1.0 / dk; /* U(k,k) */
      jmin      = bi[k] + 1;
      jmax      = bi[k + 1];
      if (jmin < jmax) {
        for (j = jmin; j < jmax; j++) {
          col       = bj[j];
          ba[j]     = rtmp[col];
          rtmp[col] = 0.0;
        }
        /* add the k-th row into il and jl */
        il[k] = jmin;
        i     = bj[jmin];
        jl[k] = jl[i];
        jl[i] = k;
      }
    }
  } while (sctx.newshift);
  PetscCall(PetscFree3(rtmp, il, jl));

  PetscCall(ISRestoreIndices(ip, &rip));

  C->assembled    = PETSC_TRUE;
  C->preallocated = PETSC_TRUE;

  PetscCall(PetscLogFlops(C->rmap->N));
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
{
  Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
  Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
  PetscInt       i, j, am = a->mbs;
  PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
  PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
  MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
  PetscReal      rs;
  FactorShiftCtx sctx;

  PetscFunctionBegin;
  /* MatPivotSetUp(): initialize shift context sctx */
  PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

  PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));

  do {
    sctx.newshift = PETSC_FALSE;
    for (i = 0; i < am; i++) {
      rtmp[i] = 0.0;
      jl[i]   = am;
      il[0]   = 0;
    }

    for (k = 0; k < am; k++) {
      /* initialize k-th row with elements nonzero in row perm(k) of A */
      nz   = ai[k + 1] - ai[k];
      acol = aj + ai[k];
      aval = aa + ai[k];
      bval = ba + bi[k];
      while (nz--) {
        if (*acol < k) { /* skip lower triangular entries */
          acol++;
          aval++;
        } else {
          rtmp[*acol++] = *aval++;
          *bval++       = 0.0; /* for in-place factorization */
        }
      }

      /* shift the diagonal of the matrix */
      if (sctx.nshift) rtmp[k] += sctx.shift_amount;

      /* modify k-th row by adding in those rows i with U(i,k)!=0 */
      dk = rtmp[k];
      i  = jl[k]; /* first row to be added to k_th row  */

      while (i < k) {
        nexti = jl[i]; /* next row to be added to k_th row */
        /* compute multiplier, update D(k) and U(i,k) */
        ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
        uikdi = -ba[ili] * ba[bi[i]];
        dk += uikdi * ba[ili];
        ba[ili] = uikdi; /* -U(i,k) */

        /* add multiple of row i to k-th row ... */
        jmin = ili + 1;
        nz   = bi[i + 1] - jmin;
        if (nz > 0) {
          bcol = bj + jmin;
          bval = ba + jmin;
          while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
          /* update il and jl for i-th row */
          il[i] = jmin;
          j     = bj[jmin];
          jl[i] = jl[j];
          jl[j] = i;
        }
        i = nexti;
      }

      /* shift the diagonals when zero pivot is detected */
      /* compute rs=sum of abs(off-diagonal) */
      rs   = 0.0;
      jmin = bi[k] + 1;
      nz   = bi[k + 1] - jmin;
      if (nz) {
        bcol = bj + jmin;
        while (nz--) {
          rs += PetscAbsScalar(rtmp[*bcol]);
          bcol++;
        }
      }

      sctx.rs = rs;
      sctx.pv = dk;
      PetscCall(MatPivotCheck(C, A, info, &sctx, k));
      if (sctx.newshift) break; /* sctx.shift_amount is updated */
      dk = sctx.pv;

      /* copy data into U(k,:) */
      ba[bi[k]] = 1.0 / dk;
      jmin      = bi[k] + 1;
      nz        = bi[k + 1] - jmin;
      if (nz) {
        bcol = bj + jmin;
        bval = ba + jmin;
        while (nz--) {
          *bval++       = rtmp[*bcol];
          rtmp[*bcol++] = 0.0;
        }
        /* add k-th row into il and jl */
        il[k] = jmin;
        i     = bj[jmin];
        jl[k] = jl[i];
        jl[i] = k;
      }
    }
  } while (sctx.newshift);
  PetscCall(PetscFree3(rtmp, il, jl));

  C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
  C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
  C->assembled           = PETSC_TRUE;
  C->preallocated        = PETSC_TRUE;

  PetscCall(PetscLogFlops(C->rmap->N));
  if (sctx.nshift) {
    if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
      PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
      PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

#include <petscbt.h>
#include <../src/mat/utils/freespace.h>
PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
{
  Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
  Mat_SeqSBAIJ      *b;
  Mat                B;
  PetscBool          perm_identity;
  PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
  const PetscInt    *rip;
  PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
  PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
  PetscReal          fill = info->fill, levels = info->levels;
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
  PetscBT            lnkbt;
  PetscBool          diagDense;
  const PetscInt    *adiag;

  PetscFunctionBegin;
  PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
  PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");

  if (bs > 1) {
    if (!a->sbaijMat) {
      A->symmetric = PETSC_BOOL3_TRUE;
      if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
      PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
    }
    fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

    PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  PetscCall(ISIdentity(perm, &perm_identity));
  PetscCall(ISGetIndices(perm, &rip));

  /* special case that simply copies fill pattern */
  if (!levels && perm_identity) {
    PetscCall(PetscMalloc1(am + 1, &ui));
    for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
    B = fact;
    PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));

    b  = (Mat_SeqSBAIJ *)B->data;
    uj = b->j;
    for (i = 0; i < am; i++) {
      aj = a->j + adiag[i];
      for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
      b->ilen[i] = ui[i];
    }
    PetscCall(PetscFree(ui));

    B->factortype = MAT_FACTOR_NONE;

    PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
    B->factortype = MAT_FACTOR_ICC;

    B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  /* initialization */
  PetscCall(PetscMalloc1(am + 1, &ui));
  ui[0] = 0;
  PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));

  /* jl: linked list for storing indices of the pivot rows
     il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
  PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
  for (i = 0; i < am; i++) {
    jl[i] = am;
    il[i] = 0;
  }

  /* create and initialize a linked list for storing column indices of the active row k */
  nlnk = am + 1;
  PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

  /* initial FreeSpace size is fill*(ai[am]+am)/2 */
  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));

  current_space = free_space;

  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
  current_space_lvl = free_space_lvl;

  for (k = 0; k < am; k++) { /* for each active row k */
    /* initialize lnk by the column indices of row rip[k] of A */
    nzk         = 0;
    ncols       = ai[rip[k] + 1] - ai[rip[k]];
    ncols_upper = 0;
    cols        = cols_lvl + am;
    for (j = 0; j < ncols; j++) {
      i = rip[*(aj + ai[rip[k]] + j)];
      if (i >= k) { /* only take upper triangular entry */
        cols[ncols_upper]     = i;
        cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
        ncols_upper++;
      }
    }
    PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
    nzk += nlnk;

    /* update lnk by computing fill-in for each pivot row to be merged in */
    prow = jl[k]; /* 1st pivot row */

    while (prow < k) {
      nextprow = jl[prow];

      /* merge prow into k-th row */
      jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
      jmax  = ui[prow + 1];
      ncols = jmax - jmin;
      i     = jmin - ui[prow];
      cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
      for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
      PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
      nzk += nlnk;

      /* update il and jl for prow */
      if (jmin < jmax) {
        il[prow] = jmin;

        j        = *cols;
        jl[prow] = jl[j];
        jl[j]    = prow;
      }
      prow = nextprow;
    }

    /* if free space is not available, make more free space */
    if (current_space->local_remaining < nzk) {
      i = am - k + 1;                                                             /* num of unfactored rows */
      i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
      PetscCall(PetscFreeSpaceGet(i, &current_space));
      PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
      reallocs++;
    }

    /* copy data into free_space and free_space_lvl, then initialize lnk */
    PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

    /* add the k-th row into il and jl */
    if (nzk - 1 > 0) {
      i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
      jl[k] = jl[i];
      jl[i] = k;
      il[k] = ui[k] + 1;
    }
    uj_ptr[k]     = current_space->array;
    uj_lvl_ptr[k] = current_space_lvl->array;

    current_space->array += nzk;
    current_space->local_used += nzk;
    current_space->local_remaining -= nzk;

    current_space_lvl->array += nzk;
    current_space_lvl->local_used += nzk;
    current_space_lvl->local_remaining -= nzk;

    ui[k + 1] = ui[k] + nzk;
  }

  PetscCall(ISRestoreIndices(perm, &rip));
  PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
  PetscCall(PetscFree(cols_lvl));

  /* copy free_space into uj and free free_space; set uj in new datastructure; */
  PetscCall(PetscMalloc1(ui[am] + 1, &uj));
  PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
  PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
  PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

  /* put together the new matrix in MATSEQSBAIJ format */
  B = fact;
  PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));

  b          = (Mat_SeqSBAIJ *)B->data;
  b->free_a  = PETSC_TRUE;
  b->free_ij = PETSC_TRUE;

  PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

  b->j             = uj;
  b->i             = ui;
  b->diag          = NULL;
  b->ilen          = NULL;
  b->imax          = NULL;
  b->row           = perm;
  b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

  PetscCall(PetscObjectReference((PetscObject)perm));

  b->icol = perm;

  PetscCall(PetscObjectReference((PetscObject)perm));
  PetscCall(PetscMalloc1(am + 1, &b->solve_work));

  b->maxnz = b->nz = ui[am];

  B->info.factor_mallocs   = reallocs;
  B->info.fill_ratio_given = fill;
  if (ai[am] != 0.) {
    /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
    B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
  } else {
    B->info.fill_ratio_needed = 0.0;
  }
#if defined(PETSC_USE_INFO)
  if (ai[am] != 0) {
    PetscReal af = B->info.fill_ratio_needed;
    PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
    PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
    PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
  } else {
    PetscCall(PetscInfo(A, "Empty matrix\n"));
  }
#endif
  if (perm_identity) {
    B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
    B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
    B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
  } else {
    fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
{
  Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
  Mat_SeqSBAIJ      *b;
  Mat                B;
  PetscBool          perm_identity;
  PetscReal          fill = info->fill;
  const PetscInt    *rip;
  PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
  PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
  PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  PetscBT            lnkbt;
  PetscBool          diagDense;

  PetscFunctionBegin;
  if (bs > 1) { /* convert to seqsbaij */
    if (!a->sbaijMat) {
      A->symmetric = PETSC_BOOL3_TRUE;
      if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
      PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
    }
    fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

    PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
    PetscFunctionReturn(PETSC_SUCCESS);
  }
  PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
  PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");

  /* check whether perm is the identity mapping */
  PetscCall(ISIdentity(perm, &perm_identity));
  PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
  PetscCall(ISGetIndices(perm, &rip));

  /* initialization */
  PetscCall(PetscMalloc1(mbs + 1, &ui));
  ui[0] = 0;

  /* jl: linked list for storing indices of the pivot rows
     il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
  PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
  for (i = 0; i < mbs; i++) {
    jl[i] = mbs;
    il[i] = 0;
  }

  /* create and initialize a linked list for storing column indices of the active row k */
  nlnk = mbs + 1;
  PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

  /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
  PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));

  current_space = free_space;

  for (k = 0; k < mbs; k++) { /* for each active row k */
    /* initialize lnk by the column indices of row rip[k] of A */
    nzk         = 0;
    ncols       = ai[rip[k] + 1] - ai[rip[k]];
    ncols_upper = 0;
    for (j = 0; j < ncols; j++) {
      i = rip[*(aj + ai[rip[k]] + j)];
      if (i >= k) { /* only take upper triangular entry */
        cols[ncols_upper] = i;
        ncols_upper++;
      }
    }
    PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
    nzk += nlnk;

    /* update lnk by computing fill-in for each pivot row to be merged in */
    prow = jl[k]; /* 1st pivot row */

    while (prow < k) {
      nextprow = jl[prow];
      /* merge prow into k-th row */
      jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
      jmax   = ui[prow + 1];
      ncols  = jmax - jmin;
      uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
      PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
      nzk += nlnk;

      /* update il and jl for prow */
      if (jmin < jmax) {
        il[prow] = jmin;
        j        = *uj_ptr;
        jl[prow] = jl[j];
        jl[j]    = prow;
      }
      prow = nextprow;
    }

    /* if free space is not available, make more free space */
    if (current_space->local_remaining < nzk) {
      i = mbs - k + 1;                                                            /* num of unfactored rows */
      i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
      PetscCall(PetscFreeSpaceGet(i, &current_space));
      reallocs++;
    }

    /* copy data into free space, then initialize lnk */
    PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));

    /* add the k-th row into il and jl */
    if (nzk - 1 > 0) {
      i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
      jl[k] = jl[i];
      jl[i] = k;
      il[k] = ui[k] + 1;
    }
    ui_ptr[k] = current_space->array;
    current_space->array += nzk;
    current_space->local_used += nzk;
    current_space->local_remaining -= nzk;

    ui[k + 1] = ui[k] + nzk;
  }

  PetscCall(ISRestoreIndices(perm, &rip));
  PetscCall(PetscFree4(ui_ptr, il, jl, cols));

  /* copy free_space into uj and free free_space; set uj in new datastructure; */
  PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
  PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
  PetscCall(PetscLLDestroy(lnk, lnkbt));

  /* put together the new matrix in MATSEQSBAIJ format */
  B = fact;
  PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));

  b          = (Mat_SeqSBAIJ *)B->data;
  b->free_a  = PETSC_TRUE;
  b->free_ij = PETSC_TRUE;

  PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));

  b->j             = uj;
  b->i             = ui;
  b->diag          = NULL;
  b->ilen          = NULL;
  b->imax          = NULL;
  b->row           = perm;
  b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

  PetscCall(PetscObjectReference((PetscObject)perm));
  b->icol = perm;
  PetscCall(PetscObjectReference((PetscObject)perm));
  PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
  b->maxnz = b->nz = ui[mbs];

  B->info.factor_mallocs   = reallocs;
  B->info.fill_ratio_given = fill;
  if (ai[mbs] != 0.) {
    /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
    B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
  } else {
    B->info.fill_ratio_needed = 0.0;
  }
#if defined(PETSC_USE_INFO)
  if (ai[mbs] != 0.) {
    PetscReal af = B->info.fill_ratio_needed;
    PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
    PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
    PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
  } else {
    PetscCall(PetscInfo(A, "Empty matrix\n"));
  }
#endif
  if (perm_identity) {
    B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
  } else {
    B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
  const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
  PetscInt           i, k, n                       = a->mbs;
  PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, *s, *t, *ls;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  /* forward solve the lower triangular */
  PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */

  for (i = 1; i < n; i++) {
    v  = aa + bs2 * ai[i];
    vi = aj + ai[i];
    nz = ai[i + 1] - ai[i];
    s  = t + bs * i;
    PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
    for (k = 0; k < nz; k++) {
      PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
      v += bs2;
    }
  }

  /* backward solve the upper triangular */
  ls = a->solve_work + A->cmap->n;
  for (i = n - 1; i >= 0; i--) {
    v  = aa + bs2 * (adiag[i + 1] + 1);
    vi = aj + adiag[i + 1] + 1;
    nz = adiag[i] - adiag[i + 1] - 1;
    PetscCall(PetscArraycpy(ls, t + i * bs, bs));
    for (k = 0; k < nz; k++) {
      PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
      v += bs2;
    }
    PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
    PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
  }

  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
  PetscInt           i, m, n = a->mbs;
  PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, *s, *t, *ls;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
  for (i = 1; i < n; i++) {
    v  = aa + bs2 * ai[i];
    vi = aj + ai[i];
    nz = ai[i + 1] - ai[i];
    s  = t + bs * i;
    PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
    for (m = 0; m < nz; m++) {
      PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
      v += bs2;
    }
  }

  /* backward solve the upper triangular */
  ls = a->solve_work + A->cmap->n;
  for (i = n - 1; i >= 0; i--) {
    v  = aa + bs2 * (adiag[i + 1] + 1);
    vi = aj + adiag[i + 1] + 1;
    nz = adiag[i] - adiag[i + 1] - 1;
    PetscCall(PetscArraycpy(ls, t + i * bs, bs));
    for (m = 0; m < nz; m++) {
      PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
      v += bs2;
    }
    PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
    PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
  }
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}
