#include <../src/mat/impls/baij/seq/baij.h>
#include <petsc/private/kernels/blockinvert.h>

PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout;
  const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
  PetscInt           i, nz;
  const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, *s, *t, *ls;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
  for (i = 1; i < n; i++) {
    v  = aa + bs2 * ai[i];
    vi = aj + ai[i];
    nz = a->diag[i] - ai[i];
    s  = t + bs * i;
    PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
    while (nz--) {
      PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
      v += bs2;
    }
  }
  /* backward solve the upper triangular */
  ls = a->solve_work + A->cmap->n;
  for (i = n - 1; i >= 0; i--) {
    v  = aa + bs2 * (a->diag[i] + 1);
    vi = aj + a->diag[i] + 1;
    nz = ai[i + 1] - a->diag[i] - 1;
    PetscCall(PetscArraycpy(ls, t + i * bs, bs));
    while (nz--) {
      PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
      v += bs2;
    }
    PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
    PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
  const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
  PetscInt           i, nz, idx, idt, idc;
  const MatScalar   *aa = a->a, *v;
  PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  idx  = 7 * (*r++);
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  t[4] = b[4 + idx];
  t[5] = b[5 + idx];
  t[6] = b[6 + idx];

  for (i = 1; i < n; i++) {
    v   = aa + 49 * ai[i];
    vi  = aj + ai[i];
    nz  = diag[i] - ai[i];
    idx = 7 * (*r++);
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    s5  = b[4 + idx];
    s6  = b[5 + idx];
    s7  = b[6 + idx];
    while (nz--) {
      idx = 7 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      x7  = t[6 + idx];
      s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
      s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
      s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
      s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
      s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
      s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
      s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
      v += 49;
    }
    idx        = 7 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
    t[4 + idx] = s5;
    t[5 + idx] = s6;
    t[6 + idx] = s7;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 49 * diag[i] + 49;
    vi  = aj + diag[i] + 1;
    nz  = ai[i + 1] - diag[i] - 1;
    idt = 7 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    s5  = t[4 + idt];
    s6  = t[5 + idt];
    s7  = t[6 + idt];
    while (nz--) {
      idx = 7 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      x7  = t[6 + idx];
      s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
      s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
      s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
      s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
      s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
      s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
      s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
      v += 49;
    }
    idc    = 7 * (*c--);
    v      = aa + 49 * diag[i];
    x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
    x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
    x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
    x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
  const PetscInt     n = a->mbs, *rout, *cout, *vi;
  PetscInt           i, nz, idx, idt, idc, m;
  const MatScalar   *aa = a->a, *v;
  PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  idx  = 7 * r[0];
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  t[4] = b[4 + idx];
  t[5] = b[5 + idx];
  t[6] = b[6 + idx];

  for (i = 1; i < n; i++) {
    v   = aa + 49 * ai[i];
    vi  = aj + ai[i];
    nz  = ai[i + 1] - ai[i];
    idx = 7 * r[i];
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    s5  = b[4 + idx];
    s6  = b[5 + idx];
    s7  = b[6 + idx];
    for (m = 0; m < nz; m++) {
      idx = 7 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      x7  = t[6 + idx];
      s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
      s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
      s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
      s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
      s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
      s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
      s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
      v += 49;
    }
    idx        = 7 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
    t[4 + idx] = s5;
    t[5 + idx] = s6;
    t[6 + idx] = s7;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 49 * (adiag[i + 1] + 1);
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    idt = 7 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    s5  = t[4 + idt];
    s6  = t[5 + idt];
    s7  = t[6 + idt];
    for (m = 0; m < nz; m++) {
      idx = 7 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      x7  = t[6 + idx];
      s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
      s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
      s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
      s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
      s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
      s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
      s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
      v += 49;
    }
    idc    = 7 * c[i];
    x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
    x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
    x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
    x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout;
  const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  PetscInt           i, nz, idx, idt, idc;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  idx  = 6 * (*r++);
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  t[4] = b[4 + idx];
  t[5] = b[5 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 36 * ai[i];
    vi  = aj + ai[i];
    nz  = diag[i] - ai[i];
    idx = 6 * (*r++);
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    s5  = b[4 + idx];
    s6  = b[5 + idx];
    while (nz--) {
      idx = 6 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
      s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
      s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
      s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
      s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
      s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
      v += 36;
    }
    idx        = 6 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
    t[4 + idx] = s5;
    t[5 + idx] = s6;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 36 * diag[i] + 36;
    vi  = aj + diag[i] + 1;
    nz  = ai[i + 1] - diag[i] - 1;
    idt = 6 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    s5  = t[4 + idt];
    s6  = t[5 + idt];
    while (nz--) {
      idx = 6 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
      s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
      s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
      s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
      s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
      s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
      v += 36;
    }
    idc    = 6 * (*c--);
    v      = aa + 36 * diag[i];
    x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
    x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
    x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
  PetscInt           i, nz, idx, idt, idc, m;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  idx  = 6 * r[0];
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  t[4] = b[4 + idx];
  t[5] = b[5 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 36 * ai[i];
    vi  = aj + ai[i];
    nz  = ai[i + 1] - ai[i];
    idx = 6 * r[i];
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    s5  = b[4 + idx];
    s6  = b[5 + idx];
    for (m = 0; m < nz; m++) {
      idx = 6 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
      s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
      s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
      s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
      s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
      s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
      v += 36;
    }
    idx        = 6 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
    t[4 + idx] = s5;
    t[5 + idx] = s6;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 36 * (adiag[i + 1] + 1);
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    idt = 6 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    s5  = t[4 + idt];
    s6  = t[5 + idt];
    for (m = 0; m < nz; m++) {
      idx = 6 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      x6  = t[5 + idx];
      s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
      s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
      s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
      s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
      s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
      s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
      v += 36;
    }
    idc    = 6 * c[i];
    x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
    x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
    x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  PetscInt           i, nz, idx, idt, idc;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  idx  = 5 * (*r++);
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  t[4] = b[4 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 25 * ai[i];
    vi  = aj + ai[i];
    nz  = diag[i] - ai[i];
    idx = 5 * (*r++);
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    s5  = b[4 + idx];
    while (nz--) {
      idx = 5 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
      s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
      s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
      s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
      s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
      v += 25;
    }
    idx        = 5 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
    t[4 + idx] = s5;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 25 * diag[i] + 25;
    vi  = aj + diag[i] + 1;
    nz  = ai[i + 1] - diag[i] - 1;
    idt = 5 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    s5  = t[4 + idt];
    while (nz--) {
      idx = 5 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
      s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
      s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
      s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
      s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
      v += 25;
    }
    idc    = 5 * (*c--);
    v      = aa + 25 * diag[i];
    x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
    x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt    *r, *c, *rout, *cout;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
  PetscInt           i, nz, idx, idt, idc, m;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  idx  = 5 * r[0];
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  t[4] = b[4 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 25 * ai[i];
    vi  = aj + ai[i];
    nz  = ai[i + 1] - ai[i];
    idx = 5 * r[i];
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    s5  = b[4 + idx];
    for (m = 0; m < nz; m++) {
      idx = 5 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
      s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
      s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
      s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
      s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
      v += 25;
    }
    idx        = 5 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
    t[4 + idx] = s5;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 25 * (adiag[i + 1] + 1);
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    idt = 5 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    s5  = t[4 + idt];
    for (m = 0; m < nz; m++) {
      idx = 5 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      x5  = t[4 + idx];
      s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
      s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
      s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
      s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
      s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
      v += 25;
    }
    idc    = 5 * c[i];
    x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
    x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  PetscInt           i, nz, idx, idt, idc;
  const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  idx  = 4 * (*r++);
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 16 * ai[i];
    vi  = aj + ai[i];
    nz  = diag[i] - ai[i];
    idx = 4 * (*r++);
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    while (nz--) {
      idx = 4 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
      s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
      s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
      s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
      v += 16;
    }
    idx        = 4 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 16 * diag[i] + 16;
    vi  = aj + diag[i] + 1;
    nz  = ai[i + 1] - diag[i] - 1;
    idt = 4 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    while (nz--) {
      idx = 4 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
      s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
      s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
      s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
      v += 16;
    }
    idc    = 4 * (*c--);
    v      = aa + 16 * diag[i];
    x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
  PetscInt           i, nz, idx, idt, idc, m;
  const PetscInt    *r, *c, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  idx  = 4 * r[0];
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  t[3] = b[3 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 16 * ai[i];
    vi  = aj + ai[i];
    nz  = ai[i + 1] - ai[i];
    idx = 4 * r[i];
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    s4  = b[3 + idx];
    for (m = 0; m < nz; m++) {
      idx = 4 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
      s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
      s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
      s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
      v += 16;
    }
    idx        = 4 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
    t[3 + idx] = s4;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 16 * (adiag[i + 1] + 1);
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    idt = 4 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    s4  = t[3 + idt];
    for (m = 0; m < nz; m++) {
      idx = 4 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      x4  = t[3 + idx];
      s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
      s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
      s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
      s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
      v += 16;
    }
    idc    = 4 * c[i];
    x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
    x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  PetscInt           i, nz, idx, idt, idc;
  const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  idx  = 3 * (*r++);
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 9 * ai[i];
    vi  = aj + ai[i];
    nz  = diag[i] - ai[i];
    idx = 3 * (*r++);
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    while (nz--) {
      idx = 3 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
      s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
      s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
      v += 9;
    }
    idx        = 3 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 9 * diag[i] + 9;
    vi  = aj + diag[i] + 1;
    nz  = ai[i + 1] - diag[i] - 1;
    idt = 3 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    while (nz--) {
      idx = 3 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
      s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
      s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
      v += 9;
    }
    idc    = 3 * (*c--);
    v      = aa + 9 * diag[i];
    x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
  }
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
  PetscInt           i, nz, idx, idt, idc, m;
  const PetscInt    *r, *c, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  idx  = 3 * r[0];
  t[0] = b[idx];
  t[1] = b[1 + idx];
  t[2] = b[2 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 9 * ai[i];
    vi  = aj + ai[i];
    nz  = ai[i + 1] - ai[i];
    idx = 3 * r[i];
    s1  = b[idx];
    s2  = b[1 + idx];
    s3  = b[2 + idx];
    for (m = 0; m < nz; m++) {
      idx = 3 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
      s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
      s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
      v += 9;
    }
    idx        = 3 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
    t[2 + idx] = s3;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 9 * (adiag[i + 1] + 1);
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    idt = 3 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    s3  = t[2 + idt];
    for (m = 0; m < nz; m++) {
      idx = 3 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      x3  = t[2 + idx];
      s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
      s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
      s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
      v += 9;
    }
    idc    = 3 * c[i];
    x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
    x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
  }
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  PetscInt           i, nz, idx, idt, idc;
  const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, x1, x2, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  idx  = 2 * (*r++);
  t[0] = b[idx];
  t[1] = b[1 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 4 * ai[i];
    vi  = aj + ai[i];
    nz  = diag[i] - ai[i];
    idx = 2 * (*r++);
    s1  = b[idx];
    s2  = b[1 + idx];
    while (nz--) {
      idx = 2 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      s1 -= v[0] * x1 + v[2] * x2;
      s2 -= v[1] * x1 + v[3] * x2;
      v += 4;
    }
    idx        = 2 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 4 * diag[i] + 4;
    vi  = aj + diag[i] + 1;
    nz  = ai[i + 1] - diag[i] - 1;
    idt = 2 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    while (nz--) {
      idx = 2 * (*vi++);
      x1  = t[idx];
      x2  = t[1 + idx];
      s1 -= v[0] * x1 + v[2] * x2;
      s2 -= v[1] * x1 + v[3] * x2;
      v += 4;
    }
    idc    = 2 * (*c--);
    v      = aa + 4 * diag[i];
    x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
  }
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
  PetscInt           i, nz, idx, jdx, idt, idc, m;
  const PetscInt    *r, *c, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, s2, x1, x2, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  idx  = 2 * r[0];
  t[0] = b[idx];
  t[1] = b[1 + idx];
  for (i = 1; i < n; i++) {
    v   = aa + 4 * ai[i];
    vi  = aj + ai[i];
    nz  = ai[i + 1] - ai[i];
    idx = 2 * r[i];
    s1  = b[idx];
    s2  = b[1 + idx];
    for (m = 0; m < nz; m++) {
      jdx = 2 * vi[m];
      x1  = t[jdx];
      x2  = t[1 + jdx];
      s1 -= v[0] * x1 + v[2] * x2;
      s2 -= v[1] * x1 + v[3] * x2;
      v += 4;
    }
    idx        = 2 * i;
    t[idx]     = s1;
    t[1 + idx] = s2;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + 4 * (adiag[i + 1] + 1);
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    idt = 2 * i;
    s1  = t[idt];
    s2  = t[1 + idt];
    for (m = 0; m < nz; m++) {
      idx = 2 * vi[m];
      x1  = t[idx];
      x2  = t[1 + idx];
      s1 -= v[0] * x1 + v[2] * x2;
      s2 -= v[1] * x1 + v[3] * x2;
      v += 4;
    }
    idc    = 2 * c[i];
    x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
    x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
  }
  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
  PetscInt           i, nz;
  const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
  const MatScalar   *aa = a->a, *v;
  PetscScalar       *x, s1, *t;
  const PetscScalar *b;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  t = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout + (n - 1);

  /* forward solve the lower triangular */
  t[0] = b[*r++];
  for (i = 1; i < n; i++) {
    v  = aa + ai[i];
    vi = aj + ai[i];
    nz = diag[i] - ai[i];
    s1 = b[*r++];
    while (nz--) s1 -= (*v++) * t[*vi++];
    t[i] = s1;
  }
  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v  = aa + diag[i] + 1;
    vi = aj + diag[i] + 1;
    nz = ai[i + 1] - diag[i] - 1;
    s1 = t[i];
    while (nz--) s1 -= (*v++) * t[*vi++];
    x[*c--] = t[i] = aa[diag[i]] * s1;
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
{
  Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  IS                 iscol = a->col, isrow = a->row;
  PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
  const PetscInt    *rout, *cout, *r, *c;
  PetscScalar       *x, *tmp, sum;
  const PetscScalar *b;
  const MatScalar   *aa = a->a, *v;

  PetscFunctionBegin;
  if (!n) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(VecGetArrayRead(bb, &b));
  PetscCall(VecGetArray(xx, &x));
  tmp = a->solve_work;

  PetscCall(ISGetIndices(isrow, &rout));
  r = rout;
  PetscCall(ISGetIndices(iscol, &cout));
  c = cout;

  /* forward solve the lower triangular */
  tmp[0] = b[r[0]];
  v      = aa;
  vi     = aj;
  for (i = 1; i < n; i++) {
    nz  = ai[i + 1] - ai[i];
    sum = b[r[i]];
    PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
    tmp[i] = sum;
    v += nz;
    vi += nz;
  }

  /* backward solve the upper triangular */
  for (i = n - 1; i >= 0; i--) {
    v   = aa + adiag[i + 1] + 1;
    vi  = aj + adiag[i + 1] + 1;
    nz  = adiag[i] - adiag[i + 1] - 1;
    sum = tmp[i];
    PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
    x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
  }

  PetscCall(ISRestoreIndices(isrow, &rout));
  PetscCall(ISRestoreIndices(iscol, &cout));
  PetscCall(VecRestoreArrayRead(bb, &b));
  PetscCall(VecRestoreArray(xx, &x));
  PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
  PetscFunctionReturn(PETSC_SUCCESS);
}
