
#include <../src/mat/impls/maij/maij.h> /*I "petscmat.h" I*/
#include <../src/mat/utils/freespace.h>

/*@
   MatMAIJGetAIJ - Get the `MATAIJ` matrix describing the blockwise action of the `MATMAIJ` matrix

   Not Collective, but if the `MATMAIJ` matrix is parallel, the `MATAIJ` matrix is also parallel

   Input Parameter:
.  A - the `MATMAIJ` matrix

   Output Parameter:
.  B - the `MATAIJ` matrix

   Level: advanced

   Note:
    The reference count on the `MATAIJ` matrix is not increased so you should not destroy it.

.seealso: `MATMAIJ`, `MATAIJ`, `MatCreateMAIJ()`
@*/
PetscErrorCode MatMAIJGetAIJ(Mat A, Mat *B) {
  PetscBool ismpimaij, isseqmaij;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIMAIJ, &ismpimaij));
  PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQMAIJ, &isseqmaij));
  if (ismpimaij) {
    Mat_MPIMAIJ *b = (Mat_MPIMAIJ *)A->data;

    *B = b->A;
  } else if (isseqmaij) {
    Mat_SeqMAIJ *b = (Mat_SeqMAIJ *)A->data;

    *B = b->AIJ;
  } else {
    *B = A;
  }
  PetscFunctionReturn(0);
}

/*@
   MatMAIJRedimension - Get a new `MATMAIJ` matrix with the same action, but for a different block size

   Logically Collective

   Input Parameters:
+  A - the `MATMAIJ` matrix
-  dof - the block size for the new matrix

   Output Parameter:
.  B - the new `MATMAIJ` matrix

   Level: advanced

.seealso: `MATMAIJ`, `MatCreateMAIJ()`
@*/
PetscErrorCode MatMAIJRedimension(Mat A, PetscInt dof, Mat *B) {
  Mat Aij = NULL;

  PetscFunctionBegin;
  PetscValidLogicalCollectiveInt(A, dof, 2);
  PetscCall(MatMAIJGetAIJ(A, &Aij));
  PetscCall(MatCreateMAIJ(Aij, dof, B));
  PetscFunctionReturn(0);
}

PetscErrorCode MatDestroy_SeqMAIJ(Mat A) {
  Mat_SeqMAIJ *b = (Mat_SeqMAIJ *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDestroy(&b->AIJ));
  PetscCall(PetscFree(A->data));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqmaij_seqaijcusparse_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqmaij_seqaij_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaij_seqmaij_C", NULL));
  PetscFunctionReturn(0);
}

PetscErrorCode MatSetUp_MAIJ(Mat A) {
  PetscFunctionBegin;
  SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Must use MatCreateMAIJ() to create MAIJ matrices");
}

PetscErrorCode MatView_SeqMAIJ(Mat A, PetscViewer viewer) {
  Mat B;

  PetscFunctionBegin;
  PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
  PetscCall(MatView(B, viewer));
  PetscCall(MatDestroy(&B));
  PetscFunctionReturn(0);
}

PetscErrorCode MatView_MPIMAIJ(Mat A, PetscViewer viewer) {
  Mat B;

  PetscFunctionBegin;
  PetscCall(MatConvert(A, MATMPIAIJ, MAT_INITIAL_MATRIX, &B));
  PetscCall(MatView(B, viewer));
  PetscCall(MatDestroy(&B));
  PetscFunctionReturn(0);
}

PetscErrorCode MatDestroy_MPIMAIJ(Mat A) {
  Mat_MPIMAIJ *b = (Mat_MPIMAIJ *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDestroy(&b->AIJ));
  PetscCall(MatDestroy(&b->OAIJ));
  PetscCall(MatDestroy(&b->A));
  PetscCall(VecScatterDestroy(&b->ctx));
  PetscCall(VecDestroy(&b->w));
  PetscCall(PetscFree(A->data));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpimaij_mpiaijcusparse_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpimaij_mpiaij_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_mpiaij_mpimaij_C", NULL));
  PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
  PetscFunctionReturn(0);
}

/*MC
  MATMAIJ - MATMAIJ = "maij" - A matrix type to be used for restriction and interpolation operations for
  multicomponent problems, interpolating or restricting each component the same way independently.
  The matrix type is based on `MATSEQAIJ` for sequential matrices, and `MATMPIAIJ` for distributed matrices.

  Operations provided:
.vb
    MatMult()
    MatMultTranspose()
    MatMultAdd()
    MatMultTransposeAdd()
.ve

  Level: advanced

.seealso: `MATAIJ`, `MatMAIJGetAIJ()`, `MatMAIJRedimension()`, `MatCreateMAIJ()`
M*/

PETSC_EXTERN PetscErrorCode MatCreate_MAIJ(Mat A) {
  Mat_MPIMAIJ *b;
  PetscMPIInt  size;

  PetscFunctionBegin;
  PetscCall(PetscNewLog(A, &b));
  A->data = (void *)b;

  PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));

  A->ops->setup = MatSetUp_MAIJ;

  b->AIJ  = NULL;
  b->dof  = 0;
  b->OAIJ = NULL;
  b->ctx  = NULL;
  b->w    = NULL;
  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
  if (size == 1) {
    PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQMAIJ));
  } else {
    PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIMAIJ));
  }
  A->preallocated = PETSC_TRUE;
  A->assembled    = PETSC_TRUE;
  PetscFunctionReturn(0);
}

/* --------------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_2(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2;
  PetscInt           nonzerorow = 0, n, i, jrow, j;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[2 * idx[jrow]];
      sum2 += v[jrow] * x[2 * idx[jrow] + 1];
      jrow++;
    }
    y[2 * i]     = sum1;
    y[2 * i + 1] = sum2;
  }

  PetscCall(PetscLogFlops(4.0 * a->nz - 2.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_2(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[2 * i];
    alpha2 = x[2 * i + 1];
    while (n-- > 0) {
      y[2 * (*idx)] += alpha1 * (*v);
      y[2 * (*idx) + 1] += alpha2 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(4.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2;
  PetscInt           n, i, jrow, j;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[2 * idx[jrow]];
      sum2 += v[jrow] * x[2 * idx[jrow] + 1];
      jrow++;
    }
    y[2 * i] += sum1;
    y[2 * i + 1] += sum2;
  }

  PetscCall(PetscLogFlops(4.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}
PetscErrorCode MatMultTransposeAdd_SeqMAIJ_2(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[2 * i];
    alpha2 = x[2 * i + 1];
    while (n-- > 0) {
      y[2 * (*idx)] += alpha1 * (*v);
      y[2 * (*idx) + 1] += alpha2 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(4.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}
/* --------------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_3(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3;
  PetscInt           nonzerorow = 0, n, i, jrow, j;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[3 * idx[jrow]];
      sum2 += v[jrow] * x[3 * idx[jrow] + 1];
      sum3 += v[jrow] * x[3 * idx[jrow] + 2];
      jrow++;
    }
    y[3 * i]     = sum1;
    y[3 * i + 1] = sum2;
    y[3 * i + 2] = sum3;
  }

  PetscCall(PetscLogFlops(6.0 * a->nz - 3.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_3(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[3 * i];
    alpha2 = x[3 * i + 1];
    alpha3 = x[3 * i + 2];
    while (n-- > 0) {
      y[3 * (*idx)] += alpha1 * (*v);
      y[3 * (*idx) + 1] += alpha2 * (*v);
      y[3 * (*idx) + 2] += alpha3 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(6.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3;
  PetscInt           n, i, jrow, j;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[3 * idx[jrow]];
      sum2 += v[jrow] * x[3 * idx[jrow] + 1];
      sum3 += v[jrow] * x[3 * idx[jrow] + 2];
      jrow++;
    }
    y[3 * i] += sum1;
    y[3 * i + 1] += sum2;
    y[3 * i + 2] += sum3;
  }

  PetscCall(PetscLogFlops(6.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}
PetscErrorCode MatMultTransposeAdd_SeqMAIJ_3(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[3 * i];
    alpha2 = x[3 * i + 1];
    alpha3 = x[3 * i + 2];
    while (n-- > 0) {
      y[3 * (*idx)] += alpha1 * (*v);
      y[3 * (*idx) + 1] += alpha2 * (*v);
      y[3 * (*idx) + 2] += alpha3 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(6.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/* ------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_4(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4;
  PetscInt           nonzerorow = 0, n, i, jrow, j;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[4 * idx[jrow]];
      sum2 += v[jrow] * x[4 * idx[jrow] + 1];
      sum3 += v[jrow] * x[4 * idx[jrow] + 2];
      sum4 += v[jrow] * x[4 * idx[jrow] + 3];
      jrow++;
    }
    y[4 * i]     = sum1;
    y[4 * i + 1] = sum2;
    y[4 * i + 2] = sum3;
    y[4 * i + 3] = sum4;
  }

  PetscCall(PetscLogFlops(8.0 * a->nz - 4.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_4(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[4 * i];
    alpha2 = x[4 * i + 1];
    alpha3 = x[4 * i + 2];
    alpha4 = x[4 * i + 3];
    while (n-- > 0) {
      y[4 * (*idx)] += alpha1 * (*v);
      y[4 * (*idx) + 1] += alpha2 * (*v);
      y[4 * (*idx) + 2] += alpha3 * (*v);
      y[4 * (*idx) + 3] += alpha4 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(8.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4;
  PetscInt           n, i, jrow, j;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[4 * idx[jrow]];
      sum2 += v[jrow] * x[4 * idx[jrow] + 1];
      sum3 += v[jrow] * x[4 * idx[jrow] + 2];
      sum4 += v[jrow] * x[4 * idx[jrow] + 3];
      jrow++;
    }
    y[4 * i] += sum1;
    y[4 * i + 1] += sum2;
    y[4 * i + 2] += sum3;
    y[4 * i + 3] += sum4;
  }

  PetscCall(PetscLogFlops(8.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}
PetscErrorCode MatMultTransposeAdd_SeqMAIJ_4(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[4 * i];
    alpha2 = x[4 * i + 1];
    alpha3 = x[4 * i + 2];
    alpha4 = x[4 * i + 3];
    while (n-- > 0) {
      y[4 * (*idx)] += alpha1 * (*v);
      y[4 * (*idx) + 1] += alpha2 * (*v);
      y[4 * (*idx) + 2] += alpha3 * (*v);
      y[4 * (*idx) + 3] += alpha4 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(8.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}
/* ------------------------------------------------------------------------------*/

PetscErrorCode MatMult_SeqMAIJ_5(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5;
  PetscInt           nonzerorow = 0, n, i, jrow, j;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[5 * idx[jrow]];
      sum2 += v[jrow] * x[5 * idx[jrow] + 1];
      sum3 += v[jrow] * x[5 * idx[jrow] + 2];
      sum4 += v[jrow] * x[5 * idx[jrow] + 3];
      sum5 += v[jrow] * x[5 * idx[jrow] + 4];
      jrow++;
    }
    y[5 * i]     = sum1;
    y[5 * i + 1] = sum2;
    y[5 * i + 2] = sum3;
    y[5 * i + 3] = sum4;
    y[5 * i + 4] = sum5;
  }

  PetscCall(PetscLogFlops(10.0 * a->nz - 5.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_5(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[5 * i];
    alpha2 = x[5 * i + 1];
    alpha3 = x[5 * i + 2];
    alpha4 = x[5 * i + 3];
    alpha5 = x[5 * i + 4];
    while (n-- > 0) {
      y[5 * (*idx)] += alpha1 * (*v);
      y[5 * (*idx) + 1] += alpha2 * (*v);
      y[5 * (*idx) + 2] += alpha3 * (*v);
      y[5 * (*idx) + 3] += alpha4 * (*v);
      y[5 * (*idx) + 4] += alpha5 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(10.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5;
  PetscInt           n, i, jrow, j;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[5 * idx[jrow]];
      sum2 += v[jrow] * x[5 * idx[jrow] + 1];
      sum3 += v[jrow] * x[5 * idx[jrow] + 2];
      sum4 += v[jrow] * x[5 * idx[jrow] + 3];
      sum5 += v[jrow] * x[5 * idx[jrow] + 4];
      jrow++;
    }
    y[5 * i] += sum1;
    y[5 * i + 1] += sum2;
    y[5 * i + 2] += sum3;
    y[5 * i + 3] += sum4;
    y[5 * i + 4] += sum5;
  }

  PetscCall(PetscLogFlops(10.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_5(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[5 * i];
    alpha2 = x[5 * i + 1];
    alpha3 = x[5 * i + 2];
    alpha4 = x[5 * i + 3];
    alpha5 = x[5 * i + 4];
    while (n-- > 0) {
      y[5 * (*idx)] += alpha1 * (*v);
      y[5 * (*idx) + 1] += alpha2 * (*v);
      y[5 * (*idx) + 2] += alpha3 * (*v);
      y[5 * (*idx) + 3] += alpha4 * (*v);
      y[5 * (*idx) + 4] += alpha5 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(10.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/* ------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_6(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6;
  PetscInt           nonzerorow = 0, n, i, jrow, j;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[6 * idx[jrow]];
      sum2 += v[jrow] * x[6 * idx[jrow] + 1];
      sum3 += v[jrow] * x[6 * idx[jrow] + 2];
      sum4 += v[jrow] * x[6 * idx[jrow] + 3];
      sum5 += v[jrow] * x[6 * idx[jrow] + 4];
      sum6 += v[jrow] * x[6 * idx[jrow] + 5];
      jrow++;
    }
    y[6 * i]     = sum1;
    y[6 * i + 1] = sum2;
    y[6 * i + 2] = sum3;
    y[6 * i + 3] = sum4;
    y[6 * i + 4] = sum5;
    y[6 * i + 5] = sum6;
  }

  PetscCall(PetscLogFlops(12.0 * a->nz - 6.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_6(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[6 * i];
    alpha2 = x[6 * i + 1];
    alpha3 = x[6 * i + 2];
    alpha4 = x[6 * i + 3];
    alpha5 = x[6 * i + 4];
    alpha6 = x[6 * i + 5];
    while (n-- > 0) {
      y[6 * (*idx)] += alpha1 * (*v);
      y[6 * (*idx) + 1] += alpha2 * (*v);
      y[6 * (*idx) + 2] += alpha3 * (*v);
      y[6 * (*idx) + 3] += alpha4 * (*v);
      y[6 * (*idx) + 4] += alpha5 * (*v);
      y[6 * (*idx) + 5] += alpha6 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(12.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6;
  PetscInt           n, i, jrow, j;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[6 * idx[jrow]];
      sum2 += v[jrow] * x[6 * idx[jrow] + 1];
      sum3 += v[jrow] * x[6 * idx[jrow] + 2];
      sum4 += v[jrow] * x[6 * idx[jrow] + 3];
      sum5 += v[jrow] * x[6 * idx[jrow] + 4];
      sum6 += v[jrow] * x[6 * idx[jrow] + 5];
      jrow++;
    }
    y[6 * i] += sum1;
    y[6 * i + 1] += sum2;
    y[6 * i + 2] += sum3;
    y[6 * i + 3] += sum4;
    y[6 * i + 4] += sum5;
    y[6 * i + 5] += sum6;
  }

  PetscCall(PetscLogFlops(12.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_6(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6;
  PetscInt           n, i;
  const PetscInt     m = b->AIJ->rmap->n, *idx;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[6 * i];
    alpha2 = x[6 * i + 1];
    alpha3 = x[6 * i + 2];
    alpha4 = x[6 * i + 3];
    alpha5 = x[6 * i + 4];
    alpha6 = x[6 * i + 5];
    while (n-- > 0) {
      y[6 * (*idx)] += alpha1 * (*v);
      y[6 * (*idx) + 1] += alpha2 * (*v);
      y[6 * (*idx) + 2] += alpha3 * (*v);
      y[6 * (*idx) + 3] += alpha4 * (*v);
      y[6 * (*idx) + 4] += alpha5 * (*v);
      y[6 * (*idx) + 5] += alpha6 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(12.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/* ------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_7(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    sum7 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[7 * idx[jrow]];
      sum2 += v[jrow] * x[7 * idx[jrow] + 1];
      sum3 += v[jrow] * x[7 * idx[jrow] + 2];
      sum4 += v[jrow] * x[7 * idx[jrow] + 3];
      sum5 += v[jrow] * x[7 * idx[jrow] + 4];
      sum6 += v[jrow] * x[7 * idx[jrow] + 5];
      sum7 += v[jrow] * x[7 * idx[jrow] + 6];
      jrow++;
    }
    y[7 * i]     = sum1;
    y[7 * i + 1] = sum2;
    y[7 * i + 2] = sum3;
    y[7 * i + 3] = sum4;
    y[7 * i + 4] = sum5;
    y[7 * i + 5] = sum6;
    y[7 * i + 6] = sum7;
  }

  PetscCall(PetscLogFlops(14.0 * a->nz - 7.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_7(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[7 * i];
    alpha2 = x[7 * i + 1];
    alpha3 = x[7 * i + 2];
    alpha4 = x[7 * i + 3];
    alpha5 = x[7 * i + 4];
    alpha6 = x[7 * i + 5];
    alpha7 = x[7 * i + 6];
    while (n-- > 0) {
      y[7 * (*idx)] += alpha1 * (*v);
      y[7 * (*idx) + 1] += alpha2 * (*v);
      y[7 * (*idx) + 2] += alpha3 * (*v);
      y[7 * (*idx) + 3] += alpha4 * (*v);
      y[7 * (*idx) + 4] += alpha5 * (*v);
      y[7 * (*idx) + 5] += alpha6 * (*v);
      y[7 * (*idx) + 6] += alpha7 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(14.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    sum7 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[7 * idx[jrow]];
      sum2 += v[jrow] * x[7 * idx[jrow] + 1];
      sum3 += v[jrow] * x[7 * idx[jrow] + 2];
      sum4 += v[jrow] * x[7 * idx[jrow] + 3];
      sum5 += v[jrow] * x[7 * idx[jrow] + 4];
      sum6 += v[jrow] * x[7 * idx[jrow] + 5];
      sum7 += v[jrow] * x[7 * idx[jrow] + 6];
      jrow++;
    }
    y[7 * i] += sum1;
    y[7 * i + 1] += sum2;
    y[7 * i + 2] += sum3;
    y[7 * i + 3] += sum4;
    y[7 * i + 4] += sum5;
    y[7 * i + 5] += sum6;
    y[7 * i + 6] += sum7;
  }

  PetscCall(PetscLogFlops(14.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_7(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[7 * i];
    alpha2 = x[7 * i + 1];
    alpha3 = x[7 * i + 2];
    alpha4 = x[7 * i + 3];
    alpha5 = x[7 * i + 4];
    alpha6 = x[7 * i + 5];
    alpha7 = x[7 * i + 6];
    while (n-- > 0) {
      y[7 * (*idx)] += alpha1 * (*v);
      y[7 * (*idx) + 1] += alpha2 * (*v);
      y[7 * (*idx) + 2] += alpha3 * (*v);
      y[7 * (*idx) + 3] += alpha4 * (*v);
      y[7 * (*idx) + 4] += alpha5 * (*v);
      y[7 * (*idx) + 5] += alpha6 * (*v);
      y[7 * (*idx) + 6] += alpha7 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(14.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMult_SeqMAIJ_8(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    sum7 = 0.0;
    sum8 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[8 * idx[jrow]];
      sum2 += v[jrow] * x[8 * idx[jrow] + 1];
      sum3 += v[jrow] * x[8 * idx[jrow] + 2];
      sum4 += v[jrow] * x[8 * idx[jrow] + 3];
      sum5 += v[jrow] * x[8 * idx[jrow] + 4];
      sum6 += v[jrow] * x[8 * idx[jrow] + 5];
      sum7 += v[jrow] * x[8 * idx[jrow] + 6];
      sum8 += v[jrow] * x[8 * idx[jrow] + 7];
      jrow++;
    }
    y[8 * i]     = sum1;
    y[8 * i + 1] = sum2;
    y[8 * i + 2] = sum3;
    y[8 * i + 3] = sum4;
    y[8 * i + 4] = sum5;
    y[8 * i + 5] = sum6;
    y[8 * i + 6] = sum7;
    y[8 * i + 7] = sum8;
  }

  PetscCall(PetscLogFlops(16.0 * a->nz - 8.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_8(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[8 * i];
    alpha2 = x[8 * i + 1];
    alpha3 = x[8 * i + 2];
    alpha4 = x[8 * i + 3];
    alpha5 = x[8 * i + 4];
    alpha6 = x[8 * i + 5];
    alpha7 = x[8 * i + 6];
    alpha8 = x[8 * i + 7];
    while (n-- > 0) {
      y[8 * (*idx)] += alpha1 * (*v);
      y[8 * (*idx) + 1] += alpha2 * (*v);
      y[8 * (*idx) + 2] += alpha3 * (*v);
      y[8 * (*idx) + 3] += alpha4 * (*v);
      y[8 * (*idx) + 4] += alpha5 * (*v);
      y[8 * (*idx) + 5] += alpha6 * (*v);
      y[8 * (*idx) + 6] += alpha7 * (*v);
      y[8 * (*idx) + 7] += alpha8 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(16.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_8(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    sum7 = 0.0;
    sum8 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[8 * idx[jrow]];
      sum2 += v[jrow] * x[8 * idx[jrow] + 1];
      sum3 += v[jrow] * x[8 * idx[jrow] + 2];
      sum4 += v[jrow] * x[8 * idx[jrow] + 3];
      sum5 += v[jrow] * x[8 * idx[jrow] + 4];
      sum6 += v[jrow] * x[8 * idx[jrow] + 5];
      sum7 += v[jrow] * x[8 * idx[jrow] + 6];
      sum8 += v[jrow] * x[8 * idx[jrow] + 7];
      jrow++;
    }
    y[8 * i] += sum1;
    y[8 * i + 1] += sum2;
    y[8 * i + 2] += sum3;
    y[8 * i + 3] += sum4;
    y[8 * i + 4] += sum5;
    y[8 * i + 5] += sum6;
    y[8 * i + 6] += sum7;
    y[8 * i + 7] += sum8;
  }

  PetscCall(PetscLogFlops(16.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_8(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[8 * i];
    alpha2 = x[8 * i + 1];
    alpha3 = x[8 * i + 2];
    alpha4 = x[8 * i + 3];
    alpha5 = x[8 * i + 4];
    alpha6 = x[8 * i + 5];
    alpha7 = x[8 * i + 6];
    alpha8 = x[8 * i + 7];
    while (n-- > 0) {
      y[8 * (*idx)] += alpha1 * (*v);
      y[8 * (*idx) + 1] += alpha2 * (*v);
      y[8 * (*idx) + 2] += alpha3 * (*v);
      y[8 * (*idx) + 3] += alpha4 * (*v);
      y[8 * (*idx) + 4] += alpha5 * (*v);
      y[8 * (*idx) + 5] += alpha6 * (*v);
      y[8 * (*idx) + 6] += alpha7 * (*v);
      y[8 * (*idx) + 7] += alpha8 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(16.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/* ------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_9(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    sum7 = 0.0;
    sum8 = 0.0;
    sum9 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[9 * idx[jrow]];
      sum2 += v[jrow] * x[9 * idx[jrow] + 1];
      sum3 += v[jrow] * x[9 * idx[jrow] + 2];
      sum4 += v[jrow] * x[9 * idx[jrow] + 3];
      sum5 += v[jrow] * x[9 * idx[jrow] + 4];
      sum6 += v[jrow] * x[9 * idx[jrow] + 5];
      sum7 += v[jrow] * x[9 * idx[jrow] + 6];
      sum8 += v[jrow] * x[9 * idx[jrow] + 7];
      sum9 += v[jrow] * x[9 * idx[jrow] + 8];
      jrow++;
    }
    y[9 * i]     = sum1;
    y[9 * i + 1] = sum2;
    y[9 * i + 2] = sum3;
    y[9 * i + 3] = sum4;
    y[9 * i + 4] = sum5;
    y[9 * i + 5] = sum6;
    y[9 * i + 6] = sum7;
    y[9 * i + 7] = sum8;
    y[9 * i + 8] = sum9;
  }

  PetscCall(PetscLogFlops(18.0 * a->nz - 9 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

/* ------------------------------------------------------------------------------*/

PetscErrorCode MatMultTranspose_SeqMAIJ_9(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[9 * i];
    alpha2 = x[9 * i + 1];
    alpha3 = x[9 * i + 2];
    alpha4 = x[9 * i + 3];
    alpha5 = x[9 * i + 4];
    alpha6 = x[9 * i + 5];
    alpha7 = x[9 * i + 6];
    alpha8 = x[9 * i + 7];
    alpha9 = x[9 * i + 8];
    while (n-- > 0) {
      y[9 * (*idx)] += alpha1 * (*v);
      y[9 * (*idx) + 1] += alpha2 * (*v);
      y[9 * (*idx) + 2] += alpha3 * (*v);
      y[9 * (*idx) + 3] += alpha4 * (*v);
      y[9 * (*idx) + 4] += alpha5 * (*v);
      y[9 * (*idx) + 5] += alpha6 * (*v);
      y[9 * (*idx) + 6] += alpha7 * (*v);
      y[9 * (*idx) + 7] += alpha8 * (*v);
      y[9 * (*idx) + 8] += alpha9 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(18.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_9(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sum1 = 0.0;
    sum2 = 0.0;
    sum3 = 0.0;
    sum4 = 0.0;
    sum5 = 0.0;
    sum6 = 0.0;
    sum7 = 0.0;
    sum8 = 0.0;
    sum9 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[9 * idx[jrow]];
      sum2 += v[jrow] * x[9 * idx[jrow] + 1];
      sum3 += v[jrow] * x[9 * idx[jrow] + 2];
      sum4 += v[jrow] * x[9 * idx[jrow] + 3];
      sum5 += v[jrow] * x[9 * idx[jrow] + 4];
      sum6 += v[jrow] * x[9 * idx[jrow] + 5];
      sum7 += v[jrow] * x[9 * idx[jrow] + 6];
      sum8 += v[jrow] * x[9 * idx[jrow] + 7];
      sum9 += v[jrow] * x[9 * idx[jrow] + 8];
      jrow++;
    }
    y[9 * i] += sum1;
    y[9 * i + 1] += sum2;
    y[9 * i + 2] += sum3;
    y[9 * i + 3] += sum4;
    y[9 * i + 4] += sum5;
    y[9 * i + 5] += sum6;
    y[9 * i + 6] += sum7;
    y[9 * i + 7] += sum8;
    y[9 * i + 8] += sum9;
  }

  PetscCall(PetscLogFlops(18.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_9(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx    = a->j + a->i[i];
    v      = a->a + a->i[i];
    n      = a->i[i + 1] - a->i[i];
    alpha1 = x[9 * i];
    alpha2 = x[9 * i + 1];
    alpha3 = x[9 * i + 2];
    alpha4 = x[9 * i + 3];
    alpha5 = x[9 * i + 4];
    alpha6 = x[9 * i + 5];
    alpha7 = x[9 * i + 6];
    alpha8 = x[9 * i + 7];
    alpha9 = x[9 * i + 8];
    while (n-- > 0) {
      y[9 * (*idx)] += alpha1 * (*v);
      y[9 * (*idx) + 1] += alpha2 * (*v);
      y[9 * (*idx) + 2] += alpha3 * (*v);
      y[9 * (*idx) + 3] += alpha4 * (*v);
      y[9 * (*idx) + 4] += alpha5 * (*v);
      y[9 * (*idx) + 5] += alpha6 * (*v);
      y[9 * (*idx) + 6] += alpha7 * (*v);
      y[9 * (*idx) + 7] += alpha8 * (*v);
      y[9 * (*idx) + 8] += alpha9 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(18.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}
PetscErrorCode MatMult_SeqMAIJ_10(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[10 * idx[jrow]];
      sum2 += v[jrow] * x[10 * idx[jrow] + 1];
      sum3 += v[jrow] * x[10 * idx[jrow] + 2];
      sum4 += v[jrow] * x[10 * idx[jrow] + 3];
      sum5 += v[jrow] * x[10 * idx[jrow] + 4];
      sum6 += v[jrow] * x[10 * idx[jrow] + 5];
      sum7 += v[jrow] * x[10 * idx[jrow] + 6];
      sum8 += v[jrow] * x[10 * idx[jrow] + 7];
      sum9 += v[jrow] * x[10 * idx[jrow] + 8];
      sum10 += v[jrow] * x[10 * idx[jrow] + 9];
      jrow++;
    }
    y[10 * i]     = sum1;
    y[10 * i + 1] = sum2;
    y[10 * i + 2] = sum3;
    y[10 * i + 3] = sum4;
    y[10 * i + 4] = sum5;
    y[10 * i + 5] = sum6;
    y[10 * i + 6] = sum7;
    y[10 * i + 7] = sum8;
    y[10 * i + 8] = sum9;
    y[10 * i + 9] = sum10;
  }

  PetscCall(PetscLogFlops(20.0 * a->nz - 10.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_10(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[10 * idx[jrow]];
      sum2 += v[jrow] * x[10 * idx[jrow] + 1];
      sum3 += v[jrow] * x[10 * idx[jrow] + 2];
      sum4 += v[jrow] * x[10 * idx[jrow] + 3];
      sum5 += v[jrow] * x[10 * idx[jrow] + 4];
      sum6 += v[jrow] * x[10 * idx[jrow] + 5];
      sum7 += v[jrow] * x[10 * idx[jrow] + 6];
      sum8 += v[jrow] * x[10 * idx[jrow] + 7];
      sum9 += v[jrow] * x[10 * idx[jrow] + 8];
      sum10 += v[jrow] * x[10 * idx[jrow] + 9];
      jrow++;
    }
    y[10 * i] += sum1;
    y[10 * i + 1] += sum2;
    y[10 * i + 2] += sum3;
    y[10 * i + 3] += sum4;
    y[10 * i + 4] += sum5;
    y[10 * i + 5] += sum6;
    y[10 * i + 6] += sum7;
    y[10 * i + 7] += sum8;
    y[10 * i + 8] += sum9;
    y[10 * i + 9] += sum10;
  }

  PetscCall(PetscLogFlops(20.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_10(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9, alpha10;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[10 * i];
    alpha2  = x[10 * i + 1];
    alpha3  = x[10 * i + 2];
    alpha4  = x[10 * i + 3];
    alpha5  = x[10 * i + 4];
    alpha6  = x[10 * i + 5];
    alpha7  = x[10 * i + 6];
    alpha8  = x[10 * i + 7];
    alpha9  = x[10 * i + 8];
    alpha10 = x[10 * i + 9];
    while (n-- > 0) {
      y[10 * (*idx)] += alpha1 * (*v);
      y[10 * (*idx) + 1] += alpha2 * (*v);
      y[10 * (*idx) + 2] += alpha3 * (*v);
      y[10 * (*idx) + 3] += alpha4 * (*v);
      y[10 * (*idx) + 4] += alpha5 * (*v);
      y[10 * (*idx) + 5] += alpha6 * (*v);
      y[10 * (*idx) + 6] += alpha7 * (*v);
      y[10 * (*idx) + 7] += alpha8 * (*v);
      y[10 * (*idx) + 8] += alpha9 * (*v);
      y[10 * (*idx) + 9] += alpha10 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(20.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_10(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9, alpha10;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[10 * i];
    alpha2  = x[10 * i + 1];
    alpha3  = x[10 * i + 2];
    alpha4  = x[10 * i + 3];
    alpha5  = x[10 * i + 4];
    alpha6  = x[10 * i + 5];
    alpha7  = x[10 * i + 6];
    alpha8  = x[10 * i + 7];
    alpha9  = x[10 * i + 8];
    alpha10 = x[10 * i + 9];
    while (n-- > 0) {
      y[10 * (*idx)] += alpha1 * (*v);
      y[10 * (*idx) + 1] += alpha2 * (*v);
      y[10 * (*idx) + 2] += alpha3 * (*v);
      y[10 * (*idx) + 3] += alpha4 * (*v);
      y[10 * (*idx) + 4] += alpha5 * (*v);
      y[10 * (*idx) + 5] += alpha6 * (*v);
      y[10 * (*idx) + 6] += alpha7 * (*v);
      y[10 * (*idx) + 7] += alpha8 * (*v);
      y[10 * (*idx) + 8] += alpha9 * (*v);
      y[10 * (*idx) + 9] += alpha10 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(20.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/*--------------------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_11(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    sum11 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[11 * idx[jrow]];
      sum2 += v[jrow] * x[11 * idx[jrow] + 1];
      sum3 += v[jrow] * x[11 * idx[jrow] + 2];
      sum4 += v[jrow] * x[11 * idx[jrow] + 3];
      sum5 += v[jrow] * x[11 * idx[jrow] + 4];
      sum6 += v[jrow] * x[11 * idx[jrow] + 5];
      sum7 += v[jrow] * x[11 * idx[jrow] + 6];
      sum8 += v[jrow] * x[11 * idx[jrow] + 7];
      sum9 += v[jrow] * x[11 * idx[jrow] + 8];
      sum10 += v[jrow] * x[11 * idx[jrow] + 9];
      sum11 += v[jrow] * x[11 * idx[jrow] + 10];
      jrow++;
    }
    y[11 * i]      = sum1;
    y[11 * i + 1]  = sum2;
    y[11 * i + 2]  = sum3;
    y[11 * i + 3]  = sum4;
    y[11 * i + 4]  = sum5;
    y[11 * i + 5]  = sum6;
    y[11 * i + 6]  = sum7;
    y[11 * i + 7]  = sum8;
    y[11 * i + 8]  = sum9;
    y[11 * i + 9]  = sum10;
    y[11 * i + 10] = sum11;
  }

  PetscCall(PetscLogFlops(22.0 * a->nz - 11 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    sum11 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[11 * idx[jrow]];
      sum2 += v[jrow] * x[11 * idx[jrow] + 1];
      sum3 += v[jrow] * x[11 * idx[jrow] + 2];
      sum4 += v[jrow] * x[11 * idx[jrow] + 3];
      sum5 += v[jrow] * x[11 * idx[jrow] + 4];
      sum6 += v[jrow] * x[11 * idx[jrow] + 5];
      sum7 += v[jrow] * x[11 * idx[jrow] + 6];
      sum8 += v[jrow] * x[11 * idx[jrow] + 7];
      sum9 += v[jrow] * x[11 * idx[jrow] + 8];
      sum10 += v[jrow] * x[11 * idx[jrow] + 9];
      sum11 += v[jrow] * x[11 * idx[jrow] + 10];
      jrow++;
    }
    y[11 * i] += sum1;
    y[11 * i + 1] += sum2;
    y[11 * i + 2] += sum3;
    y[11 * i + 3] += sum4;
    y[11 * i + 4] += sum5;
    y[11 * i + 5] += sum6;
    y[11 * i + 6] += sum7;
    y[11 * i + 7] += sum8;
    y[11 * i + 8] += sum9;
    y[11 * i + 9] += sum10;
    y[11 * i + 10] += sum11;
  }

  PetscCall(PetscLogFlops(22.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_11(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9, alpha10, alpha11;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[11 * i];
    alpha2  = x[11 * i + 1];
    alpha3  = x[11 * i + 2];
    alpha4  = x[11 * i + 3];
    alpha5  = x[11 * i + 4];
    alpha6  = x[11 * i + 5];
    alpha7  = x[11 * i + 6];
    alpha8  = x[11 * i + 7];
    alpha9  = x[11 * i + 8];
    alpha10 = x[11 * i + 9];
    alpha11 = x[11 * i + 10];
    while (n-- > 0) {
      y[11 * (*idx)] += alpha1 * (*v);
      y[11 * (*idx) + 1] += alpha2 * (*v);
      y[11 * (*idx) + 2] += alpha3 * (*v);
      y[11 * (*idx) + 3] += alpha4 * (*v);
      y[11 * (*idx) + 4] += alpha5 * (*v);
      y[11 * (*idx) + 5] += alpha6 * (*v);
      y[11 * (*idx) + 6] += alpha7 * (*v);
      y[11 * (*idx) + 7] += alpha8 * (*v);
      y[11 * (*idx) + 8] += alpha9 * (*v);
      y[11 * (*idx) + 9] += alpha10 * (*v);
      y[11 * (*idx) + 10] += alpha11 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(22.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_11(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9, alpha10, alpha11;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[11 * i];
    alpha2  = x[11 * i + 1];
    alpha3  = x[11 * i + 2];
    alpha4  = x[11 * i + 3];
    alpha5  = x[11 * i + 4];
    alpha6  = x[11 * i + 5];
    alpha7  = x[11 * i + 6];
    alpha8  = x[11 * i + 7];
    alpha9  = x[11 * i + 8];
    alpha10 = x[11 * i + 9];
    alpha11 = x[11 * i + 10];
    while (n-- > 0) {
      y[11 * (*idx)] += alpha1 * (*v);
      y[11 * (*idx) + 1] += alpha2 * (*v);
      y[11 * (*idx) + 2] += alpha3 * (*v);
      y[11 * (*idx) + 3] += alpha4 * (*v);
      y[11 * (*idx) + 4] += alpha5 * (*v);
      y[11 * (*idx) + 5] += alpha6 * (*v);
      y[11 * (*idx) + 6] += alpha7 * (*v);
      y[11 * (*idx) + 7] += alpha8 * (*v);
      y[11 * (*idx) + 8] += alpha9 * (*v);
      y[11 * (*idx) + 9] += alpha10 * (*v);
      y[11 * (*idx) + 10] += alpha11 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(22.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/*--------------------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_16(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8;
  PetscScalar        sum9, sum10, sum11, sum12, sum13, sum14, sum15, sum16;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    sum11 = 0.0;
    sum12 = 0.0;
    sum13 = 0.0;
    sum14 = 0.0;
    sum15 = 0.0;
    sum16 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[16 * idx[jrow]];
      sum2 += v[jrow] * x[16 * idx[jrow] + 1];
      sum3 += v[jrow] * x[16 * idx[jrow] + 2];
      sum4 += v[jrow] * x[16 * idx[jrow] + 3];
      sum5 += v[jrow] * x[16 * idx[jrow] + 4];
      sum6 += v[jrow] * x[16 * idx[jrow] + 5];
      sum7 += v[jrow] * x[16 * idx[jrow] + 6];
      sum8 += v[jrow] * x[16 * idx[jrow] + 7];
      sum9 += v[jrow] * x[16 * idx[jrow] + 8];
      sum10 += v[jrow] * x[16 * idx[jrow] + 9];
      sum11 += v[jrow] * x[16 * idx[jrow] + 10];
      sum12 += v[jrow] * x[16 * idx[jrow] + 11];
      sum13 += v[jrow] * x[16 * idx[jrow] + 12];
      sum14 += v[jrow] * x[16 * idx[jrow] + 13];
      sum15 += v[jrow] * x[16 * idx[jrow] + 14];
      sum16 += v[jrow] * x[16 * idx[jrow] + 15];
      jrow++;
    }
    y[16 * i]      = sum1;
    y[16 * i + 1]  = sum2;
    y[16 * i + 2]  = sum3;
    y[16 * i + 3]  = sum4;
    y[16 * i + 4]  = sum5;
    y[16 * i + 5]  = sum6;
    y[16 * i + 6]  = sum7;
    y[16 * i + 7]  = sum8;
    y[16 * i + 8]  = sum9;
    y[16 * i + 9]  = sum10;
    y[16 * i + 10] = sum11;
    y[16 * i + 11] = sum12;
    y[16 * i + 12] = sum13;
    y[16 * i + 13] = sum14;
    y[16 * i + 14] = sum15;
    y[16 * i + 15] = sum16;
  }

  PetscCall(PetscLogFlops(32.0 * a->nz - 16.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_16(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8;
  PetscScalar        alpha9, alpha10, alpha11, alpha12, alpha13, alpha14, alpha15, alpha16;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[16 * i];
    alpha2  = x[16 * i + 1];
    alpha3  = x[16 * i + 2];
    alpha4  = x[16 * i + 3];
    alpha5  = x[16 * i + 4];
    alpha6  = x[16 * i + 5];
    alpha7  = x[16 * i + 6];
    alpha8  = x[16 * i + 7];
    alpha9  = x[16 * i + 8];
    alpha10 = x[16 * i + 9];
    alpha11 = x[16 * i + 10];
    alpha12 = x[16 * i + 11];
    alpha13 = x[16 * i + 12];
    alpha14 = x[16 * i + 13];
    alpha15 = x[16 * i + 14];
    alpha16 = x[16 * i + 15];
    while (n-- > 0) {
      y[16 * (*idx)] += alpha1 * (*v);
      y[16 * (*idx) + 1] += alpha2 * (*v);
      y[16 * (*idx) + 2] += alpha3 * (*v);
      y[16 * (*idx) + 3] += alpha4 * (*v);
      y[16 * (*idx) + 4] += alpha5 * (*v);
      y[16 * (*idx) + 5] += alpha6 * (*v);
      y[16 * (*idx) + 6] += alpha7 * (*v);
      y[16 * (*idx) + 7] += alpha8 * (*v);
      y[16 * (*idx) + 8] += alpha9 * (*v);
      y[16 * (*idx) + 9] += alpha10 * (*v);
      y[16 * (*idx) + 10] += alpha11 * (*v);
      y[16 * (*idx) + 11] += alpha12 * (*v);
      y[16 * (*idx) + 12] += alpha13 * (*v);
      y[16 * (*idx) + 13] += alpha14 * (*v);
      y[16 * (*idx) + 14] += alpha15 * (*v);
      y[16 * (*idx) + 15] += alpha16 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(32.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_16(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8;
  PetscScalar        sum9, sum10, sum11, sum12, sum13, sum14, sum15, sum16;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    sum11 = 0.0;
    sum12 = 0.0;
    sum13 = 0.0;
    sum14 = 0.0;
    sum15 = 0.0;
    sum16 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[16 * idx[jrow]];
      sum2 += v[jrow] * x[16 * idx[jrow] + 1];
      sum3 += v[jrow] * x[16 * idx[jrow] + 2];
      sum4 += v[jrow] * x[16 * idx[jrow] + 3];
      sum5 += v[jrow] * x[16 * idx[jrow] + 4];
      sum6 += v[jrow] * x[16 * idx[jrow] + 5];
      sum7 += v[jrow] * x[16 * idx[jrow] + 6];
      sum8 += v[jrow] * x[16 * idx[jrow] + 7];
      sum9 += v[jrow] * x[16 * idx[jrow] + 8];
      sum10 += v[jrow] * x[16 * idx[jrow] + 9];
      sum11 += v[jrow] * x[16 * idx[jrow] + 10];
      sum12 += v[jrow] * x[16 * idx[jrow] + 11];
      sum13 += v[jrow] * x[16 * idx[jrow] + 12];
      sum14 += v[jrow] * x[16 * idx[jrow] + 13];
      sum15 += v[jrow] * x[16 * idx[jrow] + 14];
      sum16 += v[jrow] * x[16 * idx[jrow] + 15];
      jrow++;
    }
    y[16 * i] += sum1;
    y[16 * i + 1] += sum2;
    y[16 * i + 2] += sum3;
    y[16 * i + 3] += sum4;
    y[16 * i + 4] += sum5;
    y[16 * i + 5] += sum6;
    y[16 * i + 6] += sum7;
    y[16 * i + 7] += sum8;
    y[16 * i + 8] += sum9;
    y[16 * i + 9] += sum10;
    y[16 * i + 10] += sum11;
    y[16 * i + 11] += sum12;
    y[16 * i + 12] += sum13;
    y[16 * i + 13] += sum14;
    y[16 * i + 14] += sum15;
    y[16 * i + 15] += sum16;
  }

  PetscCall(PetscLogFlops(32.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_16(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8;
  PetscScalar        alpha9, alpha10, alpha11, alpha12, alpha13, alpha14, alpha15, alpha16;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[16 * i];
    alpha2  = x[16 * i + 1];
    alpha3  = x[16 * i + 2];
    alpha4  = x[16 * i + 3];
    alpha5  = x[16 * i + 4];
    alpha6  = x[16 * i + 5];
    alpha7  = x[16 * i + 6];
    alpha8  = x[16 * i + 7];
    alpha9  = x[16 * i + 8];
    alpha10 = x[16 * i + 9];
    alpha11 = x[16 * i + 10];
    alpha12 = x[16 * i + 11];
    alpha13 = x[16 * i + 12];
    alpha14 = x[16 * i + 13];
    alpha15 = x[16 * i + 14];
    alpha16 = x[16 * i + 15];
    while (n-- > 0) {
      y[16 * (*idx)] += alpha1 * (*v);
      y[16 * (*idx) + 1] += alpha2 * (*v);
      y[16 * (*idx) + 2] += alpha3 * (*v);
      y[16 * (*idx) + 3] += alpha4 * (*v);
      y[16 * (*idx) + 4] += alpha5 * (*v);
      y[16 * (*idx) + 5] += alpha6 * (*v);
      y[16 * (*idx) + 6] += alpha7 * (*v);
      y[16 * (*idx) + 7] += alpha8 * (*v);
      y[16 * (*idx) + 8] += alpha9 * (*v);
      y[16 * (*idx) + 9] += alpha10 * (*v);
      y[16 * (*idx) + 10] += alpha11 * (*v);
      y[16 * (*idx) + 11] += alpha12 * (*v);
      y[16 * (*idx) + 12] += alpha13 * (*v);
      y[16 * (*idx) + 13] += alpha14 * (*v);
      y[16 * (*idx) + 14] += alpha15 * (*v);
      y[16 * (*idx) + 15] += alpha16 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(32.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/*--------------------------------------------------------------------------------------------*/
PetscErrorCode MatMult_SeqMAIJ_18(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8;
  PetscScalar        sum9, sum10, sum11, sum12, sum13, sum14, sum15, sum16, sum17, sum18;
  const PetscInt     m          = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           nonzerorow = 0, n, i, jrow, j;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    sum11 = 0.0;
    sum12 = 0.0;
    sum13 = 0.0;
    sum14 = 0.0;
    sum15 = 0.0;
    sum16 = 0.0;
    sum17 = 0.0;
    sum18 = 0.0;

    nonzerorow += (n > 0);
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[18 * idx[jrow]];
      sum2 += v[jrow] * x[18 * idx[jrow] + 1];
      sum3 += v[jrow] * x[18 * idx[jrow] + 2];
      sum4 += v[jrow] * x[18 * idx[jrow] + 3];
      sum5 += v[jrow] * x[18 * idx[jrow] + 4];
      sum6 += v[jrow] * x[18 * idx[jrow] + 5];
      sum7 += v[jrow] * x[18 * idx[jrow] + 6];
      sum8 += v[jrow] * x[18 * idx[jrow] + 7];
      sum9 += v[jrow] * x[18 * idx[jrow] + 8];
      sum10 += v[jrow] * x[18 * idx[jrow] + 9];
      sum11 += v[jrow] * x[18 * idx[jrow] + 10];
      sum12 += v[jrow] * x[18 * idx[jrow] + 11];
      sum13 += v[jrow] * x[18 * idx[jrow] + 12];
      sum14 += v[jrow] * x[18 * idx[jrow] + 13];
      sum15 += v[jrow] * x[18 * idx[jrow] + 14];
      sum16 += v[jrow] * x[18 * idx[jrow] + 15];
      sum17 += v[jrow] * x[18 * idx[jrow] + 16];
      sum18 += v[jrow] * x[18 * idx[jrow] + 17];
      jrow++;
    }
    y[18 * i]      = sum1;
    y[18 * i + 1]  = sum2;
    y[18 * i + 2]  = sum3;
    y[18 * i + 3]  = sum4;
    y[18 * i + 4]  = sum5;
    y[18 * i + 5]  = sum6;
    y[18 * i + 6]  = sum7;
    y[18 * i + 7]  = sum8;
    y[18 * i + 8]  = sum9;
    y[18 * i + 9]  = sum10;
    y[18 * i + 10] = sum11;
    y[18 * i + 11] = sum12;
    y[18 * i + 12] = sum13;
    y[18 * i + 13] = sum14;
    y[18 * i + 14] = sum15;
    y[18 * i + 15] = sum16;
    y[18 * i + 16] = sum17;
    y[18 * i + 17] = sum18;
  }

  PetscCall(PetscLogFlops(36.0 * a->nz - 18.0 * nonzerorow));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_18(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8;
  PetscScalar        alpha9, alpha10, alpha11, alpha12, alpha13, alpha14, alpha15, alpha16, alpha17, alpha18;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(yy, &y));

  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[18 * i];
    alpha2  = x[18 * i + 1];
    alpha3  = x[18 * i + 2];
    alpha4  = x[18 * i + 3];
    alpha5  = x[18 * i + 4];
    alpha6  = x[18 * i + 5];
    alpha7  = x[18 * i + 6];
    alpha8  = x[18 * i + 7];
    alpha9  = x[18 * i + 8];
    alpha10 = x[18 * i + 9];
    alpha11 = x[18 * i + 10];
    alpha12 = x[18 * i + 11];
    alpha13 = x[18 * i + 12];
    alpha14 = x[18 * i + 13];
    alpha15 = x[18 * i + 14];
    alpha16 = x[18 * i + 15];
    alpha17 = x[18 * i + 16];
    alpha18 = x[18 * i + 17];
    while (n-- > 0) {
      y[18 * (*idx)] += alpha1 * (*v);
      y[18 * (*idx) + 1] += alpha2 * (*v);
      y[18 * (*idx) + 2] += alpha3 * (*v);
      y[18 * (*idx) + 3] += alpha4 * (*v);
      y[18 * (*idx) + 4] += alpha5 * (*v);
      y[18 * (*idx) + 5] += alpha6 * (*v);
      y[18 * (*idx) + 6] += alpha7 * (*v);
      y[18 * (*idx) + 7] += alpha8 * (*v);
      y[18 * (*idx) + 8] += alpha9 * (*v);
      y[18 * (*idx) + 9] += alpha10 * (*v);
      y[18 * (*idx) + 10] += alpha11 * (*v);
      y[18 * (*idx) + 11] += alpha12 * (*v);
      y[18 * (*idx) + 12] += alpha13 * (*v);
      y[18 * (*idx) + 13] += alpha14 * (*v);
      y[18 * (*idx) + 14] += alpha15 * (*v);
      y[18 * (*idx) + 15] += alpha16 * (*v);
      y[18 * (*idx) + 16] += alpha17 * (*v);
      y[18 * (*idx) + 17] += alpha18 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(36.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_18(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8;
  PetscScalar        sum9, sum10, sum11, sum12, sum13, sum14, sum15, sum16, sum17, sum18;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow  = ii[i];
    n     = ii[i + 1] - jrow;
    sum1  = 0.0;
    sum2  = 0.0;
    sum3  = 0.0;
    sum4  = 0.0;
    sum5  = 0.0;
    sum6  = 0.0;
    sum7  = 0.0;
    sum8  = 0.0;
    sum9  = 0.0;
    sum10 = 0.0;
    sum11 = 0.0;
    sum12 = 0.0;
    sum13 = 0.0;
    sum14 = 0.0;
    sum15 = 0.0;
    sum16 = 0.0;
    sum17 = 0.0;
    sum18 = 0.0;
    for (j = 0; j < n; j++) {
      sum1 += v[jrow] * x[18 * idx[jrow]];
      sum2 += v[jrow] * x[18 * idx[jrow] + 1];
      sum3 += v[jrow] * x[18 * idx[jrow] + 2];
      sum4 += v[jrow] * x[18 * idx[jrow] + 3];
      sum5 += v[jrow] * x[18 * idx[jrow] + 4];
      sum6 += v[jrow] * x[18 * idx[jrow] + 5];
      sum7 += v[jrow] * x[18 * idx[jrow] + 6];
      sum8 += v[jrow] * x[18 * idx[jrow] + 7];
      sum9 += v[jrow] * x[18 * idx[jrow] + 8];
      sum10 += v[jrow] * x[18 * idx[jrow] + 9];
      sum11 += v[jrow] * x[18 * idx[jrow] + 10];
      sum12 += v[jrow] * x[18 * idx[jrow] + 11];
      sum13 += v[jrow] * x[18 * idx[jrow] + 12];
      sum14 += v[jrow] * x[18 * idx[jrow] + 13];
      sum15 += v[jrow] * x[18 * idx[jrow] + 14];
      sum16 += v[jrow] * x[18 * idx[jrow] + 15];
      sum17 += v[jrow] * x[18 * idx[jrow] + 16];
      sum18 += v[jrow] * x[18 * idx[jrow] + 17];
      jrow++;
    }
    y[18 * i] += sum1;
    y[18 * i + 1] += sum2;
    y[18 * i + 2] += sum3;
    y[18 * i + 3] += sum4;
    y[18 * i + 4] += sum5;
    y[18 * i + 5] += sum6;
    y[18 * i + 6] += sum7;
    y[18 * i + 7] += sum8;
    y[18 * i + 8] += sum9;
    y[18 * i + 9] += sum10;
    y[18 * i + 10] += sum11;
    y[18 * i + 11] += sum12;
    y[18 * i + 12] += sum13;
    y[18 * i + 13] += sum14;
    y[18 * i + 14] += sum15;
    y[18 * i + 15] += sum16;
    y[18 * i + 16] += sum17;
    y[18 * i + 17] += sum18;
  }

  PetscCall(PetscLogFlops(36.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_18(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8;
  PetscScalar        alpha9, alpha10, alpha11, alpha12, alpha13, alpha14, alpha15, alpha16, alpha17, alpha18;
  const PetscInt     m = b->AIJ->rmap->n, *idx;
  PetscInt           n, i;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx     = a->j + a->i[i];
    v       = a->a + a->i[i];
    n       = a->i[i + 1] - a->i[i];
    alpha1  = x[18 * i];
    alpha2  = x[18 * i + 1];
    alpha3  = x[18 * i + 2];
    alpha4  = x[18 * i + 3];
    alpha5  = x[18 * i + 4];
    alpha6  = x[18 * i + 5];
    alpha7  = x[18 * i + 6];
    alpha8  = x[18 * i + 7];
    alpha9  = x[18 * i + 8];
    alpha10 = x[18 * i + 9];
    alpha11 = x[18 * i + 10];
    alpha12 = x[18 * i + 11];
    alpha13 = x[18 * i + 12];
    alpha14 = x[18 * i + 13];
    alpha15 = x[18 * i + 14];
    alpha16 = x[18 * i + 15];
    alpha17 = x[18 * i + 16];
    alpha18 = x[18 * i + 17];
    while (n-- > 0) {
      y[18 * (*idx)] += alpha1 * (*v);
      y[18 * (*idx) + 1] += alpha2 * (*v);
      y[18 * (*idx) + 2] += alpha3 * (*v);
      y[18 * (*idx) + 3] += alpha4 * (*v);
      y[18 * (*idx) + 4] += alpha5 * (*v);
      y[18 * (*idx) + 5] += alpha6 * (*v);
      y[18 * (*idx) + 6] += alpha7 * (*v);
      y[18 * (*idx) + 7] += alpha8 * (*v);
      y[18 * (*idx) + 8] += alpha9 * (*v);
      y[18 * (*idx) + 9] += alpha10 * (*v);
      y[18 * (*idx) + 10] += alpha11 * (*v);
      y[18 * (*idx) + 11] += alpha12 * (*v);
      y[18 * (*idx) + 12] += alpha13 * (*v);
      y[18 * (*idx) + 13] += alpha14 * (*v);
      y[18 * (*idx) + 14] += alpha15 * (*v);
      y[18 * (*idx) + 15] += alpha16 * (*v);
      y[18 * (*idx) + 16] += alpha17 * (*v);
      y[18 * (*idx) + 17] += alpha18 * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(36.0 * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMult_SeqMAIJ_N(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, *sums;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j, dof = b->dof, k;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArray(yy, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sums = y + dof * i;
    for (j = 0; j < n; j++) {
      for (k = 0; k < dof; k++) sums[k] += v[jrow] * x[dof * idx[jrow] + k];
      jrow++;
    }
  }

  PetscCall(PetscLogFlops(2.0 * dof * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_SeqMAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v;
  PetscScalar       *y, *sums;
  const PetscInt     m = b->AIJ->rmap->n, *idx, *ii;
  PetscInt           n, i, jrow, j, dof = b->dof, k;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  idx = a->j;
  v   = a->a;
  ii  = a->i;

  for (i = 0; i < m; i++) {
    jrow = ii[i];
    n    = ii[i + 1] - jrow;
    sums = y + dof * i;
    for (j = 0; j < n; j++) {
      for (k = 0; k < dof; k++) sums[k] += v[jrow] * x[dof * idx[jrow] + k];
      jrow++;
    }
  }

  PetscCall(PetscLogFlops(2.0 * dof * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_SeqMAIJ_N(Mat A, Vec xx, Vec yy) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v, *alpha;
  PetscScalar       *y;
  const PetscInt     m = b->AIJ->rmap->n, *idx, dof = b->dof;
  PetscInt           n, i, k;

  PetscFunctionBegin;
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecSet(yy, 0.0));
  PetscCall(VecGetArray(yy, &y));
  for (i = 0; i < m; i++) {
    idx   = a->j + a->i[i];
    v     = a->a + a->i[i];
    n     = a->i[i + 1] - a->i[i];
    alpha = x + dof * i;
    while (n-- > 0) {
      for (k = 0; k < dof; k++) y[dof * (*idx) + k] += alpha[k] * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(2.0 * dof * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(yy, &y));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_SeqMAIJ_N(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_SeqMAIJ       *b = (Mat_SeqMAIJ *)A->data;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)b->AIJ->data;
  const PetscScalar *x, *v, *alpha;
  PetscScalar       *y;
  const PetscInt     m = b->AIJ->rmap->n, *idx, dof = b->dof;
  PetscInt           n, i, k;

  PetscFunctionBegin;
  if (yy != zz) PetscCall(VecCopy(yy, zz));
  PetscCall(VecGetArrayRead(xx, &x));
  PetscCall(VecGetArray(zz, &y));
  for (i = 0; i < m; i++) {
    idx   = a->j + a->i[i];
    v     = a->a + a->i[i];
    n     = a->i[i + 1] - a->i[i];
    alpha = x + dof * i;
    while (n-- > 0) {
      for (k = 0; k < dof; k++) y[dof * (*idx) + k] += alpha[k] * (*v);
      idx++;
      v++;
    }
  }
  PetscCall(PetscLogFlops(2.0 * dof * a->nz));
  PetscCall(VecRestoreArrayRead(xx, &x));
  PetscCall(VecRestoreArray(zz, &y));
  PetscFunctionReturn(0);
}

/*===================================================================================*/
PetscErrorCode MatMult_MPIMAIJ_dof(Mat A, Vec xx, Vec yy) {
  Mat_MPIMAIJ *b = (Mat_MPIMAIJ *)A->data;

  PetscFunctionBegin;
  /* start the scatter */
  PetscCall(VecScatterBegin(b->ctx, xx, b->w, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*b->AIJ->ops->mult)(b->AIJ, xx, yy));
  PetscCall(VecScatterEnd(b->ctx, xx, b->w, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*b->OAIJ->ops->multadd)(b->OAIJ, b->w, yy, yy));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTranspose_MPIMAIJ_dof(Mat A, Vec xx, Vec yy) {
  Mat_MPIMAIJ *b = (Mat_MPIMAIJ *)A->data;

  PetscFunctionBegin;
  PetscCall((*b->OAIJ->ops->multtranspose)(b->OAIJ, xx, b->w));
  PetscCall((*b->AIJ->ops->multtranspose)(b->AIJ, xx, yy));
  PetscCall(VecScatterBegin(b->ctx, b->w, yy, ADD_VALUES, SCATTER_REVERSE));
  PetscCall(VecScatterEnd(b->ctx, b->w, yy, ADD_VALUES, SCATTER_REVERSE));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultAdd_MPIMAIJ_dof(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_MPIMAIJ *b = (Mat_MPIMAIJ *)A->data;

  PetscFunctionBegin;
  /* start the scatter */
  PetscCall(VecScatterBegin(b->ctx, xx, b->w, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*b->AIJ->ops->multadd)(b->AIJ, xx, yy, zz));
  PetscCall(VecScatterEnd(b->ctx, xx, b->w, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*b->OAIJ->ops->multadd)(b->OAIJ, b->w, zz, zz));
  PetscFunctionReturn(0);
}

PetscErrorCode MatMultTransposeAdd_MPIMAIJ_dof(Mat A, Vec xx, Vec yy, Vec zz) {
  Mat_MPIMAIJ *b = (Mat_MPIMAIJ *)A->data;

  PetscFunctionBegin;
  PetscCall((*b->OAIJ->ops->multtranspose)(b->OAIJ, xx, b->w));
  PetscCall((*b->AIJ->ops->multtransposeadd)(b->AIJ, xx, yy, zz));
  PetscCall(VecScatterBegin(b->ctx, b->w, zz, ADD_VALUES, SCATTER_REVERSE));
  PetscCall(VecScatterEnd(b->ctx, b->w, zz, ADD_VALUES, SCATTER_REVERSE));
  PetscFunctionReturn(0);
}

/* ----------------------------------------------------------------*/
PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqMAIJ(Mat C) {
  Mat_Product *product = C->product;

  PetscFunctionBegin;
  if (product->type == MATPRODUCT_PtAP) {
    C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqMAIJ;
  } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product type %s is not supported for SeqAIJ and SeqMAIJ matrices", MatProductTypes[product->type]);
  PetscFunctionReturn(0);
}

PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIMAIJ(Mat C) {
  Mat_Product *product = C->product;
  PetscBool    flg     = PETSC_FALSE;
  Mat          A = product->A, P = product->B;
  PetscInt     alg = 1; /* set default algorithm */
#if !defined(PETSC_HAVE_HYPRE)
  const char *algTypes[4] = {"scalable", "nonscalable", "allatonce", "allatonce_merged"};
  PetscInt    nalg        = 4;
#else
  const char *algTypes[5] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "hypre"};
  PetscInt    nalg        = 5;
#endif

  PetscFunctionBegin;
  PetscCheck(product->type == MATPRODUCT_PtAP, PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product type %s is not supported for MPIAIJ and MPIMAIJ matrices", MatProductTypes[product->type]);

  /* PtAP */
  /* Check matrix local sizes */
  PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Arow (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")",
             A->rmap->rstart, A->rmap->rend, P->rmap->rstart, P->rmap->rend);
  PetscCheck(A->cmap->rstart == P->rmap->rstart && A->cmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Acol (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")",
             A->cmap->rstart, A->cmap->rend, P->rmap->rstart, P->rmap->rend);

  /* Set the default algorithm */
  PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
  if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

  /* Get runtime option */
  PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
  PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
  if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
  PetscOptionsEnd();

  PetscCall(PetscStrcmp(C->product->alg, "allatonce", &flg));
  if (flg) {
    C->ops->productsymbolic = MatProductSymbolic_PtAP_MPIAIJ_MPIMAIJ;
    PetscFunctionReturn(0);
  }

  PetscCall(PetscStrcmp(C->product->alg, "allatonce_merged", &flg));
  if (flg) {
    C->ops->productsymbolic = MatProductSymbolic_PtAP_MPIAIJ_MPIMAIJ;
    PetscFunctionReturn(0);
  }

  /* Convert P from MAIJ to AIJ matrix since implementation not available for MAIJ */
  PetscCall(PetscInfo((PetscObject)A, "Converting from MAIJ to AIJ matrix since implementation not available for MAIJ\n"));
  PetscCall(MatConvert(P, MATMPIAIJ, MAT_INPLACE_MATRIX, &P));
  PetscCall(MatProductSetFromOptions(C));
  PetscFunctionReturn(0);
}

/* ----------------------------------------------------------------*/
PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A, Mat PP, PetscReal fill, Mat C) {
  PetscFreeSpaceList free_space = NULL, current_space = NULL;
  Mat_SeqMAIJ       *pp = (Mat_SeqMAIJ *)PP->data;
  Mat                P  = pp->AIJ;
  Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *p = (Mat_SeqAIJ *)P->data, *c;
  PetscInt          *pti, *ptj, *ptJ;
  PetscInt          *ci, *cj, *ptadenserow, *ptasparserow, *denserow, *sparserow, *ptaj;
  const PetscInt     an = A->cmap->N, am = A->rmap->N, pn = P->cmap->N, pm = P->rmap->N, ppdof = pp->dof;
  PetscInt           i, j, k, dof, pshift, ptnzi, arow, anzj, ptanzi, prow, pnzj, cnzi, cn;
  MatScalar         *ca;
  const PetscInt    *pi = p->i, *pj = p->j, *pjj, *ai = a->i, *aj = a->j, *ajj;

  PetscFunctionBegin;
  /* Get ij structure of P^T */
  PetscCall(MatGetSymbolicTranspose_SeqAIJ(P, &pti, &ptj));

  cn = pn * ppdof;
  /* Allocate ci array, arrays for fill computation and */
  /* free space for accumulating nonzero column info */
  PetscCall(PetscMalloc1(cn + 1, &ci));
  ci[0] = 0;

  /* Work arrays for rows of P^T*A */
  PetscCall(PetscMalloc4(an, &ptadenserow, an, &ptasparserow, cn, &denserow, cn, &sparserow));
  PetscCall(PetscArrayzero(ptadenserow, an));
  PetscCall(PetscArrayzero(denserow, cn));

  /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
  /* This should be reasonable if sparsity of PtAP is similar to that of A. */
  /* Note, aspect ratio of P is the same as the aspect ratio of SeqAIJ inside P */
  PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(ai[am] / pm, pn), &free_space));
  current_space = free_space;

  /* Determine symbolic info for each row of C: */
  for (i = 0; i < pn; i++) {
    ptnzi = pti[i + 1] - pti[i];
    ptJ   = ptj + pti[i];
    for (dof = 0; dof < ppdof; dof++) {
      ptanzi = 0;
      /* Determine symbolic row of PtA: */
      for (j = 0; j < ptnzi; j++) {
        /* Expand ptJ[j] by block size and shift by dof to get the right row of A */
        arow = ptJ[j] * ppdof + dof;
        /* Nonzeros of P^T*A will be in same locations as any element of A in that row */
        anzj = ai[arow + 1] - ai[arow];
        ajj  = aj + ai[arow];
        for (k = 0; k < anzj; k++) {
          if (!ptadenserow[ajj[k]]) {
            ptadenserow[ajj[k]]    = -1;
            ptasparserow[ptanzi++] = ajj[k];
          }
        }
      }
      /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
      ptaj = ptasparserow;
      cnzi = 0;
      for (j = 0; j < ptanzi; j++) {
        /* Get offset within block of P */
        pshift = *ptaj % ppdof;
        /* Get block row of P */
        prow   = (*ptaj++) / ppdof; /* integer division */
        /* P has same number of nonzeros per row as the compressed form */
        pnzj   = pi[prow + 1] - pi[prow];
        pjj    = pj + pi[prow];
        for (k = 0; k < pnzj; k++) {
          /* Locations in C are shifted by the offset within the block */
          /* Note: we cannot use PetscLLAdd here because of the additional offset for the write location */
          if (!denserow[pjj[k] * ppdof + pshift]) {
            denserow[pjj[k] * ppdof + pshift] = -1;
            sparserow[cnzi++]                 = pjj[k] * ppdof + pshift;
          }
        }
      }

      /* sort sparserow */
      PetscCall(PetscSortInt(cnzi, sparserow));

      /* If free space is not available, make more free space */
      /* Double the amount of total space in the list */
      if (current_space->local_remaining < cnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));

      /* Copy data into free space, and zero out denserows */
      PetscCall(PetscArraycpy(current_space->array, sparserow, cnzi));

      current_space->array += cnzi;
      current_space->local_used += cnzi;
      current_space->local_remaining -= cnzi;

      for (j = 0; j < ptanzi; j++) ptadenserow[ptasparserow[j]] = 0;
      for (j = 0; j < cnzi; j++) denserow[sparserow[j]] = 0;

      /* Aside: Perhaps we should save the pta info for the numerical factorization. */
      /*        For now, we will recompute what is needed. */
      ci[i * ppdof + 1 + dof] = ci[i * ppdof + dof] + cnzi;
    }
  }
  /* nnz is now stored in ci[ptm], column indices are in the list of free space */
  /* Allocate space for cj, initialize cj, and */
  /* destroy list of free space and other temporary array(s) */
  PetscCall(PetscMalloc1(ci[cn] + 1, &cj));
  PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
  PetscCall(PetscFree4(ptadenserow, ptasparserow, denserow, sparserow));

  /* Allocate space for ca */
  PetscCall(PetscCalloc1(ci[cn] + 1, &ca));

  /* put together the new matrix */
  PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), cn, cn, ci, cj, ca, NULL, C));
  PetscCall(MatSetBlockSize(C, pp->dof));

  /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
  /* Since these are PETSc arrays, change flags to free them as necessary. */
  c          = (Mat_SeqAIJ *)(C->data);
  c->free_a  = PETSC_TRUE;
  c->free_ij = PETSC_TRUE;
  c->nonew   = 0;

  C->ops->ptapnumeric    = MatPtAPNumeric_SeqAIJ_SeqMAIJ;
  C->ops->productnumeric = MatProductNumeric_PtAP;

  /* Clean up. */
  PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(P, &pti, &ptj));
  PetscFunctionReturn(0);
}

PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqMAIJ(Mat A, Mat PP, Mat C) {
  /* This routine requires testing -- first draft only */
  Mat_SeqMAIJ     *pp = (Mat_SeqMAIJ *)PP->data;
  Mat              P  = pp->AIJ;
  Mat_SeqAIJ      *a  = (Mat_SeqAIJ *)A->data;
  Mat_SeqAIJ      *p  = (Mat_SeqAIJ *)P->data;
  Mat_SeqAIJ      *c  = (Mat_SeqAIJ *)C->data;
  const PetscInt  *ai = a->i, *aj = a->j, *pi = p->i, *pj = p->j, *pJ, *pjj;
  const PetscInt  *ci = c->i, *cj = c->j, *cjj;
  const PetscInt   am = A->rmap->N, cn = C->cmap->N, cm = C->rmap->N, ppdof = pp->dof;
  PetscInt         i, j, k, pshift, poffset, anzi, pnzi, apnzj, nextap, pnzj, prow, crow, *apj, *apjdense;
  const MatScalar *aa = a->a, *pa = p->a, *pA, *paj;
  MatScalar       *ca = c->a, *caj, *apa;

  PetscFunctionBegin;
  /* Allocate temporary array for storage of one row of A*P */
  PetscCall(PetscCalloc3(cn, &apa, cn, &apj, cn, &apjdense));

  /* Clear old values in C */
  PetscCall(PetscArrayzero(ca, ci[cm]));

  for (i = 0; i < am; i++) {
    /* Form sparse row of A*P */
    anzi  = ai[i + 1] - ai[i];
    apnzj = 0;
    for (j = 0; j < anzi; j++) {
      /* Get offset within block of P */
      pshift = *aj % ppdof;
      /* Get block row of P */
      prow   = *aj++ / ppdof; /* integer division */
      pnzj   = pi[prow + 1] - pi[prow];
      pjj    = pj + pi[prow];
      paj    = pa + pi[prow];
      for (k = 0; k < pnzj; k++) {
        poffset = pjj[k] * ppdof + pshift;
        if (!apjdense[poffset]) {
          apjdense[poffset] = -1;
          apj[apnzj++]      = poffset;
        }
        apa[poffset] += (*aa) * paj[k];
      }
      PetscCall(PetscLogFlops(2.0 * pnzj));
      aa++;
    }

    /* Sort the j index array for quick sparse axpy. */
    /* Note: a array does not need sorting as it is in dense storage locations. */
    PetscCall(PetscSortInt(apnzj, apj));

    /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
    prow    = i / ppdof; /* integer division */
    pshift  = i % ppdof;
    poffset = pi[prow];
    pnzi    = pi[prow + 1] - poffset;
    /* Reset pJ and pA so we can traverse the same row of P 'dof' times. */
    pJ      = pj + poffset;
    pA      = pa + poffset;
    for (j = 0; j < pnzi; j++) {
      crow = (*pJ) * ppdof + pshift;
      cjj  = cj + ci[crow];
      caj  = ca + ci[crow];
      pJ++;
      /* Perform sparse axpy operation.  Note cjj includes apj. */
      for (k = 0, nextap = 0; nextap < apnzj; k++) {
        if (cjj[k] == apj[nextap]) caj[k] += (*pA) * apa[apj[nextap++]];
      }
      PetscCall(PetscLogFlops(2.0 * apnzj));
      pA++;
    }

    /* Zero the current row info for A*P */
    for (j = 0; j < apnzj; j++) {
      apa[apj[j]]      = 0.;
      apjdense[apj[j]] = 0;
    }
  }

  /* Assemble the final matrix and clean up */
  PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
  PetscCall(PetscFree3(apa, apj, apjdense));
  PetscFunctionReturn(0);
}

PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_SeqAIJ_SeqMAIJ(Mat C) {
  Mat_Product *product = C->product;
  Mat          A = product->A, P = product->B;

  PetscFunctionBegin;
  PetscCall(MatPtAPSymbolic_SeqAIJ_SeqMAIJ(A, P, product->fill, C));
  PetscFunctionReturn(0);
}

PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIMAIJ(Mat A, Mat PP, PetscReal fill, Mat *C) {
  PetscFunctionBegin;
  SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatPtAPSymbolic is not implemented for MPIMAIJ matrix yet");
}

PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIMAIJ(Mat A, Mat PP, Mat C) {
  PetscFunctionBegin;
  SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatPtAPNumeric is not implemented for MPIMAIJ matrix yet");
}

PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat, Mat, PetscInt, Mat);

PETSC_INTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIMAIJ_allatonce(Mat A, Mat P, Mat C) {
  Mat_MPIMAIJ *maij = (Mat_MPIMAIJ *)P->data;

  PetscFunctionBegin;

  PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, maij->A, maij->dof, C));
  PetscFunctionReturn(0);
}

PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat, Mat, PetscInt, PetscReal, Mat);

PETSC_INTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIMAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) {
  Mat_MPIMAIJ *maij = (Mat_MPIMAIJ *)P->data;

  PetscFunctionBegin;
  PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, maij->A, maij->dof, fill, C));
  C->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIMAIJ_allatonce;
  PetscFunctionReturn(0);
}

PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat, Mat, PetscInt, Mat);

PETSC_INTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIMAIJ_allatonce_merged(Mat A, Mat P, Mat C) {
  Mat_MPIMAIJ *maij = (Mat_MPIMAIJ *)P->data;

  PetscFunctionBegin;

  PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, maij->A, maij->dof, C));
  PetscFunctionReturn(0);
}

PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat, Mat, PetscInt, PetscReal, Mat);

PETSC_INTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIMAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) {
  Mat_MPIMAIJ *maij = (Mat_MPIMAIJ *)P->data;

  PetscFunctionBegin;

  PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, maij->A, maij->dof, fill, C));
  C->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIMAIJ_allatonce_merged;
  PetscFunctionReturn(0);
}

PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIMAIJ(Mat C) {
  Mat_Product *product = C->product;
  Mat          A = product->A, P = product->B;
  PetscBool    flg;

  PetscFunctionBegin;
  PetscCall(PetscStrcmp(product->alg, "allatonce", &flg));
  if (flg) {
    PetscCall(MatPtAPSymbolic_MPIAIJ_MPIMAIJ_allatonce(A, P, product->fill, C));
    C->ops->productnumeric = MatProductNumeric_PtAP;
    PetscFunctionReturn(0);
  }

  PetscCall(PetscStrcmp(product->alg, "allatonce_merged", &flg));
  if (flg) {
    PetscCall(MatPtAPSymbolic_MPIAIJ_MPIMAIJ_allatonce_merged(A, P, product->fill, C));
    C->ops->productnumeric = MatProductNumeric_PtAP;
    PetscFunctionReturn(0);
  }

  SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
}

PETSC_INTERN PetscErrorCode MatConvert_SeqMAIJ_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
  Mat_SeqMAIJ *b   = (Mat_SeqMAIJ *)A->data;
  Mat          a   = b->AIJ, B;
  Mat_SeqAIJ  *aij = (Mat_SeqAIJ *)a->data;
  PetscInt     m, n, i, ncols, *ilen, nmax = 0, *icols, j, k, ii, dof = b->dof;
  PetscInt    *cols;
  PetscScalar *vals;

  PetscFunctionBegin;
  PetscCall(MatGetSize(a, &m, &n));
  PetscCall(PetscMalloc1(dof * m, &ilen));
  for (i = 0; i < m; i++) {
    nmax = PetscMax(nmax, aij->ilen[i]);
    for (j = 0; j < dof; j++) ilen[dof * i + j] = aij->ilen[i];
  }
  PetscCall(MatCreate(PETSC_COMM_SELF, &B));
  PetscCall(MatSetSizes(B, dof * m, dof * n, dof * m, dof * n));
  PetscCall(MatSetType(B, newtype));
  PetscCall(MatSeqAIJSetPreallocation(B, 0, ilen));
  PetscCall(PetscFree(ilen));
  PetscCall(PetscMalloc1(nmax, &icols));
  ii = 0;
  for (i = 0; i < m; i++) {
    PetscCall(MatGetRow_SeqAIJ(a, i, &ncols, &cols, &vals));
    for (j = 0; j < dof; j++) {
      for (k = 0; k < ncols; k++) icols[k] = dof * cols[k] + j;
      PetscCall(MatSetValues_SeqAIJ(B, 1, &ii, ncols, icols, vals, INSERT_VALUES));
      ii++;
    }
    PetscCall(MatRestoreRow_SeqAIJ(a, i, &ncols, &cols, &vals));
  }
  PetscCall(PetscFree(icols));
  PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));

  if (reuse == MAT_INPLACE_MATRIX) {
    PetscCall(MatHeaderReplace(A, &B));
  } else {
    *newmat = B;
  }
  PetscFunctionReturn(0);
}

#include <../src/mat/impls/aij/mpi/mpiaij.h>

PETSC_INTERN PetscErrorCode MatConvert_MPIMAIJ_MPIAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
  Mat_MPIMAIJ *maij    = (Mat_MPIMAIJ *)A->data;
  Mat          MatAIJ  = ((Mat_SeqMAIJ *)maij->AIJ->data)->AIJ, B;
  Mat          MatOAIJ = ((Mat_SeqMAIJ *)maij->OAIJ->data)->AIJ;
  Mat_SeqAIJ  *AIJ     = (Mat_SeqAIJ *)MatAIJ->data;
  Mat_SeqAIJ  *OAIJ    = (Mat_SeqAIJ *)MatOAIJ->data;
  Mat_MPIAIJ  *mpiaij  = (Mat_MPIAIJ *)maij->A->data;
  PetscInt     dof = maij->dof, i, j, *dnz = NULL, *onz = NULL, nmax = 0, onmax = 0;
  PetscInt    *oicols = NULL, *icols = NULL, ncols, *cols = NULL, oncols, *ocols = NULL;
  PetscInt     rstart, cstart, *garray, ii, k;
  PetscScalar *vals, *ovals;

  PetscFunctionBegin;
  PetscCall(PetscMalloc2(A->rmap->n, &dnz, A->rmap->n, &onz));
  for (i = 0; i < A->rmap->n / dof; i++) {
    nmax  = PetscMax(nmax, AIJ->ilen[i]);
    onmax = PetscMax(onmax, OAIJ->ilen[i]);
    for (j = 0; j < dof; j++) {
      dnz[dof * i + j] = AIJ->ilen[i];
      onz[dof * i + j] = OAIJ->ilen[i];
    }
  }
  PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
  PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
  PetscCall(MatSetType(B, newtype));
  PetscCall(MatMPIAIJSetPreallocation(B, 0, dnz, 0, onz));
  PetscCall(MatSetBlockSize(B, dof));
  PetscCall(PetscFree2(dnz, onz));

  PetscCall(PetscMalloc2(nmax, &icols, onmax, &oicols));
  rstart = dof * maij->A->rmap->rstart;
  cstart = dof * maij->A->cmap->rstart;
  garray = mpiaij->garray;

  ii = rstart;
  for (i = 0; i < A->rmap->n / dof; i++) {
    PetscCall(MatGetRow_SeqAIJ(MatAIJ, i, &ncols, &cols, &vals));
    PetscCall(MatGetRow_SeqAIJ(MatOAIJ, i, &oncols, &ocols, &ovals));
    for (j = 0; j < dof; j++) {
      for (k = 0; k < ncols; k++) icols[k] = cstart + dof * cols[k] + j;
      for (k = 0; k < oncols; k++) oicols[k] = dof * garray[ocols[k]] + j;
      PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, ncols, icols, vals, INSERT_VALUES));
      PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, oncols, oicols, ovals, INSERT_VALUES));
      ii++;
    }
    PetscCall(MatRestoreRow_SeqAIJ(MatAIJ, i, &ncols, &cols, &vals));
    PetscCall(MatRestoreRow_SeqAIJ(MatOAIJ, i, &oncols, &ocols, &ovals));
  }
  PetscCall(PetscFree2(icols, oicols));

  PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));

  if (reuse == MAT_INPLACE_MATRIX) {
    PetscInt refct          = ((PetscObject)A)->refct; /* save ((PetscObject)A)->refct */
    ((PetscObject)A)->refct = 1;

    PetscCall(MatHeaderReplace(A, &B));

    ((PetscObject)A)->refct = refct; /* restore ((PetscObject)A)->refct */
  } else {
    *newmat = B;
  }
  PetscFunctionReturn(0);
}

PetscErrorCode MatCreateSubMatrix_MAIJ(Mat mat, IS isrow, IS iscol, MatReuse cll, Mat *newmat) {
  Mat A;

  PetscFunctionBegin;
  PetscCall(MatConvert(mat, MATAIJ, MAT_INITIAL_MATRIX, &A));
  PetscCall(MatCreateSubMatrix(A, isrow, iscol, cll, newmat));
  PetscCall(MatDestroy(&A));
  PetscFunctionReturn(0);
}

PetscErrorCode MatCreateSubMatrices_MAIJ(Mat mat, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *submat[]) {
  Mat A;

  PetscFunctionBegin;
  PetscCall(MatConvert(mat, MATAIJ, MAT_INITIAL_MATRIX, &A));
  PetscCall(MatCreateSubMatrices(A, n, irow, icol, scall, submat));
  PetscCall(MatDestroy(&A));
  PetscFunctionReturn(0);
}

/* ---------------------------------------------------------------------------------- */
/*@
  MatCreateMAIJ - Creates a matrix type providing restriction and interpolation
  operations for multicomponent problems.  It interpolates each component the same
  way independently.  The matrix type is based on `MATSEQAIJ` for sequential matrices,
  and `MATMPIAIJ` for distributed matrices.

  Collective

  Input Parameters:
+ A - the `MATAIJ` matrix describing the action on blocks
- dof - the block size (number of components per node)

  Output Parameter:
. maij - the new `MATMAIJ` matrix

  Operations provided:
.vb
    MatMult()
    MatMultTranspose()
    MatMultAdd()
    MatMultTransposeAdd()
    MatView()
.ve

  Level: advanced

.seealso: `MATAIJ`, `MATMAIJ`, `MatMAIJGetAIJ()`, `MatMAIJRedimension()`, `MATMAIJ`
@*/
PetscErrorCode MatCreateMAIJ(Mat A, PetscInt dof, Mat *maij) {
  PetscInt  n;
  Mat       B;
  PetscBool flg;
#if defined(PETSC_HAVE_CUDA)
  /* hack to prevent conversion to AIJ format for CUDA when used inside a parallel MAIJ */
  PetscBool convert = dof < 0 ? PETSC_FALSE : PETSC_TRUE;
#endif

  PetscFunctionBegin;
  dof = PetscAbs(dof);
  PetscCall(PetscObjectReference((PetscObject)A));

  if (dof == 1) *maij = A;
  else {
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
    /* propagate vec type */
    PetscCall(MatSetVecType(B, A->defaultvectype));
    PetscCall(MatSetSizes(B, dof * A->rmap->n, dof * A->cmap->n, dof * A->rmap->N, dof * A->cmap->N));
    PetscCall(PetscLayoutSetBlockSize(B->rmap, dof));
    PetscCall(PetscLayoutSetBlockSize(B->cmap, dof));
    PetscCall(PetscLayoutSetUp(B->rmap));
    PetscCall(PetscLayoutSetUp(B->cmap));

    B->assembled = PETSC_TRUE;

    PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &flg));
    if (flg) {
      Mat_SeqMAIJ *b;

      PetscCall(MatSetType(B, MATSEQMAIJ));

      B->ops->setup   = NULL;
      B->ops->destroy = MatDestroy_SeqMAIJ;
      B->ops->view    = MatView_SeqMAIJ;

      b      = (Mat_SeqMAIJ *)B->data;
      b->dof = dof;
      b->AIJ = A;

      if (dof == 2) {
        B->ops->mult             = MatMult_SeqMAIJ_2;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_2;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_2;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_2;
      } else if (dof == 3) {
        B->ops->mult             = MatMult_SeqMAIJ_3;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_3;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_3;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_3;
      } else if (dof == 4) {
        B->ops->mult             = MatMult_SeqMAIJ_4;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_4;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_4;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_4;
      } else if (dof == 5) {
        B->ops->mult             = MatMult_SeqMAIJ_5;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_5;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_5;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_5;
      } else if (dof == 6) {
        B->ops->mult             = MatMult_SeqMAIJ_6;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_6;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_6;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_6;
      } else if (dof == 7) {
        B->ops->mult             = MatMult_SeqMAIJ_7;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_7;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_7;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_7;
      } else if (dof == 8) {
        B->ops->mult             = MatMult_SeqMAIJ_8;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_8;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_8;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_8;
      } else if (dof == 9) {
        B->ops->mult             = MatMult_SeqMAIJ_9;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_9;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_9;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_9;
      } else if (dof == 10) {
        B->ops->mult             = MatMult_SeqMAIJ_10;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_10;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_10;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_10;
      } else if (dof == 11) {
        B->ops->mult             = MatMult_SeqMAIJ_11;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_11;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_11;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_11;
      } else if (dof == 16) {
        B->ops->mult             = MatMult_SeqMAIJ_16;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_16;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_16;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_16;
      } else if (dof == 18) {
        B->ops->mult             = MatMult_SeqMAIJ_18;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_18;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_18;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_18;
      } else {
        B->ops->mult             = MatMult_SeqMAIJ_N;
        B->ops->multadd          = MatMultAdd_SeqMAIJ_N;
        B->ops->multtranspose    = MatMultTranspose_SeqMAIJ_N;
        B->ops->multtransposeadd = MatMultTransposeAdd_SeqMAIJ_N;
      }
#if defined(PETSC_HAVE_CUDA)
      PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqmaij_seqaijcusparse_C", MatConvert_SeqMAIJ_SeqAIJ));
#endif
      PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqmaij_seqaij_C", MatConvert_SeqMAIJ_SeqAIJ));
      PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqmaij_C", MatProductSetFromOptions_SeqAIJ_SeqMAIJ));
    } else {
      Mat_MPIAIJ  *mpiaij = (Mat_MPIAIJ *)A->data;
      Mat_MPIMAIJ *b;
      IS           from, to;
      Vec          gvec;

      PetscCall(MatSetType(B, MATMPIMAIJ));

      B->ops->setup   = NULL;
      B->ops->destroy = MatDestroy_MPIMAIJ;
      B->ops->view    = MatView_MPIMAIJ;

      b      = (Mat_MPIMAIJ *)B->data;
      b->dof = dof;
      b->A   = A;

      PetscCall(MatCreateMAIJ(mpiaij->A, -dof, &b->AIJ));
      PetscCall(MatCreateMAIJ(mpiaij->B, -dof, &b->OAIJ));

      PetscCall(VecGetSize(mpiaij->lvec, &n));
      PetscCall(VecCreate(PETSC_COMM_SELF, &b->w));
      PetscCall(VecSetSizes(b->w, n * dof, n * dof));
      PetscCall(VecSetBlockSize(b->w, dof));
      PetscCall(VecSetType(b->w, VECSEQ));

      /* create two temporary Index sets for build scatter gather */
      PetscCall(ISCreateBlock(PetscObjectComm((PetscObject)A), dof, n, mpiaij->garray, PETSC_COPY_VALUES, &from));
      PetscCall(ISCreateStride(PETSC_COMM_SELF, n * dof, 0, 1, &to));

      /* create temporary global vector to generate scatter context */
      PetscCall(VecCreateMPIWithArray(PetscObjectComm((PetscObject)A), dof, dof * A->cmap->n, dof * A->cmap->N, NULL, &gvec));

      /* generate the scatter context */
      PetscCall(VecScatterCreate(gvec, from, b->w, to, &b->ctx));

      PetscCall(ISDestroy(&from));
      PetscCall(ISDestroy(&to));
      PetscCall(VecDestroy(&gvec));

      B->ops->mult             = MatMult_MPIMAIJ_dof;
      B->ops->multtranspose    = MatMultTranspose_MPIMAIJ_dof;
      B->ops->multadd          = MatMultAdd_MPIMAIJ_dof;
      B->ops->multtransposeadd = MatMultTransposeAdd_MPIMAIJ_dof;

#if defined(PETSC_HAVE_CUDA)
      PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpimaij_mpiaijcusparse_C", MatConvert_MPIMAIJ_MPIAIJ));
#endif
      PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpimaij_mpiaij_C", MatConvert_MPIMAIJ_MPIAIJ));
      PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpimaij_C", MatProductSetFromOptions_MPIAIJ_MPIMAIJ));
    }
    B->ops->createsubmatrix   = MatCreateSubMatrix_MAIJ;
    B->ops->createsubmatrices = MatCreateSubMatrices_MAIJ;
    PetscCall(MatSetUp(B));
#if defined(PETSC_HAVE_CUDA)
    /* temporary until we have CUDA implementation of MAIJ */
    {
      PetscBool flg;
      if (convert) {
        PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MATAIJCUSPARSE, ""));
        if (flg) PetscCall(MatConvert(B, ((PetscObject)A)->type_name, MAT_INPLACE_MATRIX, &B));
      }
    }
#endif
    *maij = B;
  }
  PetscFunctionReturn(0);
}
