/*
   Basic functions for basic parallel dense matrices.
   Portions of this code are under:
   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*/

#include <../src/mat/impls/dense/mpi/mpidense.h> /*I   "petscmat.h"  I*/
#include <../src/mat/impls/aij/mpi/mpiaij.h>
#include <petscblaslapack.h>
#include <petsc/private/vecimpl.h>
#include <petsc/private/deviceimpl.h>
#include <petsc/private/sfimpl.h>

/*@
  MatDenseGetLocalMatrix - For a `MATMPIDENSE` or `MATSEQDENSE` matrix returns the sequential
  matrix that represents the operator. For sequential matrices it returns itself.

  Input Parameter:
. A - the sequential or MPI `MATDENSE` matrix

  Output Parameter:
. B - the inner matrix

  Level: intermediate

.seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MATMPIDENSE`, `MATSEQDENSE`
@*/
PetscErrorCode MatDenseGetLocalMatrix(Mat A, Mat *B)
{
  Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
  PetscBool     flg;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
  PetscAssertPointer(B, 2);
  PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIDENSE, &flg));
  if (flg) *B = mat->A;
  else {
    PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQDENSE, &flg));
    PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for matrix type %s", ((PetscObject)A)->type_name);
    *B = A;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatCopy_MPIDense(Mat A, Mat B, MatStructure s)
{
  Mat_MPIDense *Amat = (Mat_MPIDense *)A->data;
  Mat_MPIDense *Bmat = (Mat_MPIDense *)B->data;

  PetscFunctionBegin;
  PetscCall(MatCopy(Amat->A, Bmat->A, s));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatShift_MPIDense(Mat A, PetscScalar alpha)
{
  Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
  PetscInt      j, lda, rstart = A->rmap->rstart, rend = A->rmap->rend, rend2;
  PetscScalar  *v;

  PetscFunctionBegin;
  PetscCall(MatDenseGetArray(mat->A, &v));
  PetscCall(MatDenseGetLDA(mat->A, &lda));
  rend2 = PetscMin(rend, A->cmap->N);
  if (rend2 > rstart) {
    for (j = rstart; j < rend2; j++) v[j - rstart + j * lda] += alpha;
    PetscCall(PetscLogFlops(rend2 - rstart));
  }
  PetscCall(MatDenseRestoreArray(mat->A, &v));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetRow_MPIDense(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
{
  Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
  PetscInt      lrow, rstart = A->rmap->rstart, rend = A->rmap->rend;

  PetscFunctionBegin;
  PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "only local rows");
  lrow = row - rstart;
  PetscCall(MatGetRow(mat->A, lrow, nz, (const PetscInt **)idx, (const PetscScalar **)v));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatRestoreRow_MPIDense(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
{
  Mat_MPIDense *mat = (Mat_MPIDense *)A->data;
  PetscInt      lrow, rstart = A->rmap->rstart, rend = A->rmap->rend;

  PetscFunctionBegin;
  PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "only local rows");
  lrow = row - rstart;
  PetscCall(MatRestoreRow(mat->A, lrow, nz, (const PetscInt **)idx, (const PetscScalar **)v));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetDiagonalBlock_MPIDense(Mat A, Mat *a)
{
  Mat_MPIDense *mdn = (Mat_MPIDense *)A->data;
  PetscInt      m = A->rmap->n, rstart = A->rmap->rstart;
  PetscScalar  *array;
  MPI_Comm      comm;
  PetscBool     flg;
  Mat           B;

  PetscFunctionBegin;
  PetscCall(MatHasCongruentLayouts(A, &flg));
  PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only square matrices supported.");
  PetscCall(PetscObjectQuery((PetscObject)A, "DiagonalBlock", (PetscObject *)&B));
  if (!B) { /* This should use MatDenseGetSubMatrix (not create), but we would need a call like MatRestoreDiagonalBlock */
#if PetscDefined(HAVE_CUDA)
    PetscCall(PetscObjectTypeCompare((PetscObject)mdn->A, MATSEQDENSECUDA, &flg));
    PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not coded for %s. Send an email to petsc-dev@mcs.anl.gov to request this feature", MATSEQDENSECUDA);
#elif PetscDefined(HAVE_HIP)
    PetscCall(PetscObjectTypeCompare((PetscObject)mdn->A, MATSEQDENSEHIP, &flg));
    PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not coded for %s. Send an email to petsc-dev@mcs.anl.gov to request this feature", MATSEQDENSEHIP);
#endif
    PetscCall(PetscObjectGetComm((PetscObject)mdn->A, &comm));
    PetscCall(MatCreate(comm, &B));
    PetscCall(MatSetSizes(B, m, m, m, m));
    PetscCall(MatSetType(B, ((PetscObject)mdn->A)->type_name));
    PetscCall(MatDenseGetArrayRead(mdn->A, (const PetscScalar **)&array));
    PetscCall(MatSeqDenseSetPreallocation(B, array + m * rstart));
    PetscCall(MatDenseRestoreArrayRead(mdn->A, (const PetscScalar **)&array));
    PetscCall(PetscObjectCompose((PetscObject)A, "DiagonalBlock", (PetscObject)B));
    *a = B;
    PetscCall(MatDestroy(&B));
  } else *a = B;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetValues_MPIDense(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], const PetscScalar v[], InsertMode addv)
{
  Mat_MPIDense *A = (Mat_MPIDense *)mat->data;
  PetscInt      i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, row;
  PetscBool     roworiented = A->roworiented;

  PetscFunctionBegin;
  for (i = 0; i < m; i++) {
    if (idxm[i] < 0) continue;
    PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large");
    if (idxm[i] >= rstart && idxm[i] < rend) {
      row = idxm[i] - rstart;
      if (roworiented) {
        PetscCall(MatSetValues(A->A, 1, &row, n, idxn, PetscSafePointerPlusOffset(v, i * n), addv));
      } else {
        for (j = 0; j < n; j++) {
          if (idxn[j] < 0) continue;
          PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large");
          PetscCall(MatSetValues(A->A, 1, &row, 1, &idxn[j], PetscSafePointerPlusOffset(v, i + j * m), addv));
        }
      }
    } else if (!A->donotstash) {
      mat->assembled = PETSC_FALSE;
      if (roworiented) {
        PetscCall(MatStashValuesRow_Private(&mat->stash, idxm[i], n, idxn, PetscSafePointerPlusOffset(v, i * n), PETSC_FALSE));
      } else {
        PetscCall(MatStashValuesCol_Private(&mat->stash, idxm[i], n, idxn, PetscSafePointerPlusOffset(v, i), m, PETSC_FALSE));
      }
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetValues_MPIDense(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
{
  Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
  PetscInt      i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, row;

  PetscFunctionBegin;
  for (i = 0; i < m; i++) {
    if (idxm[i] < 0) continue; /* negative row */
    PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large");
    PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported");
    row = idxm[i] - rstart;
    for (j = 0; j < n; j++) {
      if (idxn[j] < 0) continue; /* negative column */
      PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large");
      PetscCall(MatGetValues(mdn->A, 1, &row, 1, &idxn[j], v + i * n + j));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseGetLDA_MPIDense(Mat A, PetscInt *lda)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDenseGetLDA(a->A, lda));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseSetLDA_MPIDense(Mat A, PetscInt lda)
{
  Mat_MPIDense *a     = (Mat_MPIDense *)A->data;
  MatType       mtype = MATSEQDENSE;

  PetscFunctionBegin;
  if (!a->A) {
    PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
    PetscCall(PetscLayoutSetUp(A->rmap));
    PetscCall(PetscLayoutSetUp(A->cmap));
    PetscCall(MatCreate(PETSC_COMM_SELF, &a->A));
    PetscCall(MatSetSizes(a->A, A->rmap->n, A->cmap->N, A->rmap->n, A->cmap->N));
#if PetscDefined(HAVE_CUDA)
    PetscBool iscuda;
    PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIDENSECUDA, &iscuda));
    if (iscuda) mtype = MATSEQDENSECUDA;
#elif PetscDefined(HAVE_HIP)
    PetscBool iship;
    PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIDENSEHIP, &iship));
    if (iship) mtype = MATSEQDENSEHIP;
#endif
    PetscCall(MatSetType(a->A, mtype));
  }
  PetscCall(MatDenseSetLDA(a->A, lda));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseGetArray_MPIDense(Mat A, PetscScalar **array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDenseGetArray(a->A, array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseGetArrayRead_MPIDense(Mat A, PetscScalar **array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDenseGetArrayRead(a->A, (const PetscScalar **)array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseGetArrayWrite_MPIDense(Mat A, PetscScalar **array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDenseGetArrayWrite(a->A, array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDensePlaceArray_MPIDense(Mat A, const PetscScalar *array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDensePlaceArray(a->A, array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseResetArray_MPIDense(Mat A)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDenseResetArray(a->A));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseReplaceArray_MPIDense(Mat A, const PetscScalar *array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDenseReplaceArray(a->A, array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatCreateSubMatrix_MPIDense(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
{
  Mat_MPIDense      *mat = (Mat_MPIDense *)A->data, *newmatd;
  PetscInt           lda, i, j, rstart, rend, nrows, ncols, Ncols, nlrows, nlcols;
  const PetscInt    *irow, *icol;
  const PetscScalar *v;
  PetscScalar       *bv;
  Mat                newmat;
  IS                 iscol_local;
  MPI_Comm           comm_is, comm_mat;

  PetscFunctionBegin;
  PetscCall(PetscObjectGetComm((PetscObject)A, &comm_mat));
  PetscCall(PetscObjectGetComm((PetscObject)iscol, &comm_is));
  PetscCheck(comm_mat == comm_is, PETSC_COMM_SELF, PETSC_ERR_ARG_NOTSAMECOMM, "IS communicator must match matrix communicator");

  PetscCall(ISAllGather(iscol, &iscol_local));
  PetscCall(ISGetIndices(isrow, &irow));
  PetscCall(ISGetIndices(iscol_local, &icol));
  PetscCall(ISGetLocalSize(isrow, &nrows));
  PetscCall(ISGetLocalSize(iscol, &ncols));
  PetscCall(ISGetSize(iscol, &Ncols)); /* global number of columns, size of iscol_local */

  /* No parallel redistribution currently supported! Should really check each index set
     to confirm that it is OK.  ... Currently supports only submatrix same partitioning as
     original matrix! */

  PetscCall(MatGetLocalSize(A, &nlrows, &nlcols));
  PetscCall(MatGetOwnershipRange(A, &rstart, &rend));

  /* Check submatrix call */
  if (scall == MAT_REUSE_MATRIX) {
    /* SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Reused submatrix wrong size"); */
    /* Really need to test rows and column sizes! */
    newmat = *B;
  } else {
    /* Create and fill new matrix */
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &newmat));
    PetscCall(MatSetSizes(newmat, nrows, ncols, PETSC_DECIDE, Ncols));
    PetscCall(MatSetType(newmat, ((PetscObject)A)->type_name));
    PetscCall(MatMPIDenseSetPreallocation(newmat, NULL));
  }

  /* Now extract the data pointers and do the copy, column at a time */
  newmatd = (Mat_MPIDense *)newmat->data;
  PetscCall(MatDenseGetArray(newmatd->A, &bv));
  PetscCall(MatDenseGetArrayRead(mat->A, &v));
  PetscCall(MatDenseGetLDA(mat->A, &lda));
  for (i = 0; i < Ncols; i++) {
    const PetscScalar *av = v + lda * icol[i];
    for (j = 0; j < nrows; j++) *bv++ = av[irow[j] - rstart];
  }
  PetscCall(MatDenseRestoreArrayRead(mat->A, &v));
  PetscCall(MatDenseRestoreArray(newmatd->A, &bv));

  /* Assemble the matrices so that the correct flags are set */
  PetscCall(MatAssemblyBegin(newmat, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(newmat, MAT_FINAL_ASSEMBLY));

  /* Free work space */
  PetscCall(ISRestoreIndices(isrow, &irow));
  PetscCall(ISRestoreIndices(iscol_local, &icol));
  PetscCall(ISDestroy(&iscol_local));
  *B = newmat;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseRestoreArray_MPIDense(Mat A, PetscScalar **array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDenseRestoreArray(a->A, array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseRestoreArrayRead_MPIDense(Mat A, PetscScalar **array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDenseRestoreArrayRead(a->A, (const PetscScalar **)array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseRestoreArrayWrite_MPIDense(Mat A, PetscScalar **array)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDenseRestoreArrayWrite(a->A, array));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatAssemblyBegin_MPIDense(Mat mat, MatAssemblyType mode)
{
  Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
  PetscInt      nstash, reallocs;

  PetscFunctionBegin;
  if (mdn->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
  PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
  PetscCall(PetscInfo(mdn->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatAssemblyEnd_MPIDense(Mat mat, MatAssemblyType mode)
{
  Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;
  PetscInt      i, *row, *col, flg, j, rstart, ncols;
  PetscMPIInt   n;
  PetscScalar  *val;

  PetscFunctionBegin;
  if (!mdn->donotstash && !mat->nooffprocentries) {
    /*  wait on receives */
    while (1) {
      PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
      if (!flg) break;

      for (i = 0; i < n;) {
        /* Now identify the consecutive vals belonging to the same row */
        for (j = i, rstart = row[j]; j < n; j++) {
          if (row[j] != rstart) break;
        }
        if (j < n) ncols = j - i;
        else ncols = n - i;
        /* Now assemble all these values with a single function call */
        PetscCall(MatSetValues_MPIDense(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
        i = j;
      }
    }
    PetscCall(MatStashScatterEnd_Private(&mat->stash));
  }

  PetscCall(MatAssemblyBegin(mdn->A, mode));
  PetscCall(MatAssemblyEnd(mdn->A, mode));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatZeroEntries_MPIDense(Mat A)
{
  Mat_MPIDense *l = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatZeroEntries(l->A));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatZeroRows_MPIDense(Mat A, PetscInt n, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
{
  Mat_MPIDense *l = (Mat_MPIDense *)A->data;
  PetscInt      i, len, *lrows;

  PetscFunctionBegin;
  /* get locally owned rows */
  PetscCall(PetscLayoutMapLocal(A->rmap, n, rows, &len, &lrows, NULL));
  /* fix right-hand side if needed */
  if (x && b) {
    const PetscScalar *xx;
    PetscScalar       *bb;

    PetscCall(VecGetArrayRead(x, &xx));
    PetscCall(VecGetArrayWrite(b, &bb));
    for (i = 0; i < len; ++i) bb[lrows[i]] = diag * xx[lrows[i]];
    PetscCall(VecRestoreArrayRead(x, &xx));
    PetscCall(VecRestoreArrayWrite(b, &bb));
  }
  PetscCall(MatZeroRows(l->A, len, lrows, 0.0, NULL, NULL));
  if (diag != 0.0) {
    Vec d;

    PetscCall(MatCreateVecs(A, NULL, &d));
    PetscCall(VecSet(d, diag));
    PetscCall(MatDiagonalSet(A, d, INSERT_VALUES));
    PetscCall(VecDestroy(&d));
  }
  PetscCall(PetscFree(lrows));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PETSC_INTERN PetscErrorCode MatMult_SeqDense(Mat, Vec, Vec);
PETSC_INTERN PetscErrorCode MatMultAdd_SeqDense(Mat, Vec, Vec, Vec);
PETSC_INTERN PetscErrorCode MatMultTranspose_SeqDense(Mat, Vec, Vec);
PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqDense(Mat, Vec, Vec, Vec);

static PetscErrorCode MatMultColumnRange_MPIDense(Mat mat, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
{
  Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
  PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
  PetscCall(VecGetArrayWriteAndMemType(mdn->lvec, &ay, &aymtype));
  PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
  PetscCall(VecRestoreArrayWriteAndMemType(mdn->lvec, &ay));
  PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
  PetscUseMethod(mdn->A, "MatMultColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (mdn->A, mdn->lvec, yy, c_start, c_end));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMult_MPIDense(Mat mat, Vec xx, Vec yy)
{
  Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
  PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
  PetscCall(VecGetArrayWriteAndMemType(mdn->lvec, &ay, &aymtype));
  PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
  PetscCall(VecRestoreArrayWriteAndMemType(mdn->lvec, &ay));
  PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
  PetscCall((*mdn->A->ops->mult)(mdn->A, mdn->lvec, yy));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultAddColumnRange_MPIDense(Mat mat, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end)
{
  Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
  PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
  PetscCall(VecGetArrayAndMemType(mdn->lvec, &ay, &aymtype));
  PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
  PetscCall(VecRestoreArrayAndMemType(mdn->lvec, &ay));
  PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
  PetscUseMethod(mdn->A, "MatMultAddColumnRange_C", (Mat, Vec, Vec, Vec, PetscInt, PetscInt), (mdn->A, mdn->lvec, yy, zz, c_start, c_end));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultAdd_MPIDense(Mat mat, Vec xx, Vec yy, Vec zz)
{
  Mat_MPIDense      *mdn = (Mat_MPIDense *)mat->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(mat));
  PetscCall(VecGetArrayReadAndMemType(xx, &ax, &axmtype));
  PetscCall(VecGetArrayAndMemType(mdn->lvec, &ay, &aymtype));
  PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ax, ay, MPI_REPLACE));
  PetscCall(VecRestoreArrayAndMemType(mdn->lvec, &ay));
  PetscCall(VecRestoreArrayReadAndMemType(xx, &ax));
  PetscCall((*mdn->A->ops->multadd)(mdn->A, mdn->lvec, yy, zz));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultHermitianTransposeColumnRange_MPIDense(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
{
  Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;
  PetscInt           r_start, r_end;
  PetscInt           c_start_local, c_end_local;

  PetscFunctionBegin;
  if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
  PetscCall(VecZeroEntries(a->lvec));
  PetscCall(VecGetOwnershipRange(yy, &r_start, &r_end));
  c_start_local = PetscMax(c_start, r_start);
  c_end_local   = PetscMin(c_end, r_end);
  PetscCall(VecGetArrayAndMemType(yy, &ay, &aymtype));
  if (c_end_local > c_start_local) {
    if (PetscMemTypeHost(aymtype)) {
      PetscCall(PetscArrayzero(&ay[c_start_local], (size_t)(c_end_local - c_start_local)));
    } else {
      PetscCall(PetscDeviceRegisterMemory(ay, aymtype, sizeof(*ay) * ((size_t)(r_end - r_start))));
      PetscCall(PetscDeviceArrayZero(NULL, &ay[c_start_local], (size_t)(c_end_local - c_start_local)));
    }
  }
  PetscUseMethod(a->A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (a->A, xx, a->lvec, c_start, c_end));
  PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
  PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
  PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
  PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
  PetscCall(VecRestoreArrayAndMemType(yy, &ay));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultTransposeKernel_MPIDense(Mat A, Vec xx, Vec yy, PetscBool herm)
{
  Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
  PetscCall(VecSet(yy, 0.0));
  if (herm) PetscCall((*a->A->ops->multhermitiantranspose)(a->A, xx, a->lvec));
  else PetscCall((*a->A->ops->multtranspose)(a->A, xx, a->lvec));
  PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
  PetscCall(VecGetArrayAndMemType(yy, &ay, &aymtype));
  PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
  PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
  PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
  PetscCall(VecRestoreArrayAndMemType(yy, &ay));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultHermitianTransposeAddColumnRange_MPIDense(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end)
{
  Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
  PetscCall(VecCopy(yy, zz));
  PetscCall(VecZeroEntries(a->lvec));
  PetscUseMethod(a->A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (a->A, xx, a->lvec, c_start, c_end));
  PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
  PetscCall(VecGetArrayAndMemType(zz, &ay, &aymtype));
  PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
  PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
  PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
  PetscCall(VecRestoreArrayAndMemType(zz, &ay));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultTransposeAddKernel_MPIDense(Mat A, Vec xx, Vec yy, Vec zz, PetscBool herm)
{
  Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
  const PetscScalar *ax;
  PetscScalar       *ay;
  PetscMemType       axmtype, aymtype;

  PetscFunctionBegin;
  if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
  PetscCall(VecCopy(yy, zz));
  if (herm) PetscCall((*a->A->ops->multhermitiantranspose)(a->A, xx, a->lvec));
  else PetscCall((*a->A->ops->multtranspose)(a->A, xx, a->lvec));
  PetscCall(VecGetArrayReadAndMemType(a->lvec, &ax, &axmtype));
  PetscCall(VecGetArrayAndMemType(zz, &ay, &aymtype));
  PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, MPIU_SCALAR, axmtype, ax, aymtype, ay, MPIU_SUM));
  PetscCall(PetscSFReduceEnd(a->Mvctx, MPIU_SCALAR, ax, ay, MPIU_SUM));
  PetscCall(VecRestoreArrayReadAndMemType(a->lvec, &ax));
  PetscCall(VecRestoreArrayAndMemType(zz, &ay));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultTranspose_MPIDense(Mat A, Vec xx, Vec yy)
{
  PetscFunctionBegin;
  PetscCall(MatMultTransposeKernel_MPIDense(A, xx, yy, PETSC_FALSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultTransposeAdd_MPIDense(Mat A, Vec xx, Vec yy, Vec zz)
{
  PetscFunctionBegin;
  PetscCall(MatMultTransposeAddKernel_MPIDense(A, xx, yy, zz, PETSC_FALSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultHermitianTranspose_MPIDense(Mat A, Vec xx, Vec yy)
{
  PetscFunctionBegin;
  PetscCall(MatMultTransposeKernel_MPIDense(A, xx, yy, PETSC_TRUE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultHermitianTransposeAdd_MPIDense(Mat A, Vec xx, Vec yy, Vec zz)
{
  PetscFunctionBegin;
  PetscCall(MatMultTransposeAddKernel_MPIDense(A, xx, yy, zz, PETSC_TRUE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatGetDiagonal_MPIDense(Mat A, Vec v)
{
  Mat_MPIDense      *a = (Mat_MPIDense *)A->data;
  PetscInt           lda, len, i, nl, ng, m = A->rmap->n, radd;
  PetscScalar       *x;
  const PetscScalar *av;

  PetscFunctionBegin;
  PetscCall(VecGetArray(v, &x));
  PetscCall(VecGetSize(v, &ng));
  PetscCheck(ng == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming mat and vec");
  PetscCall(VecGetLocalSize(v, &nl));
  len  = PetscMin(a->A->rmap->n, a->A->cmap->n);
  radd = A->rmap->rstart * m;
  PetscCall(MatDenseGetArrayRead(a->A, &av));
  PetscCall(MatDenseGetLDA(a->A, &lda));
  for (i = 0; i < len; i++) x[i] = av[radd + i * lda + i];
  PetscCall(MatDenseRestoreArrayRead(a->A, &av));
  if (nl - i > 0) PetscCall(PetscArrayzero(x + i, nl - i));
  PetscCall(VecRestoreArray(v, &x));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDestroy_MPIDense(Mat mat)
{
  Mat_MPIDense *mdn = (Mat_MPIDense *)mat->data;

  PetscFunctionBegin;
  PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
  PetscCall(MatStashDestroy_Private(&mat->stash));
  PetscCheck(!mdn->vecinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!mdn->matinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDestroy(&mdn->A));
  PetscCall(VecDestroy(&mdn->lvec));
  PetscCall(PetscSFDestroy(&mdn->Mvctx));
  PetscCall(VecDestroy(&mdn->cvec));
  PetscCall(MatDestroy(&mdn->cmat));

  PetscCall(PetscFree(mat->data));
  PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));

  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetLDA_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseSetLDA_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDensePlaceArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseResetArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseReplaceArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpiaij_C", NULL));
#if defined(PETSC_HAVE_ELEMENTAL)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_elemental_C", NULL));
#endif
#if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_scalapack_C", NULL));
#endif
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIDenseSetPreallocation_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidense_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaij_C", NULL));
#if defined(PETSC_HAVE_CUDA)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidense_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijcusparse_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensecuda_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidensecuda_mpidense_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidensecuda_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidensecuda_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensecuda_mpiaij_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensecuda_mpiaijcusparse_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArrayRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAGetArrayWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArrayRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDARestoreArrayWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAPlaceArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAResetArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDAReplaceArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseCUDASetPreallocation_C", NULL));
#endif
#if defined(PETSC_HAVE_HIP)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidense_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijhipsparse_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensehip_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidensehip_mpidense_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidensehip_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidensehip_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensehip_mpiaij_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidensehip_mpiaijhipsparse_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArrayRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPGetArrayWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArrayRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPRestoreArrayWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPPlaceArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPResetArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPReplaceArray_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseHIPSetPreallocation_C", NULL));
#endif
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumn_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumn_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVec_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVec_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecRead_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecWrite_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetSubMatrix_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreSubMatrix_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultColumnRange_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultAddColumnRange_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeColumnRange_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeAddColumnRange_C", NULL));

  PetscCall(PetscObjectCompose((PetscObject)mat, "DiagonalBlock", NULL));
  PetscFunctionReturn(PETSC_SUCCESS);
}

#include <petscdraw.h>
static PetscErrorCode MatView_MPIDense_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
{
  Mat_MPIDense     *mdn = (Mat_MPIDense *)mat->data;
  PetscMPIInt       rank;
  PetscViewerType   vtype;
  PetscBool         isascii, isdraw;
  PetscViewer       sviewer;
  PetscViewerFormat format;

  PetscFunctionBegin;
  PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
  if (isascii) {
    PetscCall(PetscViewerGetType(viewer, &vtype));
    PetscCall(PetscViewerGetFormat(viewer, &format));
    if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
      MatInfo info;
      PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
      PetscCall(PetscViewerASCIIPushSynchronized(viewer));
      PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "  [%d] local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %" PetscInt_FMT " \n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
                                                   (PetscInt)info.memory));
      PetscCall(PetscViewerFlush(viewer));
      PetscCall(PetscViewerASCIIPopSynchronized(viewer));
      if (mdn->Mvctx) PetscCall(PetscSFView(mdn->Mvctx, viewer));
      PetscFunctionReturn(PETSC_SUCCESS);
    } else if (format == PETSC_VIEWER_ASCII_INFO) {
      PetscFunctionReturn(PETSC_SUCCESS);
    }
  } else if (isdraw) {
    PetscDraw draw;
    PetscBool isnull;

    PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
    PetscCall(PetscDrawIsNull(draw, &isnull));
    if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
  }

  {
    /* assemble the entire matrix onto first processor. */
    Mat          A;
    PetscInt     M = mat->rmap->N, N = mat->cmap->N, m, row, i, nz;
    PetscInt    *cols;
    PetscScalar *vals;

    PetscCall(MatCreate(PetscObjectComm((PetscObject)mat), &A));
    if (rank == 0) {
      PetscCall(MatSetSizes(A, M, N, M, N));
    } else {
      PetscCall(MatSetSizes(A, 0, 0, M, N));
    }
    /* Since this is a temporary matrix, MATMPIDENSE instead of ((PetscObject)A)->type_name here is probably acceptable. */
    PetscCall(MatSetType(A, MATMPIDENSE));
    PetscCall(MatMPIDenseSetPreallocation(A, NULL));

    /* Copy the matrix ... This isn't the most efficient means,
       but it's quick for now */
    A->insertmode = INSERT_VALUES;

    row = mat->rmap->rstart;
    m   = mdn->A->rmap->n;
    for (i = 0; i < m; i++) {
      PetscCall(MatGetRow_MPIDense(mat, row, &nz, &cols, &vals));
      PetscCall(MatSetValues_MPIDense(A, 1, &row, nz, cols, vals, INSERT_VALUES));
      PetscCall(MatRestoreRow_MPIDense(mat, row, &nz, &cols, &vals));
      row++;
    }

    PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
    PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
    if (rank == 0) {
      PetscCall(PetscObjectSetName((PetscObject)((Mat_MPIDense *)A->data)->A, ((PetscObject)mat)->name));
      PetscCall(MatView_SeqDense(((Mat_MPIDense *)A->data)->A, sviewer));
    }
    PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
    PetscCall(MatDestroy(&A));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatView_MPIDense(Mat mat, PetscViewer viewer)
{
  PetscBool isascii, isbinary, isdraw, issocket;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));

  if (isascii || issocket || isdraw) PetscCall(MatView_MPIDense_ASCIIorDraworSocket(mat, viewer));
  else if (isbinary) PetscCall(MatView_Dense_Binary(mat, viewer));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetInfo_MPIDense(Mat A, MatInfoType flag, MatInfo *info)
{
  Mat_MPIDense  *mat = (Mat_MPIDense *)A->data;
  Mat            mdn = mat->A;
  PetscLogDouble isend[5], irecv[5];

  PetscFunctionBegin;
  info->block_size = 1.0;

  PetscCall(MatGetInfo(mdn, MAT_LOCAL, info));

  isend[0] = info->nz_used;
  isend[1] = info->nz_allocated;
  isend[2] = info->nz_unneeded;
  isend[3] = info->memory;
  isend[4] = info->mallocs;
  if (flag == MAT_LOCAL) {
    info->nz_used      = isend[0];
    info->nz_allocated = isend[1];
    info->nz_unneeded  = isend[2];
    info->memory       = isend[3];
    info->mallocs      = isend[4];
  } else if (flag == MAT_GLOBAL_MAX) {
    PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)A)));

    info->nz_used      = irecv[0];
    info->nz_allocated = irecv[1];
    info->nz_unneeded  = irecv[2];
    info->memory       = irecv[3];
    info->mallocs      = irecv[4];
  } else if (flag == MAT_GLOBAL_SUM) {
    PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)A)));

    info->nz_used      = irecv[0];
    info->nz_allocated = irecv[1];
    info->nz_unneeded  = irecv[2];
    info->memory       = irecv[3];
    info->mallocs      = irecv[4];
  }
  info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
  info->fill_ratio_needed = 0;
  info->factor_mallocs    = 0;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetOption_MPIDense(Mat A, MatOption op, PetscBool flg)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  switch (op) {
  case MAT_NEW_NONZERO_LOCATIONS:
  case MAT_NEW_NONZERO_LOCATION_ERR:
  case MAT_NEW_NONZERO_ALLOCATION_ERR:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  case MAT_ROW_ORIENTED:
    MatCheckPreallocated(A, 1);
    a->roworiented = flg;
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  case MAT_IGNORE_OFF_PROC_ENTRIES:
    a->donotstash = flg;
    break;
  case MAT_SYMMETRIC:
  case MAT_STRUCTURALLY_SYMMETRIC:
  case MAT_HERMITIAN:
  case MAT_SYMMETRY_ETERNAL:
  case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
  case MAT_SPD:
  case MAT_SPD_ETERNAL:
    /* if the diagonal matrix is square it inherits some of the properties above */
    if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
    break;
  default:
    break;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDiagonalScale_MPIDense(Mat A, Vec ll, Vec rr)
{
  Mat_MPIDense      *mdn = (Mat_MPIDense *)A->data;
  const PetscScalar *l;
  PetscScalar        x, *v, *vv, *r;
  PetscInt           i, j, s2a, s3a, s2, s3, m = mdn->A->rmap->n, n = mdn->A->cmap->n, lda;

  PetscFunctionBegin;
  PetscCall(MatDenseGetArray(mdn->A, &vv));
  PetscCall(MatDenseGetLDA(mdn->A, &lda));
  PetscCall(MatGetLocalSize(A, &s2, &s3));
  if (ll) {
    PetscCall(VecGetLocalSize(ll, &s2a));
    PetscCheck(s2a == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector non-conforming local size, %" PetscInt_FMT " != %" PetscInt_FMT, s2a, s2);
    PetscCall(VecGetArrayRead(ll, &l));
    for (i = 0; i < m; i++) {
      x = l[i];
      v = vv + i;
      for (j = 0; j < n; j++) {
        (*v) *= x;
        v += lda;
      }
    }
    PetscCall(VecRestoreArrayRead(ll, &l));
    PetscCall(PetscLogFlops(1.0 * n * m));
  }
  if (rr) {
    const PetscScalar *ar;

    PetscCall(VecGetLocalSize(rr, &s3a));
    PetscCheck(s3a == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vec non-conforming local size, %" PetscInt_FMT " != %" PetscInt_FMT ".", s3a, s3);
    PetscCall(VecGetArrayRead(rr, &ar));
    if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
    PetscCall(VecGetArray(mdn->lvec, &r));
    PetscCall(PetscSFBcastBegin(mdn->Mvctx, MPIU_SCALAR, ar, r, MPI_REPLACE));
    PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, ar, r, MPI_REPLACE));
    PetscCall(VecRestoreArrayRead(rr, &ar));
    for (i = 0; i < n; i++) {
      x = r[i];
      v = vv + i * lda;
      for (j = 0; j < m; j++) (*v++) *= x;
    }
    PetscCall(VecRestoreArray(mdn->lvec, &r));
    PetscCall(PetscLogFlops(1.0 * n * m));
  }
  PetscCall(MatDenseRestoreArray(mdn->A, &vv));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatNorm_MPIDense(Mat A, NormType type, PetscReal *nrm)
{
  Mat_MPIDense      *mdn = (Mat_MPIDense *)A->data;
  PetscInt           i, j;
  PetscMPIInt        size;
  PetscReal          sum = 0.0;
  const PetscScalar *av, *v;

  PetscFunctionBegin;
  PetscCall(MatDenseGetArrayRead(mdn->A, &av));
  v = av;
  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
  if (size == 1) {
    PetscCall(MatNorm(mdn->A, type, nrm));
  } else {
    if (type == NORM_FROBENIUS) {
      for (i = 0; i < mdn->A->cmap->n * mdn->A->rmap->n; i++) {
        sum += PetscRealPart(PetscConj(*v) * (*v));
        v++;
      }
      PetscCallMPI(MPIU_Allreduce(&sum, nrm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
      *nrm = PetscSqrtReal(*nrm);
      PetscCall(PetscLogFlops(2.0 * mdn->A->cmap->n * mdn->A->rmap->n));
    } else if (type == NORM_1) {
      PetscReal *tmp;

      PetscCall(PetscCalloc1(A->cmap->N, &tmp));
      *nrm = 0.0;
      v    = av;
      for (j = 0; j < mdn->A->cmap->n; j++) {
        for (i = 0; i < mdn->A->rmap->n; i++) {
          tmp[j] += PetscAbsScalar(*v);
          v++;
        }
      }
      PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, tmp, A->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
      for (j = 0; j < A->cmap->N; j++) {
        if (tmp[j] > *nrm) *nrm = tmp[j];
      }
      PetscCall(PetscFree(tmp));
      PetscCall(PetscLogFlops(A->cmap->n * A->rmap->n));
    } else if (type == NORM_INFINITY) { /* max row norm */
      PetscCall(MatNorm(mdn->A, type, nrm));
      PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, nrm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
    } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "No support for two norm");
  }
  PetscCall(MatDenseRestoreArrayRead(mdn->A, &av));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatTranspose_MPIDense(Mat A, MatReuse reuse, Mat *matout)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  Mat           B;
  PetscInt      M = A->rmap->N, N = A->cmap->N, m, n, *rwork, rstart = A->rmap->rstart;
  PetscInt      j, i, lda;
  PetscScalar  *v;

  PetscFunctionBegin;
  if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
  if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
    PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
    PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
    PetscCall(MatMPIDenseSetPreallocation(B, NULL));
  } else B = *matout;

  m = a->A->rmap->n;
  n = a->A->cmap->n;
  PetscCall(MatDenseGetArrayRead(a->A, (const PetscScalar **)&v));
  PetscCall(MatDenseGetLDA(a->A, &lda));
  PetscCall(PetscMalloc1(m, &rwork));
  for (i = 0; i < m; i++) rwork[i] = rstart + i;
  for (j = 0; j < n; j++) {
    PetscCall(MatSetValues(B, 1, &j, m, rwork, v, INSERT_VALUES));
    v = PetscSafePointerPlusOffset(v, lda);
  }
  PetscCall(MatDenseRestoreArrayRead(a->A, (const PetscScalar **)&v));
  PetscCall(PetscFree(rwork));
  PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
  if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
    *matout = B;
  } else {
    PetscCall(MatHeaderMerge(A, &B));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode       MatDuplicate_MPIDense(Mat, MatDuplicateOption, Mat *);
PETSC_INTERN PetscErrorCode MatScale_MPIDense(Mat, PetscScalar);

static PetscErrorCode MatSetUp_MPIDense(Mat A)
{
  PetscFunctionBegin;
  PetscCall(PetscLayoutSetUp(A->rmap));
  PetscCall(PetscLayoutSetUp(A->cmap));
  if (!A->preallocated) PetscCall(MatMPIDenseSetPreallocation(A, NULL));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatAXPY_MPIDense(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
{
  Mat_MPIDense *A = (Mat_MPIDense *)Y->data, *B = (Mat_MPIDense *)X->data;

  PetscFunctionBegin;
  PetscCall(MatAXPY(A->A, alpha, B->A, str));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatConjugate_MPIDense(Mat mat)
{
  Mat_MPIDense *a = (Mat_MPIDense *)mat->data;

  PetscFunctionBegin;
  PetscCall(MatConjugate(a->A));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatRealPart_MPIDense(Mat A)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatRealPart(a->A));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatImaginaryPart_MPIDense(Mat A)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatImaginaryPart(a->A));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetColumnVector_MPIDense(Mat A, Vec v, PetscInt col)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(a->A, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Missing local matrix");
  PetscCheck(a->A->ops->getcolumnvector, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Missing get column operation");
  PetscCall((*a->A->ops->getcolumnvector)(a->A, v, col));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PETSC_INTERN PetscErrorCode MatGetColumnReductions_SeqDense(Mat, PetscInt, PetscReal *);

static PetscErrorCode MatGetColumnReductions_MPIDense(Mat A, PetscInt type, PetscReal *reductions)
{
  PetscInt      i, m, n;
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatGetSize(A, &m, &n));
  if (type == REDUCTION_MEAN_REALPART) {
    PetscCall(MatGetColumnReductions_SeqDense(a->A, (PetscInt)REDUCTION_SUM_REALPART, reductions));
  } else if (type == REDUCTION_MEAN_IMAGINARYPART) {
    PetscCall(MatGetColumnReductions_SeqDense(a->A, (PetscInt)REDUCTION_SUM_IMAGINARYPART, reductions));
  } else {
    PetscCall(MatGetColumnReductions_SeqDense(a->A, type, reductions));
  }
  if (type == NORM_2) {
    for (i = 0; i < n; i++) reductions[i] *= reductions[i];
  }
  if (type == NORM_INFINITY) {
    PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, reductions, n, MPIU_REAL, MPIU_MAX, A->hdr.comm));
  } else {
    PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, reductions, n, MPIU_REAL, MPIU_SUM, A->hdr.comm));
  }
  if (type == NORM_2) {
    for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
  } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
    for (i = 0; i < n; i++) reductions[i] /= m;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetRandom_MPIDense(Mat x, PetscRandom rctx)
{
  Mat_MPIDense *d = (Mat_MPIDense *)x->data;

  PetscFunctionBegin;
  PetscCall(MatSetRandom(d->A, rctx));
#if defined(PETSC_HAVE_DEVICE)
  x->offloadmask = d->A->offloadmask;
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatTransposeMultSymbolic_MPIDense_MPIDense(Mat, Mat, PetscReal, Mat);
static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense(Mat, Mat, Mat);
static PetscErrorCode MatTransposeMatMultSymbolic_MPIDense_MPIDense(Mat, Mat, PetscReal, Mat);
static PetscErrorCode MatTransposeMatMultNumeric_MPIDense_MPIDense(Mat, Mat, Mat);
static PetscErrorCode MatEqual_MPIDense(Mat, Mat, PetscBool *);
static PetscErrorCode MatLoad_MPIDense(Mat, PetscViewer);
static PetscErrorCode MatProductSetFromOptions_MPIDense(Mat);

static struct _MatOps MatOps_Values = {MatSetValues_MPIDense,
                                       MatGetRow_MPIDense,
                                       MatRestoreRow_MPIDense,
                                       MatMult_MPIDense,
                                       /*  4*/ MatMultAdd_MPIDense,
                                       MatMultTranspose_MPIDense,
                                       MatMultTransposeAdd_MPIDense,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 10*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       MatTranspose_MPIDense,
                                       /* 15*/ MatGetInfo_MPIDense,
                                       MatEqual_MPIDense,
                                       MatGetDiagonal_MPIDense,
                                       MatDiagonalScale_MPIDense,
                                       MatNorm_MPIDense,
                                       /* 20*/ MatAssemblyBegin_MPIDense,
                                       MatAssemblyEnd_MPIDense,
                                       MatSetOption_MPIDense,
                                       MatZeroEntries_MPIDense,
                                       /* 24*/ MatZeroRows_MPIDense,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 29*/ MatSetUp_MPIDense,
                                       NULL,
                                       NULL,
                                       MatGetDiagonalBlock_MPIDense,
                                       NULL,
                                       /* 34*/ MatDuplicate_MPIDense,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 39*/ MatAXPY_MPIDense,
                                       MatCreateSubMatrices_MPIDense,
                                       NULL,
                                       MatGetValues_MPIDense,
                                       MatCopy_MPIDense,
                                       /* 44*/ NULL,
                                       MatScale_MPIDense,
                                       MatShift_MPIDense,
                                       NULL,
                                       NULL,
                                       /* 49*/ MatSetRandom_MPIDense,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 54*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 59*/ MatCreateSubMatrix_MPIDense,
                                       MatDestroy_MPIDense,
                                       MatView_MPIDense,
                                       NULL,
                                       NULL,
                                       /* 64*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 69*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 74*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       MatLoad_MPIDense,
                                       /* 79*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /* 83*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       MatMatTransposeMultSymbolic_MPIDense_MPIDense,
                                       MatMatTransposeMultNumeric_MPIDense_MPIDense,
                                       /* 89*/ NULL,
                                       MatProductSetFromOptions_MPIDense,
                                       NULL,
                                       NULL,
                                       MatConjugate_MPIDense,
                                       /* 94*/ NULL,
                                       NULL,
                                       MatRealPart_MPIDense,
                                       MatImaginaryPart_MPIDense,
                                       NULL,
                                       /*99*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       MatGetColumnVector_MPIDense,
                                       /*104*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /*109*/ NULL,
                                       NULL,
                                       MatMultHermitianTranspose_MPIDense,
                                       MatMultHermitianTransposeAdd_MPIDense,
                                       NULL,
                                       /*114*/ NULL,
                                       MatGetColumnReductions_MPIDense,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /*120*/ NULL,
                                       MatTransposeMatMultSymbolic_MPIDense_MPIDense,
                                       MatTransposeMatMultNumeric_MPIDense_MPIDense,
                                       NULL,
                                       /*124*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /*129*/ NULL,
                                       NULL,
                                       MatCreateMPIMatConcatenateSeqMat_MPIDense,
                                       NULL,
                                       NULL,
                                       /*134*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       /*139*/ NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL,
                                       NULL};

static PetscErrorCode MatMPIDenseSetPreallocation_MPIDense(Mat mat, PetscScalar *data)
{
  Mat_MPIDense *a     = (Mat_MPIDense *)mat->data;
  MatType       mtype = MATSEQDENSE;

  PetscFunctionBegin;
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)mat), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(PetscLayoutSetUp(mat->rmap));
  PetscCall(PetscLayoutSetUp(mat->cmap));
  if (!a->A) {
    PetscCall(MatCreate(PETSC_COMM_SELF, &a->A));
    PetscCall(MatSetSizes(a->A, mat->rmap->n, mat->cmap->N, mat->rmap->n, mat->cmap->N));
  }
#if defined(PETSC_HAVE_CUDA)
  PetscBool iscuda;
  PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPIDENSECUDA, &iscuda));
  if (iscuda) mtype = MATSEQDENSECUDA;
#endif
#if defined(PETSC_HAVE_HIP)
  PetscBool iship;
  PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATMPIDENSEHIP, &iship));
  if (iship) mtype = MATSEQDENSEHIP;
#endif
  PetscCall(MatSetType(a->A, mtype));
  PetscCall(MatSeqDenseSetPreallocation(a->A, data));
#if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
  mat->offloadmask = a->A->offloadmask;
#endif
  mat->preallocated = PETSC_TRUE;
  mat->assembled    = PETSC_TRUE;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIDense(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
{
  Mat B, C;

  PetscFunctionBegin;
  PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &C));
  PetscCall(MatConvert_SeqAIJ_SeqDense(C, MATSEQDENSE, MAT_INITIAL_MATRIX, &B));
  PetscCall(MatDestroy(&C));
  if (reuse == MAT_REUSE_MATRIX) {
    C = *newmat;
  } else C = NULL;
  PetscCall(MatCreateMPIMatConcatenateSeqMat(PetscObjectComm((PetscObject)A), B, A->cmap->n, !C ? MAT_INITIAL_MATRIX : MAT_REUSE_MATRIX, &C));
  PetscCall(MatDestroy(&B));
  if (reuse == MAT_INPLACE_MATRIX) PetscCall(MatHeaderReplace(A, &C));
  else if (reuse == MAT_INITIAL_MATRIX) *newmat = C;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatConvert_MPIDense_MPIAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
{
  Mat B, C;

  PetscFunctionBegin;
  PetscCall(MatDenseGetLocalMatrix(A, &C));
  PetscCall(MatConvert_SeqDense_SeqAIJ(C, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
  if (reuse == MAT_REUSE_MATRIX) {
    C = *newmat;
  } else C = NULL;
  PetscCall(MatCreateMPIMatConcatenateSeqMat(PetscObjectComm((PetscObject)A), B, A->cmap->n, !C ? MAT_INITIAL_MATRIX : MAT_REUSE_MATRIX, &C));
  PetscCall(MatDestroy(&B));
  if (reuse == MAT_INPLACE_MATRIX) PetscCall(MatHeaderReplace(A, &C));
  else if (reuse == MAT_INITIAL_MATRIX) *newmat = C;
  PetscFunctionReturn(PETSC_SUCCESS);
}

#if defined(PETSC_HAVE_ELEMENTAL)
PETSC_INTERN PetscErrorCode MatConvert_MPIDense_Elemental(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  Mat           mat_elemental;
  PetscScalar  *v;
  PetscInt      m = A->rmap->n, N = A->cmap->N, rstart = A->rmap->rstart, i, *rows, *cols, lda;

  PetscFunctionBegin;
  if (reuse == MAT_REUSE_MATRIX) {
    mat_elemental = *newmat;
    PetscCall(MatZeroEntries(*newmat));
  } else {
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &mat_elemental));
    PetscCall(MatSetSizes(mat_elemental, PETSC_DECIDE, PETSC_DECIDE, A->rmap->N, A->cmap->N));
    PetscCall(MatSetType(mat_elemental, MATELEMENTAL));
    PetscCall(MatSetUp(mat_elemental));
    PetscCall(MatSetOption(mat_elemental, MAT_ROW_ORIENTED, PETSC_FALSE));
  }

  PetscCall(PetscMalloc2(m, &rows, N, &cols));
  for (i = 0; i < N; i++) cols[i] = i;
  for (i = 0; i < m; i++) rows[i] = rstart + i;

  /* PETSc-Elemental interface uses axpy for setting off-processor entries, only ADD_VALUES is allowed */
  PetscCall(MatDenseGetArray(A, &v));
  PetscCall(MatDenseGetLDA(a->A, &lda));
  if (lda == m) PetscCall(MatSetValues(mat_elemental, m, rows, N, cols, v, ADD_VALUES));
  else {
    for (i = 0; i < N; i++) PetscCall(MatSetValues(mat_elemental, m, rows, 1, &i, v + lda * i, ADD_VALUES));
  }
  PetscCall(MatAssemblyBegin(mat_elemental, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(mat_elemental, MAT_FINAL_ASSEMBLY));
  PetscCall(MatDenseRestoreArray(A, &v));
  PetscCall(PetscFree2(rows, cols));

  if (reuse == MAT_INPLACE_MATRIX) {
    PetscCall(MatHeaderReplace(A, &mat_elemental));
  } else {
    *newmat = mat_elemental;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}
#endif

static PetscErrorCode MatDenseGetColumn_MPIDense(Mat A, PetscInt col, PetscScalar **vals)
{
  Mat_MPIDense *mat = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDenseGetColumn(mat->A, col, vals));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseRestoreColumn_MPIDense(Mat A, PetscScalar **vals)
{
  Mat_MPIDense *mat = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCall(MatDenseRestoreColumn(mat->A, vals));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIDense(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
{
  Mat_MPIDense *mat;
  PetscInt      m, nloc, N;

  PetscFunctionBegin;
  PetscCall(MatGetSize(inmat, &m, &N));
  PetscCall(MatGetLocalSize(inmat, NULL, &nloc));
  if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
    PetscInt sum;

    if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
    /* Check sum(n) = N */
    PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
    PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);

    PetscCall(MatCreateDense(comm, m, n, PETSC_DETERMINE, N, NULL, outmat));
    PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
  }

  /* numeric phase */
  mat = (Mat_MPIDense *)(*outmat)->data;
  PetscCall(MatCopy(inmat, mat->A, SAME_NONZERO_PATTERN));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseGetColumnVec_MPIDense(Mat A, PetscInt col, Vec *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  PetscInt      lda;

  PetscFunctionBegin;
  PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
  a->vecinuse = col + 1;
  PetscCall(MatDenseGetLDA(a->A, &lda));
  PetscCall(MatDenseGetArray(a->A, (PetscScalar **)&a->ptrinuse));
  PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
  *v = a->cvec;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseRestoreColumnVec_MPIDense(Mat A, PetscInt col, Vec *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
  PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
  VecCheckAssembled(a->cvec);
  a->vecinuse = 0;
  PetscCall(MatDenseRestoreArray(a->A, (PetscScalar **)&a->ptrinuse));
  PetscCall(VecResetArray(a->cvec));
  if (v) *v = NULL;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseGetColumnVecRead_MPIDense(Mat A, PetscInt col, Vec *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  PetscInt      lda;

  PetscFunctionBegin;
  PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
  a->vecinuse = col + 1;
  PetscCall(MatDenseGetLDA(a->A, &lda));
  PetscCall(MatDenseGetArrayRead(a->A, &a->ptrinuse));
  PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
  PetscCall(VecLockReadPush(a->cvec));
  *v = a->cvec;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseRestoreColumnVecRead_MPIDense(Mat A, PetscInt col, Vec *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
  PetscCheck(a->cvec, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal column vector");
  VecCheckAssembled(a->cvec);
  a->vecinuse = 0;
  PetscCall(MatDenseRestoreArrayRead(a->A, &a->ptrinuse));
  PetscCall(VecLockReadPop(a->cvec));
  PetscCall(VecResetArray(a->cvec));
  if (v) *v = NULL;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseGetColumnVecWrite_MPIDense(Mat A, PetscInt col, Vec *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  PetscInt      lda;

  PetscFunctionBegin;
  PetscCheck(!a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  if (!a->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &a->cvec));
  a->vecinuse = col + 1;
  PetscCall(MatDenseGetLDA(a->A, &lda));
  PetscCall(MatDenseGetArrayWrite(a->A, (PetscScalar **)&a->ptrinuse));
  PetscCall(VecPlaceArray(a->cvec, PetscSafePointerPlusOffset(a->ptrinuse, (size_t)col * (size_t)lda)));
  *v = a->cvec;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseRestoreColumnVecWrite_MPIDense(Mat A, PetscInt col, Vec *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  PetscCheck(a->vecinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
  PetscCheck(a->cvec, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal column vector");
  VecCheckAssembled(a->cvec);
  a->vecinuse = 0;
  PetscCall(MatDenseRestoreArrayWrite(a->A, (PetscScalar **)&a->ptrinuse));
  PetscCall(VecResetArray(a->cvec));
  if (v) *v = NULL;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseGetSubMatrix_MPIDense(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  Mat_MPIDense *c;
  MPI_Comm      comm;
  PetscInt      prbegin, prend, pcbegin, pcend;

  PetscFunctionBegin;
  PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
  PetscCheck(!a->vecinuse, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
  PetscCheck(!a->matinuse, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  prbegin = PetscMax(0, PetscMin(A->rmap->rend, rbegin) - A->rmap->rstart);
  prend   = PetscMin(A->rmap->n, PetscMax(0, rend - A->rmap->rstart));
  pcbegin = PetscMax(0, PetscMin(A->cmap->rend, cbegin) - A->cmap->rstart);
  pcend   = PetscMin(A->cmap->n, PetscMax(0, cend - A->cmap->rstart));
  if (!a->cmat) {
    PetscCall(MatCreate(comm, &a->cmat));
    PetscCall(MatSetType(a->cmat, ((PetscObject)A)->type_name));
    if (rend - rbegin == A->rmap->N) PetscCall(PetscLayoutReference(A->rmap, &a->cmat->rmap));
    else {
      PetscCall(PetscLayoutSetLocalSize(a->cmat->rmap, prend - prbegin));
      PetscCall(PetscLayoutSetSize(a->cmat->rmap, rend - rbegin));
      PetscCall(PetscLayoutSetUp(a->cmat->rmap));
    }
    if (cend - cbegin == A->cmap->N) PetscCall(PetscLayoutReference(A->cmap, &a->cmat->cmap));
    else {
      PetscCall(PetscLayoutSetLocalSize(a->cmat->cmap, pcend - pcbegin));
      PetscCall(PetscLayoutSetSize(a->cmat->cmap, cend - cbegin));
      PetscCall(PetscLayoutSetUp(a->cmat->cmap));
    }
    c             = (Mat_MPIDense *)a->cmat->data;
    c->sub_rbegin = rbegin;
    c->sub_rend   = rend;
    c->sub_cbegin = cbegin;
    c->sub_cend   = cend;
  }
  c = (Mat_MPIDense *)a->cmat->data;
  if (c->sub_rbegin != rbegin || c->sub_rend != rend) {
    PetscCall(PetscLayoutDestroy(&a->cmat->rmap));
    PetscCall(PetscLayoutCreate(comm, &a->cmat->rmap));
    PetscCall(PetscLayoutSetLocalSize(a->cmat->rmap, prend - prbegin));
    PetscCall(PetscLayoutSetSize(a->cmat->rmap, rend - rbegin));
    PetscCall(PetscLayoutSetUp(a->cmat->rmap));
    c->sub_rbegin = rbegin;
    c->sub_rend   = rend;
  }
  if (c->sub_cbegin != cbegin || c->sub_cend != cend) {
    // special optimization: check if all columns are owned by rank 0, in which case no communication is necessary
    if ((cend - cbegin != a->cmat->cmap->N) || (A->cmap->range[1] != A->cmap->N)) {
      PetscCall(PetscLayoutDestroy(&a->cmat->cmap));
      PetscCall(PetscLayoutCreate(comm, &a->cmat->cmap));
      PetscCall(PetscLayoutSetLocalSize(a->cmat->cmap, pcend - pcbegin));
      PetscCall(PetscLayoutSetSize(a->cmat->cmap, cend - cbegin));
      PetscCall(PetscLayoutSetUp(a->cmat->cmap));
      PetscCall(VecDestroy(&c->lvec));
      PetscCall(PetscSFDestroy(&c->Mvctx));
    }
    c->sub_cbegin = cbegin;
    c->sub_cend   = cend;
  }
  PetscCheck(!c->A, comm, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
  PetscCall(MatDenseGetSubMatrix(a->A, prbegin, prend, cbegin, cend, &c->A));

  a->cmat->preallocated = PETSC_TRUE;
  a->cmat->assembled    = PETSC_TRUE;
#if defined(PETSC_HAVE_DEVICE)
  a->cmat->offloadmask = c->A->offloadmask;
#endif
  a->matinuse = cbegin + 1;
  *v          = a->cmat;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDenseRestoreSubMatrix_MPIDense(Mat A, Mat *v)
{
  Mat_MPIDense *a = (Mat_MPIDense *)A->data;
  Mat_MPIDense *c;

  PetscFunctionBegin;
  PetscCheck(a->matinuse, PetscObjectComm((PetscObject)A), PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
  PetscCheck(a->cmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing internal matrix");
  PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
  a->matinuse = 0;
  c           = (Mat_MPIDense *)a->cmat->data;
  PetscCall(MatDenseRestoreSubMatrix(a->A, &c->A));
  *v = NULL;
#if defined(PETSC_HAVE_DEVICE)
  A->offloadmask = a->A->offloadmask;
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*MC
   MATMPIDENSE - MATMPIDENSE = "mpidense" - A matrix type to be used for distributed dense matrices.

   Options Database Key:
. -mat_type mpidense - sets the matrix type to `MATMPIDENSE` during a call to `MatSetFromOptions()`

  Level: beginner

.seealso: [](ch_matrices), `Mat`, `MatCreateDense()`, `MATSEQDENSE`, `MATDENSE`
M*/
PetscErrorCode MatCreate_MPIDense(Mat mat)
{
  Mat_MPIDense *a;

  PetscFunctionBegin;
  PetscCall(PetscNew(&a));
  mat->data   = (void *)a;
  mat->ops[0] = MatOps_Values;

  mat->insertmode = NOT_SET_VALUES;

  /* build cache for off array entries formed */
  a->donotstash = PETSC_FALSE;

  PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)mat), 1, &mat->stash));

  /* stuff used for matrix vector multiply */
  a->lvec        = NULL;
  a->Mvctx       = NULL;
  a->roworiented = PETSC_TRUE;

  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetLDA_C", MatDenseGetLDA_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseSetLDA_C", MatDenseSetLDA_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArray_C", MatDenseGetArray_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArray_C", MatDenseRestoreArray_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayRead_C", MatDenseRestoreArrayRead_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreArrayWrite_C", MatDenseRestoreArrayWrite_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDensePlaceArray_C", MatDensePlaceArray_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseResetArray_C", MatDenseResetArray_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseReplaceArray_C", MatDenseReplaceArray_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpiaij_C", MatConvert_MPIDense_MPIAIJ));
#if defined(PETSC_HAVE_ELEMENTAL)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_elemental_C", MatConvert_MPIDense_Elemental));
#endif
#if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_scalapack_C", MatConvert_Dense_ScaLAPACK));
#endif
#if defined(PETSC_HAVE_CUDA)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensecuda_C", MatConvert_MPIDense_MPIDenseCUDA));
#endif
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIDenseSetPreallocation_C", MatMPIDenseSetPreallocation_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaij_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
#if defined(PETSC_HAVE_CUDA)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijcusparse_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijcusparse_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
#endif
#if defined(PETSC_HAVE_HIP)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpidense_mpidensehip_C", MatConvert_MPIDense_MPIDenseHIP));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaijhipsparse_mpidense_C", MatProductSetFromOptions_MPIAIJ_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpidense_mpiaijhipsparse_C", MatProductSetFromOptions_MPIDense_MPIAIJ));
#endif
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseGetColumn_C", MatDenseGetColumn_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDenseRestoreColumn_C", MatDenseRestoreColumn_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultColumnRange_C", MatMultColumnRange_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultAddColumnRange_C", MatMultAddColumnRange_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_MPIDense));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_MPIDense));
  PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATMPIDENSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*MC
   MATDENSE - MATDENSE = "dense" - A matrix type to be used for dense matrices.

   This matrix type is identical to `MATSEQDENSE` when constructed with a single process communicator,
   and `MATMPIDENSE` otherwise.

   Options Database Key:
. -mat_type dense - sets the matrix type to `MATDENSE` during a call to `MatSetFromOptions()`

  Level: beginner

.seealso: [](ch_matrices), `Mat`, `MATSEQDENSE`, `MATMPIDENSE`, `MATDENSECUDA`, `MATDENSEHIP`
M*/

/*@
  MatMPIDenseSetPreallocation - Sets the array used to store the matrix entries

  Collective

  Input Parameters:
+ B    - the matrix
- data - optional location of matrix data.  Set to `NULL` for PETSc
         to control all matrix memory allocation.

  Level: intermediate

  Notes:
  The dense format is fully compatible with standard Fortran
  storage by columns.

  The data input variable is intended primarily for Fortran programmers
  who wish to allocate their own matrix memory space.  Most users should
  set `data` to `NULL`.

.seealso: [](ch_matrices), `Mat`, `MATMPIDENSE`, `MatCreate()`, `MatCreateSeqDense()`, `MatSetValues()`
@*/
PetscErrorCode MatMPIDenseSetPreallocation(Mat B, PetscScalar *data)
{
  PetscFunctionBegin;
  PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
  PetscTryMethod(B, "MatMPIDenseSetPreallocation_C", (Mat, PetscScalar *), (B, data));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  MatDensePlaceArray - Allows one to replace the array in a `MATDENSE` matrix with an
  array provided by the user. This is useful to avoid copying an array
  into a matrix

  Not Collective

  Input Parameters:
+ mat   - the matrix
- array - the array in column major order

  Level: developer

  Note:
  Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.

  You can return to the original array with a call to `MatDenseResetArray()`. The user is responsible for freeing this array; it will not be
  freed when the matrix is destroyed.

.seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatDenseGetArray()`, `MatDenseResetArray()`, `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecResetArray()`,
          `MatDenseReplaceArray()`
@*/
PetscErrorCode MatDensePlaceArray(Mat mat, const PetscScalar *array)
{
  PetscFunctionBegin;
  PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
  PetscUseMethod(mat, "MatDensePlaceArray_C", (Mat, const PetscScalar *), (mat, array));
  PetscCall(PetscObjectStateIncrease((PetscObject)mat));
#if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
  mat->offloadmask = PETSC_OFFLOAD_CPU;
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  MatDenseResetArray - Resets the matrix array to that it previously had before the call to `MatDensePlaceArray()`

  Not Collective

  Input Parameter:
. mat - the matrix

  Level: developer

  Note:
  You can only call this after a call to `MatDensePlaceArray()`

.seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatDenseGetArray()`, `MatDensePlaceArray()`, `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecResetArray()`
@*/
PetscErrorCode MatDenseResetArray(Mat mat)
{
  PetscFunctionBegin;
  PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
  PetscUseMethod(mat, "MatDenseResetArray_C", (Mat), (mat));
  PetscCall(PetscObjectStateIncrease((PetscObject)mat));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  MatDenseReplaceArray - Allows one to replace the array in a dense matrix with an
  array provided by the user. This is useful to avoid copying an array
  into a matrix

  Not Collective

  Input Parameters:
+ mat   - the matrix
- array - the array in column major order

  Level: developer

  Note:
  Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.

  The memory passed in MUST be obtained with `PetscMalloc()` and CANNOT be
  freed by the user. It will be freed when the matrix is destroyed.

.seealso: [](ch_matrices), `Mat`, `MatDensePlaceArray()`, `MatDenseGetArray()`, `VecReplaceArray()`
@*/
PetscErrorCode MatDenseReplaceArray(Mat mat, const PetscScalar *array)
{
  PetscFunctionBegin;
  PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
  PetscUseMethod(mat, "MatDenseReplaceArray_C", (Mat, const PetscScalar *), (mat, array));
  PetscCall(PetscObjectStateIncrease((PetscObject)mat));
#if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
  mat->offloadmask = PETSC_OFFLOAD_CPU;
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  MatCreateDense - Creates a matrix in `MATDENSE` format.

  Collective

  Input Parameters:
+ comm - MPI communicator
. m    - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
. n    - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
. M    - number of global rows (or `PETSC_DECIDE` to have calculated if `m` is given)
. N    - number of global columns (or `PETSC_DECIDE` to have calculated if `n` is given)
- data - optional location of matrix data.  Set data to `NULL` (`PETSC_NULL_SCALAR_ARRAY` for Fortran users) for PETSc
   to control all matrix memory allocation.

  Output Parameter:
. A - the matrix

  Level: intermediate

  Notes:
  The dense format is fully compatible with standard Fortran
  storage by columns.

  Although local portions of the matrix are stored in column-major
  order, the matrix is partitioned across MPI ranks by row.

  The data input variable is intended primarily for Fortran programmers
  who wish to allocate their own matrix memory space.  Most users should
  set `data` to `NULL` (`PETSC_NULL_SCALAR_ARRAY` for Fortran users).

  The user MUST specify either the local or global matrix dimensions
  (possibly both).

.seealso: [](ch_matrices), `Mat`, `MATDENSE`, `MatCreate()`, `MatCreateSeqDense()`, `MatSetValues()`
@*/
PetscErrorCode MatCreateDense(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar data[], Mat *A)
{
  PetscFunctionBegin;
  PetscCall(MatCreate(comm, A));
  PetscCall(MatSetSizes(*A, m, n, M, N));
  PetscCall(MatSetType(*A, MATDENSE));
  PetscCall(MatSeqDenseSetPreallocation(*A, data));
  PetscCall(MatMPIDenseSetPreallocation(*A, data));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDuplicate_MPIDense(Mat A, MatDuplicateOption cpvalues, Mat *newmat)
{
  Mat           mat;
  Mat_MPIDense *a, *oldmat = (Mat_MPIDense *)A->data;

  PetscFunctionBegin;
  *newmat = NULL;
  PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &mat));
  PetscCall(MatSetSizes(mat, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
  PetscCall(MatSetType(mat, ((PetscObject)A)->type_name));
  a = (Mat_MPIDense *)mat->data;

  mat->factortype   = A->factortype;
  mat->assembled    = PETSC_TRUE;
  mat->preallocated = PETSC_TRUE;

  mat->insertmode = NOT_SET_VALUES;
  a->donotstash   = oldmat->donotstash;

  PetscCall(PetscLayoutReference(A->rmap, &mat->rmap));
  PetscCall(PetscLayoutReference(A->cmap, &mat->cmap));

  PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));

  *newmat = mat;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatLoad_MPIDense(Mat newMat, PetscViewer viewer)
{
  PetscBool isbinary;
#if defined(PETSC_HAVE_HDF5)
  PetscBool ishdf5;
#endif

  PetscFunctionBegin;
  PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
  PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
  /* force binary viewer to load .info file if it has not yet done so */
  PetscCall(PetscViewerSetUp(viewer));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
#if defined(PETSC_HAVE_HDF5)
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
#endif
  if (isbinary) {
    PetscCall(MatLoad_Dense_Binary(newMat, viewer));
#if defined(PETSC_HAVE_HDF5)
  } else if (ishdf5) {
    PetscCall(MatLoad_Dense_HDF5(newMat, viewer));
#endif
  } else SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatEqual_MPIDense(Mat A, Mat B, PetscBool *flag)
{
  Mat_MPIDense *matB = (Mat_MPIDense *)B->data, *matA = (Mat_MPIDense *)A->data;
  Mat           a, b;

  PetscFunctionBegin;
  a = matA->A;
  b = matB->A;
  PetscCall(MatEqual(a, b, flag));
  PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, flag, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductCtxDestroy_MatTransMatMult_MPIDense_MPIDense(PetscCtxRt data)
{
  MatProductCtx_TransMatMultDense *atb = *(MatProductCtx_TransMatMultDense **)data;

  PetscFunctionBegin;
  PetscCall(PetscFree2(atb->sendbuf, atb->recvcounts));
  PetscCall(MatDestroy(&atb->atb));
  PetscCall(PetscFree(atb));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductCtxDestroy_MatMatTransMult_MPIDense_MPIDense(PetscCtxRt data)
{
  MatProductCtx_MatTransMultDense *abt = *(MatProductCtx_MatTransMultDense **)data;

  PetscFunctionBegin;
  PetscCall(PetscFree2(abt->buf[0], abt->buf[1]));
  PetscCall(PetscFree2(abt->recvcounts, abt->recvdispls));
  PetscCall(PetscFree(abt));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatTransposeMatMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
{
  Mat_MPIDense                    *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
  MatProductCtx_TransMatMultDense *atb;
  MPI_Comm                         comm;
  PetscMPIInt                      size, *recvcounts;
  PetscScalar                     *carray, *sendbuf;
  const PetscScalar               *atbarray;
  PetscInt                         i, cN = C->cmap->N, proc, k, j, lda;
  const PetscInt                  *ranges;

  PetscFunctionBegin;
  MatCheckProduct(C, 3);
  PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
  atb        = (MatProductCtx_TransMatMultDense *)C->product->data;
  recvcounts = atb->recvcounts;
  sendbuf    = atb->sendbuf;

  PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
  PetscCallMPI(MPI_Comm_size(comm, &size));

  /* compute atbarray = aseq^T * bseq */
  PetscCall(MatTransposeMatMult(a->A, b->A, atb->atb ? MAT_REUSE_MATRIX : MAT_INITIAL_MATRIX, PETSC_DETERMINE, &atb->atb));

  PetscCall(MatGetOwnershipRanges(C, &ranges));

  if (ranges[1] == C->rmap->N) {
    /* all of the values are being reduced to rank 0: optimize this case to use MPI_Reduce and GPU aware MPI if available */
    PetscInt           atb_lda, c_lda;
    Mat                atb_local = atb->atb;
    Mat                atb_alloc = NULL;
    Mat                c_local   = c->A;
    Mat                c_alloc   = NULL;
    PetscMemType       atb_memtype, c_memtype;
    const PetscScalar *atb_array = NULL;
    MPI_Datatype       vector_type;
    PetscScalar       *c_array = NULL;
    PetscMPIInt        rank;

    PetscCallMPI(MPI_Comm_rank(comm, &rank));

    PetscCall(MatDenseGetLDA(atb_local, &atb_lda));
    if (atb_lda != C->rmap->N) {
      // copy atb to a matrix that will have lda == the number of rows
      PetscCall(MatDuplicate(atb_local, MAT_DO_NOT_COPY_VALUES, &atb_alloc));
      PetscCall(MatCopy(atb_local, atb_alloc, DIFFERENT_NONZERO_PATTERN));
      atb_local = atb_alloc;
    }

    if (rank == 0) {
      PetscCall(MatDenseGetLDA(c_local, &c_lda));
      if (c_lda != C->rmap->N) {
        // copy c to a matrix that will have lda == the number of rows
        PetscCall(MatDuplicate(c_local, MAT_DO_NOT_COPY_VALUES, &c_alloc));
        c_local = c_alloc;
      }
      PetscCall(MatZeroEntries(c_local));
    }
    /* atb_local and c_local have nrows = lda = A->cmap->N and ncols =
     * B->cmap->N: use the a->Mvctx to use the best reduction method */
    if (!a->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A));
    vector_type = MPIU_SCALAR;
    if (B->cmap->N > 1) {
      PetscMPIInt mpi_N;

      PetscCall(PetscMPIIntCast(B->cmap->N, &mpi_N));
      PetscCallMPI(MPI_Type_contiguous(mpi_N, MPIU_SCALAR, &vector_type));
      PetscCallMPI(MPI_Type_commit(&vector_type));
    }
    PetscCall(MatDenseGetArrayReadAndMemType(atb_local, &atb_array, &atb_memtype));
    PetscCall(MatDenseGetArrayWriteAndMemType(c_local, &c_array, &c_memtype));
    PetscCall(PetscSFReduceWithMemTypeBegin(a->Mvctx, vector_type, atb_memtype, atb_array, c_memtype, c_array, MPIU_SUM));
    PetscCall(PetscSFReduceEnd(a->Mvctx, vector_type, atb_array, c_array, MPIU_SUM));
    PetscCall(MatDenseRestoreArrayWriteAndMemType(c_local, &c_array));
    PetscCall(MatDenseRestoreArrayReadAndMemType(atb_local, &atb_array));
    if (rank == 0 && c_local != c->A) PetscCall(MatCopy(c_local, c->A, DIFFERENT_NONZERO_PATTERN));
    if (B->cmap->N > 1) PetscCallMPI(MPI_Type_free(&vector_type));
    PetscCall(MatDestroy(&atb_alloc));
    PetscCall(MatDestroy(&c_alloc));
    PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
    PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  /* arrange atbarray into sendbuf */
  PetscCall(MatDenseGetArrayRead(atb->atb, &atbarray));
  PetscCall(MatDenseGetLDA(atb->atb, &lda));
  for (proc = 0, k = 0; proc < size; proc++) {
    for (j = 0; j < cN; j++) {
      for (i = ranges[proc]; i < ranges[proc + 1]; i++) sendbuf[k++] = atbarray[i + j * lda];
    }
  }
  PetscCall(MatDenseRestoreArrayRead(atb->atb, &atbarray));

  /* sum all atbarray to local values of C */
  PetscCall(MatDenseGetArrayWrite(c->A, &carray));
  PetscCallMPI(MPI_Reduce_scatter(sendbuf, carray, recvcounts, MPIU_SCALAR, MPIU_SUM, comm));
  PetscCall(MatDenseRestoreArrayWrite(c->A, &carray));
  PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
  PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatTransposeMatMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
{
  MPI_Comm                         comm;
  PetscMPIInt                      size;
  PetscInt                         cm = A->cmap->n, cM, cN = B->cmap->N;
  MatProductCtx_TransMatMultDense *atb;
  PetscBool                        cisdense = PETSC_FALSE;
  const PetscInt                  *ranges;

  PetscFunctionBegin;
  MatCheckProduct(C, 4);
  PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
  PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
  PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, comm, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart,
             A->rmap->rend, B->rmap->rstart, B->rmap->rend);

  /* create matrix product C */
  PetscCall(MatSetSizes(C, cm, B->cmap->n, A->cmap->N, B->cmap->N));
#if defined(PETSC_HAVE_CUDA)
  PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, ""));
#endif
#if defined(PETSC_HAVE_HIP)
  PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSEHIP, ""));
#endif
  if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
  PetscCall(MatSetUp(C));

  /* create data structure for reuse C */
  PetscCallMPI(MPI_Comm_size(comm, &size));
  PetscCall(PetscNew(&atb));
  cM = C->rmap->N;
  PetscCall(PetscMalloc2(cM * cN, &atb->sendbuf, size, &atb->recvcounts));
  PetscCall(MatGetOwnershipRanges(C, &ranges));
  for (PetscMPIInt i = 0; i < size; i++) PetscCall(PetscMPIIntCast((ranges[i + 1] - ranges[i]) * cN, &atb->recvcounts[i]));
  C->product->data    = atb;
  C->product->destroy = MatProductCtxDestroy_MatTransMatMult_MPIDense_MPIDense;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatTransposeMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
{
  MPI_Comm                         comm;
  PetscMPIInt                      i, size;
  PetscInt                         maxRows, bufsiz;
  PetscMPIInt                      tag;
  PetscInt                         alg;
  MatProductCtx_MatTransMultDense *abt;
  Mat_Product                     *product = C->product;
  PetscBool                        flg;

  PetscFunctionBegin;
  MatCheckProduct(C, 4);
  PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
  /* check local size of A and B */
  PetscCheck(A->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local column dimensions are incompatible, A (%" PetscInt_FMT ") != B (%" PetscInt_FMT ")", A->cmap->n, B->cmap->n);

  PetscCall(PetscStrcmp(product->alg, "allgatherv", &flg));
  alg = flg ? 0 : 1;

  /* setup matrix product C */
  PetscCall(MatSetSizes(C, A->rmap->n, B->rmap->n, A->rmap->N, B->rmap->N));
  PetscCall(MatSetType(C, MATMPIDENSE));
  PetscCall(MatSetUp(C));
  PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag));

  /* create data structure for reuse C */
  PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
  PetscCallMPI(MPI_Comm_size(comm, &size));
  PetscCall(PetscNew(&abt));
  abt->tag = tag;
  abt->alg = alg;
  switch (alg) {
  case 1: /* alg: "cyclic" */
    for (maxRows = 0, i = 0; i < size; i++) maxRows = PetscMax(maxRows, B->rmap->range[i + 1] - B->rmap->range[i]);
    bufsiz = A->cmap->N * maxRows;
    PetscCall(PetscMalloc2(bufsiz, &abt->buf[0], bufsiz, &abt->buf[1]));
    break;
  default: /* alg: "allgatherv" */
    PetscCall(PetscMalloc2(B->rmap->n * B->cmap->N, &abt->buf[0], B->rmap->N * B->cmap->N, &abt->buf[1]));
    PetscCall(PetscMalloc2(size, &abt->recvcounts, size + 1, &abt->recvdispls));
    for (i = 0; i <= size; i++) PetscCall(PetscMPIIntCast(B->rmap->range[i] * A->cmap->N, &abt->recvdispls[i]));
    for (i = 0; i < size; i++) PetscCall(PetscMPIIntCast(abt->recvdispls[i + 1] - abt->recvdispls[i], &abt->recvcounts[i]));
    break;
  }

  C->product->data                = abt;
  C->product->destroy             = MatProductCtxDestroy_MatMatTransMult_MPIDense_MPIDense;
  C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_MPIDense_MPIDense;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense_Cyclic(Mat A, Mat B, Mat C)
{
  Mat_MPIDense                    *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
  MatProductCtx_MatTransMultDense *abt;
  MPI_Comm                         comm;
  PetscMPIInt                      rank, size, sendto, recvfrom, recvisfrom;
  PetscScalar                     *sendbuf, *recvbuf = NULL, *cv;
  PetscInt                         i, cK             = A->cmap->N, sendsiz, recvsiz, k, j, bn;
  PetscScalar                      _DOne = 1.0, _DZero = 0.0;
  const PetscScalar               *av, *bv;
  PetscBLASInt                     cm, cn, ck, alda, blda = 0, clda;
  MPI_Request                      reqs[2];
  const PetscInt                  *ranges;

  PetscFunctionBegin;
  MatCheckProduct(C, 3);
  PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
  abt = (MatProductCtx_MatTransMultDense *)C->product->data;
  PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
  PetscCallMPI(MPI_Comm_rank(comm, &rank));
  PetscCallMPI(MPI_Comm_size(comm, &size));
  PetscCall(MatDenseGetArrayRead(a->A, &av));
  PetscCall(MatDenseGetArrayRead(b->A, &bv));
  PetscCall(MatDenseGetArrayWrite(c->A, &cv));
  PetscCall(MatDenseGetLDA(a->A, &i));
  PetscCall(PetscBLASIntCast(i, &alda));
  PetscCall(MatDenseGetLDA(b->A, &i));
  PetscCall(PetscBLASIntCast(i, &blda));
  PetscCall(MatDenseGetLDA(c->A, &i));
  PetscCall(PetscBLASIntCast(i, &clda));
  PetscCall(MatGetOwnershipRanges(B, &ranges));
  bn = B->rmap->n;
  if (blda == bn) {
    sendbuf = (PetscScalar *)bv;
  } else {
    sendbuf = abt->buf[0];
    for (k = 0, i = 0; i < cK; i++) {
      for (j = 0; j < bn; j++, k++) sendbuf[k] = bv[i * blda + j];
    }
  }
  if (size > 1) {
    sendto   = (rank + size - 1) % size;
    recvfrom = (rank + size + 1) % size;
  } else {
    sendto = recvfrom = 0;
  }
  PetscCall(PetscBLASIntCast(cK, &ck));
  PetscCall(PetscBLASIntCast(c->A->rmap->n, &cm));
  recvisfrom = rank;
  for (i = 0; i < size; i++) {
    /* we have finished receiving in sending, bufs can be read/modified */
    PetscMPIInt nextrecvisfrom = (recvisfrom + 1) % size; /* which process the next recvbuf will originate on */
    PetscInt    nextbn         = ranges[nextrecvisfrom + 1] - ranges[nextrecvisfrom];

    if (nextrecvisfrom != rank) {
      /* start the cyclic sends from sendbuf, to recvbuf (which will switch to sendbuf) */
      sendsiz = cK * bn;
      recvsiz = cK * nextbn;
      recvbuf = (i & 1) ? abt->buf[0] : abt->buf[1];
      PetscCallMPI(MPIU_Isend(sendbuf, sendsiz, MPIU_SCALAR, sendto, abt->tag, comm, &reqs[0]));
      PetscCallMPI(MPIU_Irecv(recvbuf, recvsiz, MPIU_SCALAR, recvfrom, abt->tag, comm, &reqs[1]));
    }

    /* local aseq * sendbuf^T */
    PetscCall(PetscBLASIntCast(ranges[recvisfrom + 1] - ranges[recvisfrom], &cn));
    if (cm && cn && ck) PetscCallBLAS("BLASgemm", BLASgemm_("N", "T", &cm, &cn, &ck, &_DOne, av, &alda, sendbuf, &cn, &_DZero, cv + clda * ranges[recvisfrom], &clda));

    if (nextrecvisfrom != rank) {
      /* wait for the sends and receives to complete, swap sendbuf and recvbuf */
      PetscCallMPI(MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE));
    }
    bn         = nextbn;
    recvisfrom = nextrecvisfrom;
    sendbuf    = recvbuf;
  }
  PetscCall(MatDenseRestoreArrayRead(a->A, &av));
  PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
  PetscCall(MatDenseRestoreArrayWrite(c->A, &cv));
  PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
  PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense_Allgatherv(Mat A, Mat B, Mat C)
{
  Mat_MPIDense                    *a = (Mat_MPIDense *)A->data, *b = (Mat_MPIDense *)B->data, *c = (Mat_MPIDense *)C->data;
  MatProductCtx_MatTransMultDense *abt;
  MPI_Comm                         comm;
  PetscMPIInt                      size, ibn;
  PetscScalar                     *cv, *sendbuf, *recvbuf;
  const PetscScalar               *av, *bv;
  PetscInt                         blda, i, cK = A->cmap->N, k, j, bn;
  PetscScalar                      _DOne = 1.0, _DZero = 0.0;
  PetscBLASInt                     cm, cn, ck, alda, clda;

  PetscFunctionBegin;
  MatCheckProduct(C, 3);
  PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
  abt = (MatProductCtx_MatTransMultDense *)C->product->data;
  PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
  PetscCallMPI(MPI_Comm_size(comm, &size));
  PetscCall(MatDenseGetArrayRead(a->A, &av));
  PetscCall(MatDenseGetArrayRead(b->A, &bv));
  PetscCall(MatDenseGetArrayWrite(c->A, &cv));
  PetscCall(MatDenseGetLDA(a->A, &i));
  PetscCall(PetscBLASIntCast(i, &alda));
  PetscCall(MatDenseGetLDA(b->A, &blda));
  PetscCall(MatDenseGetLDA(c->A, &i));
  PetscCall(PetscBLASIntCast(i, &clda));
  /* copy transpose of B into buf[0] */
  bn      = B->rmap->n;
  sendbuf = abt->buf[0];
  recvbuf = abt->buf[1];
  for (k = 0, j = 0; j < bn; j++) {
    for (i = 0; i < cK; i++, k++) sendbuf[k] = bv[i * blda + j];
  }
  PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
  PetscCall(PetscMPIIntCast(bn * cK, &ibn));
  PetscCallMPI(MPI_Allgatherv(sendbuf, ibn, MPIU_SCALAR, recvbuf, abt->recvcounts, abt->recvdispls, MPIU_SCALAR, comm));
  PetscCall(PetscBLASIntCast(cK, &ck));
  PetscCall(PetscBLASIntCast(c->A->rmap->n, &cm));
  PetscCall(PetscBLASIntCast(c->A->cmap->n, &cn));
  if (cm && cn && ck) PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &cm, &cn, &ck, &_DOne, av, &alda, recvbuf, &ck, &_DZero, cv, &clda));
  PetscCall(MatDenseRestoreArrayRead(a->A, &av));
  PetscCall(MatDenseRestoreArrayRead(b->A, &bv));
  PetscCall(MatDenseRestoreArrayWrite(c->A, &cv));
  PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
  PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatTransposeMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
{
  MatProductCtx_MatTransMultDense *abt;

  PetscFunctionBegin;
  MatCheckProduct(C, 3);
  PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
  abt = (MatProductCtx_MatTransMultDense *)C->product->data;
  switch (abt->alg) {
  case 1:
    PetscCall(MatMatTransposeMultNumeric_MPIDense_MPIDense_Cyclic(A, B, C));
    break;
  default:
    PetscCall(MatMatTransposeMultNumeric_MPIDense_MPIDense_Allgatherv(A, B, C));
    break;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductCtxDestroy_MatMatMult_MPIDense_MPIDense(PetscCtxRt data)
{
  MatProductCtx_MatMultDense *ab = *(MatProductCtx_MatMultDense **)data;

  PetscFunctionBegin;
  PetscCall(MatDestroy(&ab->Ce));
  PetscCall(MatDestroy(&ab->Ae));
  PetscCall(MatDestroy(&ab->Be));
  PetscCall(PetscFree(ab));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatMultNumeric_MPIDense_MPIDense(Mat A, Mat B, Mat C)
{
  MatProductCtx_MatMultDense *ab;
  Mat_MPIDense               *mdn = (Mat_MPIDense *)A->data;
  Mat_MPIDense               *b   = (Mat_MPIDense *)B->data;

  PetscFunctionBegin;
  MatCheckProduct(C, 3);
  PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Missing product data");
  ab = (MatProductCtx_MatMultDense *)C->product->data;
  if (ab->Ae && ab->Ce) {
#if PetscDefined(HAVE_ELEMENTAL)
    PetscCall(MatConvert_MPIDense_Elemental(A, MATELEMENTAL, MAT_REUSE_MATRIX, &ab->Ae));
    PetscCall(MatConvert_MPIDense_Elemental(B, MATELEMENTAL, MAT_REUSE_MATRIX, &ab->Be));
    PetscCall(MatMatMultNumeric_Elemental(ab->Ae, ab->Be, ab->Ce));
    PetscCall(MatConvert(ab->Ce, MATMPIDENSE, MAT_REUSE_MATRIX, &C));
#else
    SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "PETSC_HAVE_ELEMENTAL not defined");
#endif
  } else {
    MPI_Comm           comm;
    const PetscScalar *read;
    PetscScalar       *write;
    PetscInt           lda;
    const PetscInt    *ranges;
    PetscMPIInt        size;

    if (!mdn->Mvctx) PetscCall(MatSetUpMultiply_MPIDense(A)); /* cannot be done during the symbolic phase because of possible calls to MatProductReplaceMats() */
    comm = PetscObjectComm((PetscObject)B);
    PetscCallMPI(MPI_Comm_size(comm, &size));
    PetscCall(PetscLayoutGetRanges(B->rmap, &ranges));
    if (ranges[1] == ranges[size]) {
      // optimize for the case where the B matrix is broadcast from rank 0
      PetscInt           b_lda, be_lda;
      Mat                b_local  = b->A;
      Mat                b_alloc  = NULL;
      Mat                be_local = ab->Be;
      Mat                be_alloc = NULL;
      PetscMemType       b_memtype, be_memtype;
      const PetscScalar *b_array = NULL;
      MPI_Datatype       vector_type;
      PetscScalar       *be_array = NULL;
      PetscMPIInt        rank;

      PetscCallMPI(MPI_Comm_rank(comm, &rank));
      PetscCall(MatDenseGetLDA(be_local, &be_lda));
      if (be_lda != B->rmap->N) {
        PetscCall(MatDuplicate(be_local, MAT_DO_NOT_COPY_VALUES, &be_alloc));
        be_local = be_alloc;
      }

      if (rank == 0) {
        PetscCall(MatDenseGetLDA(b_local, &b_lda));
        if (b_lda != B->rmap->N) {
          PetscCall(MatDuplicate(b_local, MAT_DO_NOT_COPY_VALUES, &b_alloc));
          PetscCall(MatCopy(b_local, b_alloc, DIFFERENT_NONZERO_PATTERN));
          b_local = b_alloc;
        }
      }
      vector_type = MPIU_SCALAR;
      if (B->cmap->N > 1) {
        PetscMPIInt mpi_N;

        PetscCall(PetscMPIIntCast(B->cmap->N, &mpi_N));
        PetscCallMPI(MPI_Type_contiguous(mpi_N, MPIU_SCALAR, &vector_type));
        PetscCallMPI(MPI_Type_commit(&vector_type));
      }
      PetscCall(MatDenseGetArrayReadAndMemType(b_local, &b_array, &b_memtype));
      PetscCall(MatDenseGetArrayWriteAndMemType(be_local, &be_array, &be_memtype));
      PetscCall(PetscSFBcastWithMemTypeBegin(mdn->Mvctx, vector_type, b_memtype, b_array, be_memtype, be_array, MPI_REPLACE));
      PetscCall(PetscSFBcastEnd(mdn->Mvctx, vector_type, b_array, be_array, MPI_REPLACE));
      PetscCall(MatDenseRestoreArrayWriteAndMemType(be_local, &be_array));
      PetscCall(MatDenseRestoreArrayReadAndMemType(b_local, &b_array));
      if (be_local != ab->Be) PetscCall(MatCopy(be_local, ab->Be, DIFFERENT_NONZERO_PATTERN));
      if (B->cmap->N > 1) PetscCallMPI(MPI_Type_free(&vector_type));
      PetscCall(MatDestroy(&be_alloc));
      PetscCall(MatDestroy(&b_alloc));
    } else {
      PetscCall(MatDenseGetLDA(B, &lda));
      PetscCall(MatDenseGetArrayRead(B, &read));
      PetscCall(MatDenseGetArrayWrite(ab->Be, &write));
      for (PetscInt i = 0; i < C->cmap->N; ++i) {
        PetscCall(PetscSFBcastBegin(mdn->Mvctx, MPIU_SCALAR, read + i * lda, write + i * ab->Be->rmap->n, MPI_REPLACE));
        PetscCall(PetscSFBcastEnd(mdn->Mvctx, MPIU_SCALAR, read + i * lda, write + i * ab->Be->rmap->n, MPI_REPLACE));
      }
      PetscCall(MatDenseRestoreArrayWrite(ab->Be, &write));
      PetscCall(MatDenseRestoreArrayRead(B, &read));
    }
    PetscCall(MatMatMultNumeric_SeqDense_SeqDense(((Mat_MPIDense *)A->data)->A, ab->Be, ((Mat_MPIDense *)C->data)->A));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIDense(Mat A, Mat B, PetscReal fill, Mat C)
{
  Mat_Product                *product = C->product;
  PetscInt                    alg;
  MatProductCtx_MatMultDense *ab;
  PetscBool                   flg;

  PetscFunctionBegin;
  MatCheckProduct(C, 4);
  PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
  /* check local size of A and B */
  PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ", %" PetscInt_FMT ")",
             A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);

  PetscCall(PetscStrcmp(product->alg, "petsc", &flg));
  alg = flg ? 0 : 1;

  /* setup C */
  PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
  PetscCall(MatSetType(C, MATMPIDENSE));
  PetscCall(MatSetUp(C));

  /* create data structure for reuse Cdense */
  PetscCall(PetscNew(&ab));

  switch (alg) {
  case 1: /* alg: "elemental" */
#if PetscDefined(HAVE_ELEMENTAL)
    /* create elemental matrices Ae and Be */
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &ab->Ae));
    PetscCall(MatSetSizes(ab->Ae, PETSC_DECIDE, PETSC_DECIDE, A->rmap->N, A->cmap->N));
    PetscCall(MatSetType(ab->Ae, MATELEMENTAL));
    PetscCall(MatSetUp(ab->Ae));
    PetscCall(MatSetOption(ab->Ae, MAT_ROW_ORIENTED, PETSC_FALSE));

    PetscCall(MatCreate(PetscObjectComm((PetscObject)B), &ab->Be));
    PetscCall(MatSetSizes(ab->Be, PETSC_DECIDE, PETSC_DECIDE, B->rmap->N, B->cmap->N));
    PetscCall(MatSetType(ab->Be, MATELEMENTAL));
    PetscCall(MatSetUp(ab->Be));
    PetscCall(MatSetOption(ab->Be, MAT_ROW_ORIENTED, PETSC_FALSE));

    /* compute symbolic Ce = Ae*Be */
    PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &ab->Ce));
    PetscCall(MatMatMultSymbolic_Elemental(ab->Ae, ab->Be, fill, ab->Ce));
#else
    SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "PETSC_HAVE_ELEMENTAL not defined");
#endif
    break;
  default: /* alg: "petsc" */
    ab->Ae = NULL;
    PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, A->cmap->N, B->cmap->N, NULL, &ab->Be));
    ab->Ce = NULL;
    break;
  }

  C->product->data       = ab;
  C->product->destroy    = MatProductCtxDestroy_MatMatMult_MPIDense_MPIDense;
  C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIDense;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductSetFromOptions_MPIDense_AB(Mat C)
{
  Mat_Product *product     = C->product;
  const char  *algTypes[2] = {"petsc", "elemental"};
  PetscInt     alg, nalg = PetscDefined(HAVE_ELEMENTAL) ? 2 : 1;
  PetscBool    flg = PETSC_FALSE;

  PetscFunctionBegin;
  /* Set default algorithm */
  alg = 0; /* default is PETSc */
  PetscCall(PetscStrcmp(product->alg, "default", &flg));
  if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));

  /* Get runtime option */
  PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
  PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[alg], &alg, &flg));
  PetscOptionsEnd();
  if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));

  C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIDense;
  C->ops->productsymbolic = MatProductSymbolic_AB;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductSetFromOptions_MPIDense_AtB(Mat C)
{
  Mat_Product *product = C->product;
  Mat          A = product->A, B = product->B;

  PetscFunctionBegin;
  PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
             A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend);
  C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_MPIDense_MPIDense;
  C->ops->productsymbolic          = MatProductSymbolic_AtB;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductSetFromOptions_MPIDense_ABt(Mat C)
{
  Mat_Product *product     = C->product;
  const char  *algTypes[2] = {"allgatherv", "cyclic"};
  PetscInt     alg, nalg = 2;
  PetscBool    flg = PETSC_FALSE;

  PetscFunctionBegin;
  /* Set default algorithm */
  alg = 0; /* default is allgatherv */
  PetscCall(PetscStrcmp(product->alg, "default", &flg));
  if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));

  /* Get runtime option */
  if (product->api_user) {
    PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
    PetscCall(PetscOptionsEList("-matmattransmult_mpidense_mpidense_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
    PetscOptionsEnd();
  } else {
    PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
    PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
    PetscOptionsEnd();
  }
  if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));

  C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_MPIDense_MPIDense;
  C->ops->productsymbolic          = MatProductSymbolic_ABt;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatProductSetFromOptions_MPIDense(Mat C)
{
  Mat_Product *product = C->product;

  PetscFunctionBegin;
  switch (product->type) {
  case MATPRODUCT_AB:
    PetscCall(MatProductSetFromOptions_MPIDense_AB(C));
    break;
  case MATPRODUCT_AtB:
    PetscCall(MatProductSetFromOptions_MPIDense_AtB(C));
    break;
  case MATPRODUCT_ABt:
    PetscCall(MatProductSetFromOptions_MPIDense_ABt(C));
    break;
  default:
    break;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDenseScatter_Private(PetscSF sf, Mat X, Mat Y, InsertMode mode, ScatterMode smode)
{
  const PetscScalar *in;
  PetscScalar       *out;
  PetscSF            vsf;
  PetscInt           N, ny, rld, lld;
  PetscMemType       mtype[2];
  MPI_Op             op = MPI_OP_NULL;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
  PetscValidHeaderSpecific(X, MAT_CLASSID, 2);
  PetscValidHeaderSpecific(Y, MAT_CLASSID, 3);
  if (mode == INSERT_VALUES) op = MPI_REPLACE;
  else if (mode == ADD_VALUES) op = MPIU_SUM;
  else if (mode == MAX_VALUES) op = MPIU_MAX;
  else if (mode == MIN_VALUES) op = MPIU_MIN;
  PetscCheck(op != MPI_OP_NULL, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "Unsupported InsertMode %d in MatDenseScatter_Private()", mode);
  PetscCheck(smode == SCATTER_FORWARD || smode == SCATTER_REVERSE, PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "Unsupported ScatterMode %d in MatDenseScatter_Private()", smode);
  PetscCall(MatGetSize(X, NULL, &N));
  PetscCall(MatGetSize(Y, NULL, &ny));
  PetscCheck(N == ny, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_SIZ, "Matrix column sizes must match: %" PetscInt_FMT " != %" PetscInt_FMT, N, ny);
  PetscCall(MatDenseGetLDA(X, &rld));
  PetscCall(MatDenseGetLDA(Y, &lld));
  /* get cached or create new strided PetscSF when the number of columns is greater than one */
  if (N > 1) {
    PetscCall(PetscObjectQuery((PetscObject)sf, "_MatDenseScatter_StridedSF", (PetscObject *)&vsf));
    if (vsf) {
      PetscInt nr[2], nl[2];

      PetscCall(PetscSFGetGraph(sf, nr, nl, NULL, NULL));
      PetscCall(PetscSFGetGraph(vsf, nr + 1, nl + 1, NULL, NULL));
      if (N * nr[0] != nr[1] || N * nl[0] != nl[1]) vsf = NULL;
    }
    if (!vsf) {
      PetscCall(PetscSFCreateStridedSF(sf, N, rld, lld, &vsf));
      PetscCall(PetscObjectCompose((PetscObject)sf, "_MatDenseScatter_StridedSF", (PetscObject)vsf));
      PetscCall(PetscObjectDereference((PetscObject)vsf));
    }
  } else vsf = sf;
  /* the output array is accessed in read and write mode,
    but write-only in the INSERT_VALUES case could be worth exploring */
  PetscCall(MatDenseGetArrayReadAndMemType(X, &in, &mtype[0]));
  PetscCall(MatDenseGetArrayAndMemType(Y, &out, &mtype[1]));
  if (smode == SCATTER_FORWARD) {
    PetscCall(PetscSFBcastWithMemTypeBegin(vsf, vsf->vscat.unit, mtype[0], in, mtype[1], out, op));
    PetscCall(PetscSFBcastEnd(vsf, vsf->vscat.unit, in, out, op));
  } else {
    PetscCall(PetscSFReduceWithMemTypeBegin(vsf, vsf->vscat.unit, mtype[0], in, mtype[1], out, op));
    PetscCall(PetscSFReduceEnd(vsf, vsf->vscat.unit, in, out, op));
  }
  PetscCall(MatDenseRestoreArrayAndMemType(Y, &out));
  PetscCall(MatDenseRestoreArrayReadAndMemType(X, &in));
  PetscFunctionReturn(PETSC_SUCCESS);
}
