#include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
#include <../src/mat/impls/sell/mpi/mpisell.h> /*I "petscmat.h" I*/
#include <petsc/private/vecimpl.h>
#include <petsc/private/isimpl.h>
#include <petscblaslapack.h>
#include <petscsf.h>

/*MC
   MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.

   This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
   and `MATMPISELL` otherwise.  As a result, for single process communicators,
  `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
  for communicators controlling multiple processes.  It is recommended that you call both of
  the above preallocation routines for simplicity.

   Options Database Keys:
. -mat_type sell - sets the matrix type to `MATSELL` during a call to `MatSetFromOptions()`

  Level: beginner

.seealso: `Mat`, `MATAIJ`, `MATBAIJ`, `MATSBAIJ`, `MatCreateSELL()`, `MatCreateSeqSELL()`, `MATSEQSELL`, `MATMPISELL`
M*/

static PetscErrorCode MatDiagonalSet_MPISELL(Mat Y, Vec D, InsertMode is)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)Y->data;

  PetscFunctionBegin;
  if (Y->assembled && Y->rmap->rstart == Y->cmap->rstart && Y->rmap->rend == Y->cmap->rend) {
    PetscCall(MatDiagonalSet(sell->A, D, is));
  } else {
    PetscCall(MatDiagonalSet_Default(Y, D, is));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
  Local utility routine that creates a mapping from the global column
number to the local number in the off-diagonal part of the local
storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
a slightly higher hash table cost; without it it is not scalable (each processor
has an order N integer array but is fast to access.
*/
PetscErrorCode MatCreateColmap_MPISELL_Private(Mat mat)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
  PetscInt     n    = sell->B->cmap->n, i;

  PetscFunctionBegin;
  PetscCheck(sell->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPISELL Matrix was assembled but is missing garray");
#if defined(PETSC_USE_CTABLE)
  PetscCall(PetscHMapICreateWithSize(n, &sell->colmap));
  for (i = 0; i < n; i++) PetscCall(PetscHMapISet(sell->colmap, sell->garray[i] + 1, i + 1));
#else
  PetscCall(PetscCalloc1(mat->cmap->N + 1, &sell->colmap));
  for (i = 0; i < n; i++) sell->colmap[sell->garray[i]] = i + 1;
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

#define MatSetValues_SeqSELL_A_Private(row, col, value, addv, orow, ocol) \
  { \
    if (col <= lastcol1) low1 = 0; \
    else high1 = nrow1; \
    lastcol1 = col; \
    while (high1 - low1 > 5) { \
      t = (low1 + high1) / 2; \
      if (cp1[sliceheight * t] > col) high1 = t; \
      else low1 = t; \
    } \
    for (_i = low1; _i < high1; _i++) { \
      if (cp1[sliceheight * _i] > col) break; \
      if (cp1[sliceheight * _i] == col) { \
        if (addv == ADD_VALUES) vp1[sliceheight * _i] += value; \
        else vp1[sliceheight * _i] = value; \
        inserted = PETSC_TRUE; \
        goto a_noinsert; \
      } \
    } \
    if (value == 0.0 && ignorezeroentries) { \
      low1  = 0; \
      high1 = nrow1; \
      goto a_noinsert; \
    } \
    if (nonew == 1) { \
      low1  = 0; \
      high1 = nrow1; \
      goto a_noinsert; \
    } \
    PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
    MatSeqXSELLReallocateSELL(A, am, 1, nrow1, a->sliidx, a->sliceheight, row / sliceheight, row, col, a->colidx, a->val, cp1, vp1, nonew, MatScalar); \
    /* shift up all the later entries in this row */ \
    for (ii = nrow1 - 1; ii >= _i; ii--) { \
      cp1[sliceheight * (ii + 1)] = cp1[sliceheight * ii]; \
      vp1[sliceheight * (ii + 1)] = vp1[sliceheight * ii]; \
    } \
    cp1[sliceheight * _i] = col; \
    vp1[sliceheight * _i] = value; \
    a->nz++; \
    nrow1++; \
  a_noinsert:; \
    a->rlen[row] = nrow1; \
  }

#define MatSetValues_SeqSELL_B_Private(row, col, value, addv, orow, ocol) \
  { \
    if (col <= lastcol2) low2 = 0; \
    else high2 = nrow2; \
    lastcol2 = col; \
    while (high2 - low2 > 5) { \
      t = (low2 + high2) / 2; \
      if (cp2[sliceheight * t] > col) high2 = t; \
      else low2 = t; \
    } \
    for (_i = low2; _i < high2; _i++) { \
      if (cp2[sliceheight * _i] > col) break; \
      if (cp2[sliceheight * _i] == col) { \
        if (addv == ADD_VALUES) vp2[sliceheight * _i] += value; \
        else vp2[sliceheight * _i] = value; \
        inserted = PETSC_TRUE; \
        goto b_noinsert; \
      } \
    } \
    if (value == 0.0 && ignorezeroentries) { \
      low2  = 0; \
      high2 = nrow2; \
      goto b_noinsert; \
    } \
    if (nonew == 1) { \
      low2  = 0; \
      high2 = nrow2; \
      goto b_noinsert; \
    } \
    PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
    MatSeqXSELLReallocateSELL(B, bm, 1, nrow2, b->sliidx, b->sliceheight, row / sliceheight, row, col, b->colidx, b->val, cp2, vp2, nonew, MatScalar); \
    /* shift up all the later entries in this row */ \
    for (ii = nrow2 - 1; ii >= _i; ii--) { \
      cp2[sliceheight * (ii + 1)] = cp2[sliceheight * ii]; \
      vp2[sliceheight * (ii + 1)] = vp2[sliceheight * ii]; \
    } \
    cp2[sliceheight * _i] = col; \
    vp2[sliceheight * _i] = value; \
    b->nz++; \
    nrow2++; \
  b_noinsert:; \
    b->rlen[row] = nrow2; \
  }

static PetscErrorCode MatSetValues_MPISELL(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
  PetscScalar  value;
  PetscInt     i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend, shift1, shift2;
  PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
  PetscBool    roworiented = sell->roworiented;

  /* Some Variables required in the macro */
  Mat          A                 = sell->A;
  Mat_SeqSELL *a                 = (Mat_SeqSELL *)A->data;
  PetscBool    ignorezeroentries = a->ignorezeroentries, found;
  Mat          B                 = sell->B;
  Mat_SeqSELL *b                 = (Mat_SeqSELL *)B->data;
  PetscInt    *cp1, *cp2, ii, _i, nrow1, nrow2, low1, high1, low2, high2, t, lastcol1, lastcol2, sliceheight = a->sliceheight;
  MatScalar   *vp1, *vp2;

  PetscFunctionBegin;
  for (i = 0; i < m; i++) {
    if (im[i] < 0) continue;
    PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
    if (im[i] >= rstart && im[i] < rend) {
      row      = im[i] - rstart;
      lastcol1 = -1;
      shift1   = a->sliidx[row / sliceheight] + (row % sliceheight); /* starting index of the row */
      cp1      = PetscSafePointerPlusOffset(a->colidx, shift1);
      vp1      = PetscSafePointerPlusOffset(a->val, shift1);
      nrow1    = a->rlen[row];
      low1     = 0;
      high1    = nrow1;
      lastcol2 = -1;
      shift2   = b->sliidx[row / sliceheight] + (row % sliceheight); /* starting index of the row */
      cp2      = PetscSafePointerPlusOffset(b->colidx, shift2);
      vp2      = PetscSafePointerPlusOffset(b->val, shift2);
      nrow2    = b->rlen[row];
      low2     = 0;
      high2    = nrow2;

      for (j = 0; j < n; j++) {
        if (roworiented) value = v[i * n + j];
        else value = v[i + j * m];
        if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES)) continue;
        if (in[j] >= cstart && in[j] < cend) {
          col = in[j] - cstart;
          MatSetValue_SeqSELL_Private(A, row, col, value, addv, im[i], in[j], cp1, vp1, lastcol1, low1, high1); /* set one value */
#if defined(PETSC_HAVE_CUDA)
          if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && found) A->offloadmask = PETSC_OFFLOAD_CPU;
#endif
        } else if (in[j] < 0) {
          continue;
        } else {
          PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
          if (mat->was_assembled) {
            if (!sell->colmap) PetscCall(MatCreateColmap_MPISELL_Private(mat));
#if defined(PETSC_USE_CTABLE)
            PetscCall(PetscHMapIGetWithDefault(sell->colmap, in[j] + 1, 0, &col));
            col--;
#else
            col = sell->colmap[in[j]] - 1;
#endif
            if (col < 0 && !((Mat_SeqSELL *)sell->B->data)->nonew) {
              PetscCall(MatDisAssemble_MPISELL(mat));
              col = in[j];
              /* Reinitialize the variables required by MatSetValues_SeqSELL_B_Private() */
              B      = sell->B;
              b      = (Mat_SeqSELL *)B->data;
              shift2 = b->sliidx[row / sliceheight] + (row % sliceheight); /* starting index of the row */
              cp2    = b->colidx + shift2;
              vp2    = b->val + shift2;
              nrow2  = b->rlen[row];
              low2   = 0;
              high2  = nrow2;
              found  = PETSC_FALSE;
            } else {
              PetscCheck(col >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
            }
          } else col = in[j];
          MatSetValue_SeqSELL_Private(B, row, col, value, addv, im[i], in[j], cp2, vp2, lastcol2, low2, high2); /* set one value */
#if defined(PETSC_HAVE_CUDA)
          if (B->offloadmask != PETSC_OFFLOAD_UNALLOCATED && found) B->offloadmask = PETSC_OFFLOAD_CPU;
#endif
        }
      }
    } else {
      PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
      if (!sell->donotstash) {
        mat->assembled = PETSC_FALSE;
        if (roworiented) {
          PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
        } else {
          PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
        }
      }
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetValues_MPISELL(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
  PetscInt     i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
  PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;

  PetscFunctionBegin;
  for (i = 0; i < m; i++) {
    if (idxm[i] < 0) continue; /* negative row */
    PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
    PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported");
    row = idxm[i] - rstart;
    for (j = 0; j < n; j++) {
      if (idxn[j] < 0) continue; /* negative column */
      PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
      if (idxn[j] >= cstart && idxn[j] < cend) {
        col = idxn[j] - cstart;
        PetscCall(MatGetValues(sell->A, 1, &row, 1, &col, v + i * n + j));
      } else {
        if (!sell->colmap) PetscCall(MatCreateColmap_MPISELL_Private(mat));
#if defined(PETSC_USE_CTABLE)
        PetscCall(PetscHMapIGetWithDefault(sell->colmap, idxn[j] + 1, 0, &col));
        col--;
#else
        col = sell->colmap[idxn[j]] - 1;
#endif
        if (col < 0 || sell->garray[col] != idxn[j]) *(v + i * n + j) = 0.0;
        else PetscCall(MatGetValues(sell->B, 1, &row, 1, &col, v + i * n + j));
      }
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatAssemblyBegin_MPISELL(Mat mat, MatAssemblyType mode)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
  PetscInt     nstash, reallocs;

  PetscFunctionBegin;
  if (sell->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);

  PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
  PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
  PetscCall(PetscInfo(sell->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatAssemblyEnd_MPISELL(Mat mat, MatAssemblyType mode)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
  PetscMPIInt  n;
  PetscInt     i, flg;
  PetscInt    *row, *col;
  PetscScalar *val;
  PetscBool    all_assembled;
  /* do not use 'b = (Mat_SeqSELL*)sell->B->data' as B can be reset in disassembly */
  PetscFunctionBegin;
  if (!sell->donotstash && !mat->nooffprocentries) {
    while (1) {
      PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
      if (!flg) break;

      for (i = 0; i < n; i++) { /* assemble one by one */
        PetscCall(MatSetValues_MPISELL(mat, 1, row + i, 1, col + i, val + i, mat->insertmode));
      }
    }
    PetscCall(MatStashScatterEnd_Private(&mat->stash));
  }
#if defined(PETSC_HAVE_CUDA)
  if (mat->offloadmask == PETSC_OFFLOAD_CPU) sell->A->offloadmask = PETSC_OFFLOAD_CPU;
#endif
  PetscCall(MatAssemblyBegin(sell->A, mode));
  PetscCall(MatAssemblyEnd(sell->A, mode));

  /*
     determine if any process has disassembled, if so we must
     also disassemble ourselves, in order that we may reassemble.
  */
  /*
     if nonzero structure of submatrix B cannot change then we know that
     no process disassembled thus we can skip this stuff
  */
  if (!((Mat_SeqSELL *)sell->B->data)->nonew) {
    PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &all_assembled, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
    if (mat->was_assembled && !all_assembled) PetscCall(MatDisAssemble_MPISELL(mat));
  }
  if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPISELL(mat));
#if defined(PETSC_HAVE_CUDA)
  if (mat->offloadmask == PETSC_OFFLOAD_CPU && sell->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) sell->B->offloadmask = PETSC_OFFLOAD_CPU;
#endif
  PetscCall(MatAssemblyBegin(sell->B, mode));
  PetscCall(MatAssemblyEnd(sell->B, mode));
  PetscCall(PetscFree2(sell->rowvalues, sell->rowindices));
  sell->rowvalues = NULL;
  PetscCall(VecDestroy(&sell->diag));

  /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
  if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqSELL *)sell->A->data)->nonew) {
    PetscObjectState state = sell->A->nonzerostate + sell->B->nonzerostate;
    PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
  }
#if defined(PETSC_HAVE_CUDA)
  mat->offloadmask = PETSC_OFFLOAD_BOTH;
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatZeroEntries_MPISELL(Mat A)
{
  Mat_MPISELL *l = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCall(MatZeroEntries(l->A));
  PetscCall(MatZeroEntries(l->B));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMult_MPISELL(Mat A, Vec xx, Vec yy)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;
  PetscInt     nt;

  PetscFunctionBegin;
  PetscCall(VecGetLocalSize(xx, &nt));
  PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
  PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*a->A->ops->mult)(a->A, xx, yy));
  PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultDiagonalBlock_MPISELL(Mat A, Vec bb, Vec xx)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultAdd_MPISELL(Mat A, Vec xx, Vec yy, Vec zz)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
  PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
  PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultTranspose_MPISELL(Mat A, Vec xx, Vec yy)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  /* do nondiagonal part */
  PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
  /* do local part */
  PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
  /* add partial results together */
  PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
  PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatIsTranspose_MPISELL(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
{
  MPI_Comm     comm;
  Mat_MPISELL *Asell = (Mat_MPISELL *)Amat->data, *Bsell;
  Mat          Adia  = Asell->A, Bdia, Aoff, Boff, *Aoffs, *Boffs;
  IS           Me, Notme;
  PetscInt     M, N, first, last, *notme, i;
  PetscMPIInt  size;

  PetscFunctionBegin;
  /* Easy test: symmetric diagonal block */
  Bsell = (Mat_MPISELL *)Bmat->data;
  Bdia  = Bsell->A;
  PetscCall(MatIsTranspose(Adia, Bdia, tol, f));
  if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
  PetscCallMPI(MPI_Comm_size(comm, &size));
  if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);

  /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
  PetscCall(MatGetSize(Amat, &M, &N));
  PetscCall(MatGetOwnershipRange(Amat, &first, &last));
  PetscCall(PetscMalloc1(N - last + first, &notme));
  for (i = 0; i < first; i++) notme[i] = i;
  for (i = last; i < M; i++) notme[i - last + first] = i;
  PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
  PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
  PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
  Aoff = Aoffs[0];
  PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
  Boff = Boffs[0];
  PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
  PetscCall(MatDestroyMatrices(1, &Aoffs));
  PetscCall(MatDestroyMatrices(1, &Boffs));
  PetscCall(ISDestroy(&Me));
  PetscCall(ISDestroy(&Notme));
  PetscCall(PetscFree(notme));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMultTransposeAdd_MPISELL(Mat A, Vec xx, Vec yy, Vec zz)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  /* do nondiagonal part */
  PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
  /* do local part */
  PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
  /* add partial results together */
  PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
  PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
  This only works correctly for square matrices where the subblock A->A is the
   diagonal block
*/
static PetscErrorCode MatGetDiagonal_MPISELL(Mat A, Vec v)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
  PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
  PetscCall(MatGetDiagonal(a->A, v));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatScale_MPISELL(Mat A, PetscScalar aa)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCall(MatScale(a->A, aa));
  PetscCall(MatScale(a->B, aa));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatDestroy_MPISELL(Mat mat)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;

  PetscFunctionBegin;
  PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
  PetscCall(MatStashDestroy_Private(&mat->stash));
  PetscCall(VecDestroy(&sell->diag));
  PetscCall(MatDestroy(&sell->A));
  PetscCall(MatDestroy(&sell->B));
#if defined(PETSC_USE_CTABLE)
  PetscCall(PetscHMapIDestroy(&sell->colmap));
#else
  PetscCall(PetscFree(sell->colmap));
#endif
  PetscCall(PetscFree(sell->garray));
  PetscCall(VecDestroy(&sell->lvec));
  PetscCall(VecScatterDestroy(&sell->Mvctx));
  PetscCall(PetscFree2(sell->rowvalues, sell->rowindices));
  PetscCall(PetscFree(sell->ld));
  PetscCall(PetscFree(mat->data));

  PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPISELLSetPreallocation_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpisell_mpiaij_C", NULL));
#if defined(PETSC_HAVE_CUDA)
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpisell_mpisellcuda_C", NULL));
#endif
  PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
  PetscFunctionReturn(PETSC_SUCCESS);
}

#include <petscdraw.h>
static PetscErrorCode MatView_MPISELL_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
{
  Mat_MPISELL      *sell = (Mat_MPISELL *)mat->data;
  PetscMPIInt       rank = sell->rank, size = sell->size;
  PetscBool         isdraw, isascii, isbinary;
  PetscViewer       sviewer;
  PetscViewerFormat format;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
  if (isascii) {
    PetscCall(PetscViewerGetFormat(viewer, &format));
    if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
      MatInfo   info;
      PetscInt *inodes;

      PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
      PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
      PetscCall(MatInodeGetInodeSizes(sell->A, NULL, &inodes, NULL));
      PetscCall(PetscViewerASCIIPushSynchronized(viewer));
      if (!inodes) {
        PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %" PetscInt_FMT ", not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used,
                                                     (PetscInt)info.nz_allocated, (PetscInt)info.memory));
      } else {
        PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %" PetscInt_FMT ", using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used,
                                                     (PetscInt)info.nz_allocated, (PetscInt)info.memory));
      }
      PetscCall(MatGetInfo(sell->A, MAT_LOCAL, &info));
      PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
      PetscCall(MatGetInfo(sell->B, MAT_LOCAL, &info));
      PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
      PetscCall(PetscViewerFlush(viewer));
      PetscCall(PetscViewerASCIIPopSynchronized(viewer));
      PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
      PetscCall(VecScatterView(sell->Mvctx, viewer));
      PetscFunctionReturn(PETSC_SUCCESS);
    } else if (format == PETSC_VIEWER_ASCII_INFO) {
      PetscInt inodecount, inodelimit, *inodes;
      PetscCall(MatInodeGetInodeSizes(sell->A, &inodecount, &inodes, &inodelimit));
      if (inodes) {
        PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
      } else {
        PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
      }
      PetscFunctionReturn(PETSC_SUCCESS);
    } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
      PetscFunctionReturn(PETSC_SUCCESS);
    }
  } else if (isbinary) {
    if (size == 1) {
      PetscCall(PetscObjectSetName((PetscObject)sell->A, ((PetscObject)mat)->name));
      PetscCall(MatView(sell->A, viewer));
    } else {
      /* PetscCall(MatView_MPISELL_Binary(mat,viewer)); */
    }
    PetscFunctionReturn(PETSC_SUCCESS);
  } else if (isdraw) {
    PetscDraw draw;
    PetscBool isnull;
    PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
    PetscCall(PetscDrawIsNull(draw, &isnull));
    if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
  }

  {
    /* assemble the entire matrix onto first processor. */
    Mat          A;
    Mat_SeqSELL *Aloc;
    PetscInt     M = mat->rmap->N, N = mat->cmap->N, *acolidx, row, col, i, j;
    MatScalar   *aval;
    PetscBool    isnonzero;

    PetscCall(MatCreate(PetscObjectComm((PetscObject)mat), &A));
    if (rank == 0) {
      PetscCall(MatSetSizes(A, M, N, M, N));
    } else {
      PetscCall(MatSetSizes(A, 0, 0, M, N));
    }
    /* This is just a temporary matrix, so explicitly using MATMPISELL is probably best */
    PetscCall(MatSetType(A, MATMPISELL));
    PetscCall(MatMPISELLSetPreallocation(A, 0, NULL, 0, NULL));
    PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_FALSE));

    /* copy over the A part */
    Aloc    = (Mat_SeqSELL *)sell->A->data;
    acolidx = Aloc->colidx;
    aval    = Aloc->val;
    for (i = 0; i < Aloc->totalslices; i++) { /* loop over slices */
      for (j = Aloc->sliidx[i]; j < Aloc->sliidx[i + 1]; j++) {
        isnonzero = (PetscBool)((j - Aloc->sliidx[i]) / Aloc->sliceheight < Aloc->rlen[i * Aloc->sliceheight + j % Aloc->sliceheight]);
        if (isnonzero) { /* check the mask bit */
          row = i * Aloc->sliceheight + j % Aloc->sliceheight + mat->rmap->rstart;
          col = *acolidx + mat->rmap->rstart;
          PetscCall(MatSetValues(A, 1, &row, 1, &col, aval, INSERT_VALUES));
        }
        aval++;
        acolidx++;
      }
    }

    /* copy over the B part */
    Aloc    = (Mat_SeqSELL *)sell->B->data;
    acolidx = Aloc->colidx;
    aval    = Aloc->val;
    for (i = 0; i < Aloc->totalslices; i++) {
      for (j = Aloc->sliidx[i]; j < Aloc->sliidx[i + 1]; j++) {
        isnonzero = (PetscBool)((j - Aloc->sliidx[i]) / Aloc->sliceheight < Aloc->rlen[i * Aloc->sliceheight + j % Aloc->sliceheight]);
        if (isnonzero) {
          row = i * Aloc->sliceheight + j % Aloc->sliceheight + mat->rmap->rstart;
          col = sell->garray[*acolidx];
          PetscCall(MatSetValues(A, 1, &row, 1, &col, aval, INSERT_VALUES));
        }
        aval++;
        acolidx++;
      }
    }

    PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
    /*
       Everyone has to call to draw the matrix since the graphics waits are
       synchronized across all processors that share the PetscDraw object
    */
    PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
    if (rank == 0) {
      PetscCall(PetscObjectSetName((PetscObject)((Mat_MPISELL *)A->data)->A, ((PetscObject)mat)->name));
      PetscCall(MatView_SeqSELL(((Mat_MPISELL *)A->data)->A, sviewer));
    }
    PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
    PetscCall(MatDestroy(&A));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatView_MPISELL(Mat mat, PetscViewer viewer)
{
  PetscBool isascii, isdraw, issocket, isbinary;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
  if (isascii || isdraw || isbinary || issocket) PetscCall(MatView_MPISELL_ASCIIorDraworSocket(mat, viewer));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetGhosts_MPISELL(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;

  PetscFunctionBegin;
  PetscCall(MatGetSize(sell->B, NULL, nghosts));
  if (ghosts) *ghosts = sell->garray;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetInfo_MPISELL(Mat matin, MatInfoType flag, MatInfo *info)
{
  Mat_MPISELL   *mat = (Mat_MPISELL *)matin->data;
  Mat            A = mat->A, B = mat->B;
  PetscLogDouble isend[5], irecv[5];

  PetscFunctionBegin;
  info->block_size = 1.0;
  PetscCall(MatGetInfo(A, MAT_LOCAL, info));

  isend[0] = info->nz_used;
  isend[1] = info->nz_allocated;
  isend[2] = info->nz_unneeded;
  isend[3] = info->memory;
  isend[4] = info->mallocs;

  PetscCall(MatGetInfo(B, MAT_LOCAL, info));

  isend[0] += info->nz_used;
  isend[1] += info->nz_allocated;
  isend[2] += info->nz_unneeded;
  isend[3] += info->memory;
  isend[4] += info->mallocs;
  if (flag == MAT_LOCAL) {
    info->nz_used      = isend[0];
    info->nz_allocated = isend[1];
    info->nz_unneeded  = isend[2];
    info->memory       = isend[3];
    info->mallocs      = isend[4];
  } else if (flag == MAT_GLOBAL_MAX) {
    PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));

    info->nz_used      = irecv[0];
    info->nz_allocated = irecv[1];
    info->nz_unneeded  = irecv[2];
    info->memory       = irecv[3];
    info->mallocs      = irecv[4];
  } else if (flag == MAT_GLOBAL_SUM) {
    PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));

    info->nz_used      = irecv[0];
    info->nz_allocated = irecv[1];
    info->nz_unneeded  = irecv[2];
    info->memory       = irecv[3];
    info->mallocs      = irecv[4];
  }
  info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
  info->fill_ratio_needed = 0;
  info->factor_mallocs    = 0;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetOption_MPISELL(Mat A, MatOption op, PetscBool flg)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  switch (op) {
  case MAT_NEW_NONZERO_LOCATIONS:
  case MAT_NEW_NONZERO_ALLOCATION_ERR:
  case MAT_UNUSED_NONZERO_LOCATION_ERR:
  case MAT_KEEP_NONZERO_PATTERN:
  case MAT_NEW_NONZERO_LOCATION_ERR:
  case MAT_USE_INODES:
  case MAT_IGNORE_ZERO_ENTRIES:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    PetscCall(MatSetOption(a->B, op, flg));
    break;
  case MAT_ROW_ORIENTED:
    MatCheckPreallocated(A, 1);
    a->roworiented = flg;

    PetscCall(MatSetOption(a->A, op, flg));
    PetscCall(MatSetOption(a->B, op, flg));
    break;
  case MAT_IGNORE_OFF_PROC_ENTRIES:
    a->donotstash = flg;
    break;
  case MAT_SYMMETRIC:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  case MAT_STRUCTURALLY_SYMMETRIC:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  case MAT_HERMITIAN:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  case MAT_SYMMETRY_ETERNAL:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
    MatCheckPreallocated(A, 1);
    PetscCall(MatSetOption(a->A, op, flg));
    break;
  default:
    break;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDiagonalScale_MPISELL(Mat mat, Vec ll, Vec rr)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;
  Mat          a = sell->A, b = sell->B;
  PetscInt     s1, s2, s3;

  PetscFunctionBegin;
  PetscCall(MatGetLocalSize(mat, &s2, &s3));
  if (rr) {
    PetscCall(VecGetLocalSize(rr, &s1));
    PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
    /* Overlap communication with computation. */
    PetscCall(VecScatterBegin(sell->Mvctx, rr, sell->lvec, INSERT_VALUES, SCATTER_FORWARD));
  }
  if (ll) {
    PetscCall(VecGetLocalSize(ll, &s1));
    PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
    PetscUseTypeMethod(b, diagonalscale, ll, NULL);
  }
  /* scale  the diagonal block */
  PetscUseTypeMethod(a, diagonalscale, ll, rr);

  if (rr) {
    /* Do a scatter end and then right scale the off-diagonal block */
    PetscCall(VecScatterEnd(sell->Mvctx, rr, sell->lvec, INSERT_VALUES, SCATTER_FORWARD));
    PetscUseTypeMethod(b, diagonalscale, NULL, sell->lvec);
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetUnfactored_MPISELL(Mat A)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCall(MatSetUnfactored(a->A));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatEqual_MPISELL(Mat A, Mat B, PetscBool *flag)
{
  Mat_MPISELL *matB = (Mat_MPISELL *)B->data, *matA = (Mat_MPISELL *)A->data;
  Mat          a, b, c, d;
  PetscBool    flg;

  PetscFunctionBegin;
  a = matA->A;
  b = matA->B;
  c = matB->A;
  d = matB->B;

  PetscCall(MatEqual(a, c, &flg));
  if (flg) PetscCall(MatEqual(b, d, &flg));
  PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatCopy_MPISELL(Mat A, Mat B, MatStructure str)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;
  Mat_MPISELL *b = (Mat_MPISELL *)B->data;

  PetscFunctionBegin;
  /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
  if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
    /* because of the column compression in the off-processor part of the matrix a->B,
       the number of columns in a->B and b->B may be different, hence we cannot call
       the MatCopy() directly on the two parts. If need be, we can provide a more
       efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
       then copying the submatrices */
    PetscCall(MatCopy_Basic(A, B, str));
  } else {
    PetscCall(MatCopy(a->A, b->A, str));
    PetscCall(MatCopy(a->B, b->B, str));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetUp_MPISELL(Mat A)
{
  PetscFunctionBegin;
  PetscCall(MatMPISELLSetPreallocation(A, PETSC_DEFAULT, NULL, PETSC_DEFAULT, NULL));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatConjugate_MPISELL(Mat mat)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;

  PetscFunctionBegin;
  PetscCall(MatConjugate_SeqSELL(sell->A));
  PetscCall(MatConjugate_SeqSELL(sell->B));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatInvertBlockDiagonal_MPISELL(Mat A, const PetscScalar **values)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;

  PetscFunctionBegin;
  PetscCall(MatInvertBlockDiagonal(a->A, values));
  A->factorerrortype = a->A->factorerrortype;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetRandom_MPISELL(Mat x, PetscRandom rctx)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)x->data;

  PetscFunctionBegin;
  PetscCall(MatSetRandom(sell->A, rctx));
  PetscCall(MatSetRandom(sell->B, rctx));
  PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSetFromOptions_MPISELL(Mat A, PetscOptionItems PetscOptionsObject)
{
  PetscFunctionBegin;
  PetscOptionsHeadBegin(PetscOptionsObject, "MPISELL options");
  PetscOptionsHeadEnd();
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatShift_MPISELL(Mat Y, PetscScalar a)
{
  Mat_MPISELL *msell = (Mat_MPISELL *)Y->data;
  Mat_SeqSELL *sell  = (Mat_SeqSELL *)msell->A->data;

  PetscFunctionBegin;
  if (!Y->preallocated) {
    PetscCall(MatMPISELLSetPreallocation(Y, 1, NULL, 0, NULL));
  } else if (!sell->nz) {
    PetscInt nonew = sell->nonew;
    PetscCall(MatSeqSELLSetPreallocation(msell->A, 1, NULL));
    sell->nonew = nonew;
  }
  PetscCall(MatShift_Basic(Y, a));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetDiagonalBlock_MPISELL(Mat A, Mat *a)
{
  PetscFunctionBegin;
  *a = ((Mat_MPISELL *)A->data)->A;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatStoreValues_MPISELL(Mat mat)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;

  PetscFunctionBegin;
  PetscCall(MatStoreValues(sell->A));
  PetscCall(MatStoreValues(sell->B));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatRetrieveValues_MPISELL(Mat mat)
{
  Mat_MPISELL *sell = (Mat_MPISELL *)mat->data;

  PetscFunctionBegin;
  PetscCall(MatRetrieveValues(sell->A));
  PetscCall(MatRetrieveValues(sell->B));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMPISELLSetPreallocation_MPISELL(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[])
{
  Mat_MPISELL *b;

  PetscFunctionBegin;
  PetscCall(PetscLayoutSetUp(B->rmap));
  PetscCall(PetscLayoutSetUp(B->cmap));
  b = (Mat_MPISELL *)B->data;

  if (!B->preallocated) {
    /* Explicitly create 2 MATSEQSELL matrices. */
    PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
    PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
    PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
    PetscCall(MatSetType(b->A, MATSEQSELL));
    PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
    PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N));
    PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
    PetscCall(MatSetType(b->B, MATSEQSELL));
  }

  PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen));
  PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen));
  B->preallocated  = PETSC_TRUE;
  B->was_assembled = PETSC_FALSE;
  /*
    critical for MatAssemblyEnd to work.
    MatAssemblyBegin checks it to set up was_assembled
    and MatAssemblyEnd checks was_assembled to determine whether to build garray
  */
  B->assembled = PETSC_FALSE;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDuplicate_MPISELL(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
{
  Mat          mat;
  Mat_MPISELL *a, *oldmat = (Mat_MPISELL *)matin->data;

  PetscFunctionBegin;
  *newmat = NULL;
  PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
  PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
  PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
  PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
  a = (Mat_MPISELL *)mat->data;

  mat->factortype   = matin->factortype;
  mat->assembled    = PETSC_TRUE;
  mat->insertmode   = NOT_SET_VALUES;
  mat->preallocated = PETSC_TRUE;

  a->size         = oldmat->size;
  a->rank         = oldmat->rank;
  a->donotstash   = oldmat->donotstash;
  a->roworiented  = oldmat->roworiented;
  a->rowindices   = NULL;
  a->rowvalues    = NULL;
  a->getrowactive = PETSC_FALSE;

  PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
  PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));

  if (oldmat->colmap) {
#if defined(PETSC_USE_CTABLE)
    PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
#else
    PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
    PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
#endif
  } else a->colmap = NULL;
  if (oldmat->garray) {
    PetscInt len;
    len = oldmat->B->cmap->n;
    PetscCall(PetscMalloc1(len + 1, &a->garray));
    if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
  } else a->garray = NULL;

  PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
  PetscCall(VecScatterCopy(oldmat->Mvctx, &a->Mvctx));
  PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
  PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
  PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
  *newmat = mat;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static const struct _MatOps MatOps_Values = {MatSetValues_MPISELL,
                                             NULL,
                                             NULL,
                                             MatMult_MPISELL,
                                             /* 4*/ MatMultAdd_MPISELL,
                                             MatMultTranspose_MPISELL,
                                             MatMultTransposeAdd_MPISELL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*10*/ NULL,
                                             NULL,
                                             NULL,
                                             MatSOR_MPISELL,
                                             NULL,
                                             /*15*/ MatGetInfo_MPISELL,
                                             MatEqual_MPISELL,
                                             MatGetDiagonal_MPISELL,
                                             MatDiagonalScale_MPISELL,
                                             NULL,
                                             /*20*/ MatAssemblyBegin_MPISELL,
                                             MatAssemblyEnd_MPISELL,
                                             MatSetOption_MPISELL,
                                             MatZeroEntries_MPISELL,
                                             /*24*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*29*/ MatSetUp_MPISELL,
                                             NULL,
                                             NULL,
                                             MatGetDiagonalBlock_MPISELL,
                                             NULL,
                                             /*34*/ MatDuplicate_MPISELL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*39*/ NULL,
                                             NULL,
                                             NULL,
                                             MatGetValues_MPISELL,
                                             MatCopy_MPISELL,
                                             /*44*/ NULL,
                                             MatScale_MPISELL,
                                             MatShift_MPISELL,
                                             MatDiagonalSet_MPISELL,
                                             NULL,
                                             /*49*/ MatSetRandom_MPISELL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*54*/ MatFDColoringCreate_MPIXAIJ,
                                             NULL,
                                             MatSetUnfactored_MPISELL,
                                             NULL,
                                             NULL,
                                             /*59*/ NULL,
                                             MatDestroy_MPISELL,
                                             MatView_MPISELL,
                                             NULL,
                                             NULL,
                                             /*64*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*69*/ NULL,
                                             NULL,
                                             NULL,
                                             MatFDColoringApply_AIJ, /* reuse AIJ function */
                                             MatSetFromOptions_MPISELL,
                                             NULL,
                                             /*75*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*80*/ NULL,
                                             NULL,
                                             NULL,
                                             /*83*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*89*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             MatConjugate_MPISELL,
                                             /*94*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*99*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*104*/ NULL,
                                             NULL,
                                             MatGetGhosts_MPISELL,
                                             NULL,
                                             NULL,
                                             /*109*/ MatMultDiagonalBlock_MPISELL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*114*/ NULL,
                                             NULL,
                                             MatInvertBlockDiagonal_MPISELL,
                                             NULL,
                                             /*119*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*124*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*129*/ MatFDColoringSetUp_MPIXAIJ,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*134*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             /*139*/ NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL,
                                             NULL};

/*@C
  MatMPISELLSetPreallocation - Preallocates memory for a `MATMPISELL` sparse parallel matrix in sell format.
  For good matrix assembly performance the user should preallocate the matrix storage by
  setting the parameters `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).

  Collective

  Input Parameters:
+ B     - the matrix
. d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
           (same value is used for all local rows)
. d_nnz - array containing the number of nonzeros in the various rows of the
           DIAGONAL portion of the local submatrix (possibly different for each row)
           or NULL (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
           The size of this array is equal to the number of local rows, i.e 'm'.
           For matrices that will be factored, you must leave room for (and set)
           the diagonal entry even if it is zero.
. o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
           submatrix (same value is used for all local rows).
- o_nnz - array containing the number of nonzeros in the various rows of the
           OFF-DIAGONAL portion of the local submatrix (possibly different for
           each row) or NULL (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
           structure. The size of this array is equal to the number
           of local rows, i.e 'm'.

  Example usage:
  Consider the following 8x8 matrix with 34 non-zero values, that is
  assembled across 3 processors. Lets assume that proc0 owns 3 rows,
  proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
  as follows

.vb
            1  2  0  |  0  3  0  |  0  4
    Proc0   0  5  6  |  7  0  0  |  8  0
            9  0 10  | 11  0  0  | 12  0
    -------------------------------------
           13  0 14  | 15 16 17  |  0  0
    Proc1   0 18  0  | 19 20 21  |  0  0
            0  0  0  | 22 23  0  | 24  0
    -------------------------------------
    Proc2  25 26 27  |  0  0 28  | 29  0
           30  0  0  | 31 32 33  |  0 34
.ve

  This can be represented as a collection of submatrices as

.vb
      A B C
      D E F
      G H I
.ve

  Where the submatrices A,B,C are owned by proc0, D,E,F are
  owned by proc1, G,H,I are owned by proc2.

  The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
  The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
  The 'M','N' parameters are 8,8, and have the same values on all procs.

  The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
  submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
  corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
  Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
  part as `MATSEQSELL` matrices. For example, proc1 will store [E] as a `MATSEQSELL`
  matrix, and [DF] as another SeqSELL matrix.

  When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
  allocated for every row of the local DIAGONAL submatrix, and o_nz
  storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
  One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
  the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
  In this case, the values of d_nz,o_nz are
.vb
     proc0  dnz = 2, o_nz = 2
     proc1  dnz = 3, o_nz = 2
     proc2  dnz = 1, o_nz = 4
.ve
  We are allocating m*(d_nz+o_nz) storage locations for every proc. This
  translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
  for proc3. i.e we are using 12+15+10=37 storage locations to store
  34 values.

  When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
  for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
  In the above case the values for d_nnz,o_nnz are
.vb
     proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
     proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
     proc2 d_nnz = [1,1]   and o_nnz = [4,4]
.ve
  Here the space allocated is according to nz (or maximum values in the nnz
  if nnz is provided) for DIAGONAL and OFF-DIAGONAL submatrices, i.e (2+2+3+2)*3+(1+4)*2=37

  Level: intermediate

  Notes:
  If the *_nnz parameter is given then the *_nz parameter is ignored

  The stored row and column indices begin with zero.

  The parallel matrix is partitioned such that the first m0 rows belong to
  process 0, the next m1 rows belong to process 1, the next m2 rows belong
  to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.

  The DIAGONAL portion of the local submatrix of a processor can be defined
  as the submatrix which is obtained by extraction the part corresponding to
  the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
  first row that belongs to the processor, r2 is the last row belonging to
  the this processor, and c1-c2 is range of indices of the local part of a
  vector suitable for applying the matrix to.  This is an mxn matrix.  In the
  common case of a square matrix, the row and column ranges are the same and
  the DIAGONAL part is also square. The remaining portion of the local
  submatrix (mxN) constitute the OFF-DIAGONAL portion.

  If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.

  You can call `MatGetInfo()` to get information on how effective the preallocation was;
  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
  You can also run with the option -info and look for messages with the string
  malloc in them to see if additional memory allocation was needed.

.seealso: `Mat`, `MatCreate()`, `MatCreateSeqSELL()`, `MatSetValues()`, `MatCreateSELL()`,
          `MATMPISELL`, `MatGetInfo()`, `PetscSplitOwnership()`, `MATSELL`
@*/
PetscErrorCode MatMPISELLSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
{
  PetscFunctionBegin;
  PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
  PetscValidType(B, 1);
  PetscTryMethod(B, "MatMPISELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*MC
   MATMPISELL - MATMPISELL = "mpisell" - A matrix type to be used for MPI sparse matrices,
   based on the sliced Ellpack format

   Options Database Key:
. -mat_type sell - sets the matrix type to `MATSELL` during a call to `MatSetFromOptions()`

   Level: beginner

.seealso: `Mat`, `MatCreateSELL()`, `MATSEQSELL`, `MATSELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
M*/

/*@C
  MatCreateSELL - Creates a sparse parallel matrix in `MATSELL` format.

  Collective

  Input Parameters:
+ comm      - MPI communicator
. m         - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
              This value should be the same as the local size used in creating the
              y vector for the matrix-vector product y = Ax.
. n         - This value should be the same as the local size used in creating the
              x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
              calculated if `N` is given) For square matrices n is almost always `m`.
. M         - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
. N         - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
. d_rlenmax - max number of nonzeros per row in DIAGONAL portion of local submatrix
             (same value is used for all local rows)
. d_rlen    - array containing the number of nonzeros in the various rows of the
              DIAGONAL portion of the local submatrix (possibly different for each row)
              or `NULL`, if d_rlenmax is used to specify the nonzero structure.
              The size of this array is equal to the number of local rows, i.e `m`.
. o_rlenmax - max number of nonzeros per row in the OFF-DIAGONAL portion of local
              submatrix (same value is used for all local rows).
- o_rlen    - array containing the number of nonzeros in the various rows of the
              OFF-DIAGONAL portion of the local submatrix (possibly different for
              each row) or `NULL`, if `o_rlenmax` is used to specify the nonzero
              structure. The size of this array is equal to the number
              of local rows, i.e `m`.

  Output Parameter:
. A - the matrix

  Options Database Key:
. -mat_sell_oneindex - Internally use indexing starting at 1
        rather than 0.  When calling `MatSetValues()`,
        the user still MUST index entries starting at 0!

  Example:
  Consider the following 8x8 matrix with 34 non-zero values, that is
  assembled across 3 processors. Lets assume that proc0 owns 3 rows,
  proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
  as follows

.vb
            1  2  0  |  0  3  0  |  0  4
    Proc0   0  5  6  |  7  0  0  |  8  0
            9  0 10  | 11  0  0  | 12  0
    -------------------------------------
           13  0 14  | 15 16 17  |  0  0
    Proc1   0 18  0  | 19 20 21  |  0  0
            0  0  0  | 22 23  0  | 24  0
    -------------------------------------
    Proc2  25 26 27  |  0  0 28  | 29  0
           30  0  0  | 31 32 33  |  0 34
.ve

  This can be represented as a collection of submatrices as
.vb
      A B C
      D E F
      G H I
.ve

  Where the submatrices A,B,C are owned by proc0, D,E,F are
  owned by proc1, G,H,I are owned by proc2.

  The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
  The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
  The 'M','N' parameters are 8,8, and have the same values on all procs.

  The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
  submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
  corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
  Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
  part as `MATSEQSELL` matrices. For example, proc1 will store [E] as a `MATSEQSELL`
  matrix, and [DF] as another `MATSEQSELL` matrix.

  When d_rlenmax, o_rlenmax parameters are specified, d_rlenmax storage elements are
  allocated for every row of the local DIAGONAL submatrix, and o_rlenmax
  storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
  One way to choose `d_rlenmax` and `o_rlenmax` is to use the maximum number of nonzeros over
  the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
  In this case, the values of d_rlenmax,o_rlenmax are
.vb
     proc0 - d_rlenmax = 2, o_rlenmax = 2
     proc1 - d_rlenmax = 3, o_rlenmax = 2
     proc2 - d_rlenmax = 1, o_rlenmax = 4
.ve
  We are allocating m*(d_rlenmax+o_rlenmax) storage locations for every proc. This
  translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
  for proc3. i.e we are using 12+15+10=37 storage locations to store
  34 values.

  When `d_rlen`, `o_rlen` parameters are specified, the storage is specified
  for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
  In the above case the values for `d_nnz`, `o_nnz` are
.vb
     proc0 - d_nnz = [2,2,2] and o_nnz = [2,2,2]
     proc1 - d_nnz = [3,3,2] and o_nnz = [2,1,1]
     proc2 - d_nnz = [1,1]   and o_nnz = [4,4]
.ve
  Here the space allocated is still 37 though there are 34 nonzeros because
  the allocation is always done according to rlenmax.

  Level: intermediate

  Notes:
  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
  MatXXXXSetPreallocation() paradigm instead of this routine directly.
  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]

  If the *_rlen parameter is given then the *_rlenmax parameter is ignored

  `m`, `n`, `M`, `N` parameters specify the size of the matrix, and its partitioning across
  processors, while `d_rlenmax`, `d_rlen`, `o_rlenmax` , `o_rlen` parameters specify the approximate
  storage requirements for this matrix.

  If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
  processor than it must be used on all processors that share the object for
  that argument.

  The user MUST specify either the local or global matrix dimensions
  (possibly both).

  The parallel matrix is partitioned across processors such that the
  first m0 rows belong to process 0, the next m1 rows belong to
  process 1, the next m2 rows belong to process 2 etc.. where
  m0,m1,m2,.. are the input parameter 'm'. i.e each processor stores
  values corresponding to [`m` x `N`] submatrix.

  The columns are logically partitioned with the n0 columns belonging
  to 0th partition, the next n1 columns belonging to the next
  partition etc.. where n0,n1,n2... are the input parameter `n`.

  The DIAGONAL portion of the local submatrix on any given processor
  is the submatrix corresponding to the rows and columns `m`, `n`
  corresponding to the given processor. i.e diagonal matrix on
  process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
  etc. The remaining portion of the local submatrix [m x (N-n)]
  constitute the OFF-DIAGONAL portion. The example below better
  illustrates this concept.

  For a square global matrix we define each processor's diagonal portion
  to be its local rows and the corresponding columns (a square submatrix);
  each processor's off-diagonal portion encompasses the remainder of the
  local matrix (a rectangular submatrix).

  If `o_rlen`, `d_rlen` are specified, then `o_rlenmax`, and `d_rlenmax` are ignored.

  When calling this routine with a single process communicator, a matrix of
  type `MATSEQSELL` is returned.  If a matrix of type `MATMPISELL` is desired for this
  type of communicator, use the construction mechanism
.vb
   MatCreate(...,&A);
   MatSetType(A,MATMPISELL);
   MatSetSizes(A, m,n,M,N);
   MatMPISELLSetPreallocation(A,...);
.ve

.seealso: `Mat`, `MATSELL`, `MatCreate()`, `MatCreateSeqSELL()`, `MatSetValues()`, `MatMPISELLSetPreallocation()`, `MATMPISELL`
@*/
PetscErrorCode MatCreateSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[], Mat *A)
{
  PetscMPIInt size;

  PetscFunctionBegin;
  PetscCall(MatCreate(comm, A));
  PetscCall(MatSetSizes(*A, m, n, M, N));
  PetscCallMPI(MPI_Comm_size(comm, &size));
  if (size > 1) {
    PetscCall(MatSetType(*A, MATMPISELL));
    PetscCall(MatMPISELLSetPreallocation(*A, d_rlenmax, d_rlen, o_rlenmax, o_rlen));
  } else {
    PetscCall(MatSetType(*A, MATSEQSELL));
    PetscCall(MatSeqSELLSetPreallocation(*A, d_rlenmax, d_rlen));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@C
  MatMPISELLGetSeqSELL - Returns the local pieces of this distributed matrix

  Not Collective

  Input Parameter:
. A - the `MATMPISELL` matrix

  Output Parameters:
+ Ad     - The diagonal portion of `A`
. Ao     - The off-diagonal portion of `A`
- colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix

  Level: advanced

.seealso: `Mat`, `MATSEQSELL`, `MATMPISELL`
@*/
PetscErrorCode MatMPISELLGetSeqSELL(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;
  PetscBool    flg;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPISELL, &flg));
  PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPISELL matrix as input");
  if (Ad) *Ad = a->A;
  if (Ao) *Ao = a->B;
  if (colmap) *colmap = a->garray;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@C
  MatMPISELLGetLocalMatCondensed - Creates a `MATSEQSELL` matrix from an `MATMPISELL` matrix by
  taking all its local rows and NON-ZERO columns

  Not Collective

  Input Parameters:
+ A     - the matrix
. scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
. row   - index sets of rows to extract (or `NULL`)
- col   - index sets of columns to extract (or `NULL`)

  Output Parameter:
. A_loc - the local sequential matrix generated

  Level: advanced

.seealso: `Mat`, `MATSEQSELL`, `MATMPISELL`, `MatGetOwnershipRange()`, `MatMPISELLGetLocalMat()`
@*/
PetscErrorCode MatMPISELLGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;
  PetscInt     i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
  IS           isrowa, iscola;
  Mat         *aloc;
  PetscBool    match;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPISELL, &match));
  PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPISELL matrix as input");
  PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
  if (!row) {
    start = A->rmap->rstart;
    end   = A->rmap->rend;
    PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
  } else {
    isrowa = *row;
  }
  if (!col) {
    start = A->cmap->rstart;
    cmap  = a->garray;
    nzA   = a->A->cmap->n;
    nzB   = a->B->cmap->n;
    PetscCall(PetscMalloc1(nzA + nzB, &idx));
    ncols = 0;
    for (i = 0; i < nzB; i++) {
      if (cmap[i] < start) idx[ncols++] = cmap[i];
      else break;
    }
    imark = i;
    for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
    for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
    PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
  } else {
    iscola = *col;
  }
  if (scall != MAT_INITIAL_MATRIX) {
    PetscCall(PetscMalloc1(1, &aloc));
    aloc[0] = *A_loc;
  }
  PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
  *A_loc = aloc[0];
  PetscCall(PetscFree(aloc));
  if (!row) PetscCall(ISDestroy(&isrowa));
  if (!col) PetscCall(ISDestroy(&iscola));
  PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
  PetscFunctionReturn(PETSC_SUCCESS);
}

#include <../src/mat/impls/aij/mpi/mpiaij.h>

PetscErrorCode MatConvert_MPISELL_MPIAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
{
  Mat_MPISELL *a = (Mat_MPISELL *)A->data;
  Mat          B;
  Mat_MPIAIJ  *b;

  PetscFunctionBegin;
  PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Matrix must be assembled");

  if (reuse == MAT_REUSE_MATRIX) {
    B = *newmat;
  } else {
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
    PetscCall(MatSetType(B, MATMPIAIJ));
    PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
    PetscCall(MatSetBlockSizes(B, A->rmap->bs, A->cmap->bs));
    PetscCall(MatSeqAIJSetPreallocation(B, 0, NULL));
    PetscCall(MatMPIAIJSetPreallocation(B, 0, NULL, 0, NULL));
  }
  b = (Mat_MPIAIJ *)B->data;

  if (reuse == MAT_REUSE_MATRIX) {
    PetscCall(MatConvert_SeqSELL_SeqAIJ(a->A, MATSEQAIJ, MAT_REUSE_MATRIX, &b->A));
    PetscCall(MatConvert_SeqSELL_SeqAIJ(a->B, MATSEQAIJ, MAT_REUSE_MATRIX, &b->B));
  } else {
    PetscCall(MatDestroy(&b->A));
    PetscCall(MatDestroy(&b->B));
    PetscCall(MatDisAssemble_MPISELL(A));
    PetscCall(MatConvert_SeqSELL_SeqAIJ(a->A, MATSEQAIJ, MAT_INITIAL_MATRIX, &b->A));
    PetscCall(MatConvert_SeqSELL_SeqAIJ(a->B, MATSEQAIJ, MAT_INITIAL_MATRIX, &b->B));
    PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
  }

  if (reuse == MAT_INPLACE_MATRIX) {
    PetscCall(MatHeaderReplace(A, &B));
  } else {
    *newmat = B;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
{
  Mat_MPIAIJ  *a = (Mat_MPIAIJ *)A->data;
  Mat          B;
  Mat_MPISELL *b;

  PetscFunctionBegin;
  PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Matrix must be assembled");

  if (reuse == MAT_REUSE_MATRIX) {
    B = *newmat;
  } else {
    Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)a->A->data, *Ba = (Mat_SeqAIJ *)a->B->data;
    PetscInt    i, d_nz = 0, o_nz = 0, m = A->rmap->N, n = A->cmap->N, lm = A->rmap->n, ln = A->cmap->n;
    PetscInt   *d_nnz, *o_nnz;
    PetscCall(PetscMalloc2(lm, &d_nnz, lm, &o_nnz));
    for (i = 0; i < lm; i++) {
      d_nnz[i] = Aa->i[i + 1] - Aa->i[i];
      o_nnz[i] = Ba->i[i + 1] - Ba->i[i];
      if (d_nnz[i] > d_nz) d_nz = d_nnz[i];
      if (o_nnz[i] > o_nz) o_nz = o_nnz[i];
    }
    PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
    PetscCall(MatSetType(B, MATMPISELL));
    PetscCall(MatSetSizes(B, lm, ln, m, n));
    PetscCall(MatSetBlockSizes(B, A->rmap->bs, A->cmap->bs));
    PetscCall(MatSeqSELLSetPreallocation(B, d_nz, d_nnz));
    PetscCall(MatMPISELLSetPreallocation(B, d_nz, d_nnz, o_nz, o_nnz));
    PetscCall(PetscFree2(d_nnz, o_nnz));
  }
  b = (Mat_MPISELL *)B->data;

  if (reuse == MAT_REUSE_MATRIX) {
    PetscCall(MatConvert_SeqAIJ_SeqSELL(a->A, MATSEQSELL, MAT_REUSE_MATRIX, &b->A));
    PetscCall(MatConvert_SeqAIJ_SeqSELL(a->B, MATSEQSELL, MAT_REUSE_MATRIX, &b->B));
  } else {
    PetscCall(MatDestroy(&b->A));
    PetscCall(MatDestroy(&b->B));
    PetscCall(MatConvert_SeqAIJ_SeqSELL(a->A, MATSEQSELL, MAT_INITIAL_MATRIX, &b->A));
    PetscCall(MatConvert_SeqAIJ_SeqSELL(a->B, MATSEQSELL, MAT_INITIAL_MATRIX, &b->B));
    PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
  }

  if (reuse == MAT_INPLACE_MATRIX) {
    PetscCall(MatHeaderReplace(A, &B));
  } else {
    *newmat = B;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSOR_MPISELL(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
{
  Mat_MPISELL *mat = (Mat_MPISELL *)matin->data;
  Vec          bb1 = NULL;

  PetscFunctionBegin;
  if (flag == SOR_APPLY_UPPER) {
    PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
    PetscFunctionReturn(PETSC_SUCCESS);
  }

  if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));

  if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
    if (flag & SOR_ZERO_INITIAL_GUESS) {
      PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
      its--;
    }

    while (its--) {
      PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
      PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));

      /* update rhs: bb1 = bb - B*x */
      PetscCall(VecScale(mat->lvec, -1.0));
      PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));

      /* local sweep */
      PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
    }
  } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
    if (flag & SOR_ZERO_INITIAL_GUESS) {
      PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
      its--;
    }
    while (its--) {
      PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
      PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));

      /* update rhs: bb1 = bb - B*x */
      PetscCall(VecScale(mat->lvec, -1.0));
      PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));

      /* local sweep */
      PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
    }
  } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
    if (flag & SOR_ZERO_INITIAL_GUESS) {
      PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
      its--;
    }
    while (its--) {
      PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
      PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));

      /* update rhs: bb1 = bb - B*x */
      PetscCall(VecScale(mat->lvec, -1.0));
      PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));

      /* local sweep */
      PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
    }
  } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");

  PetscCall(VecDestroy(&bb1));

  matin->factorerrortype = mat->A->factorerrortype;
  PetscFunctionReturn(PETSC_SUCCESS);
}

#if defined(PETSC_HAVE_CUDA)
PETSC_INTERN PetscErrorCode MatConvert_MPISELL_MPISELLCUDA(Mat, MatType, MatReuse, Mat *);
#endif

/*MC
   MATMPISELL - MATMPISELL = "MPISELL" - A matrix type to be used for parallel sparse matrices.

   Options Database Keys:
. -mat_type mpisell - sets the matrix type to `MATMPISELL` during a call to `MatSetFromOptions()`

  Level: beginner

.seealso: `Mat`, `MATSELL`, `MATSEQSELL` `MatCreateSELL()`
M*/
PETSC_EXTERN PetscErrorCode MatCreate_MPISELL(Mat B)
{
  Mat_MPISELL *b;
  PetscMPIInt  size;

  PetscFunctionBegin;
  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
  PetscCall(PetscNew(&b));
  B->data       = (void *)b;
  B->ops[0]     = MatOps_Values;
  B->assembled  = PETSC_FALSE;
  B->insertmode = NOT_SET_VALUES;
  b->size       = size;
  PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
  /* build cache for off array entries formed */
  PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));

  b->donotstash  = PETSC_FALSE;
  b->colmap      = NULL;
  b->garray      = NULL;
  b->roworiented = PETSC_TRUE;

  /* stuff used for matrix vector multiply */
  b->lvec  = NULL;
  b->Mvctx = NULL;

  /* stuff for MatGetRow() */
  b->rowindices   = NULL;
  b->rowvalues    = NULL;
  b->getrowactive = PETSC_FALSE;

  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPISELL));
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPISELL));
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPISELL));
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELL));
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpisell_mpiaij_C", MatConvert_MPISELL_MPIAIJ));
#if defined(PETSC_HAVE_CUDA)
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpisell_mpisellcuda_C", MatConvert_MPISELL_MPISELLCUDA));
#endif
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPISELL));
  PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPISELL));
  PetscFunctionReturn(PETSC_SUCCESS);
}
