/*
        Provides an interface to the SuperLU_DIST sparse solver
*/

#include <../src/mat/impls/aij/seq/aij.h>
#include <../src/mat/impls/aij/mpi/mpiaij.h>
#include <petscpkg_version.h>

PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wundef")
EXTERN_C_BEGIN
// To suppress compiler warnings
#define DISABLE_CUSPARSE_DEPRECATED
#if defined(PETSC_USE_COMPLEX)
  #define CASTDOUBLECOMPLEX     (doublecomplex *)
  #define CASTDOUBLECOMPLEXSTAR (doublecomplex **)
  #include <superlu_zdefs.h>
  #define LUstructInit                  zLUstructInit
  #define ScalePermstructInit           zScalePermstructInit
  #define ScalePermstructFree           zScalePermstructFree
  #define LUstructFree                  zLUstructFree
  #define Destroy_LU                    zDestroy_LU
  #define ScalePermstruct_t             zScalePermstruct_t
  #define LUstruct_t                    zLUstruct_t
  #define SOLVEstruct_t                 zSOLVEstruct_t
  #define SolveFinalize                 zSolveFinalize
  #define pGetDiagU                     pzGetDiagU
  #define pgssvx                        pzgssvx
  #define allocateA_dist                zallocateA_dist
  #define Create_CompRowLoc_Matrix_dist zCreate_CompRowLoc_Matrix_dist
  #define SLU                           SLU_Z
  #if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    #define DeAllocLlu_3d              zDeAllocLlu_3d
    #define DeAllocGlu_3d              zDeAllocGlu_3d
    #define Destroy_A3d_gathered_on_2d zDestroy_A3d_gathered_on_2d
    #define pgssvx3d                   pzgssvx3d
  #endif
#elif defined(PETSC_USE_REAL_SINGLE)
  #define CASTDOUBLECOMPLEX
  #define CASTDOUBLECOMPLEXSTAR
  #include <superlu_sdefs.h>
  #define LUstructInit                  sLUstructInit
  #define ScalePermstructInit           sScalePermstructInit
  #define ScalePermstructFree           sScalePermstructFree
  #define LUstructFree                  sLUstructFree
  #define Destroy_LU                    sDestroy_LU
  #define ScalePermstruct_t             sScalePermstruct_t
  #define LUstruct_t                    sLUstruct_t
  #define SOLVEstruct_t                 sSOLVEstruct_t
  #define SolveFinalize                 sSolveFinalize
  #define pGetDiagU                     psGetDiagU
  #define pgssvx                        psgssvx
  #define allocateA_dist                sallocateA_dist
  #define Create_CompRowLoc_Matrix_dist sCreate_CompRowLoc_Matrix_dist
  #define SLU                           SLU_S
  #if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    #define DeAllocLlu_3d              sDeAllocLlu_3d
    #define DeAllocGlu_3d              sDeAllocGlu_3d
    #define Destroy_A3d_gathered_on_2d sDestroy_A3d_gathered_on_2d
    #define pgssvx3d                   psgssvx3d
  #endif
#else
  #define CASTDOUBLECOMPLEX
  #define CASTDOUBLECOMPLEXSTAR
  #include <superlu_ddefs.h>
  #define LUstructInit                  dLUstructInit
  #define ScalePermstructInit           dScalePermstructInit
  #define ScalePermstructFree           dScalePermstructFree
  #define LUstructFree                  dLUstructFree
  #define Destroy_LU                    dDestroy_LU
  #define ScalePermstruct_t             dScalePermstruct_t
  #define LUstruct_t                    dLUstruct_t
  #define SOLVEstruct_t                 dSOLVEstruct_t
  #define SolveFinalize                 dSolveFinalize
  #define pGetDiagU                     pdGetDiagU
  #define pgssvx                        pdgssvx
  #define allocateA_dist                dallocateA_dist
  #define Create_CompRowLoc_Matrix_dist dCreate_CompRowLoc_Matrix_dist
  #define SLU                           SLU_D
  #if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    #define DeAllocLlu_3d              dDeAllocLlu_3d
    #define DeAllocGlu_3d              dDeAllocGlu_3d
    #define Destroy_A3d_gathered_on_2d dDestroy_A3d_gathered_on_2d
    #define pgssvx3d                   pdgssvx3d
  #endif
#endif
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  #include <superlu_sdefs.h>
#endif
EXTERN_C_END
PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()

typedef struct {
  int_t      nprow, npcol, *row, *col;
  gridinfo_t grid;
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  PetscBool    use3d;
  int_t        npdep; /* replication factor, must be power of two */
  gridinfo3d_t grid3d;
#endif
  superlu_dist_options_t options;
  SuperMatrix            A_sup;
  ScalePermstruct_t      ScalePermstruct;
  LUstruct_t             LUstruct;
  int                    StatPrint;
  SOLVEstruct_t          SOLVEstruct;
  fact_t                 FactPattern;
  MPI_Comm               comm_superlu;
  PetscScalar           *val;
  PetscBool              matsolve_iscalled, matmatsolve_iscalled;
  PetscBool              CleanUpSuperLU_Dist; /* Flag to clean up (non-global) SuperLU objects during Destroy */
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  sScalePermstruct_t sScalePermstruct;
  sLUstruct_t        sLUstruct;
  sSOLVEstruct_t     sSOLVEstruct;
  float             *sval;
  PetscBool          singleprecision; /* use single precision SuperLU_DIST from double precision PETSc */
  float             *sbptr;
#endif
} Mat_SuperLU_DIST;

static PetscErrorCode MatSuperluDistGetDiagU_SuperLU_DIST(Mat F, PetscScalar *diagU)
{
  Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST *)F->data;

  PetscFunctionBegin;
  PetscStackCallExternalVoid("SuperLU_DIST:pGetDiagU", pGetDiagU(F->rmap->N, &lu->LUstruct, &lu->grid, CASTDOUBLECOMPLEX diagU));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode MatSuperluDistGetDiagU(Mat F, PetscScalar *diagU)
{
  PetscFunctionBegin;
  PetscValidHeaderSpecific(F, MAT_CLASSID, 1);
  PetscTryMethod(F, "MatSuperluDistGetDiagU_C", (Mat, PetscScalar *), (F, diagU));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*  This allows reusing the Superlu_DIST communicator and grid when only a single SuperLU_DIST matrix is used at a time */
typedef struct {
  MPI_Comm   comm;
  PetscBool  busy;
  gridinfo_t grid;
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  PetscBool    use3d;
  gridinfo3d_t grid3d;
#endif
} PetscSuperLU_DIST;

static PetscMPIInt Petsc_Superlu_dist_keyval = MPI_KEYVAL_INVALID;

PETSC_EXTERN PetscMPIInt MPIAPI Petsc_Superlu_dist_keyval_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *attr_val, void *extra_state)
{
  PetscSuperLU_DIST *context = (PetscSuperLU_DIST *)attr_val;

  PetscFunctionBegin;
  PetscCheckReturnMPI(keyval == Petsc_Superlu_dist_keyval, PETSC_COMM_SELF, PETSC_ERR_ARG_CORRUPT, "Unexpected keyval");
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  if (context->use3d) {
    PetscStackCallExternalVoid("SuperLU_DIST:superlu_gridexit3d", superlu_gridexit3d(&context->grid3d));
  } else
#endif
    PetscStackCallExternalVoid("SuperLU_DIST:superlu_gridexit", superlu_gridexit(&context->grid));
  PetscCallMPIReturnMPI(MPI_Comm_free(&context->comm));
  PetscCallReturnMPI(PetscFree(context));
  PetscFunctionReturn(MPI_SUCCESS);
}

/*
   Performs MPI_Comm_free_keyval() on Petsc_Superlu_dist_keyval but keeps the global variable for
   users who do not destroy all PETSc objects before PetscFinalize().

   The value Petsc_Superlu_dist_keyval is retained so that Petsc_Superlu_dist_keyval_DeleteFn()
   can still check that the keyval associated with the MPI communicator is correct when the MPI
   communicator is destroyed.

   This is called in PetscFinalize()
*/
static PetscErrorCode Petsc_Superlu_dist_keyval_free(void)
{
  PetscMPIInt Petsc_Superlu_dist_keyval_temp = Petsc_Superlu_dist_keyval;

  PetscFunctionBegin;
  PetscCallMPI(MPI_Comm_free_keyval(&Petsc_Superlu_dist_keyval_temp));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatDestroy_SuperLU_DIST(Mat A)
{
  Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST *)A->data;

  PetscFunctionBegin;
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  PetscCall(PetscFree(lu->sbptr));
#endif
  if (lu->CleanUpSuperLU_Dist) {
    /* Deallocate SuperLU_DIST storage */
    PetscStackCallExternalVoid("SuperLU_DIST:Destroy_CompRowLoc_Matrix_dist", Destroy_CompRowLoc_Matrix_dist(&lu->A_sup));
    if (lu->options.SolveInitialized) {
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
      if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", sSolveFinalize(&lu->options, &lu->sSOLVEstruct));
      else
#endif
        PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", SolveFinalize(&lu->options, &lu->SOLVEstruct));
    }
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    if (lu->use3d) {
  #if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
      if (lu->singleprecision) {
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", sDestroy_LU(A->cmap->N, &lu->grid3d.grid2d, &lu->sLUstruct));
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_A3d_gathered_on_2d", sDestroy_A3d_gathered_on_2d(&lu->sSOLVEstruct, &lu->grid3d));
        PetscStackCallExternalVoid("SuperLU_DIST:ScalePermstructFree", sScalePermstructFree(&lu->sScalePermstruct));
        PetscStackCallExternalVoid("SuperLU_DIST:LUstructFree", sLUstructFree(&lu->sLUstruct));
      } else
  #endif
      {
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", Destroy_LU(A->cmap->N, &lu->grid3d.grid2d, &lu->LUstruct));
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_A3d_gathered_on_2d", Destroy_A3d_gathered_on_2d(&lu->SOLVEstruct, &lu->grid3d));
        PetscStackCallExternalVoid("SuperLU_DIST:ScalePermstructFree", ScalePermstructFree(&lu->ScalePermstruct));
        PetscStackCallExternalVoid("SuperLU_DIST:LUstructFree", LUstructFree(&lu->LUstruct));
      }
    } else
#endif
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
      if (lu->singleprecision) {
      PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", sDestroy_LU(A->cmap->N, &lu->grid, &lu->sLUstruct));
      PetscStackCallExternalVoid("SuperLU_DIST:ScalePermstructFree", sScalePermstructFree(&lu->sScalePermstruct));
      PetscStackCallExternalVoid("SuperLU_DIST:LUstructFree", sLUstructFree(&lu->sLUstruct));
    } else
#endif
    {
      PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", Destroy_LU(A->cmap->N, &lu->grid, &lu->LUstruct));
      PetscStackCallExternalVoid("SuperLU_DIST:ScalePermstructFree", ScalePermstructFree(&lu->ScalePermstruct));
      PetscStackCallExternalVoid("SuperLU_DIST:LUstructFree", LUstructFree(&lu->LUstruct));
    }
    /* Release the SuperLU_DIST process grid only if the matrix has its own copy, that is it is not in the communicator context */
    if (lu->comm_superlu) {
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
      if (lu->use3d) {
        PetscStackCallExternalVoid("SuperLU_DIST:superlu_gridexit3d", superlu_gridexit3d(&lu->grid3d));
      } else
#endif
        PetscStackCallExternalVoid("SuperLU_DIST:superlu_gridexit", superlu_gridexit(&lu->grid));
    }
  }
  /*
   * We always need to release the communicator that was created in MatGetFactor_aij_superlu_dist.
   * lu->CleanUpSuperLU_Dist was turned on in MatLUFactorSymbolic_SuperLU_DIST. There are some use
   * cases where we only create a matrix but do not solve mat. In these cases, lu->CleanUpSuperLU_Dist
   * is off, and the communicator was not released or marked as "not busy " in the old code.
   * Here we try to release comm regardless.
  */
  if (lu->comm_superlu) {
    PetscCall(PetscCommRestoreComm(PetscObjectComm((PetscObject)A), &lu->comm_superlu));
  } else {
    PetscSuperLU_DIST *context;
    MPI_Comm           comm;
    PetscMPIInt        iflg;

    PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
    PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Superlu_dist_keyval, &context, &iflg));
    if (iflg) context->busy = PETSC_FALSE;
  }

  PetscCall(PetscFree(A->data));
  /* clear composed functions */
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
  PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSuperluDistGetDiagU_C", NULL));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatSolve_SuperLU_DIST(Mat A, Vec b_mpi, Vec x)
{
  Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST *)A->data;
  PetscInt          m  = A->rmap->n;
  SuperLUStat_t     stat;
  PetscReal         berr[1];
  PetscScalar      *bptr = NULL;
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  float sberr[1];
#endif
  int              info; /* SuperLU_Dist info code is ALWAYS an int, even with long long indices */
  static PetscBool cite = PETSC_FALSE;

  PetscFunctionBegin;
  PetscCheck(lu->options.Fact == FACTORED, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "SuperLU_DIST options.Fact must equal FACTORED");
  PetscCall(PetscCitationsRegister("@article{lidemmel03,\n  author = {Xiaoye S. Li and James W. Demmel},\n  title = {{SuperLU_DIST}: A Scalable Distributed-Memory Sparse Direct\n           Solver for Unsymmetric Linear Systems},\n  journal = {ACM "
                                   "Trans. Mathematical Software},\n  volume = {29},\n  number = {2},\n  pages = {110-140},\n  year = 2003\n}\n",
                                   &cite));

  if (lu->options.SolveInitialized && !lu->matsolve_iscalled) {
    /* see comments in MatMatSolve() */
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", sSolveFinalize(&lu->options, &lu->sSOLVEstruct));
    else
#endif
      PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", SolveFinalize(&lu->options, &lu->SOLVEstruct));
    lu->options.SolveInitialized = NO;
  }
  PetscCall(VecCopy(b_mpi, x));
  PetscCall(VecGetArray(x, &bptr));
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  if (lu->singleprecision) {
    PetscInt n;
    PetscCall(VecGetLocalSize(x, &n));
    if (!lu->sbptr) PetscCall(PetscMalloc1(n, &lu->sbptr));
    for (PetscInt i = 0; i < n; i++) lu->sbptr[i] = PetscRealPart(bptr[i]); /* PetscRealPart() is a no-op to allow compiling with complex */
  }
#endif

  PetscStackCallExternalVoid("SuperLU_DIST:PStatInit", PStatInit(&stat)); /* Initialize the statistics variables. */
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  if (lu->use3d) {
  #if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:pgssvx3d", psgssvx3d(&lu->options, &lu->A_sup, &lu->sScalePermstruct, lu->sbptr, (int)m, 1, &lu->grid3d, &lu->sLUstruct, &lu->sSOLVEstruct, sberr, &stat, &info));
    else
  #endif
      PetscStackCallExternalVoid("SuperLU_DIST:pgssvx3d", pgssvx3d(&lu->options, &lu->A_sup, &lu->ScalePermstruct, CASTDOUBLECOMPLEX bptr, (int)m, 1, &lu->grid3d, &lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &info));
    PetscCheck(!info, PETSC_COMM_SELF, PETSC_ERR_LIB, "pgssvx3d fails, info: %d", info);
  } else
#endif
  {
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:pgssvx", psgssvx(&lu->options, &lu->A_sup, &lu->sScalePermstruct, lu->sbptr, (int)m, 1, &lu->grid, &lu->sLUstruct, &lu->sSOLVEstruct, sberr, &stat, &info));
    else
#endif
      PetscStackCallExternalVoid("SuperLU_DIST:pgssvx", pgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, CASTDOUBLECOMPLEX bptr, (int)m, 1, &lu->grid, &lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &info));
    PetscCheck(!info, PETSC_COMM_SELF, PETSC_ERR_LIB, "pgssvx fails, info: %d", info);
  }
  if (lu->options.PrintStat) PetscStackCallExternalVoid("SuperLU_DIST:PStatPrint", PStatPrint(&lu->options, &stat, &lu->grid)); /* Print the statistics. */
  PetscStackCallExternalVoid("SuperLU_DIST:PStatFree", PStatFree(&stat));
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  if (lu->singleprecision) {
    PetscInt n;
    PetscCall(VecGetLocalSize(x, &n));
    for (PetscInt i = 0; i < n; i++) bptr[i] = lu->sbptr[i];
  }
#endif
  PetscCall(VecRestoreArray(x, &bptr));
  lu->matsolve_iscalled    = PETSC_TRUE;
  lu->matmatsolve_iscalled = PETSC_FALSE;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatMatSolve_SuperLU_DIST(Mat A, Mat B_mpi, Mat X)
{
  Mat_SuperLU_DIST *lu = (Mat_SuperLU_DIST *)A->data;
  PetscInt          m  = A->rmap->n, nrhs;
  SuperLUStat_t     stat;
  PetscReal         berr[1];
  PetscScalar      *bptr;
  int               info; /* SuperLU_Dist info code is ALWAYS an int, even with long long indices */
  PetscBool         flg;

  PetscFunctionBegin;
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  PetscCheck(!lu->singleprecision, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for -pc_precision single");
#endif
  PetscCheck(lu->options.Fact == FACTORED, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "SuperLU_DIST options.Fact must equal FACTORED");
  PetscCall(PetscObjectTypeCompareAny((PetscObject)B_mpi, &flg, MATSEQDENSE, MATMPIDENSE, NULL));
  PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Matrix B must be MATDENSE matrix");
  if (X != B_mpi) {
    PetscCall(PetscObjectTypeCompareAny((PetscObject)X, &flg, MATSEQDENSE, MATMPIDENSE, NULL));
    PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Matrix X must be MATDENSE matrix");
  }

  if (lu->options.SolveInitialized && !lu->matmatsolve_iscalled) {
    /* communication pattern of SOLVEstruct is unlikely created for matmatsolve,
       thus destroy it and create a new SOLVEstruct.
       Otherwise it may result in memory corruption or incorrect solution
       See src/mat/tests/ex125.c */
    PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", SolveFinalize(&lu->options, &lu->SOLVEstruct));
    lu->options.SolveInitialized = NO;
  }
  if (X != B_mpi) PetscCall(MatCopy(B_mpi, X, SAME_NONZERO_PATTERN));

  PetscCall(MatGetSize(B_mpi, NULL, &nrhs));

  PetscStackCallExternalVoid("SuperLU_DIST:PStatInit", PStatInit(&stat)); /* Initialize the statistics variables. */
  PetscCall(MatDenseGetArray(X, &bptr));

#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  if (lu->use3d) PetscStackCallExternalVoid("SuperLU_DIST:pgssvx3d", pgssvx3d(&lu->options, &lu->A_sup, &lu->ScalePermstruct, CASTDOUBLECOMPLEX bptr, (int)m, (int)nrhs, &lu->grid3d, &lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &info));
  else
#endif
    PetscStackCallExternalVoid("SuperLU_DIST:pgssvx", pgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, CASTDOUBLECOMPLEX bptr, (int)m, (int)nrhs, &lu->grid, &lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &info));
  PetscCheck(!info, PETSC_COMM_SELF, PETSC_ERR_LIB, "pdgssvx fails, info: %d", info);
  PetscCall(MatDenseRestoreArray(X, &bptr));

  if (lu->options.PrintStat) PetscStackCallExternalVoid("SuperLU_DIST:PStatPrint", PStatPrint(&lu->options, &stat, &lu->grid)); /* Print the statistics. */
  PetscStackCallExternalVoid("SuperLU_DIST:PStatFree", PStatFree(&stat));
  lu->matsolve_iscalled    = PETSC_FALSE;
  lu->matmatsolve_iscalled = PETSC_TRUE;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*
  input:
   F:        numeric Cholesky factor
  output:
   nneg:     total number of negative pivots
   nzero:    total number of zero pivots
   npos:     (global dimension of F) - nneg - nzero
*/
static PetscErrorCode MatGetInertia_SuperLU_DIST(Mat F, PetscInt *nneg, PetscInt *nzero, PetscInt *npos)
{
  Mat_SuperLU_DIST *lu    = (Mat_SuperLU_DIST *)F->data;
  PetscScalar      *diagU = NULL;
  PetscInt          M, i, neg = 0, zero = 0, pos = 0;
  PetscReal         r;

  PetscFunctionBegin;
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  PetscCheck(!lu->singleprecision, PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Not for -pc_precision single");
#endif
  PetscCheck(F->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Matrix factor F is not assembled");
  PetscCheck(lu->options.RowPerm == NOROWPERM, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must set NOROWPERM");
  PetscCall(MatGetSize(F, &M, NULL));
  PetscCall(PetscMalloc1(M, &diagU));
  PetscCall(MatSuperluDistGetDiagU(F, diagU));
  for (i = 0; i < M; i++) {
#if defined(PETSC_USE_COMPLEX)
    r = PetscAbsReal(PetscImaginaryPart(diagU[i]));
    PetscCheck(r < 1000 * PETSC_MACHINE_EPSILON, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "diagU[%" PetscInt_FMT "]=%g + i %g is non-real", i, (double)PetscRealPart(diagU[i]), (double)PetscImaginaryPart(diagU[i]));
    r = PetscRealPart(diagU[i]);
#else
    r = diagU[i];
#endif
    if (r > 0) {
      pos++;
    } else if (r < 0) {
      neg++;
    } else zero++;
  }

  PetscCall(PetscFree(diagU));
  if (nneg) *nneg = neg;
  if (nzero) *nzero = zero;
  if (npos) *npos = pos;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatLUFactorNumeric_SuperLU_DIST(Mat F, Mat A, const MatFactorInfo *info)
{
  Mat_SuperLU_DIST  *lu = (Mat_SuperLU_DIST *)F->data;
  Mat                Aloc;
  const PetscScalar *av;
  const PetscInt    *ai = NULL, *aj = NULL;
  PetscInt           nz, unused;
  int                sinfo; /* SuperLU_Dist info flag is always an int even with long long indices */
  SuperLUStat_t      stat;
  PetscReal         *berr = 0;
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  float *sberr = 0;
#endif
  PetscBool ismpiaij, isseqaij, flg;

  PetscFunctionBegin;
  PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
  PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &ismpiaij));
  if (ismpiaij) {
    PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &Aloc));
  } else if (isseqaij) {
    PetscCall(PetscObjectReference((PetscObject)A));
    Aloc = A;
  } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);

  PetscCall(MatGetRowIJ(Aloc, 0, PETSC_FALSE, PETSC_FALSE, &unused, &ai, &aj, &flg));
  PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "GetRowIJ failed");
  PetscCall(MatSeqAIJGetArrayRead(Aloc, &av));
  nz = ai[Aloc->rmap->n];

  /* Allocations for A_sup */
  if (lu->options.Fact == DOFACT) { /* first numeric factorization */
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:allocateA_dist", sallocateA_dist(Aloc->rmap->n, nz, &lu->sval, &lu->col, &lu->row));
    else
#endif
      PetscStackCallExternalVoid("SuperLU_DIST:allocateA_dist", allocateA_dist(Aloc->rmap->n, nz, CASTDOUBLECOMPLEXSTAR & lu->val, &lu->col, &lu->row));
  } else { /* successive numeric factorization, sparsity pattern and perm_c are reused. */
    if (lu->FactPattern == SamePattern_SameRowPerm) {
      lu->options.Fact = SamePattern_SameRowPerm; /* matrix has similar numerical values */
    } else if (lu->FactPattern == SamePattern) {
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
      if (lu->use3d) {
  #if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
        if (lu->singleprecision) {
          PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", sDestroy_LU(A->cmap->N, &lu->grid3d.grid2d, &lu->sLUstruct));
          PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", sSolveFinalize(&lu->options, &lu->sSOLVEstruct));
        } else
  #endif
        {
          PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", Destroy_LU(A->cmap->N, &lu->grid3d.grid2d, &lu->LUstruct));
          PetscStackCallExternalVoid("SuperLU_DIST:SolveFinalize", SolveFinalize(&lu->options, &lu->SOLVEstruct));
        }
      } else
#endif
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
        if (lu->singleprecision)
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", sDestroy_LU(A->rmap->N, &lu->grid, &lu->sLUstruct));
      else
#endif
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", Destroy_LU(A->rmap->N, &lu->grid, &lu->LUstruct));
      lu->options.Fact = SamePattern;
    } else if (lu->FactPattern == DOFACT) {
      PetscStackCallExternalVoid("SuperLU_DIST:Destroy_CompRowLoc_Matrix_dist", Destroy_CompRowLoc_Matrix_dist(&lu->A_sup));
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
      if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", sDestroy_LU(A->rmap->N, &lu->grid, &lu->sLUstruct));
      else
#endif
        PetscStackCallExternalVoid("SuperLU_DIST:Destroy_LU", Destroy_LU(A->rmap->N, &lu->grid, &lu->LUstruct));
      lu->options.Fact = DOFACT;
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
      if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:allocateA_dist", sallocateA_dist(Aloc->rmap->n, nz, &lu->sval, &lu->col, &lu->row));
      else
#endif
        PetscStackCallExternalVoid("SuperLU_DIST:allocateA_dist", allocateA_dist(Aloc->rmap->n, nz, CASTDOUBLECOMPLEXSTAR & lu->val, &lu->col, &lu->row));
    } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "options.Fact must be one of SamePattern SamePattern_SameRowPerm DOFACT");
  }

  /* Copy AIJ matrix to superlu_dist matrix */
  PetscCall(PetscArraycpy(lu->row, ai, Aloc->rmap->n + 1));
  PetscCall(PetscArraycpy(lu->col, aj, nz));
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  if (lu->singleprecision)
    for (PetscInt i = 0; i < nz; i++) lu->sval[i] = PetscRealPart(av[i]); /* PetscRealPart() is a no-op to allow compiling with complex */
  else
#endif
    PetscCall(PetscArraycpy(lu->val, av, nz));
  PetscCall(MatRestoreRowIJ(Aloc, 0, PETSC_FALSE, PETSC_FALSE, &unused, &ai, &aj, &flg));
  PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_SUP, "RestoreRowIJ failed");
  PetscCall(MatSeqAIJRestoreArrayRead(Aloc, &av));
  PetscCall(MatDestroy(&Aloc));

  /* Create and setup A_sup */
  if (lu->options.Fact == DOFACT) {
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision)
      PetscStackCallExternalVoid("SuperLU_DIST:Create_CompRowLoc_Matrix_dist", sCreate_CompRowLoc_Matrix_dist(&lu->A_sup, A->rmap->N, A->cmap->N, nz, A->rmap->n, A->rmap->rstart, lu->sval, lu->col, lu->row, SLU_NR_loc, SLU_S, SLU_GE));
    else
#endif
      PetscStackCallExternalVoid("SuperLU_DIST:Create_CompRowLoc_Matrix_dist", Create_CompRowLoc_Matrix_dist(&lu->A_sup, A->rmap->N, A->cmap->N, nz, A->rmap->n, A->rmap->rstart, CASTDOUBLECOMPLEX lu->val, lu->col, lu->row, SLU_NR_loc, SLU, SLU_GE));
  }

  /* Factor the matrix. */
  PetscStackCallExternalVoid("SuperLU_DIST:PStatInit", PStatInit(&stat)); /* Initialize the statistics variables. */
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  if (lu->use3d) {
  #if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:pgssvx3d", psgssvx3d(&lu->options, &lu->A_sup, &lu->sScalePermstruct, 0, (int)A->rmap->n, 0, &lu->grid3d, &lu->sLUstruct, &lu->sSOLVEstruct, sberr, &stat, &sinfo));
    else
  #endif
      PetscStackCallExternalVoid("SuperLU_DIST:pgssvx3d", pgssvx3d(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, (int)A->rmap->n, 0, &lu->grid3d, &lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &sinfo));
  } else
#endif
  {
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    if (lu->singleprecision) PetscStackCallExternalVoid("SuperLU_DIST:pgssvx", psgssvx(&lu->options, &lu->A_sup, &lu->sScalePermstruct, 0, (int)A->rmap->n, 0, &lu->grid, &lu->sLUstruct, &lu->sSOLVEstruct, sberr, &stat, &sinfo));
    else
#endif
      PetscStackCallExternalVoid("SuperLU_DIST:pgssvx", pgssvx(&lu->options, &lu->A_sup, &lu->ScalePermstruct, 0, (int)A->rmap->n, 0, &lu->grid, &lu->LUstruct, &lu->SOLVEstruct, berr, &stat, &sinfo));
  }
  if (sinfo > 0) {
    PetscCheck(!A->erroriffailure, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot in row %d", sinfo);
    if (sinfo <= lu->A_sup.ncol) {
      F->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
      PetscCall(PetscInfo(F, "U(i,i) is exactly zero, i= %d\n", sinfo));
    } else if (sinfo > lu->A_sup.ncol) {
      /*
       number of bytes allocated when memory allocation
       failure occurred, plus A->ncol.
       */
      F->factorerrortype = MAT_FACTOR_OUTMEMORY;
      PetscCall(PetscInfo(F, "Number of bytes allocated when memory allocation fails %d\n", sinfo));
    }
  } else PetscCheck(sinfo >= 0, PETSC_COMM_SELF, PETSC_ERR_LIB, "info = %d, argument in p*gssvx() had an illegal value", sinfo);

  if (lu->options.PrintStat) PetscStackCallExternalVoid("SuperLU_DIST:PStatPrint", PStatPrint(&lu->options, &stat, &lu->grid)); /* Print the statistics. */
  PetscStackCallExternalVoid("SuperLU_DIST:PStatFree", PStatFree(&stat));
  F->assembled     = PETSC_TRUE;
  F->preallocated  = PETSC_TRUE;
  lu->options.Fact = FACTORED; /* The factored form of A is supplied. Local option used by this func. only */
  PetscFunctionReturn(PETSC_SUCCESS);
}

/* Note the PETSc r and c permutations are ignored */
static PetscErrorCode MatLUFactorSymbolic_SuperLU_DIST(Mat F, Mat A, IS r, IS c, const MatFactorInfo *info)
{
  Mat_SuperLU_DIST  *lu = (Mat_SuperLU_DIST *)F->data;
  PetscInt           M = A->rmap->N, N = A->cmap->N, indx;
  PetscMPIInt        size, iflg;
  PetscBool          flg, set;
  const char        *colperm[]     = {"NATURAL", "MMD_AT_PLUS_A", "MMD_ATA", "METIS_AT_PLUS_A", "PARMETIS"};
  const char        *rowperm[]     = {"NOROWPERM", "LargeDiag_MC64", "LargeDiag_AWPM", "MY_PERMR"};
  const char        *factPattern[] = {"SamePattern", "SamePattern_SameRowPerm", "DOFACT"};
  MPI_Comm           comm;
  PetscSuperLU_DIST *context = NULL;

  PetscFunctionBegin;
  /* Set options to F */
  PetscCall(PetscObjectGetComm((PetscObject)F, &comm));
  PetscCallMPI(MPI_Comm_size(comm, &size));

  PetscOptionsBegin(PetscObjectComm((PetscObject)F), ((PetscObject)F)->prefix, "SuperLU_Dist Options", "Mat");
  PetscCall(PetscOptionsBool("-mat_superlu_dist_equil", "Equilibrate matrix", "None", lu->options.Equil ? PETSC_TRUE : PETSC_FALSE, &flg, &set));
  if (set && !flg) lu->options.Equil = NO;

  PetscCall(PetscOptionsEList("-mat_superlu_dist_rowperm", "Row permutation", "None", rowperm, 4, rowperm[1], &indx, &flg));
  if (flg) {
    switch (indx) {
    case 0:
      lu->options.RowPerm = NOROWPERM;
      break;
    case 1:
      lu->options.RowPerm = LargeDiag_MC64;
      break;
    case 2:
      lu->options.RowPerm = LargeDiag_AWPM;
      break;
    case 3:
      lu->options.RowPerm = MY_PERMR;
      break;
    default:
      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unknown row permutation");
    }
  }

  PetscCall(PetscOptionsEList("-mat_superlu_dist_colperm", "Column permutation", "None", colperm, 5, colperm[3], &indx, &flg));
  if (flg) {
    switch (indx) {
    case 0:
      lu->options.ColPerm = NATURAL;
      break;
    case 1:
      lu->options.ColPerm = MMD_AT_PLUS_A;
      break;
    case 2:
      lu->options.ColPerm = MMD_ATA;
      break;
    case 3:
      lu->options.ColPerm = METIS_AT_PLUS_A;
      break;
    case 4:
      lu->options.ColPerm = PARMETIS; /* only works for np>1 */
      break;
    default:
      SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unknown column permutation");
    }
  }

  lu->options.ReplaceTinyPivot = NO;
  PetscCall(PetscOptionsBool("-mat_superlu_dist_replacetinypivot", "Replace tiny pivots", "None", lu->options.ReplaceTinyPivot ? PETSC_TRUE : PETSC_FALSE, &flg, &set));
  if (set && flg) lu->options.ReplaceTinyPivot = YES;

  lu->options.ParSymbFact = NO;
  PetscCall(PetscOptionsBool("-mat_superlu_dist_parsymbfact", "Parallel symbolic factorization", "None", PETSC_FALSE, &flg, &set));
  if (set && flg && size > 1) {
#if defined(PETSC_HAVE_PARMETIS)
    lu->options.ParSymbFact = YES;
    lu->options.ColPerm     = PARMETIS; /* in v2.2, PARMETIS is forced for ParSymbFact regardless of user ordering setting */
#else
    SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "parsymbfact needs PARMETIS");
#endif
  }

  lu->FactPattern = SamePattern;
  PetscCall(PetscOptionsEList("-mat_superlu_dist_fact", "Sparsity pattern for repeated matrix factorization", "None", factPattern, 3, factPattern[0], &indx, &flg));
  if (flg) {
    switch (indx) {
    case 0:
      lu->FactPattern = SamePattern;
      break;
    case 1:
      lu->FactPattern = SamePattern_SameRowPerm;
      break;
    case 2:
      lu->FactPattern = DOFACT;
      break;
    }
  }

  lu->options.IterRefine = NOREFINE;
  PetscCall(PetscOptionsBool("-mat_superlu_dist_iterrefine", "Use iterative refinement", "None", lu->options.IterRefine == NOREFINE ? PETSC_FALSE : PETSC_TRUE, &flg, &set));
  if (set && flg) lu->options.IterRefine = SLU_DOUBLE;

  if (PetscLogPrintInfo) lu->options.PrintStat = YES;
  else lu->options.PrintStat = NO;
  PetscCall(PetscOptionsDeprecated("-mat_superlu_dist_statprint", "-mat_superlu_dist_printstat", "3.19", NULL));
  PetscCall(PetscOptionsBool("-mat_superlu_dist_printstat", "Print factorization information", "None", (PetscBool)lu->options.PrintStat, (PetscBool *)&lu->options.PrintStat, NULL));

#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(8, 0, 0)
  lu->options.superlu_acc_offload = 1;
  PetscCall(PetscOptionsBool("-mat_superlu_dist_gpuoffload", "Offload factorization onto the GPUs", "None", (PetscBool)lu->options.superlu_acc_offload, (PetscBool *)&lu->options.superlu_acc_offload, NULL));
#endif

  PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Superlu_dist_keyval, &context, &iflg));
  if (!iflg || context->busy) { /* additional options */
    if (!iflg) {
      PetscCall(PetscNew(&context));
      context->busy = PETSC_TRUE;
      PetscCallMPI(MPI_Comm_dup(comm, &context->comm));
      PetscCallMPI(MPI_Comm_set_attr(comm, Petsc_Superlu_dist_keyval, context));
    } else {
      PetscCall(PetscCommGetComm(PetscObjectComm((PetscObject)A), &lu->comm_superlu));
    }

    /* Default number of process columns and rows */
    lu->nprow = (int_t)(0.5 + PetscSqrtReal((PetscReal)size));
    if (!lu->nprow) lu->nprow = 1;
    while (lu->nprow > 0) {
      lu->npcol = (int_t)(size / lu->nprow);
      if (size == lu->nprow * lu->npcol) break;
      lu->nprow--;
    }
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    lu->use3d = PETSC_FALSE;
    lu->npdep = 1;
#endif

#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    PetscCall(PetscOptionsBool("-mat_superlu_dist_3d", "Use SuperLU_DIST 3D distribution", "None", lu->use3d, &lu->use3d, NULL));
    if (lu->use3d) {
      PetscInt t;
      PetscCall(PetscOptionsInt("-mat_superlu_dist_d", "Number of z entries in processor partition", "None", lu->npdep, (PetscInt *)&lu->npdep, NULL));
      t = (PetscInt)PetscLog2Real((PetscReal)lu->npdep);
      PetscCheck(PetscPowInt(2, t) == lu->npdep, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_OUTOFRANGE, "-mat_superlu_dist_d %lld must be a power of 2", (long long)lu->npdep);
      if (lu->npdep > 1) {
        lu->nprow = (int_t)(0.5 + PetscSqrtReal((PetscReal)(size / lu->npdep)));
        if (!lu->nprow) lu->nprow = 1;
        while (lu->nprow > 0) {
          lu->npcol = (int_t)(size / (lu->npdep * lu->nprow));
          if (size == lu->nprow * lu->npcol * lu->npdep) break;
          lu->nprow--;
        }
      }
    }
#endif
    PetscCall(PetscOptionsInt("-mat_superlu_dist_r", "Number rows in processor partition", "None", lu->nprow, (PetscInt *)&lu->nprow, NULL));
    PetscCall(PetscOptionsInt("-mat_superlu_dist_c", "Number columns in processor partition", "None", lu->npcol, (PetscInt *)&lu->npcol, NULL));
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    PetscCheck(size == lu->nprow * lu->npcol * lu->npdep, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number of processes %d must equal to nprow %lld * npcol %lld * npdep %lld", size, (long long)lu->nprow, (long long)lu->npcol, (long long)lu->npdep);
#else
    PetscCheck(size == lu->nprow * lu->npcol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number of processes %d must equal to nprow %lld * npcol %lld", size, (long long)lu->nprow, (long long)lu->npcol);
#endif
    /* end of adding additional options */

#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    if (lu->use3d) {
      PetscStackCallExternalVoid("SuperLU_DIST:superlu_gridinit3d", superlu_gridinit3d(context ? context->comm : lu->comm_superlu, (int)lu->nprow, (int)lu->npcol, (int)lu->npdep, &lu->grid3d));
      if (context) {
        context->grid3d = lu->grid3d;
        context->use3d  = lu->use3d;
      }
    } else {
#endif
      PetscStackCallExternalVoid("SuperLU_DIST:superlu_gridinit", superlu_gridinit(context ? context->comm : lu->comm_superlu, (int)lu->nprow, (int)lu->npcol, &lu->grid));
      if (context) context->grid = lu->grid;
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
    }
#endif
    PetscCall(PetscInfo(NULL, "Duplicating a communicator for SuperLU_DIST and calling superlu_gridinit()\n"));
    if (iflg) {
      PetscCall(PetscInfo(NULL, "Communicator attribute already in use so not saving communicator and SuperLU_DIST grid in communicator attribute \n"));
    } else {
      PetscCall(PetscInfo(NULL, "Storing communicator and SuperLU_DIST grid in communicator attribute\n"));
    }
  } else { /* (iflg && !context->busy) */
    PetscCall(PetscInfo(NULL, "Reusing communicator and superlu_gridinit() for SuperLU_DIST from communicator attribute.\n"));
    context->busy = PETSC_TRUE;
    lu->grid      = context->grid;
  }
  PetscOptionsEnd();

  /* Initialize ScalePermstruct and LUstruct. */
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  if (lu->singleprecision) {
    PetscStackCallExternalVoid("SuperLU_DIST:ScalePermstructInit", sScalePermstructInit(M, N, &lu->sScalePermstruct));
    PetscStackCallExternalVoid("SuperLU_DIST:LUstructInit", sLUstructInit(N, &lu->sLUstruct));
  } else
#endif
  {
    PetscStackCallExternalVoid("SuperLU_DIST:ScalePermstructInit", ScalePermstructInit(M, N, &lu->ScalePermstruct));
    PetscStackCallExternalVoid("SuperLU_DIST:LUstructInit", LUstructInit(N, &lu->LUstruct));
  }
  F->ops->lufactornumeric = MatLUFactorNumeric_SuperLU_DIST;
  F->ops->solve           = MatSolve_SuperLU_DIST;
  F->ops->matsolve        = MatMatSolve_SuperLU_DIST;
  F->ops->getinertia      = NULL;

  if (A->symmetric == PETSC_BOOL3_TRUE || A->hermitian == PETSC_BOOL3_TRUE) F->ops->getinertia = MatGetInertia_SuperLU_DIST;
  lu->CleanUpSuperLU_Dist = PETSC_TRUE;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatCholeskyFactorSymbolic_SuperLU_DIST(Mat F, Mat A, IS r, const MatFactorInfo *info)
{
  PetscFunctionBegin;
  PetscCall(MatLUFactorSymbolic_SuperLU_DIST(F, A, r, r, info));
  F->ops->choleskyfactornumeric = MatLUFactorNumeric_SuperLU_DIST;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatFactorGetSolverType_aij_superlu_dist(Mat A, MatSolverType *type)
{
  PetscFunctionBegin;
  *type = MATSOLVERSUPERLU_DIST;
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatView_Info_SuperLU_DIST(Mat A, PetscViewer viewer)
{
  Mat_SuperLU_DIST      *lu = (Mat_SuperLU_DIST *)A->data;
  superlu_dist_options_t options;

  PetscFunctionBegin;
  /* check if matrix is superlu_dist type */
  if (A->ops->solve != MatSolve_SuperLU_DIST) PetscFunctionReturn(PETSC_SUCCESS);

  options = lu->options;
  PetscCall(PetscViewerASCIIPrintf(viewer, "SuperLU_DIST run parameters:\n"));
  /* would love to use superlu 'IFMT' macro but it looks like it's inconsistently applied, the
   * format spec for int64_t is set to %d for whatever reason */
  PetscCall(PetscViewerASCIIPrintf(viewer, "  Process grid nprow %lld x npcol %lld \n", (long long)lu->nprow, (long long)lu->npcol));
#if PETSC_PKG_SUPERLU_DIST_VERSION_GE(9, 0, 0)
  if (lu->use3d) PetscCall(PetscViewerASCIIPrintf(viewer, "  Using 3d decomposition with npdep %lld \n", (long long)lu->npdep));
#endif

  PetscCall(PetscViewerASCIIPrintf(viewer, "  Equilibrate matrix %s \n", PetscBools[options.Equil != NO]));
  PetscCall(PetscViewerASCIIPrintf(viewer, "  Replace tiny pivots %s \n", PetscBools[options.ReplaceTinyPivot != NO]));
  PetscCall(PetscViewerASCIIPrintf(viewer, "  Use iterative refinement %s \n", PetscBools[options.IterRefine == SLU_DOUBLE]));
  PetscCall(PetscViewerASCIIPrintf(viewer, "  Processors in row %lld col partition %lld \n", (long long)lu->nprow, (long long)lu->npcol));

  switch (options.RowPerm) {
  case NOROWPERM:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Row permutation NOROWPERM\n"));
    break;
  case LargeDiag_MC64:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Row permutation LargeDiag_MC64\n"));
    break;
  case LargeDiag_AWPM:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Row permutation LargeDiag_AWPM\n"));
    break;
  case MY_PERMR:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Row permutation MY_PERMR\n"));
    break;
  default:
    SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unknown column permutation");
  }

  switch (options.ColPerm) {
  case NATURAL:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Column permutation NATURAL\n"));
    break;
  case MMD_AT_PLUS_A:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Column permutation MMD_AT_PLUS_A\n"));
    break;
  case MMD_ATA:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Column permutation MMD_ATA\n"));
    break;
  /*  Even though this is called METIS, the SuperLU_DIST code sets this by default if PARMETIS is defined, not METIS */
  case METIS_AT_PLUS_A:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Column permutation METIS_AT_PLUS_A\n"));
    break;
  case PARMETIS:
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Column permutation PARMETIS\n"));
    break;
  default:
    SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unknown column permutation");
  }

  PetscCall(PetscViewerASCIIPrintf(viewer, "  Parallel symbolic factorization %s \n", PetscBools[options.ParSymbFact != NO]));

  if (lu->FactPattern == SamePattern) {
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Repeated factorization SamePattern\n"));
  } else if (lu->FactPattern == SamePattern_SameRowPerm) {
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Repeated factorization SamePattern_SameRowPerm\n"));
  } else if (lu->FactPattern == DOFACT) {
    PetscCall(PetscViewerASCIIPrintf(viewer, "  Repeated factorization DOFACT\n"));
  } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unknown factorization pattern");
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
  if (lu->singleprecision) PetscCall(PetscViewerASCIIPrintf(viewer, "  Using SuperLU_DIST in single precision\n"));
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatView_SuperLU_DIST(Mat A, PetscViewer viewer)
{
  PetscBool         isascii;
  PetscViewerFormat format;

  PetscFunctionBegin;
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
  if (isascii) {
    PetscCall(PetscViewerGetFormat(viewer, &format));
    if (format == PETSC_VIEWER_ASCII_INFO) PetscCall(MatView_Info_SuperLU_DIST(A, viewer));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode MatGetFactor_aij_superlu_dist(Mat A, MatFactorType ftype, Mat *F)
{
  Mat                    B;
  Mat_SuperLU_DIST      *lu;
  PetscInt               M = A->rmap->N, N = A->cmap->N;
  PetscMPIInt            size;
  superlu_dist_options_t options;
  PetscBool              flg;
  PetscPrecision         precision = PETSC_PRECISION_INVALID;

  PetscFunctionBegin;
  /* Create the factorization matrix */
  PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
  PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, M, N));
  PetscCall(PetscStrallocpy("superlu_dist", &((PetscObject)B)->type_name));
  PetscCall(MatSetUp(B));
  B->ops->getinfo = MatGetInfo_External;
  B->ops->view    = MatView_SuperLU_DIST;
  B->ops->destroy = MatDestroy_SuperLU_DIST;

  /* set_default_options_dist() sets the default input options to the following values:
     options.Fact                = DOFACT;
     options.Equil               = YES;
     options.ParSymbFact         = NO;
     options.ColPerm             = METIS_AT_PLUS_A;
     options.RowPerm             = LargeDiag_MC64;
     options.ReplaceTinyPivot    = NO;
     options.IterRefine          = SLU_DOUBLE;
     options.Trans               = NOTRANS;
     options.SolveInitialized    = NO; -hold the communication pattern used MatSolve() and MatMatSolve()
     options.RefineInitialized   = NO;
     options.PrintStat           = YES;
     options.superlu_acc_offload = 1;
     options.SymPattern          = NO;
  */
  set_default_options_dist(&options);

  B->trivialsymbolic = PETSC_TRUE;
  if (ftype == MAT_FACTOR_LU) {
    B->factortype            = MAT_FACTOR_LU;
    B->ops->lufactorsymbolic = MatLUFactorSymbolic_SuperLU_DIST;
  } else {
    B->factortype                  = MAT_FACTOR_CHOLESKY;
    B->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SuperLU_DIST;
    options.SymPattern             = YES;
  }

  /* set solvertype */
  PetscCall(PetscFree(B->solvertype));
  PetscCall(PetscStrallocpy(MATSOLVERSUPERLU_DIST, &B->solvertype));

  PetscCall(PetscNew(&lu));
  B->data = lu;
  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));

  lu->options              = options;
  lu->options.Fact         = DOFACT;
  lu->matsolve_iscalled    = PETSC_FALSE;
  lu->matmatsolve_iscalled = PETSC_FALSE;

  PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "SuperLU_DIST Options", "Mat");
  PetscCall(PetscOptionsEnum("-pc_precision", "Precision used by SuperLU_DIST", "MATSOLVERSUPERLU_DIST", PetscPrecisionTypes, (PetscEnum)precision, (PetscEnum *)&precision, &flg));
  PetscOptionsEnd();
  if (flg) {
    PetscCheck(precision == PETSC_PRECISION_SINGLE || precision == PETSC_PRECISION_DOUBLE, PetscObjectComm((PetscObject)A), PETSC_ERR_USER_INPUT, "-pc_precision only accepts single or double as option for SuperLU_DIST");
#if defined(PETSC_HAVE_SUPERLU_DIST_SINGLE)
    lu->singleprecision = (PetscBool)(precision == PETSC_PRECISION_SINGLE); // It also implies PetscReal is not single; not merely SuperLU_DIST is running in single
#endif
  }

  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatFactorGetSolverType_C", MatFactorGetSolverType_aij_superlu_dist));
  PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSuperluDistGetDiagU_C", MatSuperluDistGetDiagU_SuperLU_DIST));

  *F = B;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PETSC_INTERN PetscErrorCode MatSolverTypeRegister_SuperLU_DIST(void)
{
  PetscFunctionBegin;
  PetscCall(MatSolverTypeRegister(MATSOLVERSUPERLU_DIST, MATMPIAIJ, MAT_FACTOR_LU, MatGetFactor_aij_superlu_dist));
  PetscCall(MatSolverTypeRegister(MATSOLVERSUPERLU_DIST, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_aij_superlu_dist));
  PetscCall(MatSolverTypeRegister(MATSOLVERSUPERLU_DIST, MATMPIAIJ, MAT_FACTOR_CHOLESKY, MatGetFactor_aij_superlu_dist));
  PetscCall(MatSolverTypeRegister(MATSOLVERSUPERLU_DIST, MATSEQAIJ, MAT_FACTOR_CHOLESKY, MatGetFactor_aij_superlu_dist));
  if (Petsc_Superlu_dist_keyval == MPI_KEYVAL_INVALID) {
    PetscCallMPI(MPI_Comm_create_keyval(MPI_COMM_NULL_COPY_FN, Petsc_Superlu_dist_keyval_DeleteFn, &Petsc_Superlu_dist_keyval, NULL));
    PetscCall(PetscRegisterFinalize(Petsc_Superlu_dist_keyval_free));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*MC
  MATSOLVERSUPERLU_DIST - Parallel direct solver package for LU factorization

  Use `./configure --download-superlu_dist --download-parmetis --download-metis --download-ptscotch`  to have PETSc installed with SuperLU_DIST

  Use `-pc_type lu` `-pc_factor_mat_solver_type superlu_dist` to use this direct solver

   Works with `MATAIJ` matrices

  Options Database Keys:
+ -mat_superlu_dist_r <n> - number of rows in processor partition
. -mat_superlu_dist_c <n> - number of columns in processor partition
. -mat_superlu_dist_3d - use 3d partition, requires SuperLU_DIST 7.2 or later
. -mat_superlu_dist_d <n> - depth in 3d partition (valid only if `-mat_superlu_dist_3d`) is provided
. -mat_superlu_dist_equil - equilibrate the matrix
. -mat_superlu_dist_rowperm <NOROWPERM,LargeDiag_MC64,LargeDiag_AWPM,MY_PERMR> - row permutation
. -mat_superlu_dist_colperm <NATURAL,MMD_AT_PLUS_A,MMD_ATA,METIS_AT_PLUS_A,PARMETIS> - column permutation
. -mat_superlu_dist_replacetinypivot - replace tiny pivots
. -mat_superlu_dist_fact <SamePattern> - (choose one of) `SamePattern`, `SamePattern_SameRowPerm`, `DOFACT`
. -mat_superlu_dist_iterrefine - use iterative refinement
. -mat_superlu_dist_printstat - print factorization information
. -mat_superlu_dist_gpuoffload - offload factorization onto the GPUs, requires SuperLU_DIST 8.0.0 or later
- -pc_precision single - use SuperLU_DIST single precision with PETSc double precision

  Level: beginner

  Note:
    If PETSc was configured with `--with-cuda` then this solver will use the GPUs by default.

.seealso: [](ch_matrices), `Mat`, `PCLU`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatGetFactor()`
M*/
