#include <petsc/private/pcbddcimpl.h>
#include <petsc/private/pcbddcprivateimpl.h>
#include <../src/mat/impls/dense/seq/dense.h>
#include <petscblaslapack.h>

static inline PetscErrorCode PCBDDCAdjGetNextLayer_Private(PetscInt *, PetscInt, PetscBT, PetscInt *, PetscInt *, PetscInt *);
static PetscErrorCode        PCBDDCComputeExplicitSchur(Mat, PetscBool, MatReuse, Mat *);
static PetscErrorCode        PCBDDCReuseSolvers_Interior(PC, Vec, Vec);
static PetscErrorCode        PCBDDCReuseSolvers_Correction(PC, Vec, Vec);

/* if v2 is not present, correction is done in-place */
PetscErrorCode PCBDDCReuseSolversBenignAdapt(PCBDDCReuseSolvers ctx, Vec v, Vec v2, PetscBool sol, PetscBool full)
{
  PetscScalar *array;
  PetscScalar *array2;

  PetscFunctionBegin;
  if (!ctx->benign_n) PetscFunctionReturn(PETSC_SUCCESS);
  if (sol && full) {
    PetscInt n_I, size_schur;

    /* get sizes */
    PetscCall(MatGetSize(ctx->benign_csAIB, &size_schur, NULL));
    PetscCall(VecGetSize(v, &n_I));
    n_I = n_I - size_schur;
    /* get schur sol from array */
    PetscCall(VecGetArray(v, &array));
    PetscCall(VecPlaceArray(ctx->benign_dummy_schur_vec, array + n_I));
    PetscCall(VecRestoreArray(v, &array));
    /* apply interior sol correction */
    PetscCall(MatMultTranspose(ctx->benign_csAIB, ctx->benign_dummy_schur_vec, ctx->benign_corr_work));
    PetscCall(VecResetArray(ctx->benign_dummy_schur_vec));
    PetscCall(MatMultAdd(ctx->benign_AIIm1ones, ctx->benign_corr_work, v, v));
  }
  if (v2) {
    PetscInt nl;

    PetscCall(VecGetArrayRead(v, (const PetscScalar **)&array));
    PetscCall(VecGetLocalSize(v2, &nl));
    PetscCall(VecGetArray(v2, &array2));
    PetscCall(PetscArraycpy(array2, array, nl));
  } else {
    PetscCall(VecGetArray(v, &array));
    array2 = array;
  }
  if (!sol) { /* change rhs */
    PetscInt n;
    for (n = 0; n < ctx->benign_n; n++) {
      PetscScalar     sum = 0.;
      const PetscInt *cols;
      PetscInt        nz, i;

      PetscCall(ISGetLocalSize(ctx->benign_zerodiag_subs[n], &nz));
      PetscCall(ISGetIndices(ctx->benign_zerodiag_subs[n], &cols));
      for (i = 0; i < nz - 1; i++) sum += array[cols[i]];
#if defined(PETSC_USE_COMPLEX)
      sum = -(PetscRealPart(sum) / nz + PETSC_i * (PetscImaginaryPart(sum) / nz));
#else
      sum = -sum / nz;
#endif
      for (i = 0; i < nz - 1; i++) array2[cols[i]] += sum;
      ctx->benign_save_vals[n] = array2[cols[nz - 1]];
      array2[cols[nz - 1]]     = sum;
      PetscCall(ISRestoreIndices(ctx->benign_zerodiag_subs[n], &cols));
    }
  } else {
    PetscInt n;
    for (n = 0; n < ctx->benign_n; n++) {
      PetscScalar     sum = 0.;
      const PetscInt *cols;
      PetscInt        nz, i;
      PetscCall(ISGetLocalSize(ctx->benign_zerodiag_subs[n], &nz));
      PetscCall(ISGetIndices(ctx->benign_zerodiag_subs[n], &cols));
      for (i = 0; i < nz - 1; i++) sum += array[cols[i]];
#if defined(PETSC_USE_COMPLEX)
      sum = -(PetscRealPart(sum) / nz + PETSC_i * (PetscImaginaryPart(sum) / nz));
#else
      sum = -sum / nz;
#endif
      for (i = 0; i < nz - 1; i++) array2[cols[i]] += sum;
      array2[cols[nz - 1]] = ctx->benign_save_vals[n];
      PetscCall(ISRestoreIndices(ctx->benign_zerodiag_subs[n], &cols));
    }
  }
  if (v2) {
    PetscCall(VecRestoreArrayRead(v, (const PetscScalar **)&array));
    PetscCall(VecRestoreArray(v2, &array2));
  } else {
    PetscCall(VecRestoreArray(v, &array));
  }
  if (!sol && full) {
    Vec      usedv;
    PetscInt n_I, size_schur;

    /* get sizes */
    PetscCall(MatGetSize(ctx->benign_csAIB, &size_schur, NULL));
    PetscCall(VecGetSize(v, &n_I));
    n_I = n_I - size_schur;
    /* compute schur rhs correction */
    if (v2) {
      usedv = v2;
    } else {
      usedv = v;
    }
    /* apply schur rhs correction */
    PetscCall(MatMultTranspose(ctx->benign_AIIm1ones, usedv, ctx->benign_corr_work));
    PetscCall(VecGetArrayRead(usedv, (const PetscScalar **)&array));
    PetscCall(VecPlaceArray(ctx->benign_dummy_schur_vec, array + n_I));
    PetscCall(VecRestoreArrayRead(usedv, (const PetscScalar **)&array));
    PetscCall(MatMultAdd(ctx->benign_csAIB, ctx->benign_corr_work, ctx->benign_dummy_schur_vec, ctx->benign_dummy_schur_vec));
    PetscCall(VecResetArray(ctx->benign_dummy_schur_vec));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_Solve_Private(PC pc, Vec rhs, Vec sol, PetscBool transpose, PetscBool full)
{
  PCBDDCReuseSolvers ctx;
  PetscBool          copy = PETSC_FALSE;

  PetscFunctionBegin;
  PetscCall(PCShellGetContext(pc, &ctx));
  if (full) {
    PetscCall(MatMumpsSetIcntl(ctx->F, 26, -1));
#if defined(PETSC_HAVE_MKL_PARDISO)
    PetscCall(MatMkl_PardisoSetCntl(ctx->F, 70, 0));
#endif
    copy = ctx->has_vertices;
  } else { /* interior solver */
    PetscCall(MatMumpsSetIcntl(ctx->F, 26, 0));
#if defined(PETSC_HAVE_MKL_PARDISO)
    PetscCall(MatMkl_PardisoSetCntl(ctx->F, 70, 1));
#endif
    copy = PETSC_TRUE;
  }
  /* copy rhs into factored matrix workspace */
  if (copy) {
    PetscInt     n;
    PetscScalar *array, *array_solver;

    PetscCall(VecGetLocalSize(rhs, &n));
    PetscCall(VecGetArrayRead(rhs, (const PetscScalar **)&array));
    PetscCall(VecGetArray(ctx->rhs, &array_solver));
    PetscCall(PetscArraycpy(array_solver, array, n));
    PetscCall(VecRestoreArray(ctx->rhs, &array_solver));
    PetscCall(VecRestoreArrayRead(rhs, (const PetscScalar **)&array));

    PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, ctx->rhs, NULL, PETSC_FALSE, full));
    if (transpose) {
      PetscCall(MatSolveTranspose(ctx->F, ctx->rhs, ctx->sol));
    } else {
      PetscCall(MatSolve(ctx->F, ctx->rhs, ctx->sol));
    }
    PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, ctx->sol, NULL, PETSC_TRUE, full));

    /* get back data to caller worskpace */
    PetscCall(VecGetArrayRead(ctx->sol, (const PetscScalar **)&array_solver));
    PetscCall(VecGetArray(sol, &array));
    PetscCall(PetscArraycpy(array, array_solver, n));
    PetscCall(VecRestoreArray(sol, &array));
    PetscCall(VecRestoreArrayRead(ctx->sol, (const PetscScalar **)&array_solver));
  } else {
    if (ctx->benign_n) {
      PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, rhs, ctx->rhs, PETSC_FALSE, full));
      if (transpose) {
        PetscCall(MatSolveTranspose(ctx->F, ctx->rhs, sol));
      } else {
        PetscCall(MatSolve(ctx->F, ctx->rhs, sol));
      }
      PetscCall(PCBDDCReuseSolversBenignAdapt(ctx, sol, NULL, PETSC_TRUE, full));
    } else {
      if (transpose) {
        PetscCall(MatSolveTranspose(ctx->F, rhs, sol));
      } else {
        PetscCall(MatSolve(ctx->F, rhs, sol));
      }
    }
  }
  /* restore defaults */
  PetscCall(MatMumpsSetIcntl(ctx->F, 26, -1));
#if defined(PETSC_HAVE_MKL_PARDISO)
  PetscCall(MatMkl_PardisoSetCntl(ctx->F, 70, 0));
#endif
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_Correction(PC pc, Vec rhs, Vec sol)
{
  PetscFunctionBegin;
  PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_FALSE, PETSC_TRUE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_CorrectionTranspose(PC pc, Vec rhs, Vec sol)
{
  PetscFunctionBegin;
  PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_TRUE, PETSC_TRUE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_Interior(PC pc, Vec rhs, Vec sol)
{
  PetscFunctionBegin;
  PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_FALSE, PETSC_FALSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_InteriorTranspose(PC pc, Vec rhs, Vec sol)
{
  PetscFunctionBegin;
  PetscCall(PCBDDCReuseSolvers_Solve_Private(pc, rhs, sol, PETSC_TRUE, PETSC_FALSE));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_View(PC pc, PetscViewer viewer)
{
  PCBDDCReuseSolvers ctx;
  PetscBool          isascii;

  PetscFunctionBegin;
  PetscCall(PCShellGetContext(pc, &ctx));
  PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
  if (isascii) PetscCall(PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_INFO));
  PetscCall(MatView(ctx->F, viewer));
  if (isascii) PetscCall(PetscViewerPopFormat(viewer));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolversReset(PCBDDCReuseSolvers reuse)
{
  PetscInt i;

  PetscFunctionBegin;
  PetscCall(MatDestroy(&reuse->F));
  PetscCall(VecDestroy(&reuse->sol));
  PetscCall(VecDestroy(&reuse->rhs));
  PetscCall(PCDestroy(&reuse->interior_solver));
  PetscCall(PCDestroy(&reuse->correction_solver));
  PetscCall(ISDestroy(&reuse->is_R));
  PetscCall(ISDestroy(&reuse->is_B));
  PetscCall(VecScatterDestroy(&reuse->correction_scatter_B));
  PetscCall(VecDestroy(&reuse->sol_B));
  PetscCall(VecDestroy(&reuse->rhs_B));
  for (i = 0; i < reuse->benign_n; i++) PetscCall(ISDestroy(&reuse->benign_zerodiag_subs[i]));
  PetscCall(PetscFree(reuse->benign_zerodiag_subs));
  PetscCall(PetscFree(reuse->benign_save_vals));
  PetscCall(MatDestroy(&reuse->benign_csAIB));
  PetscCall(MatDestroy(&reuse->benign_AIIm1ones));
  PetscCall(VecDestroy(&reuse->benign_corr_work));
  PetscCall(VecDestroy(&reuse->benign_dummy_schur_vec));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCReuseSolvers_Destroy(PC pc)
{
  PCBDDCReuseSolvers ctx;

  PetscFunctionBegin;
  PetscCall(PCShellGetContext(pc, &ctx));
  PetscCall(PCBDDCReuseSolversReset(ctx));
  PetscCall(PetscFree(ctx));
  PetscCall(PCShellSetContext(pc, NULL));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static PetscErrorCode PCBDDCComputeExplicitSchur(Mat M, PetscBool issym, MatReuse reuse, Mat *S)
{
  Mat         B, C, D, Bd, Cd, AinvBd;
  KSP         ksp;
  PC          pc;
  PetscBool   isLU, isILU, isCHOL, Bdense, Cdense;
  PetscReal   fill = 2.0;
  PetscInt    n_I;
  PetscMPIInt size;

  PetscFunctionBegin;
  PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)M), &size));
  PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for parallel matrices");
  if (reuse == MAT_REUSE_MATRIX) {
    PetscBool Sdense;

    PetscCall(PetscObjectTypeCompare((PetscObject)*S, MATSEQDENSE, &Sdense));
    PetscCheck(Sdense, PetscObjectComm((PetscObject)M), PETSC_ERR_SUP, "S should dense");
  }
  PetscCall(MatSchurComplementGetSubMatrices(M, NULL, NULL, &B, &C, &D));
  PetscCall(MatSchurComplementGetKSP(M, &ksp));
  PetscCall(KSPGetPC(ksp, &pc));
  PetscCall(PetscObjectTypeCompare((PetscObject)pc, PCLU, &isLU));
  PetscCall(PetscObjectTypeCompare((PetscObject)pc, PCILU, &isILU));
  PetscCall(PetscObjectTypeCompare((PetscObject)pc, PCCHOLESKY, &isCHOL));
  PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &Bdense));
  PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &Cdense));
  PetscCall(MatGetSize(B, &n_I, NULL));
  if (n_I) {
    if (!Bdense) {
      PetscCall(MatConvert(B, MATSEQDENSE, MAT_INITIAL_MATRIX, &Bd));
    } else {
      Bd = B;
    }

    if (isLU || isILU || isCHOL) {
      Mat fact;
      PetscCall(KSPSetUp(ksp));
      PetscCall(PCFactorGetMatrix(pc, &fact));
      PetscCall(MatDuplicate(Bd, MAT_DO_NOT_COPY_VALUES, &AinvBd));
      PetscCall(MatMatSolve(fact, Bd, AinvBd));
    } else {
      PetscBool ex = PETSC_TRUE;

      if (ex) {
        Mat Ainvd;

        PetscCall(PCComputeOperator(pc, MATDENSE, &Ainvd));
        PetscCall(MatMatMult(Ainvd, Bd, MAT_INITIAL_MATRIX, fill, &AinvBd));
        PetscCall(MatDestroy(&Ainvd));
      } else {
        Vec          sol, rhs;
        PetscScalar *arrayrhs, *arraysol;
        PetscInt     i, nrhs, n;

        PetscCall(MatDuplicate(Bd, MAT_DO_NOT_COPY_VALUES, &AinvBd));
        PetscCall(MatGetSize(Bd, &n, &nrhs));
        PetscCall(MatDenseGetArray(Bd, &arrayrhs));
        PetscCall(MatDenseGetArray(AinvBd, &arraysol));
        PetscCall(KSPGetSolution(ksp, &sol));
        PetscCall(KSPGetRhs(ksp, &rhs));
        for (i = 0; i < nrhs; i++) {
          PetscCall(VecPlaceArray(rhs, arrayrhs + i * n));
          PetscCall(VecPlaceArray(sol, arraysol + i * n));
          PetscCall(KSPSolve(ksp, rhs, sol));
          PetscCall(VecResetArray(rhs));
          PetscCall(VecResetArray(sol));
        }
        PetscCall(MatDenseRestoreArray(Bd, &arrayrhs));
        PetscCall(MatDenseRestoreArray(AinvBd, &arrayrhs));
      }
    }
    if (!Bdense & !issym) PetscCall(MatDestroy(&Bd));

    if (!issym) {
      if (!Cdense) {
        PetscCall(MatConvert(C, MATSEQDENSE, MAT_INITIAL_MATRIX, &Cd));
      } else {
        Cd = C;
      }
      PetscCall(MatMatMult(Cd, AinvBd, reuse, fill, S));
      if (!Cdense) PetscCall(MatDestroy(&Cd));
    } else {
      PetscCall(MatTransposeMatMult(Bd, AinvBd, reuse, fill, S));
      if (!Bdense) PetscCall(MatDestroy(&Bd));
    }
    PetscCall(MatDestroy(&AinvBd));
  }

  if (D) {
    Mat       Dd;
    PetscBool Ddense;

    PetscCall(PetscObjectTypeCompare((PetscObject)D, MATSEQDENSE, &Ddense));
    if (!Ddense) {
      PetscCall(MatConvert(D, MATSEQDENSE, MAT_INITIAL_MATRIX, &Dd));
    } else {
      Dd = D;
    }
    if (n_I) {
      PetscCall(MatAYPX(*S, -1.0, Dd, SAME_NONZERO_PATTERN));
    } else {
      if (reuse == MAT_INITIAL_MATRIX) {
        PetscCall(MatDuplicate(Dd, MAT_COPY_VALUES, S));
      } else {
        PetscCall(MatCopy(Dd, *S, SAME_NONZERO_PATTERN));
      }
    }
    if (!Ddense) PetscCall(MatDestroy(&Dd));
  } else {
    PetscCall(MatScale(*S, -1.0));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode PCBDDCSubSchursSetUp(PCBDDCSubSchurs sub_schurs, Mat Ain, Mat Sin, PetscBool exact_schur, PetscInt xadj[], PetscInt adjncy[], PetscInt nlayers, Vec scaling, PetscBool compute_Stilda, PetscBool reuse_solvers, PetscBool benign_trick, PetscInt benign_n, PetscInt benign_p0_lidx[], IS benign_zerodiag_subs[], Mat change, IS change_primal)
{
  Mat          F, A_II, A_IB, A_BI, A_BB, AE_II;
  Mat          S_all;
  Vec          gstash, lstash;
  VecScatter   sstash;
  IS           is_I, is_I_layer;
  IS           all_subsets, all_subsets_mult, all_subsets_n;
  PetscScalar *stasharray, *Bwork;
  PetscInt    *all_local_idx_N, *all_local_subid_N = NULL;
  PetscInt    *auxnum1, *auxnum2;
  PetscInt    *local_subs = sub_schurs->graph->local_subs;
  PetscInt     i, subset_size, max_subset_size, n_local_subs = sub_schurs->graph->n_local_subs;
  PetscInt     n_B, extra, local_size, global_size;
  PetscInt     local_stash_size;
  PetscBLASInt B_N, B_ierr, B_lwork, *pivots;
  MPI_Comm     comm_n;
  PetscBool    deluxe   = PETSC_TRUE;
  PetscBool    use_potr = PETSC_FALSE, use_sytr = PETSC_FALSE;
  PetscViewer  matl_dbg_viewer = NULL;
  PetscBool    flg, multi_element = sub_schurs->graph->multi_element;

  PetscFunctionBegin;
  PetscCall(MatDestroy(&sub_schurs->A));
  PetscCall(MatDestroy(&sub_schurs->S));
  if (Ain) {
    PetscCall(PetscObjectReference((PetscObject)Ain));
    sub_schurs->A = Ain;
  }

  PetscCall(PetscObjectReference((PetscObject)Sin));
  sub_schurs->S = Sin;
  if (sub_schurs->schur_explicit) sub_schurs->schur_explicit = (PetscBool)(!!sub_schurs->A);

  /* preliminary checks */
  PetscCheck(sub_schurs->schur_explicit || !compute_Stilda, PetscObjectComm((PetscObject)sub_schurs->l2gmap), PETSC_ERR_SUP, "Adaptive selection of constraints requires MUMPS and/or MKL_PARDISO");

  if (benign_trick) sub_schurs->is_posdef = PETSC_FALSE;

  /* debug (MATLAB) */
  if (sub_schurs->debug) {
    PetscMPIInt size, rank;
    PetscInt    nr, *print_schurs_ranks, print_schurs = PETSC_FALSE;

    PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &size));
    PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &rank));
    nr = size;
    PetscCall(PetscMalloc1(nr, &print_schurs_ranks));
    PetscOptionsBegin(PetscObjectComm((PetscObject)sub_schurs->l2gmap), sub_schurs->prefix, "BDDC sub_schurs options", "PC");
    PetscCall(PetscOptionsIntArray("-sub_schurs_debug_ranks", "Ranks to debug (all if the option is not used)", NULL, print_schurs_ranks, &nr, &flg));
    if (!flg) print_schurs = PETSC_TRUE;
    else {
      print_schurs = PETSC_FALSE;
      for (i = 0; i < nr; i++)
        if (print_schurs_ranks[i] == rank) {
          print_schurs = PETSC_TRUE;
          break;
        }
    }
    PetscOptionsEnd();
    PetscCall(PetscFree(print_schurs_ranks));
    if (print_schurs) {
      char filename[256];

      PetscCall(PetscSNPrintf(filename, sizeof(filename), "sub_schurs_Schur_r%d.m", PetscGlobalRank));
      PetscCall(PetscViewerASCIIOpen(PETSC_COMM_SELF, filename, &matl_dbg_viewer));
      PetscCall(PetscViewerPushFormat(matl_dbg_viewer, PETSC_VIEWER_ASCII_MATLAB));
    }
  }

  /* DEBUG: turn on/off multi-element code path */
  PetscCall(PetscOptionsGetBool(NULL, sub_schurs->prefix, "-sub_schurs_multielement_code", &multi_element, NULL));
  if (n_local_subs == 0) multi_element = PETSC_FALSE;

  /* restrict work on active processes */
  if (sub_schurs->restrict_comm) {
    PetscSubcomm subcomm;
    PetscMPIInt  color, rank;

    color = 0;
    if (!sub_schurs->n_subs) color = 1; /* this can happen if we are in a multilevel case or if the subdomain is disconnected */
    PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &rank));
    PetscCall(PetscSubcommCreate(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &subcomm));
    PetscCall(PetscSubcommSetNumber(subcomm, 2));
    PetscCall(PetscSubcommSetTypeGeneral(subcomm, color, rank));
    PetscCall(PetscCommDuplicate(PetscSubcommChild(subcomm), &comm_n, NULL));
    PetscCall(PetscSubcommDestroy(&subcomm));
    if (!sub_schurs->n_subs) {
      PetscCall(PetscCommDestroy(&comm_n));
      PetscFunctionReturn(PETSC_SUCCESS);
    }
  } else {
    PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)sub_schurs->l2gmap), &comm_n, NULL));
  }

  /* get Schur complement matrices */
  if (!sub_schurs->schur_explicit) {
    Mat       tA_IB, tA_BI, tA_BB;
    PetscBool isseqsbaij;
    PetscCall(MatSchurComplementGetSubMatrices(sub_schurs->S, &A_II, NULL, &tA_IB, &tA_BI, &tA_BB));
    PetscCall(PetscObjectTypeCompare((PetscObject)tA_BB, MATSEQSBAIJ, &isseqsbaij));
    if (isseqsbaij) {
      PetscCall(MatConvert(tA_BB, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_BB));
      PetscCall(MatConvert(tA_IB, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_IB));
      PetscCall(MatConvert(tA_BI, MATSEQAIJ, MAT_INITIAL_MATRIX, &A_BI));
    } else {
      PetscCall(PetscObjectReference((PetscObject)tA_BB));
      A_BB = tA_BB;
      PetscCall(PetscObjectReference((PetscObject)tA_IB));
      A_IB = tA_IB;
      PetscCall(PetscObjectReference((PetscObject)tA_BI));
      A_BI = tA_BI;
    }
  } else {
    A_II = NULL;
    A_IB = NULL;
    A_BI = NULL;
    A_BB = NULL;
  }
  S_all = NULL;

  /* determine interior problems */
  PetscCall(ISGetLocalSize(sub_schurs->is_I, &i));
  if (nlayers >= 0 && i) { /* Interior problems can be different from the original one */
    PetscBT         touched;
    const PetscInt *idx_B;
    PetscInt        n_I, n_B, n_local_dofs, n_prev_added, j, layer, *local_numbering;

    PetscCheck(xadj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot request layering without adjacency");
    /* get sizes */
    PetscCall(ISGetLocalSize(sub_schurs->is_I, &n_I));
    PetscCall(ISGetLocalSize(sub_schurs->is_B, &n_B));

    PetscCall(PetscMalloc1(n_I + n_B, &local_numbering));
    PetscCall(PetscBTCreate(n_I + n_B, &touched));
    PetscCall(PetscBTMemzero(n_I + n_B, touched));

    /* all boundary dofs must be skipped when adding layers */
    PetscCall(ISGetIndices(sub_schurs->is_B, &idx_B));
    for (j = 0; j < n_B; j++) PetscCall(PetscBTSet(touched, idx_B[j]));
    PetscCall(PetscArraycpy(local_numbering, idx_B, n_B));
    PetscCall(ISRestoreIndices(sub_schurs->is_B, &idx_B));

    /* add prescribed number of layers of dofs */
    n_local_dofs = n_B;
    n_prev_added = n_B;
    for (layer = 0; layer < nlayers; layer++) {
      PetscInt n_added = 0;
      if (n_local_dofs == n_I + n_B) break;
      PetscCheck(n_local_dofs <= n_I + n_B, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error querying layer %" PetscInt_FMT ". Out of bound access (%" PetscInt_FMT " > %" PetscInt_FMT ")", layer, n_local_dofs, n_I + n_B);
      PetscCall(PCBDDCAdjGetNextLayer_Private(local_numbering + n_local_dofs, n_prev_added, touched, xadj, adjncy, &n_added));
      n_prev_added = n_added;
      n_local_dofs += n_added;
      if (!n_added) break;
    }
    PetscCall(PetscBTDestroy(&touched));

    /* IS for I layer dofs in original numbering */
    PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)sub_schurs->is_I), n_local_dofs - n_B, local_numbering + n_B, PETSC_COPY_VALUES, &is_I_layer));
    PetscCall(PetscFree(local_numbering));
    PetscCall(ISSort(is_I_layer));
    /* IS for I layer dofs in I numbering */
    if (!sub_schurs->schur_explicit) {
      ISLocalToGlobalMapping ItoNmap;
      PetscCall(ISLocalToGlobalMappingCreateIS(sub_schurs->is_I, &ItoNmap));
      PetscCall(ISGlobalToLocalMappingApplyIS(ItoNmap, IS_GTOLM_DROP, is_I_layer, &is_I));
      PetscCall(ISLocalToGlobalMappingDestroy(&ItoNmap));

      /* II block */
      PetscCall(MatCreateSubMatrix(A_II, is_I, is_I, MAT_INITIAL_MATRIX, &AE_II));
    }
  } else {
    PetscInt n_I;

    /* IS for I dofs in original numbering */
    PetscCall(PetscObjectReference((PetscObject)sub_schurs->is_I));
    is_I_layer = sub_schurs->is_I;

    /* IS for I dofs in I numbering (strided 1) */
    if (!sub_schurs->schur_explicit) {
      PetscCall(ISGetSize(sub_schurs->is_I, &n_I));
      PetscCall(ISCreateStride(PetscObjectComm((PetscObject)sub_schurs->is_I), n_I, 0, 1, &is_I));

      /* II block is the same */
      PetscCall(PetscObjectReference((PetscObject)A_II));
      AE_II = A_II;
    }
  }

  /* Get info on subset sizes and sum of all subsets sizes */
  max_subset_size = 0;
  local_size      = 0;
  for (i = 0; i < sub_schurs->n_subs; i++) {
    PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
    max_subset_size = PetscMax(subset_size, max_subset_size);
    local_size += subset_size;
  }

  /* Work arrays for local indices */
  extra = 0;
  PetscCall(ISGetLocalSize(sub_schurs->is_B, &n_B));
  if (sub_schurs->schur_explicit && is_I_layer) PetscCall(ISGetLocalSize(is_I_layer, &extra));
  PetscCall(PetscMalloc1(n_B + extra, &all_local_idx_N));
  if (multi_element) PetscCall(PetscMalloc1(n_B + extra, &all_local_subid_N));
  if (extra) {
    const PetscInt *idxs;
    PetscCall(ISGetIndices(is_I_layer, &idxs));
    PetscCall(PetscArraycpy(all_local_idx_N, idxs, extra));
    if (multi_element)
      for (PetscInt j = 0; j < extra; j++) all_local_subid_N[j] = local_subs[idxs[j]];
    PetscCall(ISRestoreIndices(is_I_layer, &idxs));
  }
  PetscCall(PetscMalloc1(sub_schurs->n_subs, &auxnum1));
  PetscCall(PetscMalloc1(sub_schurs->n_subs, &auxnum2));

  /* Get local indices in local numbering */
  local_size       = 0;
  local_stash_size = 0;
  for (i = 0; i < sub_schurs->n_subs; i++) {
    const PetscInt *idxs;

    PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
    PetscCall(ISGetIndices(sub_schurs->is_subs[i], &idxs));
    /* start (smallest in global ordering) and multiplicity */
    auxnum1[i] = idxs[0];
    auxnum2[i] = subset_size * subset_size;
    /* subset indices in local numbering */
    PetscCall(PetscArraycpy(all_local_idx_N + local_size + extra, idxs, subset_size));
    if (multi_element)
      for (PetscInt j = 0; j < subset_size; j++) all_local_subid_N[j + local_size + extra] = local_subs[idxs[j]];
    PetscCall(ISRestoreIndices(sub_schurs->is_subs[i], &idxs));
    local_size += subset_size;
    local_stash_size += subset_size * subset_size;
  }

  /* allocate extra workspace needed only for GETRI or SYTRF when inverting the blocks or the entire Schur complement */
  use_potr = use_sytr = PETSC_FALSE;
  if (benign_trick || (sub_schurs->is_hermitian && sub_schurs->is_posdef)) {
    use_potr = PETSC_TRUE;
  } else if (sub_schurs->is_symmetric) {
    use_sytr = PETSC_TRUE;
  }
  if (local_size && !use_potr && compute_Stilda) {
    PetscScalar  lwork, dummyscalar = 0.;
    PetscBLASInt dummyint = 0;

    B_lwork = -1;
    PetscCall(PetscBLASIntCast(local_size, &B_N));
    PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
    if (use_sytr) {
      PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, &dummyscalar, &B_N, &dummyint, &lwork, &B_lwork, &B_ierr));
      PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in query to SYTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
    } else {
      PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, &dummyscalar, &B_N, &dummyint, &lwork, &B_lwork, &B_ierr));
      PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in query to GETRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
    }
    PetscCall(PetscFPTrapPop());
    PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(lwork), &B_lwork));
    PetscCall(PetscMalloc2(B_lwork, &Bwork, B_N, &pivots));
  } else {
    Bwork  = NULL;
    pivots = NULL;
  }

  /* prepare data for summing up properly schurs on subsets */
  PetscCall(ISCreateGeneral(comm_n, sub_schurs->n_subs, auxnum1, PETSC_OWN_POINTER, &all_subsets_n));
  PetscCall(ISLocalToGlobalMappingApplyIS(sub_schurs->l2gmap, all_subsets_n, &all_subsets));
  PetscCall(ISDestroy(&all_subsets_n));
  PetscCall(ISCreateGeneral(comm_n, sub_schurs->n_subs, auxnum2, PETSC_OWN_POINTER, &all_subsets_mult));
  PetscCall(ISRenumber(all_subsets, all_subsets_mult, &global_size, &all_subsets_n));
  PetscCall(ISDestroy(&all_subsets));
  PetscCall(ISDestroy(&all_subsets_mult));
  PetscCall(ISGetLocalSize(all_subsets_n, &i));
  PetscCheck(i == local_stash_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Invalid size of new subset! %" PetscInt_FMT " != %" PetscInt_FMT, i, local_stash_size);
  PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, local_stash_size, NULL, &lstash));
  PetscCall(VecCreateMPI(comm_n, PETSC_DECIDE, global_size, &gstash));
  PetscCall(VecScatterCreate(lstash, NULL, gstash, all_subsets_n, &sstash));
  PetscCall(ISDestroy(&all_subsets_n));

  /* subset indices in local boundary numbering */
  if (!sub_schurs->is_Ej_all) {
    PetscInt *all_local_idx_B;

    PetscCall(PetscMalloc1(local_size, &all_local_idx_B));
    PetscCall(ISGlobalToLocalMappingApply(sub_schurs->BtoNmap, IS_GTOLM_DROP, local_size, all_local_idx_N + extra, &subset_size, all_local_idx_B));
    PetscCheck(subset_size == local_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Error in sub_schurs serial (BtoNmap)! %" PetscInt_FMT " != %" PetscInt_FMT, subset_size, local_size);
    PetscCall(ISCreateGeneral(PETSC_COMM_SELF, local_size, all_local_idx_B, PETSC_OWN_POINTER, &sub_schurs->is_Ej_all));
  }

  if (change) {
    ISLocalToGlobalMapping BtoS;
    IS                     change_primal_B;
    IS                     change_primal_all;

    PetscCheck(!sub_schurs->change_primal_sub, PETSC_COMM_SELF, PETSC_ERR_PLIB, "This should not happen");
    PetscCheck(!sub_schurs->change, PETSC_COMM_SELF, PETSC_ERR_PLIB, "This should not happen");
    PetscCall(PetscMalloc1(sub_schurs->n_subs, &sub_schurs->change_primal_sub));
    for (i = 0; i < sub_schurs->n_subs; i++) {
      ISLocalToGlobalMapping NtoS;
      PetscCall(ISLocalToGlobalMappingCreateIS(sub_schurs->is_subs[i], &NtoS));
      PetscCall(ISGlobalToLocalMappingApplyIS(NtoS, IS_GTOLM_DROP, change_primal, &sub_schurs->change_primal_sub[i]));
      PetscCall(ISLocalToGlobalMappingDestroy(&NtoS));
    }
    PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, change_primal, &change_primal_B));
    PetscCall(ISLocalToGlobalMappingCreateIS(sub_schurs->is_Ej_all, &BtoS));
    PetscCall(ISGlobalToLocalMappingApplyIS(BtoS, IS_GTOLM_DROP, change_primal_B, &change_primal_all));
    PetscCall(ISLocalToGlobalMappingDestroy(&BtoS));
    PetscCall(ISDestroy(&change_primal_B));
    PetscCall(PetscMalloc1(sub_schurs->n_subs, &sub_schurs->change));
    for (i = 0; i < sub_schurs->n_subs; i++) {
      Mat change_sub;

      PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
      PetscCall(KSPCreate(PETSC_COMM_SELF, &sub_schurs->change[i]));
      PetscCall(KSPSetNestLevel(sub_schurs->change[i], 1)); /* do not seem to have direct access to a PC from which to get the level of nests */
      PetscCall(KSPSetType(sub_schurs->change[i], KSPPREONLY));
      if (!sub_schurs->change_with_qr) {
        PetscCall(MatCreateSubMatrix(change, sub_schurs->is_subs[i], sub_schurs->is_subs[i], MAT_INITIAL_MATRIX, &change_sub));
      } else {
        Mat change_subt;
        PetscCall(MatCreateSubMatrix(change, sub_schurs->is_subs[i], sub_schurs->is_subs[i], MAT_INITIAL_MATRIX, &change_subt));
        PetscCall(MatConvert(change_subt, MATSEQDENSE, MAT_INITIAL_MATRIX, &change_sub));
        PetscCall(MatDestroy(&change_subt));
      }
      PetscCall(KSPSetOperators(sub_schurs->change[i], change_sub, change_sub));
      PetscCall(MatDestroy(&change_sub));
      PetscCall(KSPSetOptionsPrefix(sub_schurs->change[i], sub_schurs->prefix));
      PetscCall(KSPAppendOptionsPrefix(sub_schurs->change[i], "sub_schurs_change_"));
    }
    PetscCall(ISDestroy(&change_primal_all));
  }

  /* Local matrix of all local Schur on subsets (transposed) */
  if (!sub_schurs->S_Ej_all) {
    Mat          T;
    PetscScalar *v;
    PetscInt    *ii, *jj;
    PetscInt     cum, i, j, k;

    /* MatSeqAIJSetPreallocation + MatSetValues is slow for these kind of matrices (may have large blocks)
       Allocate properly a representative matrix and duplicate */
    PetscCall(PetscMalloc3(local_size + 1, &ii, local_stash_size, &jj, local_stash_size, &v));
    ii[0] = 0;
    cum   = 0;
    for (i = 0; i < sub_schurs->n_subs; i++) {
      PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
      for (j = 0; j < subset_size; j++) {
        const PetscInt row = cum + j;
        PetscInt       col = cum;

        ii[row + 1] = ii[row] + subset_size;
        for (k = ii[row]; k < ii[row + 1]; k++) {
          jj[k] = col;
          col++;
        }
      }
      cum += subset_size;
    }
    PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, local_size, local_size, ii, jj, v, &T));
    PetscCall(MatDuplicate(T, MAT_DO_NOT_COPY_VALUES, &sub_schurs->S_Ej_all));
    PetscCall(MatDestroy(&T));
    PetscCall(PetscFree3(ii, jj, v));
  }
  /* matrices for deluxe scaling and adaptive selection */
  if (compute_Stilda) {
    if (!sub_schurs->sum_S_Ej_tilda_all) PetscCall(MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_tilda_all));
    if (!sub_schurs->sum_S_Ej_inv_all && deluxe) PetscCall(MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_inv_all));
  }

  /* Compute Schur complements explicitly */
  F = NULL;
  if (!sub_schurs->schur_explicit) {
    /* this code branch is used when MatFactor with Schur complement support is not present or when explicitly requested;
       it is not efficient, unless the economic version of the scaling is used */
    Mat          S_Ej_expl;
    PetscScalar *work;
    PetscInt     j, *dummy_idx;
    PetscBool    Sdense;

    PetscCall(PetscMalloc2(max_subset_size, &dummy_idx, max_subset_size * max_subset_size, &work));
    local_size = 0;
    for (i = 0; i < sub_schurs->n_subs; i++) {
      IS  is_subset_B;
      Mat AE_EE, AE_IE, AE_EI, S_Ej;

      /* subsets in original and boundary numbering */
      PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, sub_schurs->is_subs[i], &is_subset_B));
      /* EE block */
      PetscCall(MatCreateSubMatrix(A_BB, is_subset_B, is_subset_B, MAT_INITIAL_MATRIX, &AE_EE));
      /* IE block */
      PetscCall(MatCreateSubMatrix(A_IB, is_I, is_subset_B, MAT_INITIAL_MATRIX, &AE_IE));
      /* EI block */
      if (sub_schurs->is_symmetric) {
        PetscCall(MatCreateTranspose(AE_IE, &AE_EI));
      } else if (sub_schurs->is_hermitian) {
        PetscCall(MatCreateHermitianTranspose(AE_IE, &AE_EI));
      } else {
        PetscCall(MatCreateSubMatrix(A_BI, is_subset_B, is_I, MAT_INITIAL_MATRIX, &AE_EI));
      }
      PetscCall(ISDestroy(&is_subset_B));
      PetscCall(MatCreateSchurComplement(AE_II, AE_II, AE_IE, AE_EI, AE_EE, &S_Ej));
      PetscCall(MatDestroy(&AE_EE));
      PetscCall(MatDestroy(&AE_IE));
      PetscCall(MatDestroy(&AE_EI));
      if (AE_II == A_II) { /* we can reuse the same ksp */
        KSP ksp;
        PetscCall(MatSchurComplementGetKSP(sub_schurs->S, &ksp));
        PetscCall(MatSchurComplementSetKSP(S_Ej, ksp));
      } else { /* build new ksp object which inherits ksp and pc types from the original one */
        KSP       origksp, schurksp;
        PC        origpc, schurpc;
        KSPType   ksp_type;
        PetscInt  n_internal;
        PetscBool ispcnone;

        PetscCall(MatSchurComplementGetKSP(sub_schurs->S, &origksp));
        PetscCall(MatSchurComplementGetKSP(S_Ej, &schurksp));
        PetscCall(KSPGetType(origksp, &ksp_type));
        PetscCall(KSPSetType(schurksp, ksp_type));
        PetscCall(KSPGetPC(schurksp, &schurpc));
        PetscCall(KSPGetPC(origksp, &origpc));
        PetscCall(PetscObjectTypeCompare((PetscObject)origpc, PCNONE, &ispcnone));
        if (!ispcnone) {
          PCType pc_type;
          PetscCall(PCGetType(origpc, &pc_type));
          PetscCall(PCSetType(schurpc, pc_type));
        } else {
          PetscCall(PCSetType(schurpc, PCLU));
        }
        PetscCall(ISGetSize(is_I, &n_internal));
        if (!n_internal) { /* UMFPACK gives error with 0 sized problems */
          MatSolverType solver = NULL;
          PetscCall(PCFactorGetMatSolverType(origpc, &solver));
          if (solver) PetscCall(PCFactorSetMatSolverType(schurpc, solver));
        }
        PetscCall(KSPSetUp(schurksp));
      }
      PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
      PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &S_Ej_expl));
      PetscCall(PCBDDCComputeExplicitSchur(S_Ej, sub_schurs->is_symmetric, MAT_REUSE_MATRIX, &S_Ej_expl));
      PetscCall(PetscObjectTypeCompare((PetscObject)S_Ej_expl, MATSEQDENSE, &Sdense));
      PetscCheck(Sdense, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented for sparse matrices");
      for (j = 0; j < subset_size; j++) dummy_idx[j] = local_size + j;
      PetscCall(MatSetValues(sub_schurs->S_Ej_all, subset_size, dummy_idx, subset_size, dummy_idx, work, INSERT_VALUES));
      PetscCall(MatDestroy(&S_Ej));
      PetscCall(MatDestroy(&S_Ej_expl));
      local_size += subset_size;
    }
    PetscCall(PetscFree2(dummy_idx, work));
    /* free */
    PetscCall(ISDestroy(&is_I));
    PetscCall(MatDestroy(&AE_II));
    PetscCall(PetscFree(all_local_idx_N));
  } else {
    Mat                A, cs_AIB_mat = NULL, benign_AIIm1_ones_mat = NULL;
    Mat               *gdswA;
    Vec                Dall = NULL;
    IS                 is_A_all, *is_p_r = NULL, is_schur;
    MatType            Stype;
    PetscScalar       *work, *S_data, *schur_factor, infty = PETSC_MAX_REAL;
    PetscScalar       *SEj_arr = NULL, *SEjinv_arr = NULL;
    const PetscScalar *rS_data;
    PetscInt           n, n_I, size_schur, size_active_schur, cum, cum2;
    PetscBool          economic, solver_S, S_lower_triangular = PETSC_FALSE;
    PetscBool          schur_has_vertices, factor_workaround;
    PetscBool          use_cholesky;
#if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
    PetscBool oldpin;
#endif
    /* multi-element */
    IS *is_sub_all = NULL, *is_sub_schur_all = NULL, *is_sub_schur = NULL;

    /* get sizes */
    n_I = 0;
    if (is_I_layer) PetscCall(ISGetLocalSize(is_I_layer, &n_I));
    economic = PETSC_FALSE;
    PetscCall(ISGetLocalSize(sub_schurs->is_I, &cum));
    if (cum != n_I) economic = PETSC_TRUE;
    PetscCall(MatGetLocalSize(sub_schurs->A, &n, NULL));
    size_active_schur = local_size;

    /* import scaling vector (wrong formulation if we have 3D edges) */
    if (scaling && compute_Stilda) {
      const PetscScalar *array;
      PetscScalar       *array2;
      const PetscInt    *idxs;
      PetscInt           i;

      PetscCall(ISGetIndices(sub_schurs->is_Ej_all, &idxs));
      PetscCall(VecCreateSeq(PETSC_COMM_SELF, size_active_schur, &Dall));
      PetscCall(VecGetArrayRead(scaling, &array));
      PetscCall(VecGetArray(Dall, &array2));
      for (i = 0; i < size_active_schur; i++) array2[i] = array[idxs[i]];
      PetscCall(VecRestoreArray(Dall, &array2));
      PetscCall(VecRestoreArrayRead(scaling, &array));
      PetscCall(ISRestoreIndices(sub_schurs->is_Ej_all, &idxs));
      deluxe = PETSC_FALSE;
    }

    /* size active schurs does not count any dirichlet or vertex dof on the interface */
    factor_workaround  = PETSC_FALSE;
    schur_has_vertices = PETSC_FALSE;
    cum                = n_I + size_active_schur;
    if (sub_schurs->is_dir) {
      const PetscInt *idxs;
      PetscInt        n_dir;

      PetscCall(ISGetLocalSize(sub_schurs->is_dir, &n_dir));
      PetscCall(ISGetIndices(sub_schurs->is_dir, &idxs));
      PetscCall(PetscArraycpy(all_local_idx_N + cum, idxs, n_dir));
      if (multi_element)
        for (PetscInt j = 0; j < n_dir; j++) all_local_subid_N[j + cum] = local_subs[idxs[j]];
      PetscCall(ISRestoreIndices(sub_schurs->is_dir, &idxs));
      cum += n_dir;
      if (!sub_schurs->gdsw) factor_workaround = PETSC_TRUE;
    }
    /* include the primal vertices in the Schur complement */
    if (exact_schur && sub_schurs->is_vertices && (compute_Stilda || benign_n)) {
      PetscInt n_v;

      PetscCall(ISGetLocalSize(sub_schurs->is_vertices, &n_v));
      if (n_v) {
        const PetscInt *idxs;

        PetscCall(ISGetIndices(sub_schurs->is_vertices, &idxs));
        PetscCall(PetscArraycpy(all_local_idx_N + cum, idxs, n_v));
        if (multi_element)
          for (PetscInt j = 0; j < n_v; j++) all_local_subid_N[j + cum] = local_subs[idxs[j]];
        PetscCall(ISRestoreIndices(sub_schurs->is_vertices, &idxs));
        cum += n_v;
        if (!sub_schurs->gdsw) factor_workaround = PETSC_TRUE;
        schur_has_vertices = PETSC_TRUE;
      }
    }
    size_schur = cum - n_I;
    PetscCall(ISCreateGeneral(PETSC_COMM_SELF, cum, all_local_idx_N, PETSC_OWN_POINTER, &is_A_all));
#if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
    oldpin = sub_schurs->A->boundtocpu;
    PetscCall(MatBindToCPU(sub_schurs->A, PETSC_TRUE));
#endif
    if (cum == n) {
      PetscCall(ISSetPermutation(is_A_all));
      PetscCall(MatPermute(sub_schurs->A, is_A_all, is_A_all, &A));
    } else {
      PetscCall(MatCreateSubMatrix(sub_schurs->A, is_A_all, is_A_all, MAT_INITIAL_MATRIX, &A));
    }
#if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
    PetscCall(MatBindToCPU(sub_schurs->A, oldpin));
#endif
    PetscCall(MatSetOptionsPrefixFactor(A, sub_schurs->prefix));
    PetscCall(MatAppendOptionsPrefixFactor(A, "sub_schurs_"));
    /* subsets ordered last */
    PetscCall(ISCreateStride(PETSC_COMM_SELF, size_schur, n_I, 1, &is_schur));

    if (multi_element) {
      PetscInt *idx_sub;

      PetscCall(PetscMalloc3(n_local_subs, &is_sub_all, n_local_subs, &is_sub_schur_all, n_local_subs, &is_sub_schur));
      PetscCall(PetscMalloc1(n + size_schur, &idx_sub));
      for (PetscInt sub = 0; sub < n_local_subs; sub++) {
        PetscInt size_sub = 0, size_schur_sub = 0, size_I_sub;

        for (PetscInt j = 0; j < n_I; j++)
          if (all_local_subid_N[j] == sub) idx_sub[size_sub++] = j;
        size_I_sub = size_sub;
        for (PetscInt j = n_I; j < n_I + size_schur; j++)
          if (all_local_subid_N[j] == sub) {
            idx_sub[size_sub++]           = j;
            idx_sub[n + size_schur_sub++] = j - n_I;
          }

        PetscCall(ISCreateGeneral(PETSC_COMM_SELF, size_sub, idx_sub, PETSC_COPY_VALUES, &is_sub_all[sub]));
        PetscCall(ISCreateGeneral(PETSC_COMM_SELF, size_schur_sub, idx_sub + n, PETSC_COPY_VALUES, &is_sub_schur[sub]));
        PetscCall(ISCreateStride(PETSC_COMM_SELF, size_schur_sub, size_I_sub, 1, &is_sub_schur_all[sub]));
      }
      PetscCall(PetscFree(idx_sub));
    }

    /* if we actually change the basis for the pressures, LDL^T factors will use a lot of memory
       this is a workaround */
    if (benign_n) {
      Vec                    v, benign_AIIm1_ones;
      ISLocalToGlobalMapping N_to_reor;
      IS                     is_p0, is_p0_p;
      PetscScalar           *cs_AIB, *AIIm1_data;
      PetscInt               sizeA;

      PetscCall(ISLocalToGlobalMappingCreateIS(is_A_all, &N_to_reor));
      PetscCall(ISCreateGeneral(PETSC_COMM_SELF, benign_n, benign_p0_lidx, PETSC_COPY_VALUES, &is_p0));
      PetscCall(ISGlobalToLocalMappingApplyIS(N_to_reor, IS_GTOLM_DROP, is_p0, &is_p0_p));
      PetscCall(ISDestroy(&is_p0));
      PetscCall(MatCreateVecs(A, &v, &benign_AIIm1_ones));
      PetscCall(VecGetSize(v, &sizeA));
      PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, sizeA, benign_n, NULL, &benign_AIIm1_ones_mat));
      PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, size_schur, benign_n, NULL, &cs_AIB_mat));
      PetscCall(MatDenseGetArray(cs_AIB_mat, &cs_AIB));
      PetscCall(MatDenseGetArray(benign_AIIm1_ones_mat, &AIIm1_data));
      PetscCall(PetscMalloc1(benign_n, &is_p_r));
      /* compute colsum of A_IB restricted to pressures */
      for (i = 0; i < benign_n; i++) {
        const PetscScalar *array;
        const PetscInt    *idxs;
        PetscInt           j, nz;

        PetscCall(ISGlobalToLocalMappingApplyIS(N_to_reor, IS_GTOLM_DROP, benign_zerodiag_subs[i], &is_p_r[i]));
        PetscCall(ISGetLocalSize(is_p_r[i], &nz));
        PetscCall(ISGetIndices(is_p_r[i], &idxs));
        for (j = 0; j < nz; j++) AIIm1_data[idxs[j] + sizeA * i] = 1.;
        PetscCall(ISRestoreIndices(is_p_r[i], &idxs));
        PetscCall(VecPlaceArray(benign_AIIm1_ones, AIIm1_data + sizeA * i));
        PetscCall(MatMult(A, benign_AIIm1_ones, v));
        PetscCall(VecResetArray(benign_AIIm1_ones));
        PetscCall(VecGetArrayRead(v, &array));
        for (j = 0; j < size_schur; j++) {
#if defined(PETSC_USE_COMPLEX)
          cs_AIB[i * size_schur + j] = (PetscRealPart(array[j + n_I]) / nz + PETSC_i * (PetscImaginaryPart(array[j + n_I]) / nz));
#else
          cs_AIB[i * size_schur + j] = array[j + n_I] / nz;
#endif
        }
        PetscCall(VecRestoreArrayRead(v, &array));
      }
      PetscCall(MatDenseRestoreArray(cs_AIB_mat, &cs_AIB));
      PetscCall(MatDenseRestoreArray(benign_AIIm1_ones_mat, &AIIm1_data));
      PetscCall(VecDestroy(&v));
      PetscCall(VecDestroy(&benign_AIIm1_ones));
      PetscCall(MatSetOption(A, MAT_KEEP_NONZERO_PATTERN, PETSC_FALSE));
      PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_FALSE));
      PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
      PetscCall(MatZeroRowsColumnsIS(A, is_p0_p, 1.0, NULL, NULL));
      PetscCall(ISDestroy(&is_p0_p));
      PetscCall(ISLocalToGlobalMappingDestroy(&N_to_reor));
    }
    PetscCall(MatSetOption(A, MAT_SYMMETRIC, sub_schurs->is_symmetric));
    PetscCall(MatSetOption(A, MAT_HERMITIAN, sub_schurs->is_hermitian));
    PetscCall(MatSetOption(A, MAT_SPD, sub_schurs->is_posdef));

    /* for complexes, symmetric and hermitian at the same time implies null imaginary part */
    use_cholesky = (PetscBool)((use_potr || use_sytr) && sub_schurs->is_hermitian && sub_schurs->is_symmetric);

    /* when using the benign subspace trick, the local Schur complements are SPD */
    /* MKL_PARDISO does not handle well the computation of a Schur complement from a symmetric indefinite factorization
       Use LU and adapt pivoting perturbation (still, solution is not as accurate as with using MUMPS) */
    if (benign_trick) {
      sub_schurs->is_posdef = PETSC_TRUE;
      PetscCall(PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, &flg));
      if (flg) use_cholesky = PETSC_FALSE;
    }
    if (sub_schurs->mat_factor_type == MAT_FACTOR_NONE) sub_schurs->mat_factor_type = use_cholesky ? MAT_FACTOR_CHOLESKY : MAT_FACTOR_LU;

    if (n_I && !multi_element) {
      char      stype[64];
      PetscBool gpu = PETSC_FALSE;

      PetscCall(MatGetFactor(A, sub_schurs->mat_solver_type, sub_schurs->mat_factor_type, &F));
      PetscCheck(F, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatGetFactor not supported by matrix instance of type %s. Rerun with \"-info :mat | grep MatGetFactor_\" for additional information", ((PetscObject)A)->type_name);
      PetscCall(MatSetErrorIfFailure(A, PETSC_TRUE));
#if defined(PETSC_HAVE_MKL_PARDISO)
      if (benign_trick) PetscCall(MatMkl_PardisoSetCntl(F, 10, 10));
#endif
      PetscCall(MatFactorSetSchurIS(F, is_schur));

      /* factorization step */
      switch (sub_schurs->mat_factor_type) {
      case MAT_FACTOR_CHOLESKY:
        PetscCall(MatCholeskyFactorSymbolic(F, A, NULL, NULL));
        /* be sure that icntl 19 is not set by command line */
        PetscCall(MatMumpsSetIcntl(F, 19, 2));
        PetscCall(MatCholeskyFactorNumeric(F, A, NULL));
        S_lower_triangular = PETSC_TRUE;
        break;
      case MAT_FACTOR_LU:
        PetscCall(MatLUFactorSymbolic(F, A, NULL, NULL, NULL));
        /* be sure that icntl 19 is not set by command line */
        PetscCall(MatMumpsSetIcntl(F, 19, 3));
        PetscCall(MatLUFactorNumeric(F, A, NULL));
        break;
      default:
        SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
      }
      PetscCall(MatViewFromOptions(F, (PetscObject)A, "-mat_factor_view"));

      if (matl_dbg_viewer) {
        Mat S;
        IS  is;

        PetscCall(PetscObjectSetName((PetscObject)A, "A"));
        PetscCall(MatView(A, matl_dbg_viewer));
        PetscCall(MatFactorCreateSchurComplement(F, &S, NULL));
        PetscCall(PetscObjectSetName((PetscObject)S, "S"));
        PetscCall(MatView(S, matl_dbg_viewer));
        PetscCall(MatDestroy(&S));
        PetscCall(ISCreateStride(PETSC_COMM_SELF, n_I, 0, 1, &is));
        PetscCall(PetscObjectSetName((PetscObject)is, "I"));
        PetscCall(ISView(is, matl_dbg_viewer));
        PetscCall(ISDestroy(&is));
        PetscCall(ISCreateStride(PETSC_COMM_SELF, size_schur, n_I, 1, &is));
        PetscCall(PetscObjectSetName((PetscObject)is, "B"));
        PetscCall(ISView(is, matl_dbg_viewer));
        PetscCall(ISDestroy(&is));
        PetscCall(PetscObjectSetName((PetscObject)is_A_all, "IA"));
        PetscCall(ISView(is_A_all, matl_dbg_viewer));
        for (i = 0, cum = 0; i < sub_schurs->n_subs; i++) {
          IS   is;
          char name[16];

          PetscCall(PetscSNPrintf(name, sizeof(name), "IE%" PetscInt_FMT, i));
          PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
          PetscCall(ISCreateStride(PETSC_COMM_SELF, subset_size, cum, 1, &is));
          PetscCall(PetscObjectSetName((PetscObject)is, name));
          PetscCall(ISView(is, matl_dbg_viewer));
          PetscCall(ISDestroy(&is));
          if (sub_schurs->change) {
            Mat T;

            PetscCall(PetscSNPrintf(name, sizeof(name), "TE%" PetscInt_FMT, i));
            PetscCall(KSPGetOperators(sub_schurs->change[i], &T, NULL));
            PetscCall(PetscObjectSetName((PetscObject)T, name));
            PetscCall(MatView(T, matl_dbg_viewer));
            PetscCall(PetscSNPrintf(name, sizeof(name), "ITE%" PetscInt_FMT, i));
            PetscCall(PetscObjectSetName((PetscObject)sub_schurs->change_primal_sub[i], name));
            PetscCall(ISView(sub_schurs->change_primal_sub[i], matl_dbg_viewer));
          }
          cum += subset_size;
        }
        PetscCall(PetscViewerFlush(matl_dbg_viewer));
      }

      /* get explicit Schur Complement computed during numeric factorization */
      PetscCall(MatFactorGetSchurComplement(F, &S_all, NULL));
      PetscCall(PetscStrncpy(stype, MATSEQDENSE, sizeof(stype)));
#if defined(PETSC_HAVE_CUDA)
      PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &gpu, MATSEQAIJVIENNACL, MATSEQAIJCUSPARSE, ""));
#endif
      if (gpu) PetscCall(PetscStrncpy(stype, MATSEQDENSECUDA, sizeof(stype)));
      PetscCall(PetscOptionsGetString(NULL, sub_schurs->prefix, "-sub_schurs_schur_mat_type", stype, sizeof(stype), NULL));
      PetscCall(MatConvert(S_all, stype, MAT_INPLACE_MATRIX, &S_all));
      PetscCall(MatSetOption(S_all, MAT_SPD, sub_schurs->is_posdef));
      PetscCall(MatSetOption(S_all, MAT_HERMITIAN, sub_schurs->is_hermitian));
      PetscCall(MatGetType(S_all, &Stype));

      /* we can reuse the solvers if we are not using the economic version */
      reuse_solvers = (PetscBool)(reuse_solvers && !economic && !sub_schurs->graph->multi_element);
      if (!sub_schurs->gdsw) {
        factor_workaround = (PetscBool)(reuse_solvers && factor_workaround);
        if (!sub_schurs->is_posdef && factor_workaround && compute_Stilda && size_active_schur) reuse_solvers = factor_workaround = PETSC_FALSE;
      }
      solver_S = PETSC_TRUE;

      /* update the Schur complement with the change of basis on the pressures */
      if (benign_n) {
        const PetscScalar *cs_AIB;
        PetscScalar       *S_data, *AIIm1_data;
        Mat                S2 = NULL, S3 = NULL; /* dbg */
        PetscScalar       *S2_data, *S3_data;    /* dbg */
        Vec                v, benign_AIIm1_ones;
        PetscInt           sizeA;

        PetscCall(MatDenseGetArray(S_all, &S_data));
        PetscCall(MatCreateVecs(A, &v, &benign_AIIm1_ones));
        PetscCall(VecGetSize(v, &sizeA));
        PetscCall(MatMumpsSetIcntl(F, 26, 0));
#if defined(PETSC_HAVE_MKL_PARDISO)
        PetscCall(MatMkl_PardisoSetCntl(F, 70, 1));
#endif
        PetscCall(MatDenseGetArrayRead(cs_AIB_mat, &cs_AIB));
        PetscCall(MatDenseGetArray(benign_AIIm1_ones_mat, &AIIm1_data));
        if (matl_dbg_viewer) {
          PetscCall(MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S2));
          PetscCall(MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S3));
          PetscCall(MatDenseGetArray(S2, &S2_data));
          PetscCall(MatDenseGetArray(S3, &S3_data));
        }
        for (i = 0; i < benign_n; i++) {
          PetscScalar    *array, sum = 0., one = 1., *sums;
          const PetscInt *idxs;
          PetscInt        k, j, nz;
          PetscBLASInt    B_k, B_n;

          PetscCall(PetscCalloc1(benign_n, &sums));
          PetscCall(VecPlaceArray(benign_AIIm1_ones, AIIm1_data + sizeA * i));
          PetscCall(VecCopy(benign_AIIm1_ones, v));
          PetscCall(MatSolve(F, v, benign_AIIm1_ones));
          PetscCall(MatMult(A, benign_AIIm1_ones, v));
          PetscCall(VecResetArray(benign_AIIm1_ones));
          /* p0 dofs (eliminated) are excluded from the sums */
          for (k = 0; k < benign_n; k++) {
            PetscCall(ISGetLocalSize(is_p_r[k], &nz));
            PetscCall(ISGetIndices(is_p_r[k], &idxs));
            for (j = 0; j < nz - 1; j++) sums[k] -= AIIm1_data[idxs[j] + sizeA * i];
            PetscCall(ISRestoreIndices(is_p_r[k], &idxs));
          }
          PetscCall(VecGetArrayRead(v, (const PetscScalar **)&array));
          if (matl_dbg_viewer) {
            Vec  vv;
            char name[16];

            PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, size_schur, array + n_I, &vv));
            PetscCall(PetscSNPrintf(name, sizeof(name), "Pvs%" PetscInt_FMT, i));
            PetscCall(PetscObjectSetName((PetscObject)vv, name));
            PetscCall(VecView(vv, matl_dbg_viewer));
          }
          /* perform sparse rank updates on symmetric Schur (TODO: move outside of the loop?) */
          /* cs_AIB already scaled by 1./nz */
          B_k = 1;
          PetscCall(PetscBLASIntCast(size_schur, &B_n));
          for (k = 0; k < benign_n; k++) {
            sum = sums[k];

            if (PetscAbsScalar(sum) == 0.0) continue;
            if (k == i) {
              if (B_n) PetscCallBLAS("BLASsyrk", BLASsyrk_("L", "N", &B_n, &B_k, &sum, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
              if (matl_dbg_viewer && B_n) PetscCallBLAS("BLASsyrk", BLASsyrk_("L", "N", &B_n, &B_k, &sum, cs_AIB + i * size_schur, &B_n, &one, S3_data, &B_n));
            } else { /* XXX Is it correct to use symmetric rank-2 update with half of the sum? */
              sum /= 2.0;
              if (B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, cs_AIB + k * size_schur, &B_n, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
              if (matl_dbg_viewer && B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, cs_AIB + k * size_schur, &B_n, cs_AIB + i * size_schur, &B_n, &one, S3_data, &B_n));
            }
          }
          sum = 1.;
          if (B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, array + n_I, &B_n, cs_AIB + i * size_schur, &B_n, &one, S_data, &B_n));
          if (matl_dbg_viewer && B_n) PetscCallBLAS("BLASsyr2k", BLASsyr2k_("L", "N", &B_n, &B_k, &sum, array + n_I, &B_n, cs_AIB + i * size_schur, &B_n, &one, S2_data, &B_n));
          PetscCall(VecRestoreArrayRead(v, (const PetscScalar **)&array));
          /* set p0 entry of AIIm1_ones to zero */
          PetscCall(ISGetLocalSize(is_p_r[i], &nz));
          PetscCall(ISGetIndices(is_p_r[i], &idxs));
          for (j = 0; j < benign_n; j++) AIIm1_data[idxs[nz - 1] + sizeA * j] = 0.;
          PetscCall(ISRestoreIndices(is_p_r[i], &idxs));
          PetscCall(PetscFree(sums));
        }
        PetscCall(VecDestroy(&benign_AIIm1_ones));
        if (matl_dbg_viewer) {
          PetscCall(MatDenseRestoreArray(S2, &S2_data));
          PetscCall(MatDenseRestoreArray(S3, &S3_data));
        }
        if (!S_lower_triangular) { /* I need to expand the upper triangular data (column-oriented) */
          PetscInt k, j;
          for (k = 0; k < size_schur; k++) {
            for (j = k; j < size_schur; j++) S_data[j * size_schur + k] = PetscConj(S_data[k * size_schur + j]);
          }
        }

        /* restore defaults */
        PetscCall(MatMumpsSetIcntl(F, 26, -1));
#if defined(PETSC_HAVE_MKL_PARDISO)
        PetscCall(MatMkl_PardisoSetCntl(F, 70, 0));
#endif
        PetscCall(MatDenseRestoreArrayRead(cs_AIB_mat, &cs_AIB));
        PetscCall(MatDenseRestoreArray(benign_AIIm1_ones_mat, &AIIm1_data));
        PetscCall(VecDestroy(&v));
        PetscCall(MatDenseRestoreArray(S_all, &S_data));
        if (matl_dbg_viewer) {
          Mat S;

          PetscCall(MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED));
          PetscCall(MatFactorCreateSchurComplement(F, &S, NULL));
          PetscCall(PetscObjectSetName((PetscObject)S, "Sb"));
          PetscCall(MatView(S, matl_dbg_viewer));
          PetscCall(MatDestroy(&S));
          PetscCall(PetscObjectSetName((PetscObject)S2, "S2P"));
          PetscCall(MatView(S2, matl_dbg_viewer));
          PetscCall(PetscObjectSetName((PetscObject)S3, "S3P"));
          PetscCall(MatView(S3, matl_dbg_viewer));
          PetscCall(PetscObjectSetName((PetscObject)cs_AIB_mat, "cs"));
          PetscCall(MatView(cs_AIB_mat, matl_dbg_viewer));
          PetscCall(MatFactorGetSchurComplement(F, &S_all, NULL));
        }
        PetscCall(MatDestroy(&S2));
        PetscCall(MatDestroy(&S3));
      }
      if (!reuse_solvers) {
        for (i = 0; i < benign_n; i++) PetscCall(ISDestroy(&is_p_r[i]));
        PetscCall(PetscFree(is_p_r));
        PetscCall(MatDestroy(&cs_AIB_mat));
        PetscCall(MatDestroy(&benign_AIIm1_ones_mat));
      }
    } else if (multi_element) { /* MUMPS does not support sparse Schur complements. Loop over local subs */
      PetscInt       *nnz;
      const PetscInt *idxs;
      PetscInt        size_schur_sub;

      PetscCall(PetscCalloc1(size_schur, &nnz));
      for (PetscInt sub = 0; sub < n_local_subs; sub++) {
        PetscCall(ISGetLocalSize(is_sub_schur[sub], &size_schur_sub));
        PetscCall(ISGetIndices(is_sub_schur[sub], &idxs));
        for (PetscInt j = 0; j < size_schur_sub; j++) nnz[idxs[j]] = size_schur_sub;
        PetscCall(ISRestoreIndices(is_sub_schur[sub], &idxs));
      }
      PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, size_schur, size_schur, 0, nnz, &S_all));
      PetscCall(MatSetOption(S_all, MAT_ROW_ORIENTED, sub_schurs->is_hermitian));
      PetscCall(PetscFree(nnz));

      for (PetscInt sub = 0; sub < n_local_subs; sub++) {
        Mat                Asub, Ssub;
        const PetscScalar *vals;
        PetscInt           size_all_sub;

        F = NULL;
        PetscCall(ISGetLocalSize(is_sub_schur[sub], &size_schur_sub));
        PetscCall(ISGetLocalSize(is_sub_all[sub], &size_all_sub));
        PetscCall(MatCreateSubMatrix(A, is_sub_all[sub], is_sub_all[sub], MAT_INITIAL_MATRIX, &Asub));
        if (size_schur_sub == size_all_sub) {
          /* we can't use MatFactor when size_schur == size_of_the_problem */
          PetscCall(MatConvert(Asub, MATDENSE, MAT_INITIAL_MATRIX, &Ssub));
        } else {
          PetscCall(MatGetFactor(Asub, sub_schurs->mat_solver_type, sub_schurs->mat_factor_type, &F));
          PetscCheck(F, PetscObjectComm((PetscObject)Asub), PETSC_ERR_SUP, "MatGetFactor not supported by matrix instance of type %s. Rerun with \"-info :mat | grep MatGetFactor_\" for additional information", ((PetscObject)Asub)->type_name);
          PetscCall(MatSetErrorIfFailure(Asub, PETSC_TRUE));
#if defined(PETSC_HAVE_MKL_PARDISO)
          if (benign_trick) PetscCall(MatMkl_PardisoSetCntl(F, 10, 10));
#endif
          /* subsets ordered last */
          PetscCall(MatFactorSetSchurIS(F, is_sub_schur_all[sub]));

          /* factorization step */
          switch (sub_schurs->mat_factor_type) {
          case MAT_FACTOR_CHOLESKY:
            PetscCall(MatCholeskyFactorSymbolic(F, Asub, NULL, NULL));
            /* be sure that icntl 19 is not set by command line */
            PetscCall(MatMumpsSetIcntl(F, 19, 2));
            PetscCall(MatCholeskyFactorNumeric(F, Asub, NULL));
            S_lower_triangular = PETSC_TRUE;
            break;
          case MAT_FACTOR_LU:
            PetscCall(MatLUFactorSymbolic(F, Asub, NULL, NULL, NULL));
            /* be sure that icntl 19 is not set by command line */
            PetscCall(MatMumpsSetIcntl(F, 19, 3));
            PetscCall(MatLUFactorNumeric(F, Asub, NULL));
            break;
          default:
            SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
          }
          PetscCall(MatFactorCreateSchurComplement(F, &Ssub, NULL));
        }
        PetscCall(MatDestroy(&Asub));
        PetscCall(MatDenseGetArrayRead(Ssub, &vals));
        PetscCall(ISGetIndices(is_sub_schur[sub], &idxs));
        PetscCall(MatSetValues(S_all, size_schur_sub, idxs, size_schur_sub, idxs, vals, INSERT_VALUES));
        PetscCall(ISRestoreIndices(is_sub_schur[sub], &idxs));
        PetscCall(MatDenseRestoreArrayRead(Ssub, &vals));
        PetscCall(MatDestroy(&Ssub));
        PetscCall(MatDestroy(&F));
      }
      PetscCall(MatAssemblyBegin(S_all, MAT_FINAL_ASSEMBLY));
      PetscCall(MatAssemblyEnd(S_all, MAT_FINAL_ASSEMBLY));
      PetscCall(MatSetOption(S_all, MAT_SPD, sub_schurs->is_posdef));
      PetscCall(MatSetOption(S_all, MAT_HERMITIAN, sub_schurs->is_hermitian));
      Stype             = MATDENSE;
      reuse_solvers     = PETSC_FALSE;
      factor_workaround = PETSC_FALSE;
      solver_S          = PETSC_FALSE;
    } else { /* we can't use MatFactor when size_schur == size_of_the_problem */
      PetscCall(MatConvert(A, MATSEQDENSE, MAT_INITIAL_MATRIX, &S_all));
      PetscCall(MatGetType(S_all, &Stype));
      reuse_solvers     = PETSC_FALSE; /* TODO: why we can't reuse the solvers here? */
      factor_workaround = PETSC_FALSE;
      solver_S          = PETSC_FALSE;
    }

    if (reuse_solvers) {
      Mat                A_II, pA_II, Afake;
      Vec                vec1_B;
      PCBDDCReuseSolvers msolv_ctx;
      PetscInt           n_R;

      if (sub_schurs->reuse_solver) {
        PetscCall(PCBDDCReuseSolversReset(sub_schurs->reuse_solver));
      } else {
        PetscCall(PetscNew(&sub_schurs->reuse_solver));
      }
      msolv_ctx = sub_schurs->reuse_solver;
      PetscCall(MatSchurComplementGetSubMatrices(sub_schurs->S, &A_II, &pA_II, NULL, NULL, NULL));
      PetscCall(PetscObjectReference((PetscObject)F));
      msolv_ctx->F = F;
      PetscCall(MatCreateVecs(F, &msolv_ctx->sol, NULL));
      /* currently PETSc has no support for MatSolve(F,x,x), so cheat and let rhs and sol share the same memory */
      {
        PetscScalar *array;
        PetscInt     n;

        PetscCall(VecGetLocalSize(msolv_ctx->sol, &n));
        PetscCall(VecGetArray(msolv_ctx->sol, &array));
        PetscCall(VecCreateSeqWithArray(PetscObjectComm((PetscObject)msolv_ctx->sol), 1, n, array, &msolv_ctx->rhs));
        PetscCall(VecRestoreArray(msolv_ctx->sol, &array));
      }
      msolv_ctx->has_vertices = schur_has_vertices;

      /* interior solver */
      PetscCall(PCCreate(PetscObjectComm((PetscObject)A_II), &msolv_ctx->interior_solver));
      PetscCall(PCSetOperators(msolv_ctx->interior_solver, A_II, pA_II));
      PetscCall(PCSetType(msolv_ctx->interior_solver, PCSHELL));
      PetscCall(PCShellSetName(msolv_ctx->interior_solver, "Interior solver (w/o Schur factorization)"));
      PetscCall(PCShellSetContext(msolv_ctx->interior_solver, msolv_ctx));
      PetscCall(PCShellSetView(msolv_ctx->interior_solver, PCBDDCReuseSolvers_View));
      PetscCall(PCShellSetApply(msolv_ctx->interior_solver, PCBDDCReuseSolvers_Interior));
      PetscCall(PCShellSetApplyTranspose(msolv_ctx->interior_solver, PCBDDCReuseSolvers_InteriorTranspose));
      if (sub_schurs->gdsw) PetscCall(PCShellSetDestroy(msolv_ctx->interior_solver, PCBDDCReuseSolvers_Destroy));

      /* correction solver */
      if (!sub_schurs->gdsw) {
        PetscCall(PCCreate(PetscObjectComm((PetscObject)A_II), &msolv_ctx->correction_solver));
        PetscCall(PCSetType(msolv_ctx->correction_solver, PCSHELL));
        PetscCall(PCShellSetName(msolv_ctx->correction_solver, "Correction solver (with Schur factorization)"));
        PetscCall(PCShellSetContext(msolv_ctx->correction_solver, msolv_ctx));
        PetscCall(PCShellSetView(msolv_ctx->interior_solver, PCBDDCReuseSolvers_View));
        PetscCall(PCShellSetApply(msolv_ctx->correction_solver, PCBDDCReuseSolvers_Correction));
        PetscCall(PCShellSetApplyTranspose(msolv_ctx->correction_solver, PCBDDCReuseSolvers_CorrectionTranspose));

        /* scatter and vecs for Schur complement solver */
        PetscCall(MatCreateVecs(S_all, &msolv_ctx->sol_B, &msolv_ctx->rhs_B));
        PetscCall(MatCreateVecs(sub_schurs->S, &vec1_B, NULL));
        if (!schur_has_vertices) {
          PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, is_A_all, &msolv_ctx->is_B));
          PetscCall(VecScatterCreate(vec1_B, msolv_ctx->is_B, msolv_ctx->sol_B, NULL, &msolv_ctx->correction_scatter_B));
          PetscCall(PetscObjectReference((PetscObject)is_A_all));
          msolv_ctx->is_R = is_A_all;
        } else {
          IS              is_B_all;
          const PetscInt *idxs;
          PetscInt        dual, n_v, n;

          PetscCall(ISGetLocalSize(sub_schurs->is_vertices, &n_v));
          dual = size_schur - n_v;
          PetscCall(ISGetLocalSize(is_A_all, &n));
          PetscCall(ISGetIndices(is_A_all, &idxs));
          PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)is_A_all), dual, idxs + n_I, PETSC_COPY_VALUES, &is_B_all));
          PetscCall(ISGlobalToLocalMappingApplyIS(sub_schurs->BtoNmap, IS_GTOLM_DROP, is_B_all, &msolv_ctx->is_B));
          PetscCall(ISDestroy(&is_B_all));
          PetscCall(ISCreateStride(PetscObjectComm((PetscObject)is_A_all), dual, 0, 1, &is_B_all));
          PetscCall(VecScatterCreate(vec1_B, msolv_ctx->is_B, msolv_ctx->sol_B, is_B_all, &msolv_ctx->correction_scatter_B));
          PetscCall(ISDestroy(&is_B_all));
          PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)is_A_all), n - n_v, idxs, PETSC_COPY_VALUES, &msolv_ctx->is_R));
          PetscCall(ISRestoreIndices(is_A_all, &idxs));
        }
        PetscCall(ISGetLocalSize(msolv_ctx->is_R, &n_R));
        PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, n_R, n_R, 0, NULL, &Afake));
        PetscCall(MatAssemblyBegin(Afake, MAT_FINAL_ASSEMBLY));
        PetscCall(MatAssemblyEnd(Afake, MAT_FINAL_ASSEMBLY));
        PetscCall(PCSetOperators(msolv_ctx->correction_solver, Afake, Afake));
        PetscCall(MatDestroy(&Afake));
        PetscCall(VecDestroy(&vec1_B));
      }
      /* communicate benign info to solver context */
      if (benign_n) {
        PetscScalar *array;

        msolv_ctx->benign_n             = benign_n;
        msolv_ctx->benign_zerodiag_subs = is_p_r;
        PetscCall(PetscMalloc1(benign_n, &msolv_ctx->benign_save_vals));
        msolv_ctx->benign_csAIB = cs_AIB_mat;
        PetscCall(MatCreateVecs(cs_AIB_mat, &msolv_ctx->benign_corr_work, NULL));
        PetscCall(VecGetArray(msolv_ctx->benign_corr_work, &array));
        PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, size_schur, array, &msolv_ctx->benign_dummy_schur_vec));
        PetscCall(VecRestoreArray(msolv_ctx->benign_corr_work, &array));
        msolv_ctx->benign_AIIm1ones = benign_AIIm1_ones_mat;
      }
    } else {
      if (sub_schurs->reuse_solver) PetscCall(PCBDDCReuseSolversReset(sub_schurs->reuse_solver));
      PetscCall(PetscFree(sub_schurs->reuse_solver));
    }
    PetscCall(MatDestroy(&A));
    PetscCall(ISDestroy(&is_A_all));

    /* Work arrays */
    PetscCall(PetscMalloc1(max_subset_size * max_subset_size, &work));

    /* S_Ej_all */
    PetscInt *idx_work = NULL;
    cum = cum2 = 0;
    if (!multi_element) PetscCall(MatDenseGetArrayRead(S_all, &rS_data));
    else PetscCall(PetscMalloc1(max_subset_size, &idx_work));
    PetscCall(MatSeqAIJGetArray(sub_schurs->S_Ej_all, &SEj_arr));
    if (sub_schurs->sum_S_Ej_inv_all) PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_inv_all, &SEjinv_arr));
    if (sub_schurs->gdsw) PetscCall(MatCreateSubMatrices(sub_schurs->A, sub_schurs->n_subs, sub_schurs->is_subs, sub_schurs->is_subs, MAT_INITIAL_MATRIX, &gdswA));
    for (i = 0; i < sub_schurs->n_subs; i++) {
      /* get S_E (or K^i_EE for GDSW) */
      PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
      if (sub_schurs->gdsw) {
        Mat T;

        PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &T));
        PetscCall(MatConvert(gdswA[i], MATDENSE, MAT_REUSE_MATRIX, &T));
        PetscCall(MatDestroy(&T));
      } else {
        if (multi_element) { /* transpose copy to workspace */
          // XXX CSR directly?
          for (PetscInt j = 0; j < subset_size; j++) idx_work[j] = cum + j;
          PetscCall(MatGetValues(S_all, subset_size, idx_work, subset_size, idx_work, work));
          if (!sub_schurs->is_hermitian) {
            for (PetscInt k = 0; k < subset_size; k++) {
              for (PetscInt j = k; j < subset_size; j++) {
                PetscScalar t             = work[k * subset_size + j];
                work[k * subset_size + j] = work[j * subset_size + k];
                work[j * subset_size + k] = t;
              }
            }
          }
        } else if (S_lower_triangular) { /* I need to expand the upper triangular data (column-oriented) */
          for (PetscInt k = 0; k < subset_size; k++) {
            for (PetscInt j = k; j < subset_size; j++) {
              work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
              work[j * subset_size + k] = PetscConj(rS_data[cum2 + k * size_schur + j]);
            }
          }
        } else { /* just copy to workspace */
          for (PetscInt k = 0; k < subset_size; k++) {
            for (PetscInt j = 0; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
          }
        }
      }
      /* insert S_E values */
      if (sub_schurs->change) {
        Mat change_sub, SEj, T;

        /* change basis */
        PetscCall(KSPGetOperators(sub_schurs->change[i], &change_sub, NULL));
        PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &SEj));
        if (!sub_schurs->change_with_qr) { /* currently there's no support for PtAP with P SeqAIJ */
          Mat T2;
          PetscCall(MatTransposeMatMult(change_sub, SEj, MAT_INITIAL_MATRIX, 1.0, &T2));
          PetscCall(MatMatMult(T2, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
          PetscCall(MatConvert(T, MATSEQDENSE, MAT_INPLACE_MATRIX, &T));
          PetscCall(MatDestroy(&T2));
        } else {
          PetscCall(MatPtAP(SEj, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
        }
        PetscCall(MatCopy(T, SEj, SAME_NONZERO_PATTERN));
        PetscCall(MatDestroy(&T));
        PetscCall(MatZeroRowsColumnsIS(SEj, sub_schurs->change_primal_sub[i], 1.0, NULL, NULL));
        PetscCall(MatDestroy(&SEj));
      }
      PetscCall(PetscArraycpy(SEj_arr, work, subset_size * subset_size));
      if (compute_Stilda) {
        if (deluxe) { /* if adaptivity is requested, invert S_E blocks */
          Mat                M;
          const PetscScalar *vals;
          PetscBool          isdense, isdensecuda;

          PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &M));
          PetscCall(MatSetOption(M, MAT_SPD, sub_schurs->is_posdef));
          PetscCall(MatSetOption(M, MAT_HERMITIAN, sub_schurs->is_hermitian));
          if (!PetscBTLookup(sub_schurs->is_edge, i)) PetscCall(MatSetType(M, Stype));
          PetscCall(PetscObjectTypeCompare((PetscObject)M, MATSEQDENSE, &isdense));
          PetscCall(PetscObjectTypeCompare((PetscObject)M, MATSEQDENSECUDA, &isdensecuda));
          switch (sub_schurs->mat_factor_type) {
          case MAT_FACTOR_CHOLESKY:
            PetscCall(MatCholeskyFactor(M, NULL, NULL));
            break;
          case MAT_FACTOR_LU:
            PetscCall(MatLUFactor(M, NULL, NULL, NULL));
            break;
          default:
            SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
          }
          if (isdense) {
            PetscCall(MatSeqDenseInvertFactors_Private(M));
#if defined(PETSC_HAVE_CUDA)
          } else if (isdensecuda) {
            PetscCall(MatSeqDenseCUDAInvertFactors_Internal(M));
#endif
          } else SETERRQ(PetscObjectComm((PetscObject)M), PETSC_ERR_SUP, "Not implemented for type %s", Stype);
          PetscCall(MatDenseGetArrayRead(M, &vals));
          PetscCall(PetscArraycpy(SEjinv_arr, vals, subset_size * subset_size));
          PetscCall(MatDenseRestoreArrayRead(M, &vals));
          PetscCall(MatDestroy(&M));
        } else if (scaling) { /* not using deluxe */
          Mat          SEj;
          Vec          D;
          PetscScalar *array;

          PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, work, &SEj));
          PetscCall(VecGetArray(Dall, &array));
          PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, subset_size, array + cum, &D));
          PetscCall(VecRestoreArray(Dall, &array));
          PetscCall(VecShift(D, -1.));
          PetscCall(MatDiagonalScale(SEj, D, D));
          PetscCall(MatDestroy(&SEj));
          PetscCall(VecDestroy(&D));
          PetscCall(PetscArraycpy(SEj_arr, work, subset_size * subset_size));
        }
      }
      cum += subset_size;
      cum2 += subset_size * (size_schur + 1);
      SEj_arr += subset_size * subset_size;
      if (SEjinv_arr) SEjinv_arr += subset_size * subset_size;
    }
    if (sub_schurs->gdsw) PetscCall(MatDestroySubMatrices(sub_schurs->n_subs, &gdswA));
    if (!multi_element) PetscCall(MatDenseRestoreArrayRead(S_all, &rS_data));
    PetscCall(MatSeqAIJRestoreArray(sub_schurs->S_Ej_all, &SEj_arr));
    if (sub_schurs->sum_S_Ej_inv_all) PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_inv_all, &SEjinv_arr));
    if (solver_S) PetscCall(MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED));

    /* may prevent from unneeded copies, since MUMPS or MKL_Pardiso always use CPU memory
       however, preliminary tests indicate using GPUs is still faster in the solve phase */
#if defined(PETSC_HAVE_VIENNACL) || defined(PETSC_HAVE_CUDA)
    if (reuse_solvers) {
      Mat                  St;
      MatFactorSchurStatus st;

      flg = PETSC_FALSE;
      PetscCall(PetscOptionsGetBool(NULL, sub_schurs->prefix, "-sub_schurs_schur_pin_to_cpu", &flg, NULL));
      PetscCall(MatFactorGetSchurComplement(F, &St, &st));
      PetscCall(MatBindToCPU(St, flg));
      PetscCall(MatFactorRestoreSchurComplement(F, &St, st));
    }
#endif

    schur_factor = NULL;
    if (compute_Stilda && size_active_schur) {
      if (sub_schurs->n_subs == 1 && size_schur == size_active_schur && deluxe) { /* we already computed the inverse */
        PetscCall(MatSeqAIJGetArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
        PetscCall(PetscArraycpy(SEjinv_arr, work, size_schur * size_schur));
        PetscCall(MatSeqAIJRestoreArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
      } else {
        Mat S_all_inv = NULL;

        if (solver_S && !sub_schurs->gdsw) {
          /* for adaptive selection we need S^-1; for solver reusage we need S_\Delta\Delta^-1.
             The latter is not the principal subminor for S^-1. However, the factors can be reused since S_\Delta\Delta is the leading principal submatrix of S */
          if (factor_workaround) { /* invert without calling MatFactorInvertSchurComplement, since we are hacking */
            PetscScalar *data;
            PetscInt     nd = 0;

            PetscCheck(use_potr, PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor update not yet implemented for non SPD matrices");
            PetscCall(MatFactorGetSchurComplement(F, &S_all_inv, NULL));
            PetscCall(MatDenseGetArray(S_all_inv, &data));
            if (sub_schurs->is_dir) { /* dirichlet dofs could have different scalings */
              PetscCall(ISGetLocalSize(sub_schurs->is_dir, &nd));
            }

            /* factor and invert activedofs and vertices (dirichlet dofs does not contribute) */
            if (schur_has_vertices) {
              Mat          M;
              PetscScalar *tdata;
              PetscInt     nv = 0, news;

              PetscCall(ISGetLocalSize(sub_schurs->is_vertices, &nv));
              news = size_active_schur + nv;
              PetscCall(PetscCalloc1(news * news, &tdata));
              for (i = 0; i < size_active_schur; i++) {
                PetscCall(PetscArraycpy(tdata + i * (news + 1), data + i * (size_schur + 1), size_active_schur - i));
                PetscCall(PetscArraycpy(tdata + i * (news + 1) + size_active_schur - i, data + i * size_schur + size_active_schur + nd, nv));
              }
              for (i = 0; i < nv; i++) {
                PetscInt k = i + size_active_schur;
                PetscCall(PetscArraycpy(tdata + k * (news + 1), data + (k + nd) * (size_schur + 1), nv - i));
              }

              PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, news, news, tdata, &M));
              PetscCall(MatSetOption(M, MAT_SPD, PETSC_TRUE));
              PetscCall(MatCholeskyFactor(M, NULL, NULL));
              /* save the factors */
              cum = 0;
              PetscCall(PetscMalloc1((size_active_schur * (size_active_schur + 1)) / 2 + nd, &schur_factor));
              for (i = 0; i < size_active_schur; i++) {
                PetscCall(PetscArraycpy(schur_factor + cum, tdata + i * (news + 1), size_active_schur - i));
                cum += size_active_schur - i;
              }
              for (i = 0; i < nd; i++) schur_factor[cum + i] = PetscSqrtReal(PetscRealPart(data[(i + size_active_schur) * (size_schur + 1)]));
              S_all_inv->ops->solve             = M->ops->solve;
              S_all_inv->ops->matsolve          = M->ops->matsolve;
              S_all_inv->ops->solvetranspose    = M->ops->solvetranspose;
              S_all_inv->ops->matsolvetranspose = M->ops->matsolvetranspose;
              S_all_inv->factortype             = MAT_FACTOR_CHOLESKY;
              PetscCall(MatSeqDenseInvertFactors_Private(M));
              /* move back just the active dofs to the Schur complement */
              for (i = 0; i < size_active_schur; i++) PetscCall(PetscArraycpy(data + i * size_schur, tdata + i * news, size_active_schur));
              PetscCall(PetscFree(tdata));
              PetscCall(MatDestroy(&M));
            } else { /* we can factorize and invert just the activedofs */
              Mat          M;
              PetscScalar *aux;

              PetscCall(PetscMalloc1(nd, &aux));
              for (i = 0; i < nd; i++) aux[i] = 1.0 / data[(i + size_active_schur) * (size_schur + 1)];
              PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, size_active_schur, size_active_schur, data, &M));
              PetscCall(MatDenseSetLDA(M, size_schur));
              PetscCall(MatSetOption(M, MAT_SPD, PETSC_TRUE));
              PetscCall(MatCholeskyFactor(M, NULL, NULL));
              PetscCall(MatSeqDenseInvertFactors_Private(M));
              PetscCall(MatDestroy(&M));
              PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, size_schur, nd, data + size_active_schur * size_schur, &M));
              PetscCall(MatZeroEntries(M));
              PetscCall(MatDestroy(&M));
              PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nd, size_schur, data + size_active_schur, &M));
              PetscCall(MatDenseSetLDA(M, size_schur));
              PetscCall(MatZeroEntries(M));
              PetscCall(MatDestroy(&M));
              for (i = 0; i < nd; i++) data[(i + size_active_schur) * (size_schur + 1)] = aux[i];
              PetscCall(PetscFree(aux));
            }
            PetscCall(MatDenseRestoreArray(S_all_inv, &data));
          } else { /* use MatFactor calls to invert S */
            PetscCall(MatFactorInvertSchurComplement(F));
            PetscCall(MatFactorGetSchurComplement(F, &S_all_inv, NULL));
          }
        } else if (!sub_schurs->gdsw) { /* we need to invert explicitly since we are not using MatFactor for S */
          if (multi_element) {
            PetscCall(MatDuplicate(S_all, MAT_DO_NOT_COPY_VALUES, &S_all_inv));
            PetscCall(MatSetOption(S_all_inv, MAT_ROW_ORIENTED, sub_schurs->is_hermitian));
            for (PetscInt sub = 0; sub < n_local_subs; sub++) {
              const PetscScalar *vals;
              const PetscInt    *idxs;
              PetscInt           size_schur_sub;
              Mat                M;

              PetscCall(MatCreateSubMatrix(S_all, is_sub_schur[sub], is_sub_schur[sub], MAT_INITIAL_MATRIX, &M));
              PetscCall(MatConvert(M, MATDENSE, MAT_INPLACE_MATRIX, &M));
              PetscCall(MatSetOption(M, MAT_SPD, sub_schurs->is_posdef));
              PetscCall(MatSetOption(M, MAT_HERMITIAN, sub_schurs->is_hermitian));
              switch (sub_schurs->mat_factor_type) {
              case MAT_FACTOR_CHOLESKY:
                PetscCall(MatCholeskyFactor(M, NULL, NULL));
                break;
              case MAT_FACTOR_LU:
                PetscCall(MatLUFactor(M, NULL, NULL, NULL));
                break;
              default:
                SETERRQ(PetscObjectComm((PetscObject)F), PETSC_ERR_SUP, "Unsupported factor type %s", MatFactorTypes[sub_schurs->mat_factor_type]);
              }
              PetscCall(MatSeqDenseInvertFactors_Private(M));
              PetscCall(MatDenseGetArrayRead(M, &vals));
              PetscCall(ISGetLocalSize(is_sub_schur[sub], &size_schur_sub));
              PetscCall(ISGetIndices(is_sub_schur[sub], &idxs));
              PetscCall(MatSetValues(S_all_inv, size_schur_sub, idxs, size_schur_sub, idxs, vals, INSERT_VALUES));
              PetscCall(ISRestoreIndices(is_sub_schur[sub], &idxs));
              PetscCall(MatDenseRestoreArrayRead(M, &vals));
              PetscCall(MatDestroy(&M));
            }
            PetscCall(MatAssemblyBegin(S_all_inv, MAT_FINAL_ASSEMBLY));
            PetscCall(MatAssemblyEnd(S_all_inv, MAT_FINAL_ASSEMBLY));
          } else {
            PetscCall(PetscObjectReference((PetscObject)S_all));
            S_all_inv = S_all;
            PetscCall(MatDenseGetArray(S_all_inv, &S_data));
            PetscCall(PetscBLASIntCast(size_schur, &B_N));
            PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
            if (use_potr) {
              PetscCallBLAS("LAPACKpotrf", LAPACKpotrf_("L", &B_N, S_data, &B_N, &B_ierr));
              PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
              PetscCallBLAS("LAPACKpotri", LAPACKpotri_("L", &B_N, S_data, &B_N, &B_ierr));
              PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
            } else if (use_sytr) {
              PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, S_data, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
              PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
              PetscCallBLAS("LAPACKsytri", LAPACKsytri_("L", &B_N, S_data, &B_N, pivots, Bwork, &B_ierr));
              PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
            } else {
              PetscCallBLAS("LAPACKgetrf", LAPACKgetrf_(&B_N, &B_N, S_data, &B_N, pivots, &B_ierr));
              PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
              PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, S_data, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
              PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
            }
            PetscCall(PetscLogFlops(1.0 * size_schur * size_schur * size_schur));
            PetscCall(PetscFPTrapPop());
            PetscCall(MatDenseRestoreArray(S_all_inv, &S_data));
          }
        } else if (sub_schurs->gdsw) {
          Mat      tS, tX, SEj, S_II, S_IE, S_EE;
          KSP      pS_II;
          PC       pS_II_pc;
          IS       EE, II;
          PetscInt nS;

          PetscCall(MatFactorCreateSchurComplement(F, &tS, NULL));
          PetscCall(MatGetSize(tS, &nS, NULL));
          PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
          for (i = 0, cum = 0; i < sub_schurs->n_subs; i++) { /* naive implementation */
            PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
            PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, SEjinv_arr, &SEj));

            PetscCall(ISCreateStride(PETSC_COMM_SELF, subset_size, cum, 1, &EE));
            PetscCall(ISComplement(EE, 0, nS, &II));
            PetscCall(MatCreateSubMatrix(tS, II, II, MAT_INITIAL_MATRIX, &S_II));
            PetscCall(MatCreateSubMatrix(tS, II, EE, MAT_INITIAL_MATRIX, &S_IE));
            PetscCall(MatCreateSubMatrix(tS, EE, EE, MAT_INITIAL_MATRIX, &S_EE));
            PetscCall(ISDestroy(&II));
            PetscCall(ISDestroy(&EE));

            PetscCall(KSPCreate(PETSC_COMM_SELF, &pS_II));
            PetscCall(KSPSetNestLevel(pS_II, 1)); /* do not have direct access to a PC to provide the level of nesting of the KSP */
            PetscCall(KSPSetType(pS_II, KSPPREONLY));
            PetscCall(KSPGetPC(pS_II, &pS_II_pc));
            PetscCall(PCSetType(pS_II_pc, PCSVD));
            PetscCall(KSPSetOptionsPrefix(pS_II, sub_schurs->prefix));
            PetscCall(KSPAppendOptionsPrefix(pS_II, "pseudo_"));
            PetscCall(KSPSetOperators(pS_II, S_II, S_II));
            PetscCall(MatDestroy(&S_II));
            PetscCall(KSPSetFromOptions(pS_II));
            PetscCall(KSPSetUp(pS_II));
            PetscCall(MatDuplicate(S_IE, MAT_DO_NOT_COPY_VALUES, &tX));
            PetscCall(KSPMatSolve(pS_II, S_IE, tX));
            PetscCall(KSPDestroy(&pS_II));

            PetscCall(MatTransposeMatMult(S_IE, tX, MAT_REUSE_MATRIX, PETSC_DETERMINE, &SEj));
            PetscCall(MatDestroy(&S_IE));
            PetscCall(MatDestroy(&tX));
            PetscCall(MatAYPX(SEj, -1, S_EE, SAME_NONZERO_PATTERN));
            PetscCall(MatDestroy(&S_EE));

            PetscCall(MatDestroy(&SEj));
            cum += subset_size;
            SEjinv_arr += subset_size * subset_size;
          }
          PetscCall(MatDestroy(&tS));
          PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
        }
        /* S_Ej_tilda_all */
        cum = cum2 = 0;
        rS_data    = NULL;
        if (S_all_inv && !multi_element) PetscCall(MatDenseGetArrayRead(S_all_inv, &rS_data));
        PetscCall(MatSeqAIJGetArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
        for (i = 0; i < sub_schurs->n_subs; i++) {
          PetscInt j;

          PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
          /* get (St^-1)_E */
          /* Unless we are changing the variables, I don't need to expand to upper triangular since St^-1
             will be properly accessed later during adaptive selection */
          if (multi_element) { /* transpose copy to workspace */
            // XXX CSR directly?
            for (PetscInt j = 0; j < subset_size; j++) idx_work[j] = cum + j;
            PetscCall(MatGetValues(S_all_inv, subset_size, idx_work, subset_size, idx_work, work));
            if (!sub_schurs->is_hermitian) {
              for (PetscInt k = 0; k < subset_size; k++) {
                for (PetscInt j = k; j < subset_size; j++) {
                  PetscScalar t             = work[k * subset_size + j];
                  work[k * subset_size + j] = work[j * subset_size + k];
                  work[j * subset_size + k] = t;
                }
              }
            }
          } else if (rS_data) {
            if (S_lower_triangular) {
              PetscInt k;
              if (sub_schurs->change) {
                for (k = 0; k < subset_size; k++) {
                  for (j = k; j < subset_size; j++) {
                    work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
                    work[j * subset_size + k] = work[k * subset_size + j];
                  }
                }
              } else {
                for (k = 0; k < subset_size; k++) {
                  for (j = k; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
                }
              }
            } else {
              PetscInt k;
              for (k = 0; k < subset_size; k++) {
                for (j = 0; j < subset_size; j++) work[k * subset_size + j] = rS_data[cum2 + k * size_schur + j];
              }
            }
          }
          if (sub_schurs->change) {
            Mat         change_sub, SEj, T;
            PetscScalar val = sub_schurs->gdsw ? PETSC_SMALL : 1. / PETSC_SMALL;

            /* change basis */
            PetscCall(KSPGetOperators(sub_schurs->change[i], &change_sub, NULL));
            PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, subset_size, subset_size, (rS_data || multi_element) ? work : SEjinv_arr, &SEj));
            if (!sub_schurs->change_with_qr) { /* currently there's no support for PtAP with P SeqAIJ */
              Mat T2;
              PetscCall(MatTransposeMatMult(change_sub, SEj, MAT_INITIAL_MATRIX, 1.0, &T2));
              PetscCall(MatMatMult(T2, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
              PetscCall(MatDestroy(&T2));
              PetscCall(MatConvert(T, MATSEQDENSE, MAT_INPLACE_MATRIX, &T));
            } else {
              PetscCall(MatPtAP(SEj, change_sub, MAT_INITIAL_MATRIX, 1.0, &T));
            }
            PetscCall(MatCopy(T, SEj, SAME_NONZERO_PATTERN));
            PetscCall(MatDestroy(&T));
            PetscCall(MatZeroRowsColumnsIS(SEj, sub_schurs->change_primal_sub[i], val, NULL, NULL));
            PetscCall(MatDestroy(&SEj));
          }
          if (rS_data || multi_element) PetscCall(PetscArraycpy(SEjinv_arr, work, subset_size * subset_size));
          cum += subset_size;
          cum2 += subset_size * (size_schur + 1);
          SEjinv_arr += subset_size * subset_size;
        }
        PetscCall(MatSeqAIJRestoreArrayWrite(sub_schurs->sum_S_Ej_tilda_all, &SEjinv_arr));
        if (S_all_inv) {
          if (!multi_element) PetscCall(MatDenseRestoreArrayRead(S_all_inv, &rS_data));
          if (solver_S) {
            if (schur_has_vertices) {
              PetscCall(MatFactorRestoreSchurComplement(F, &S_all_inv, MAT_FACTOR_SCHUR_FACTORED));
            } else {
              PetscCall(MatFactorRestoreSchurComplement(F, &S_all_inv, MAT_FACTOR_SCHUR_INVERTED));
            }
          }
        }
        PetscCall(MatDestroy(&S_all_inv));
      }

      /* move back factors if needed */
      if (schur_has_vertices && factor_workaround && !sub_schurs->gdsw) {
        Mat          S_tmp;
        PetscInt     nd = 0;
        PetscScalar *data;

        PetscCheck(use_potr, PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor update not yet implemented for non SPD matrices");
        PetscCheck(solver_S, PETSC_COMM_SELF, PETSC_ERR_PLIB, "This should not happen");
        PetscCall(MatFactorGetSchurComplement(F, &S_tmp, NULL));
        PetscCall(MatDenseGetArray(S_tmp, &data));
        PetscCall(PetscArrayzero(data, size_schur * size_schur));

        if (S_lower_triangular) {
          cum = 0;
          for (i = 0; i < size_active_schur; i++) {
            PetscCall(PetscArraycpy(data + i * (size_schur + 1), schur_factor + cum, size_active_schur - i));
            cum += size_active_schur - i;
          }
        } else {
          PetscCall(PetscArraycpy(data, schur_factor, size_schur * size_schur));
        }
        if (sub_schurs->is_dir) {
          PetscCall(ISGetLocalSize(sub_schurs->is_dir, &nd));
          for (i = 0; i < nd; i++) data[(i + size_active_schur) * (size_schur + 1)] = schur_factor[cum + i];
        }
        /* workaround: since I cannot modify the matrices used inside the solvers for the forward and backward substitutions,
             set the diagonal entry of the Schur factor to a very large value */
        for (i = size_active_schur + nd; i < size_schur; i++) data[i * (size_schur + 1)] = infty;
        PetscCall(MatDenseRestoreArray(S_tmp, &data));
        PetscCall(MatFactorRestoreSchurComplement(F, &S_tmp, MAT_FACTOR_SCHUR_FACTORED));
      }
    } else if (factor_workaround && !sub_schurs->gdsw) { /* we need to eliminate any unneeded coupling */
      PetscScalar *data;
      PetscInt     nd = 0;

      if (sub_schurs->is_dir) { /* dirichlet dofs could have different scalings */
        PetscCall(ISGetLocalSize(sub_schurs->is_dir, &nd));
      }
      PetscCall(MatFactorGetSchurComplement(F, &S_all, NULL));
      PetscCall(MatDenseGetArray(S_all, &data));
      for (i = 0; i < size_active_schur; i++) PetscCall(PetscArrayzero(data + i * size_schur + size_active_schur, size_schur - size_active_schur));
      for (i = size_active_schur + nd; i < size_schur; i++) {
        PetscCall(PetscArrayzero(data + i * size_schur + size_active_schur, size_schur - size_active_schur));
        data[i * (size_schur + 1)] = infty;
      }
      PetscCall(MatDenseRestoreArray(S_all, &data));
      PetscCall(MatFactorRestoreSchurComplement(F, &S_all, MAT_FACTOR_SCHUR_UNFACTORED));
    }
    PetscCall(PetscFree(idx_work));
    PetscCall(PetscFree(work));
    PetscCall(PetscFree(schur_factor));
    PetscCall(VecDestroy(&Dall));
    PetscCall(ISDestroy(&is_schur));
    if (multi_element) {
      for (PetscInt sub = 0; sub < n_local_subs; sub++) {
        PetscCall(ISDestroy(&is_sub_all[sub]));
        PetscCall(ISDestroy(&is_sub_schur_all[sub]));
        PetscCall(ISDestroy(&is_sub_schur[sub]));
      }
      PetscCall(PetscFree3(is_sub_all, is_sub_schur_all, is_sub_schur));
    }
  }
  PetscCall(ISDestroy(&is_I_layer));
  PetscCall(MatDestroy(&S_all));
  PetscCall(MatDestroy(&A_BB));
  PetscCall(MatDestroy(&A_IB));
  PetscCall(MatDestroy(&A_BI));
  PetscCall(MatDestroy(&F));

  PetscCall(MatAssemblyBegin(sub_schurs->S_Ej_all, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(sub_schurs->S_Ej_all, MAT_FINAL_ASSEMBLY));
  if (compute_Stilda) {
    PetscCall(MatAssemblyBegin(sub_schurs->sum_S_Ej_tilda_all, MAT_FINAL_ASSEMBLY));
    PetscCall(MatAssemblyEnd(sub_schurs->sum_S_Ej_tilda_all, MAT_FINAL_ASSEMBLY));
    if (deluxe) {
      PetscCall(MatAssemblyBegin(sub_schurs->sum_S_Ej_inv_all, MAT_FINAL_ASSEMBLY));
      PetscCall(MatAssemblyEnd(sub_schurs->sum_S_Ej_inv_all, MAT_FINAL_ASSEMBLY));
    }
  }

  /* Get local part of (\sum_j S_Ej) */
  if (!sub_schurs->sum_S_Ej_all) PetscCall(MatDuplicate(sub_schurs->S_Ej_all, MAT_DO_NOT_COPY_VALUES, &sub_schurs->sum_S_Ej_all));
  PetscCall(VecSet(gstash, 0.0));
  PetscCall(MatSeqAIJGetArray(sub_schurs->S_Ej_all, &stasharray));
  PetscCall(VecPlaceArray(lstash, stasharray));
  PetscCall(VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
  PetscCall(VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
  PetscCall(MatSeqAIJRestoreArray(sub_schurs->S_Ej_all, &stasharray));
  PetscCall(VecResetArray(lstash));
  PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_all, &stasharray));
  PetscCall(VecPlaceArray(lstash, stasharray));
  PetscCall(VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
  PetscCall(VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
  PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_all, &stasharray));
  PetscCall(VecResetArray(lstash));

  /* Get local part of (\sum_j S^-1_Ej) (\sum_j St^-1_Ej) */
  if (compute_Stilda) {
    PetscCall(VecSet(gstash, 0.0));
    PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &stasharray));
    PetscCall(VecPlaceArray(lstash, stasharray));
    PetscCall(VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
    PetscCall(VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
    PetscCall(VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
    PetscCall(VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
    PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &stasharray));
    PetscCall(VecResetArray(lstash));
    if (deluxe) {
      PetscCall(VecSet(gstash, 0.0));
      PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_inv_all, &stasharray));
      PetscCall(VecPlaceArray(lstash, stasharray));
      PetscCall(VecScatterBegin(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
      PetscCall(VecScatterEnd(sstash, lstash, gstash, ADD_VALUES, SCATTER_FORWARD));
      PetscCall(VecScatterBegin(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
      PetscCall(VecScatterEnd(sstash, gstash, lstash, INSERT_VALUES, SCATTER_REVERSE));
      PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_inv_all, &stasharray));
      PetscCall(VecResetArray(lstash));
    } else if (!sub_schurs->gdsw) {
      PetscScalar *array;
      PetscInt     cum;

      PetscCall(MatSeqAIJGetArray(sub_schurs->sum_S_Ej_tilda_all, &array));
      cum = 0;
      for (i = 0; i < sub_schurs->n_subs; i++) {
        PetscCall(ISGetLocalSize(sub_schurs->is_subs[i], &subset_size));
        PetscCall(PetscBLASIntCast(subset_size, &B_N));
        PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
        if (use_potr) {
          PetscCallBLAS("LAPACKpotrf", LAPACKpotrf_("L", &B_N, array + cum, &B_N, &B_ierr));
          PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
          PetscCallBLAS("LAPACKpotri", LAPACKpotri_("L", &B_N, array + cum, &B_N, &B_ierr));
          PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in POTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
        } else if (use_sytr) {
          PetscCallBLAS("LAPACKsytrf", LAPACKsytrf_("L", &B_N, array + cum, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
          PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
          PetscCallBLAS("LAPACKsytri", LAPACKsytri_("L", &B_N, array + cum, &B_N, pivots, Bwork, &B_ierr));
          PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in SYTRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
        } else {
          PetscCallBLAS("LAPACKgetrf", LAPACKgetrf_(&B_N, &B_N, array + cum, &B_N, pivots, &B_ierr));
          PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRF Lapack routine %" PetscBLASInt_FMT, B_ierr);
          PetscCallBLAS("LAPACKgetri", LAPACKgetri_(&B_N, array + cum, &B_N, pivots, Bwork, &B_lwork, &B_ierr));
          PetscCheck(!B_ierr, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in GETRI Lapack routine %" PetscBLASInt_FMT, B_ierr);
        }
        PetscCall(PetscLogFlops(1.0 * subset_size * subset_size * subset_size));
        PetscCall(PetscFPTrapPop());
        cum += subset_size * subset_size;
      }
      PetscCall(MatSeqAIJRestoreArray(sub_schurs->sum_S_Ej_tilda_all, &array));
      PetscCall(PetscObjectReference((PetscObject)sub_schurs->sum_S_Ej_all));
      PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_inv_all));
      sub_schurs->sum_S_Ej_inv_all = sub_schurs->sum_S_Ej_all;
    }
  }
  PetscCall(VecDestroy(&lstash));
  PetscCall(VecDestroy(&gstash));
  PetscCall(VecScatterDestroy(&sstash));

  if (matl_dbg_viewer) {
    if (sub_schurs->S_Ej_all) {
      PetscCall(PetscObjectSetName((PetscObject)sub_schurs->S_Ej_all, "SE"));
      PetscCall(MatView(sub_schurs->S_Ej_all, matl_dbg_viewer));
    }
    if (sub_schurs->sum_S_Ej_all) {
      PetscCall(PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_all, "SSE"));
      PetscCall(MatView(sub_schurs->sum_S_Ej_all, matl_dbg_viewer));
    }
    if (sub_schurs->sum_S_Ej_inv_all) {
      PetscCall(PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_inv_all, "SSEm"));
      PetscCall(MatView(sub_schurs->sum_S_Ej_inv_all, matl_dbg_viewer));
    }
    if (sub_schurs->sum_S_Ej_tilda_all) {
      PetscCall(PetscObjectSetName((PetscObject)sub_schurs->sum_S_Ej_tilda_all, "SSEt"));
      PetscCall(MatView(sub_schurs->sum_S_Ej_tilda_all, matl_dbg_viewer));
    }
  }

  /* when not explicit, we need to set the factor type */
  if (sub_schurs->mat_factor_type == MAT_FACTOR_NONE) sub_schurs->mat_factor_type = sub_schurs->is_hermitian ? MAT_FACTOR_CHOLESKY : MAT_FACTOR_LU;

  /* free workspace */
  if (matl_dbg_viewer) PetscCall(PetscViewerFlush(matl_dbg_viewer));
  if (sub_schurs->debug) PetscCallMPI(MPI_Barrier(comm_n));
  PetscCall(PetscViewerDestroy(&matl_dbg_viewer));
  PetscCall(PetscFree2(Bwork, pivots));
  PetscCall(PetscCommDestroy(&comm_n));
  PetscCall(PetscFree(all_local_subid_N));
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode PCBDDCSubSchursInit(PCBDDCSubSchurs sub_schurs, const char *prefix, IS is_I, IS is_B, PCBDDCGraph graph, ISLocalToGlobalMapping BtoNmap, PetscBool copycc, PetscBool gdsw)
{
  IS       *faces, *edges, *all_cc, vertices;
  PetscInt  s, i, n_faces, n_edges, n_all_cc;
  PetscBool is_sorted, ispardiso, ismumps;

  PetscFunctionBegin;
  PetscCall(ISSorted(is_I, &is_sorted));
  PetscCheck(is_sorted, PetscObjectComm((PetscObject)is_I), PETSC_ERR_PLIB, "IS for I dofs should be shorted");
  PetscCall(ISSorted(is_B, &is_sorted));
  PetscCheck(is_sorted, PetscObjectComm((PetscObject)is_B), PETSC_ERR_PLIB, "IS for B dofs should be shorted");

  /* reset any previous data */
  PetscCall(PCBDDCSubSchursReset(sub_schurs));

  sub_schurs->gdsw  = gdsw;
  sub_schurs->graph = graph;

  /* get index sets for faces and edges (already sorted by global ordering) */
  PetscCall(PCBDDCGraphGetCandidatesIS(graph, &n_faces, &faces, &n_edges, &edges, &vertices));
  n_all_cc = n_faces + n_edges;
  PetscCall(PetscBTCreate(n_all_cc, &sub_schurs->is_edge));
  PetscCall(PetscMalloc1(n_all_cc, &all_cc));
  n_all_cc = 0;
  for (i = 0; i < n_faces; i++) {
    PetscCall(ISGetSize(faces[i], &s));
    if (!s) continue;
    if (copycc) {
      PetscCall(ISDuplicate(faces[i], &all_cc[n_all_cc]));
    } else {
      PetscCall(PetscObjectReference((PetscObject)faces[i]));
      all_cc[n_all_cc] = faces[i];
    }
    n_all_cc++;
  }
  for (i = 0; i < n_edges; i++) {
    PetscCall(ISGetSize(edges[i], &s));
    if (!s) continue;
    if (copycc) {
      PetscCall(ISDuplicate(edges[i], &all_cc[n_all_cc]));
    } else {
      PetscCall(PetscObjectReference((PetscObject)edges[i]));
      all_cc[n_all_cc] = edges[i];
    }
    PetscCall(PetscBTSet(sub_schurs->is_edge, n_all_cc));
    n_all_cc++;
  }
  PetscCall(PetscObjectReference((PetscObject)vertices));
  sub_schurs->is_vertices = vertices;
  PetscCall(PCBDDCGraphRestoreCandidatesIS(graph, &n_faces, &faces, &n_edges, &edges, &vertices));
  sub_schurs->is_dir = NULL;
  PetscCall(PCBDDCGraphGetDirichletDofsB(graph, &sub_schurs->is_dir));

  /* Determine if MatFactor can be used */
  PetscCall(PetscStrallocpy(prefix, &sub_schurs->prefix));
#if defined(PETSC_HAVE_MUMPS)
  PetscCall(PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERMUMPS, sizeof(sub_schurs->mat_solver_type)));
#elif defined(PETSC_HAVE_MKL_PARDISO)
  PetscCall(PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, sizeof(sub_schurs->mat_solver_type)));
#else
  PetscCall(PetscStrncpy(sub_schurs->mat_solver_type, MATSOLVERPETSC, sizeof(sub_schurs->mat_solver_type)));
#endif
  sub_schurs->mat_factor_type = MAT_FACTOR_NONE;
  sub_schurs->is_hermitian    = PetscDefined(USE_COMPLEX) ? PETSC_FALSE : PETSC_TRUE; /* Hermitian Cholesky is not supported by PETSc and external packages */
  sub_schurs->is_posdef       = PETSC_TRUE;
  sub_schurs->is_symmetric    = PETSC_TRUE;
  sub_schurs->debug           = PETSC_FALSE;
  sub_schurs->restrict_comm   = PETSC_FALSE;
  PetscOptionsBegin(PetscObjectComm((PetscObject)graph->l2gmap), sub_schurs->prefix, "BDDC sub_schurs options", "PC");
  PetscCall(PetscOptionsString("-sub_schurs_mat_solver_type", "Specific direct solver to use", NULL, sub_schurs->mat_solver_type, sub_schurs->mat_solver_type, sizeof(sub_schurs->mat_solver_type), NULL));
  PetscCall(PetscOptionsEnum("-sub_schurs_mat_factor_type", "Factor type to use. Use MAT_FACTOR_NONE for automatic selection", NULL, MatFactorTypes, (PetscEnum)sub_schurs->mat_factor_type, (PetscEnum *)&sub_schurs->mat_factor_type, NULL));
  PetscCall(PetscOptionsBool("-sub_schurs_symmetric", "Symmetric problem", NULL, sub_schurs->is_symmetric, &sub_schurs->is_symmetric, NULL));
  PetscCall(PetscOptionsBool("-sub_schurs_hermitian", "Hermitian problem", NULL, sub_schurs->is_hermitian, &sub_schurs->is_hermitian, NULL));
  PetscCall(PetscOptionsBool("-sub_schurs_posdef", "Positive definite problem", NULL, sub_schurs->is_posdef, &sub_schurs->is_posdef, NULL));
  PetscCall(PetscOptionsBool("-sub_schurs_restrictcomm", "Restrict communicator on active processes only", NULL, sub_schurs->restrict_comm, &sub_schurs->restrict_comm, NULL));
  PetscCall(PetscOptionsBool("-sub_schurs_debug", "Debug output", NULL, sub_schurs->debug, &sub_schurs->debug, NULL));
  PetscOptionsEnd();
  PetscCall(PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMUMPS, &ismumps));
  PetscCall(PetscStrcmp(sub_schurs->mat_solver_type, MATSOLVERMKL_PARDISO, &ispardiso));
  sub_schurs->schur_explicit = (PetscBool)(ispardiso || ismumps);

  /* for reals, symmetric and Hermitian are synonyms */
#if !defined(PETSC_USE_COMPLEX)
  sub_schurs->is_symmetric = (PetscBool)(sub_schurs->is_symmetric && sub_schurs->is_hermitian);
  sub_schurs->is_hermitian = sub_schurs->is_symmetric;
#endif

  PetscCall(PetscObjectReference((PetscObject)is_I));
  sub_schurs->is_I = is_I;
  PetscCall(PetscObjectReference((PetscObject)is_B));
  sub_schurs->is_B = is_B;
  PetscCall(PetscObjectReference((PetscObject)graph->l2gmap));
  sub_schurs->l2gmap = graph->l2gmap;
  PetscCall(PetscObjectReference((PetscObject)BtoNmap));
  sub_schurs->BtoNmap            = BtoNmap;
  sub_schurs->n_subs             = n_all_cc;
  sub_schurs->is_subs            = all_cc;
  sub_schurs->S_Ej_all           = NULL;
  sub_schurs->sum_S_Ej_all       = NULL;
  sub_schurs->sum_S_Ej_inv_all   = NULL;
  sub_schurs->sum_S_Ej_tilda_all = NULL;
  sub_schurs->is_Ej_all          = NULL;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode PCBDDCSubSchursCreate(PCBDDCSubSchurs *sub_schurs)
{
  PCBDDCSubSchurs schurs_ctx;

  PetscFunctionBegin;
  PetscCall(PetscNew(&schurs_ctx));
  schurs_ctx->n_subs = 0;
  *sub_schurs        = schurs_ctx;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode PCBDDCSubSchursReset(PCBDDCSubSchurs sub_schurs)
{
  PetscFunctionBegin;
  if (!sub_schurs) PetscFunctionReturn(PETSC_SUCCESS);
  sub_schurs->graph = NULL;
  PetscCall(PetscFree(sub_schurs->prefix));
  PetscCall(MatDestroy(&sub_schurs->A));
  PetscCall(MatDestroy(&sub_schurs->S));
  PetscCall(ISDestroy(&sub_schurs->is_I));
  PetscCall(ISDestroy(&sub_schurs->is_B));
  PetscCall(ISLocalToGlobalMappingDestroy(&sub_schurs->l2gmap));
  PetscCall(ISLocalToGlobalMappingDestroy(&sub_schurs->BtoNmap));
  PetscCall(MatDestroy(&sub_schurs->S_Ej_all));
  PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_all));
  PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_inv_all));
  PetscCall(MatDestroy(&sub_schurs->sum_S_Ej_tilda_all));
  PetscCall(ISDestroy(&sub_schurs->is_Ej_all));
  PetscCall(ISDestroy(&sub_schurs->is_vertices));
  PetscCall(ISDestroy(&sub_schurs->is_dir));
  PetscCall(PetscBTDestroy(&sub_schurs->is_edge));
  for (PetscInt i = 0; i < sub_schurs->n_subs; i++) PetscCall(ISDestroy(&sub_schurs->is_subs[i]));
  if (sub_schurs->n_subs) PetscCall(PetscFree(sub_schurs->is_subs));
  if (sub_schurs->reuse_solver) PetscCall(PCBDDCReuseSolversReset(sub_schurs->reuse_solver));
  PetscCall(PetscFree(sub_schurs->reuse_solver));
  if (sub_schurs->change) {
    for (PetscInt i = 0; i < sub_schurs->n_subs; i++) {
      PetscCall(KSPDestroy(&sub_schurs->change[i]));
      PetscCall(ISDestroy(&sub_schurs->change_primal_sub[i]));
    }
  }
  PetscCall(PetscFree(sub_schurs->change));
  PetscCall(PetscFree(sub_schurs->change_primal_sub));
  sub_schurs->n_subs = 0;
  PetscFunctionReturn(PETSC_SUCCESS);
}

PetscErrorCode PCBDDCSubSchursDestroy(PCBDDCSubSchurs *sub_schurs)
{
  PetscFunctionBegin;
  PetscCall(PCBDDCSubSchursReset(*sub_schurs));
  PetscCall(PetscFree(*sub_schurs));
  PetscFunctionReturn(PETSC_SUCCESS);
}

static inline PetscErrorCode PCBDDCAdjGetNextLayer_Private(PetscInt *queue_tip, PetscInt n_prev, PetscBT touched, PetscInt *xadj, PetscInt *adjncy, PetscInt *n_added)
{
  PetscInt i, j, n;

  PetscFunctionBegin;
  n = 0;
  for (i = -n_prev; i < 0; i++) {
    PetscInt start_dof = queue_tip[i];
    for (j = xadj[start_dof]; j < xadj[start_dof + 1]; j++) {
      PetscInt dof = adjncy[j];
      if (!PetscBTLookup(touched, dof)) {
        PetscCall(PetscBTSet(touched, dof));
        queue_tip[n] = dof;
        n++;
      }
    }
  }
  *n_added = n;
  PetscFunctionReturn(PETSC_SUCCESS);
}
