/*

Author: Henry M. Tufo III

e-mail: hmt@cs.brown.edu

snail-mail:
Division of Applied Mathematics
Brown University
Providence, RI 02912

Last Modification:
6.21.97

File Description:
-----------------

*/

#include <../src/ksp/pc/impls/tfs/tfs.h>

/* default length of number of items via tree - doubles if exceeded */
#define TREE_BUF_SZ 2048
#define GS_VEC_SZ   1

/*
Type: struct gather_scatter_id
*/
typedef struct gather_scatter_id {
  PetscInt     id;
  PetscInt     nel_min;
  PetscInt     nel_max;
  PetscInt     nel_sum;
  PetscInt     negl;
  PetscInt     gl_max;
  PetscInt     gl_min;
  PetscInt     repeats;
  PetscInt     ordered;
  PetscInt     positive;
  PetscScalar *vals;

  /* bit mask info */
  PetscInt *my_proc_mask;
  PetscInt  mask_sz;
  PetscInt *ngh_buf;
  PetscInt  ngh_buf_sz;
  PetscInt *nghs;
  PetscInt  num_nghs;
  PetscInt  max_nghs;
  PetscInt *pw_nghs;
  PetscInt  num_pw_nghs;
  PetscInt *tree_nghs;
  PetscInt  num_tree_nghs;

  PetscInt num_loads;

  /* repeats == true -> local info */
  PetscInt  nel;  /* number of unique elements */
  PetscInt *elms; /* of size nel */
  PetscInt  nel_total;
  PetscInt *local_elms; /* of size nel_total */
  PetscInt *companion;  /* of size nel_total */

  /* local info */
  PetscInt   num_local_total;
  PetscInt   local_strength;
  PetscInt   num_local;
  PetscInt  *num_local_reduce;
  PetscInt **local_reduce;
  PetscInt   num_local_gop;
  PetscInt  *num_gop_local_reduce;
  PetscInt **gop_local_reduce;

  /* pairwise info */
  PetscInt     level;
  PetscInt     num_pairs;
  PetscInt     max_pairs;
  PetscInt     loc_node_pairs;
  PetscInt     max_node_pairs;
  PetscInt     min_node_pairs;
  PetscInt     avg_node_pairs;
  PetscInt    *pair_list;
  PetscInt    *msg_sizes;
  PetscInt   **node_list;
  PetscInt     len_pw_list;
  PetscInt    *pw_elm_list;
  PetscScalar *pw_vals;

  MPI_Request *msg_ids_in;
  MPI_Request *msg_ids_out;

  PetscScalar *out;
  PetscScalar *in;
  PetscInt     msg_total;

  /* tree - crystal accumulator info */
  PetscInt   max_left_over;
  PetscInt  *pre;
  PetscInt  *in_num;
  PetscInt  *out_num;
  PetscInt **in_list;
  PetscInt **out_list;

  /* new tree work*/
  PetscInt     tree_nel;
  PetscInt    *tree_elms;
  PetscScalar *tree_buf;
  PetscScalar *tree_work;

  PetscInt  tree_map_sz;
  PetscInt *tree_map_in;
  PetscInt *tree_map_out;

  /* current memory status */
  PetscInt gl_bss_min;
  PetscInt gl_perm_min;

  /* max segment size for PCTFS_gs_gop_vec() */
  PetscInt vec_sz;

  /* hack to make paul happy */
  MPI_Comm PCTFS_gs_comm;

} PCTFS_gs_id;

static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
static PCTFS_gs_id   *gsi_new(void);
static PetscErrorCode set_tree(PCTFS_gs_id *gs);

/* same for all but vector flavor */
static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
/* vector flavor */
static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

/* global vars */
/* from comm.c module */

static PetscInt num_gs_ids = 0;

/* should make this dynamic ... later */
static PetscInt  msg_buf     = MAX_MSG_BUF;
static PetscInt  vec_sz      = GS_VEC_SZ;
static PetscInt *tree_buf    = NULL;
static PetscInt  tree_buf_sz = 0;
static PetscInt  ntree       = 0;

/***************************************************************************/
PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
{
  PetscFunctionBegin;
  vec_sz = size;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
{
  PetscFunctionBegin;
  msg_buf = buf_size;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
{
  PCTFS_gs_id *gs;
  MPI_Group    PCTFS_gs_group;
  MPI_Comm     PCTFS_gs_comm;

  /* ensure that communication package has been initialized */
  PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());

  /* determines if we have enough dynamic/semi-static memory */
  /* checks input, allocs and sets gd_id template            */
  gs = gsi_check_args(elms, nel, level);

  /* only bit mask version up and working for the moment    */
  /* LATER :: get int list version working for sparse pblms */
  PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));

  PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
  PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
  PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);

  gs->PCTFS_gs_comm = PCTFS_gs_comm;

  return gs;
}

/******************************************************************************/
static PCTFS_gs_id *gsi_new(void)
{
  PCTFS_gs_id *gs;
  gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
  PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
  return gs;
}

/******************************************************************************/
static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
{
  PetscInt     i, j, k, t2;
  PetscInt    *companion, *elms, *unique, *iptr;
  PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
  PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
  PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
  PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
  PCTFS_gs_id *gs;

  if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
  if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");

  if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));

  /* get space for gs template */
  gs     = gsi_new();
  gs->id = ++num_gs_ids;

  /* hmt 6.4.99                                            */
  /* caller can set global ids that don't participate to 0 */
  /* PCTFS_gs_init ignores all zeros in elm list                 */
  /* negative global ids are still invalid                 */
  for (i = j = 0; i < nel; i++) {
    if (in_elms[i] != 0) j++;
  }

  k   = nel;
  nel = j;

  /* copy over in_elms list and create inverse map */
  elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
  companion = (PetscInt *)malloc(nel * sizeof(PetscInt));

  for (i = j = 0; i < k; i++) {
    if (in_elms[i] != 0) {
      elms[j]        = in_elms[i];
      companion[j++] = i;
    }
  }

  if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");

  /* pre-pass ... check to see if sorted */
  elms[nel] = INT_MAX;
  iptr      = elms;
  unique    = elms + 1;
  j         = 0;
  while (*iptr != INT_MAX) {
    if (*iptr++ > *unique++) {
      j = 1;
      break;
    }
  }

  /* set up inverse map */
  if (j) {
    PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
    PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
  } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
  elms[nel] = INT_MIN;

  /* first pass */
  /* determine number of unique elements, check pd */
  for (i = k = 0; i < nel; i += j) {
    t2 = elms[i];
    j  = ++i;

    /* clump 'em for now */
    while (elms[j] == t2) j++;

    /* how many together and num local */
    if (j -= i) {
      num_local++;
      k += j;
    }
  }

  /* how many unique elements? */
  gs->repeats = k;
  gs->nel     = nel - k;

  /* number of repeats? */
  gs->num_local = num_local;
  num_local += 2;
  gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
  gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));

  unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
  gs->elms       = unique;
  gs->nel_total  = nel;
  gs->local_elms = elms;
  gs->companion  = companion;

  /* compess map as well as keep track of local ops */
  for (num_local = i = j = 0; i < gs->nel; i++) {
    k  = j;
    t2 = unique[i] = elms[j];
    companion[i]   = companion[j];

    while (elms[j] == t2) j++;

    if ((t2 = (j - k)) > 1) {
      /* number together */
      num_to_reduce[num_local] = t2++;

      iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));

      /* to use binary searching don't remap until we check intersection */
      *iptr++ = i;

      /* note that we're skipping the first one */
      while (++k < j) *(iptr++) = companion[k];
      *iptr = -1;
    }
  }

  /* sentinel for ngh_buf */
  unique[gs->nel] = INT_MAX;

  /* for two partition sort hack */
  num_to_reduce[num_local]   = 0;
  local_reduce[num_local]    = NULL;
  num_to_reduce[++num_local] = 0;
  local_reduce[num_local]    = NULL;

  /* load 'em up */
  /* note one extra to hold NON_UNIFORM flag!!! */
  vals[2] = vals[1] = vals[0] = nel;
  if (gs->nel > 0) {
    vals[3] = unique[0];
    vals[4] = unique[gs->nel - 1];
  } else {
    vals[3] = INT_MAX;
    vals[4] = INT_MIN;
  }
  vals[5] = level;
  vals[6] = num_gs_ids;

  /* GLOBAL: send 'em out */
  PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));

  /* must be semi-pos def - only pairwise depends on this */
  /* LATER - remove this restriction */
  if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
  if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");

  gs->nel_min = vals[0];
  gs->nel_max = vals[1];
  gs->nel_sum = vals[2];
  gs->gl_min  = vals[3];
  gs->gl_max  = vals[4];
  gs->negl    = vals[4] - vals[3] + 1;

  if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);

  /* LATER :: add level == -1 -> program selects level */
  if (vals[5] < 0) vals[5] = 0;
  else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
  gs->level = vals[5];

  return gs;
}

/******************************************************************************/
static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
{
  PetscInt   i, nel, *elms;
  PetscInt   t1;
  PetscInt **reduce;
  PetscInt  *map;

  PetscFunctionBegin;
  /* totally local removes ... PCTFS_ct_bits == 0 */
  PetscCall(get_ngh_buf(gs));

  if (gs->level) PetscCall(set_pairwise(gs));
  if (gs->max_left_over) PetscCall(set_tree(gs));

  /* intersection local and pairwise/tree? */
  gs->num_local_total      = gs->num_local;
  gs->gop_local_reduce     = gs->local_reduce;
  gs->num_gop_local_reduce = gs->num_local_reduce;

  map = gs->companion;

  /* is there any local compression */
  if (!gs->num_local) {
    gs->local_strength = NONE;
    gs->num_local_gop  = 0;
  } else {
    /* ok find intersection */
    map    = gs->companion;
    reduce = gs->local_reduce;
    for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
      if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
        t1++;
        PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
        gs->num_local_reduce[i] *= -1;
      }
      **reduce = map[**reduce];
    }

    /* intersection is empty */
    if (!t1) {
      gs->local_strength = FULL;
      gs->num_local_gop  = 0;
    } else { /* intersection not empty */
      gs->local_strength = PARTIAL;

      PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));

      gs->num_local_gop   = t1;
      gs->num_local_total = gs->num_local;
      gs->num_local -= t1;
      gs->gop_local_reduce     = gs->local_reduce;
      gs->num_gop_local_reduce = gs->num_local_reduce;

      for (i = 0; i < t1; i++) {
        PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
        gs->num_gop_local_reduce[i] *= -1;
        gs->local_reduce++;
        gs->num_local_reduce++;
      }
      gs->local_reduce++;
      gs->num_local_reduce++;
    }
  }

  elms = gs->pw_elm_list;
  nel  = gs->len_pw_list;
  for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

  elms = gs->tree_map_in;
  nel  = gs->tree_map_sz;
  for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

  /* clean up */
  free((void *)gs->local_elms);
  free((void *)gs->companion);
  free((void *)gs->elms);
  free((void *)gs->ngh_buf);
  gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode place_in_tree(PetscInt elm)
{
  PetscInt *tp, n;

  PetscFunctionBegin;
  if (ntree == tree_buf_sz) {
    if (tree_buf_sz) {
      tp = tree_buf;
      n  = tree_buf_sz;
      tree_buf_sz <<= 1;
      tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
      PCTFS_ivec_copy(tree_buf, tp, n);
      free(tp);
    } else {
      tree_buf_sz = TREE_BUF_SZ;
      tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
    }
  }

  tree_buf[ntree++] = elm;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
{
  PetscInt  i, j, npw = 0, ntree_map = 0;
  PetscInt  p_mask_size, ngh_buf_size, buf_size;
  PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
  PetscInt *ngh_buf, *buf1, *buf2;
  PetscInt  offset, per_load, num_loads, or_ct, start, end;
  PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
  PetscInt  oper = GL_B_OR;
  PetscInt *ptr3, *t_mask, level, ct1, ct2;

  PetscFunctionBegin;
  /* to make life easier */
  nel   = gs->nel;
  elms  = gs->elms;
  level = gs->level;

  /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
  p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
  PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));

  /* allocate space for masks and info bufs */
  gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
  gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
  gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
  t_mask                        = (PetscInt *)malloc(p_mask_size);
  gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);

  /* comm buffer size ... memory usage bounded by ~2*msg_buf */
  /* had thought I could exploit rendezvous threshold */

  /* default is one pass */
  per_load = negl = gs->negl;
  gs->num_loads = num_loads = 1;
  i                         = p_mask_size * negl;

  /* possible overflow on buffer size */
  /* overflow hack                    */
  if (i < 0) i = INT_MAX;

  buf_size = PetscMin(msg_buf, i);

  /* can we do it? */
  PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);

  /* get PCTFS_giop buf space ... make *only* one malloc */
  buf1 = (PetscInt *)malloc(buf_size << 1);

  /* more than one gior exchange needed? */
  if (buf_size != i) {
    per_load      = buf_size / p_mask_size;
    buf_size      = per_load * p_mask_size;
    gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
  }

  /* convert buf sizes from #bytes to #ints - 32-bit only! */
  p_mask_size /= sizeof(PetscInt);
  ngh_buf_size /= sizeof(PetscInt);
  buf_size /= sizeof(PetscInt);

  /* find PCTFS_giop work space */
  buf2 = buf1 + buf_size;

  /* hold #ints needed for processor masks */
  gs->mask_sz = p_mask_size;

  /* init buffers */
  PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
  PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
  PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));

  /* HACK reset tree info */
  tree_buf    = NULL;
  tree_buf_sz = ntree = 0;

  /* ok do it */
  for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
    /* identity for bitwise or is 000...000 */
    PetscCall(PCTFS_ivec_zero(buf1, buf_size));

    /* load msg buffer */
    for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
      offset = (offset - start) * p_mask_size;
      PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
    }

    /* GLOBAL: pass buffer */
    PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));

    /* unload buffer into ngh_buf */
    ptr2 = (elms + i_start);
    for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
      /* I own it ... may have to pairwise it */
      if (j == *ptr2) {
        /* do i share it w/anyone? */
        ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
        /* guess not */
        if (ct1 < 2) {
          ptr2++;
          ptr1 += p_mask_size;
          continue;
        }

        /* i do ... so keep info and turn off my bit */
        PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
        PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
        PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));

        /* is it to be done pairwise? */
        if (--ct1 <= level) {
          npw++;

          /* turn on high bit to indicate pw need to process */
          *ptr2++ |= TOP_BIT;
          PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
          ptr1 += p_mask_size;
          continue;
        }

        /* get set for next and note that I have a tree contribution */
        /* could save exact elm index for tree here -> save a search */
        ptr2++;
        ptr1 += p_mask_size;
        ntree_map++;
      } else { /* i don't but still might be involved in tree */

        /* shared by how many? */
        ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));

        /* none! */
        if (ct1 < 2) continue;

        /* is it going to be done pairwise? but not by me of course!*/
        if (--ct1 <= level) continue;
      }
      /* LATER we're going to have to process it NOW */
      /* nope ... tree it */
      PetscCall(place_in_tree(j));
    }
  }

  free((void *)t_mask);
  free((void *)buf1);

  gs->len_pw_list = npw;
  gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

  /* expand from bit mask list to int list and save ngh list */
  gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
  PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));

  gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));

  oper = GL_MAX;
  ct1  = gs->num_nghs;
  PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
  gs->max_nghs = ct1;

  gs->tree_map_sz   = ntree_map;
  gs->max_left_over = ntree;

  free((void *)p_mask);
  free((void *)sh_proc_mask);
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
{
  PetscInt  i, j;
  PetscInt  p_mask_size;
  PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
  PetscInt *ngh_buf, *buf2;
  PetscInt  offset;
  PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
  PetscInt *pairwise_elm_list, len_pair_list = 0;
  PetscInt *iptr, t1, i_start, nel, *elms;
  PetscInt  ct;

  PetscFunctionBegin;
  /* to make life easier */
  nel          = gs->nel;
  elms         = gs->elms;
  ngh_buf      = gs->ngh_buf;
  sh_proc_mask = gs->pw_nghs;

  /* need a few temp masks */
  p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
  p_mask        = (PetscInt *)malloc(p_mask_size);
  tmp_proc_mask = (PetscInt *)malloc(p_mask_size);

  /* set mask to my PCTFS_my_id's bit mask */
  PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));

  p_mask_size /= sizeof(PetscInt);

  len_pair_list   = gs->len_pw_list;
  gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));

  /* how many processors (nghs) do we have to exchange with? */
  nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

  /* allocate space for PCTFS_gs_gop() info */
  gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
  gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
  gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));

  /* init msg_size list */
  PetscCall(PCTFS_ivec_zero(msg_size, nprs));

  /* expand from bit mask list to int list */
  PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));

  /* keep list of elements being handled pairwise */
  for (i = j = 0; i < nel; i++) {
    if (elms[i] & TOP_BIT) {
      elms[i] ^= TOP_BIT;
      pairwise_elm_list[j++] = i;
    }
  }
  pairwise_elm_list[j] = -1;

  gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
  gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
  gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
  gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
  gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);

  /* find who goes to each processor */
  for (i_start = i = 0; i < nprs; i++) {
    /* processor i's mask */
    PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));

    /* det # going to processor i */
    for (ct = j = 0; j < len_pair_list; j++) {
      buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
      PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
      if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
    }
    msg_size[i] = ct;
    i_start     = PetscMax(i_start, ct);

    /*space to hold nodes in message to first neighbor */
    msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));

    for (j = 0; j < len_pair_list; j++) {
      buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
      PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
      if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
    }
    *iptr = -1;
  }
  msg_nodes[nprs] = NULL;

  j = gs->loc_node_pairs = i_start;
  t1                     = GL_MAX;
  PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
  gs->max_node_pairs = i_start;

  i_start = j;
  t1      = GL_MIN;
  PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
  gs->min_node_pairs = i_start;

  i_start = j;
  t1      = GL_ADD;
  PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
  gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;

  i_start = nprs;
  t1      = GL_MAX;
  PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
  gs->max_pairs = i_start;

  /* remap pairwise in tail of gsi_via_bit_mask() */
  gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
  gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
  gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);

  /* reset malloc pool */
  free((void *)p_mask);
  free((void *)tmp_proc_mask);
  PetscFunctionReturn(PETSC_SUCCESS);
}

/* to do pruned tree just save ngh buf copy for each one and decode here!
******************************************************************************/
static PetscErrorCode set_tree(PCTFS_gs_id *gs)
{
  PetscInt  i, j, n, nel;
  PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

  PetscFunctionBegin;
  /* local work ptrs */
  elms = gs->elms;
  nel  = gs->nel;

  /* how many via tree */
  gs->tree_nel = n = ntree;
  gs->tree_elms = tree_elms = iptr_in = tree_buf;
  gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
  gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
  j                                   = gs->tree_map_sz;
  gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
  gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));

  /* search the longer of the two lists */
  /* note ... could save this info in get_ngh_buf and save searches */
  if (n <= nel) {
    /* bijective fct w/remap - search elm list */
    for (i = 0; i < n; i++) {
      if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
        *iptr_in++  = j;
        *iptr_out++ = i;
      }
    }
  } else {
    for (i = 0; i < nel; i++) {
      if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
        *iptr_in++  = i;
        *iptr_out++ = j;
      }
    }
  }

  /* sentinel */
  *iptr_in = *iptr_out = -1;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
{
  PetscInt   *num, *map, **reduce;
  PetscScalar tmp;

  PetscFunctionBegin;
  num    = gs->num_gop_local_reduce;
  reduce = gs->gop_local_reduce;
  while ((map = *reduce++)) {
    /* wall */
    if (*num == 2) {
      num++;
      vals[map[1]] = vals[map[0]];
    } else if (*num == 3) { /* corner shared by three elements */
      num++;
      vals[map[2]] = vals[map[1]] = vals[map[0]];
    } else if (*num == 4) { /* corner shared by four elements */
      num++;
      vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
    } else { /* general case ... odd geoms ... 3D*/
      num++;
      tmp = *(vals + *map++);
      while (*map >= 0) *(vals + *map++) = tmp;
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
{
  PetscInt   *num, *map, **reduce;
  PetscScalar tmp;

  PetscFunctionBegin;
  num    = gs->num_local_reduce;
  reduce = gs->local_reduce;
  while ((map = *reduce)) {
    /* wall */
    if (*num == 2) {
      num++;
      reduce++;
      vals[map[1]] = vals[map[0]] += vals[map[1]];
    } else if (*num == 3) { /* corner shared by three elements */
      num++;
      reduce++;
      vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
    } else if (*num == 4) { /* corner shared by four elements */
      num++;
      reduce++;
      vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
    } else { /* general case ... odd geoms ... 3D*/
      num++;
      tmp = 0.0;
      while (*map >= 0) tmp += *(vals + *map++);

      map = *reduce++;
      while (*map >= 0) *(vals + *map++) = tmp;
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
{
  PetscInt    *num, *map, **reduce;
  PetscScalar *base;

  PetscFunctionBegin;
  num    = gs->num_gop_local_reduce;
  reduce = gs->gop_local_reduce;
  while ((map = *reduce++)) {
    /* wall */
    if (*num == 2) {
      num++;
      vals[map[0]] += vals[map[1]];
    } else if (*num == 3) { /* corner shared by three elements */
      num++;
      vals[map[0]] += (vals[map[1]] + vals[map[2]]);
    } else if (*num == 4) { /* corner shared by four elements */
      num++;
      vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
    } else { /* general case ... odd geoms ... 3D*/
      num++;
      base = vals + *map++;
      while (*map >= 0) *base += *(vals + *map++);
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
{
  PetscInt i;

  PetscFunctionBegin;
  PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
  if (gs->nghs) free((void *)gs->nghs);
  if (gs->pw_nghs) free((void *)gs->pw_nghs);

  /* tree */
  if (gs->max_left_over) {
    if (gs->tree_elms) free((void *)gs->tree_elms);
    if (gs->tree_buf) free((void *)gs->tree_buf);
    if (gs->tree_work) free((void *)gs->tree_work);
    if (gs->tree_map_in) free((void *)gs->tree_map_in);
    if (gs->tree_map_out) free((void *)gs->tree_map_out);
  }

  /* pairwise info */
  if (gs->num_pairs) {
    /* should be NULL already */
    if (gs->ngh_buf) free((void *)gs->ngh_buf);
    if (gs->elms) free((void *)gs->elms);
    if (gs->local_elms) free((void *)gs->local_elms);
    if (gs->companion) free((void *)gs->companion);

    /* only set if pairwise */
    if (gs->vals) free((void *)gs->vals);
    if (gs->in) free((void *)gs->in);
    if (gs->out) free((void *)gs->out);
    if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
    if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
    if (gs->pw_vals) free((void *)gs->pw_vals);
    if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
    if (gs->node_list) {
      for (i = 0; i < gs->num_pairs; i++) {
        if (gs->node_list[i]) free((void *)gs->node_list[i]);
      }
      free((void *)gs->node_list);
    }
    if (gs->msg_sizes) free((void *)gs->msg_sizes);
    if (gs->pair_list) free((void *)gs->pair_list);
  }

  /* local info */
  if (gs->num_local_total >= 0) {
    for (i = 0; i < gs->num_local_total + 1; i++) {
      if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
    }
  }

  /* if intersection tree/pairwise and local isn't empty */
  if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
  if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);

  free((void *)gs);
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
{
  PetscFunctionBegin;
  switch (*op) {
  case '+':
    PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
    break;
  default:
    PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
    PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
    PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
    break;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
{
  PetscFunctionBegin;
  PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");

  /* local only operations!!! */
  if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));

  /* if intersection tree/pairwise and local isn't empty */
  if (gs->num_local_gop) {
    PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));

    /* pairwise */
    if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));

    /* tree */
    else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));

    PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
  } else { /* if intersection tree/pairwise and local is empty */
    /* pairwise */
    if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));

    /* tree */
    else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
{
  PetscInt    *num, *map, **reduce;
  PetscScalar *base;

  PetscFunctionBegin;
  num    = gs->num_local_reduce;
  reduce = gs->local_reduce;
  while ((map = *reduce)) {
    base = vals + map[0] * step;

    /* wall */
    if (*num == 2) {
      num++;
      reduce++;
      PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
      PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
    } else if (*num == 3) { /* corner shared by three elements */
      num++;
      reduce++;
      PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
      PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
      PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
      PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
    } else if (*num == 4) { /* corner shared by four elements */
      num++;
      reduce++;
      PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
      PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
      PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
      PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
      PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
      PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
    } else { /* general case ... odd geoms ... 3D */
      num++;
      while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));

      map = *reduce;
      while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));

      reduce++;
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
{
  PetscInt    *num, *map, **reduce;
  PetscScalar *base;

  PetscFunctionBegin;
  num    = gs->num_gop_local_reduce;
  reduce = gs->gop_local_reduce;
  while ((map = *reduce++)) {
    base = vals + map[0] * step;

    /* wall */
    if (*num == 2) {
      num++;
      PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
    } else if (*num == 3) { /* corner shared by three elements */
      num++;
      PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
      PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
    } else if (*num == 4) { /* corner shared by four elements */
      num++;
      PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
      PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
      PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
    } else { /* general case ... odd geoms ... 3D*/
      num++;
      while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
{
  PetscInt    *num, *map, **reduce;
  PetscScalar *base;

  PetscFunctionBegin;
  num    = gs->num_gop_local_reduce;
  reduce = gs->gop_local_reduce;
  while ((map = *reduce++)) {
    base = vals + map[0] * step;

    /* wall */
    if (*num == 2) {
      num++;
      PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
    } else if (*num == 3) { /* corner shared by three elements */
      num++;
      PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
      PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
    } else if (*num == 4) { /* corner shared by four elements */
      num++;
      PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
      PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
      PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
    } else { /* general case ... odd geoms ... 3D*/
      num++;
      while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
    }
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
{
  PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
  PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
  PetscInt    *pw, *list, *size, **nodes;
  MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
  MPI_Status   status;
  PetscBLASInt i1 = 1, dstep;

  PetscFunctionBegin;
  /* strip and load s */
  msg_list = list = gs->pair_list;
  msg_size = size = gs->msg_sizes;
  msg_nodes = nodes = gs->node_list;
  iptr = pw = gs->pw_elm_list;
  dptr1 = dptr3 = gs->pw_vals;
  msg_ids_in = ids_in = gs->msg_ids_in;
  msg_ids_out = ids_out = gs->msg_ids_out;
  dptr2                 = gs->out;
  in1 = in2 = gs->in;

  /* post the receives */
  /*  msg_nodes=nodes; */
  do {
    /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
        second one *list and do list++ afterwards */
    PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
    list++;
    msg_ids_in++;
    in1 += *size++ * step;
  } while (*++msg_nodes);
  msg_nodes = nodes;

  /* load gs values into in out gs buffers */
  while (*iptr >= 0) {
    PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
    dptr3 += step;
    iptr++;
  }

  /* load out buffers and post the sends */
  while ((iptr = *msg_nodes++)) {
    dptr3 = dptr2;
    while (*iptr >= 0) {
      PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
      dptr2 += step;
      iptr++;
    }
    PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
    msg_size++;
    msg_list++;
    msg_ids_out++;
  }

  /* tree */
  if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));

  /* process the received data */
  msg_nodes = nodes;
  while ((iptr = *nodes++)) {
    PetscScalar d1 = 1.0;

    /* Should I check the return value of MPI_Wait() or status? */
    /* Can this loop be replaced by a call to MPI_Waitall()? */
    PetscCallMPI(MPI_Wait(ids_in, &status));
    ids_in++;
    while (*iptr >= 0) {
      PetscCall(PetscBLASIntCast(step, &dstep));
      PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
      in2 += step;
      iptr++;
    }
  }

  /* replace vals */
  while (*pw >= 0) {
    PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
    dptr1 += step;
    pw++;
  }

  /* clear isend message handles */
  /* This changed for clarity though it could be the same */

  /* Should I check the return value of MPI_Wait() or status? */
  /* Can this loop be replaced by a call to MPI_Waitall()? */
  while (*msg_nodes++) {
    PetscCallMPI(MPI_Wait(ids_out, &status));
    ids_out++;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
{
  PetscInt     size, *in, *out;
  PetscScalar *buf, *work;
  PetscInt     op[] = {GL_ADD, 0};
  PetscBLASInt i1   = 1;
  PetscBLASInt dstep;

  PetscFunctionBegin;
  /* copy over to local variables */
  in   = gs->tree_map_in;
  out  = gs->tree_map_out;
  buf  = gs->tree_buf;
  work = gs->tree_work;
  size = gs->tree_nel * step;

  /* zero out collection buffer */
  PetscCall(PCTFS_rvec_zero(buf, size));

  /* copy over my contributions */
  while (*in >= 0) {
    PetscCall(PetscBLASIntCast(step, &dstep));
    PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
  }

  /* perform fan in/out on full buffer */
  /* must change PCTFS_grop to handle the blas */
  PetscCall(PCTFS_grop(buf, work, size, op));

  /* reset */
  in  = gs->tree_map_in;
  out = gs->tree_map_out;

  /* get the portion of the results I need */
  while (*in >= 0) {
    PetscCall(PetscBLASIntCast(step, &dstep));
    PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
{
  PetscFunctionBegin;
  switch (*op) {
  case '+':
    PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
    break;
  default:
    PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
    PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
    PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
    break;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
{
  PetscFunctionBegin;
  /* if there's nothing to do return */
  if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);

  /* can't do more dimensions then exist */
  dim = PetscMin(dim, PCTFS_i_log2_num_nodes);

  /* local only operations!!! */
  if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));

  /* if intersection tree/pairwise and local isn't empty */
  if (gs->num_local_gop) {
    PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));

    /* pairwise will do tree inside ... */
    if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
    else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));

    PetscCall(PCTFS_gs_gop_local_out(gs, vals));
  } else { /* if intersection tree/pairwise and local is empty */
    /* pairwise will do tree inside */
    if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
    else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
{
  PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
  PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
  PetscInt    *pw, *list, *size, **nodes;
  MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
  MPI_Status   status;
  PetscInt     i, mask = 1;

  PetscFunctionBegin;
  for (i = 1; i < dim; i++) {
    mask <<= 1;
    mask++;
  }

  /* strip and load s */
  msg_list = list = gs->pair_list;
  msg_size = size = gs->msg_sizes;
  msg_nodes = nodes = gs->node_list;
  iptr = pw = gs->pw_elm_list;
  dptr1 = dptr3 = gs->pw_vals;
  msg_ids_in = ids_in = gs->msg_ids_in;
  msg_ids_out = ids_out = gs->msg_ids_out;
  dptr2                 = gs->out;
  in1 = in2 = gs->in;

  /* post the receives */
  /*  msg_nodes=nodes; */
  do {
    /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
        second one *list and do list++ afterwards */
    if ((PCTFS_my_id | mask) == (*list | mask)) {
      PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
      list++;
      msg_ids_in++;
      in1 += *size++;
    } else {
      list++;
      size++;
    }
  } while (*++msg_nodes);

  /* load gs values into in out gs buffers */
  while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

  /* load out buffers and post the sends */
  msg_nodes = nodes;
  list      = msg_list;
  while ((iptr = *msg_nodes++)) {
    if ((PCTFS_my_id | mask) == (*list | mask)) {
      dptr3 = dptr2;
      while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
      /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
      /* is msg_ids_out++ correct? */
      PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
      msg_size++;
      list++;
      msg_ids_out++;
    } else {
      list++;
      msg_size++;
    }
  }

  /* do the tree while we're waiting */
  if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));

  /* process the received data */
  msg_nodes = nodes;
  list      = msg_list;
  while ((iptr = *nodes++)) {
    if ((PCTFS_my_id | mask) == (*list | mask)) {
      /* Should I check the return value of MPI_Wait() or status? */
      /* Can this loop be replaced by a call to MPI_Waitall()? */
      PetscCallMPI(MPI_Wait(ids_in, &status));
      ids_in++;
      while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
    }
    list++;
  }

  /* replace vals */
  while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

  /* clear isend message handles */
  /* This changed for clarity though it could be the same */
  while (*msg_nodes++) {
    if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
      /* Should I check the return value of MPI_Wait() or status? */
      /* Can this loop be replaced by a call to MPI_Waitall()? */
      PetscCallMPI(MPI_Wait(ids_out, &status));
      ids_out++;
    }
    msg_list++;
  }
  PetscFunctionReturn(PETSC_SUCCESS);
}

/******************************************************************************/
static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
{
  PetscInt     size;
  PetscInt    *in, *out;
  PetscScalar *buf, *work;
  PetscInt     op[] = {GL_ADD, 0};

  PetscFunctionBegin;
  in   = gs->tree_map_in;
  out  = gs->tree_map_out;
  buf  = gs->tree_buf;
  work = gs->tree_work;
  size = gs->tree_nel;

  PetscCall(PCTFS_rvec_zero(buf, size));

  while (*in >= 0) *(buf + *out++) = *(vals + *in++);

  in  = gs->tree_map_in;
  out = gs->tree_map_out;

  PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));

  while (*in >= 0) *(vals + *in++) = *(buf + *out++);
  PetscFunctionReturn(PETSC_SUCCESS);
}
