xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision ef46b1a67e276116c83b5d4ce8efc2932ea4fc0a)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 /***********************************gs.c***************************************
30 Type: struct gather_scatter_id
31 ------------------------------
32 
33 ************************************gs.c**************************************/
34 typedef struct gather_scatter_id {
35   PetscInt    id;
36   PetscInt    nel_min;
37   PetscInt    nel_max;
38   PetscInt    nel_sum;
39   PetscInt    negl;
40   PetscInt    gl_max;
41   PetscInt    gl_min;
42   PetscInt    repeats;
43   PetscInt    ordered;
44   PetscInt    positive;
45   PetscScalar *vals;
46 
47   /* bit mask info */
48   PetscInt *my_proc_mask;
49   PetscInt mask_sz;
50   PetscInt *ngh_buf;
51   PetscInt ngh_buf_sz;
52   PetscInt *nghs;
53   PetscInt num_nghs;
54   PetscInt max_nghs;
55   PetscInt *pw_nghs;
56   PetscInt num_pw_nghs;
57   PetscInt *tree_nghs;
58   PetscInt num_tree_nghs;
59 
60   PetscInt num_loads;
61 
62   /* repeats == true -> local info */
63   PetscInt nel;         /* number of unique elememts */
64   PetscInt *elms;       /* of size nel */
65   PetscInt nel_total;
66   PetscInt *local_elms; /* of size nel_total */
67   PetscInt *companion;  /* of size nel_total */
68 
69   /* local info */
70   PetscInt num_local_total;
71   PetscInt local_strength;
72   PetscInt num_local;
73   PetscInt *num_local_reduce;
74   PetscInt **local_reduce;
75   PetscInt num_local_gop;
76   PetscInt *num_gop_local_reduce;
77   PetscInt **gop_local_reduce;
78 
79   /* pairwise info */
80   PetscInt    level;
81   PetscInt    num_pairs;
82   PetscInt    max_pairs;
83   PetscInt    loc_node_pairs;
84   PetscInt    max_node_pairs;
85   PetscInt    min_node_pairs;
86   PetscInt    avg_node_pairs;
87   PetscInt    *pair_list;
88   PetscInt    *msg_sizes;
89   PetscInt    **node_list;
90   PetscInt    len_pw_list;
91   PetscInt    *pw_elm_list;
92   PetscScalar *pw_vals;
93 
94   MPI_Request *msg_ids_in;
95   MPI_Request *msg_ids_out;
96 
97   PetscScalar *out;
98   PetscScalar *in;
99   PetscInt    msg_total;
100 
101   /* tree - crystal accumulator info */
102   PetscInt max_left_over;
103   PetscInt *pre;
104   PetscInt *in_num;
105   PetscInt *out_num;
106   PetscInt **in_list;
107   PetscInt **out_list;
108 
109   /* new tree work*/
110   PetscInt    tree_nel;
111   PetscInt    *tree_elms;
112   PetscScalar *tree_buf;
113   PetscScalar *tree_work;
114 
115   PetscInt tree_map_sz;
116   PetscInt *tree_map_in;
117   PetscInt *tree_map_out;
118 
119   /* current memory status */
120   PetscInt gl_bss_min;
121   PetscInt gl_perm_min;
122 
123   /* max segment size for PCTFS_gs_gop_vec() */
124   PetscInt vec_sz;
125 
126   /* hack to make paul happy */
127   MPI_Comm PCTFS_gs_comm;
128 
129 } PCTFS_gs_id;
130 
131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135 static PCTFS_gs_id *gsi_new(void);
136 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
137 
138 /* same for all but vector flavor */
139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140 /* vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142 
143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 
149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
151 
152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
155 
156 /* global vars */
157 /* from comm.c module */
158 
159 static PetscInt num_gs_ids = 0;
160 
161 /* should make this dynamic ... later */
162 static PetscInt msg_buf    =MAX_MSG_BUF;
163 static PetscInt vec_sz     =GS_VEC_SZ;
164 static PetscInt *tree_buf  =NULL;
165 static PetscInt tree_buf_sz=0;
166 static PetscInt ntree      =0;
167 
168 /***************************************************************************/
169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170 {
171   PetscFunctionBegin;
172   vec_sz = size;
173   PetscFunctionReturn(0);
174 }
175 
176 /******************************************************************************/
177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
178 {
179   PetscFunctionBegin;
180   msg_buf = buf_size;
181   PetscFunctionReturn(0);
182 }
183 
184 /******************************************************************************/
185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
186 {
187   PCTFS_gs_id    *gs;
188   MPI_Group      PCTFS_gs_group;
189   MPI_Comm       PCTFS_gs_comm;
190 
191   /* ensure that communication package has been initialized */
192   PCTFS_comm_init();
193 
194   /* determines if we have enough dynamic/semi-static memory */
195   /* checks input, allocs and sets gd_id template            */
196   gs = gsi_check_args(elms,nel,level);
197 
198   /* only bit mask version up and working for the moment    */
199   /* LATER :: get int list version working for sparse pblms */
200   PetscCallAbort(PETSC_COMM_WORLD,gsi_via_bit_mask(gs));
201 
202   PetscCallAbort(PETSC_COMM_WORLD,MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group));
203   PetscCallAbort(PETSC_COMM_WORLD,MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm));
204   PetscCallAbort(PETSC_COMM_WORLD,MPI_Group_free(&PCTFS_gs_group));
205 
206   gs->PCTFS_gs_comm=PCTFS_gs_comm;
207 
208   return(gs);
209 }
210 
211 /******************************************************************************/
212 static PCTFS_gs_id *gsi_new(void)
213 {
214   PCTFS_gs_id    *gs;
215   gs   = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
216   PetscCallAbort(PETSC_COMM_WORLD,PetscMemzero(gs,sizeof(PCTFS_gs_id)));
217   return(gs);
218 }
219 
220 /******************************************************************************/
221 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
222 {
223   PetscInt       i, j, k, t2;
224   PetscInt       *companion, *elms, *unique, *iptr;
225   PetscInt       num_local=0, *num_to_reduce, **local_reduce;
226   PetscInt       oprs[]   = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
227   PetscInt       vals[PETSC_STATIC_ARRAY_LENGTH(oprs)-1];
228   PetscInt       work[PETSC_STATIC_ARRAY_LENGTH(oprs)-1];
229   PCTFS_gs_id    *gs;
230 
231   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
232   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
233 
234   if (nel==0) PetscCallAbort(PETSC_COMM_WORLD,PetscInfo(0,"I don't have any elements!!!\n"));
235 
236   /* get space for gs template */
237   gs     = gsi_new();
238   gs->id = ++num_gs_ids;
239 
240   /* hmt 6.4.99                                            */
241   /* caller can set global ids that don't participate to 0 */
242   /* PCTFS_gs_init ignores all zeros in elm list                 */
243   /* negative global ids are still invalid                 */
244   for (i=j=0; i<nel; i++) {
245     if (in_elms[i]!=0) j++;
246   }
247 
248   k=nel; nel=j;
249 
250   /* copy over in_elms list and create inverse map */
251   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
252   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
253 
254   for (i=j=0; i<k; i++) {
255     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
256   }
257 
258   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
259 
260   /* pre-pass ... check to see if sorted */
261   elms[nel] = INT_MAX;
262   iptr      = elms;
263   unique    = elms+1;
264   j         =0;
265   while (*iptr!=INT_MAX) {
266     if (*iptr++>*unique++) { j=1; break; }
267   }
268 
269   /* set up inverse map */
270   if (j) {
271     PetscCallAbort(PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n"));
272     PetscCallAbort(PETSC_COMM_WORLD,PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER));
273   } else PetscCallAbort(PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list sorted!\n"));
274   elms[nel] = INT_MIN;
275 
276   /* first pass */
277   /* determine number of unique elements, check pd */
278   for (i=k=0; i<nel; i+=j) {
279     t2 = elms[i];
280     j  = ++i;
281 
282     /* clump 'em for now */
283     while (elms[j]==t2) j++;
284 
285     /* how many together and num local */
286     if (j-=i) { num_local++; k+=j; }
287   }
288 
289   /* how many unique elements? */
290   gs->repeats = k;
291   gs->nel     = nel-k;
292 
293   /* number of repeats? */
294   gs->num_local        = num_local;
295   num_local           += 2;
296   gs->local_reduce     = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
297   gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
298 
299   unique         = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
300   gs->elms       = unique;
301   gs->nel_total  = nel;
302   gs->local_elms = elms;
303   gs->companion  = companion;
304 
305   /* compess map as well as keep track of local ops */
306   for (num_local=i=j=0; i<gs->nel; i++) {
307     k            = j;
308     t2           = unique[i] = elms[j];
309     companion[i] = companion[j];
310 
311     while (elms[j]==t2) j++;
312 
313     if ((t2=(j-k))>1) {
314       /* number together */
315       num_to_reduce[num_local] = t2++;
316 
317       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
318 
319       /* to use binary searching don't remap until we check intersection */
320       *iptr++ = i;
321 
322       /* note that we're skipping the first one */
323       while (++k<j) *(iptr++) = companion[k];
324       *iptr = -1;
325     }
326   }
327 
328   /* sentinel for ngh_buf */
329   unique[gs->nel]=INT_MAX;
330 
331   /* for two partition sort hack */
332   num_to_reduce[num_local]   = 0;
333   local_reduce[num_local]    = NULL;
334   num_to_reduce[++num_local] = 0;
335   local_reduce[num_local]    = NULL;
336 
337   /* load 'em up */
338   /* note one extra to hold NON_UNIFORM flag!!! */
339   vals[2] = vals[1] = vals[0] = nel;
340   if (gs->nel>0) {
341     vals[3] = unique[0];
342     vals[4] = unique[gs->nel-1];
343   } else {
344     vals[3] = INT_MAX;
345     vals[4] = INT_MIN;
346   }
347   vals[5] = level;
348   vals[6] = num_gs_ids;
349 
350   /* GLOBAL: send 'em out */
351   PetscCallAbort(PETSC_COMM_WORLD,PCTFS_giop(vals,work,PETSC_STATIC_ARRAY_LENGTH(oprs)-1,oprs));
352 
353   /* must be semi-pos def - only pairwise depends on this */
354   /* LATER - remove this restriction */
355   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
356   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
357 
358   gs->nel_min = vals[0];
359   gs->nel_max = vals[1];
360   gs->nel_sum = vals[2];
361   gs->gl_min  = vals[3];
362   gs->gl_max  = vals[4];
363   gs->negl    = vals[4]-vals[3]+1;
364 
365   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
366 
367   /* LATER :: add level == -1 -> program selects level */
368   if (vals[5]<0) vals[5]=0;
369   else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
370   gs->level = vals[5];
371 
372   return(gs);
373 }
374 
375 /******************************************************************************/
376 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
377 {
378   PetscInt       i, nel, *elms;
379   PetscInt       t1;
380   PetscInt       **reduce;
381   PetscInt       *map;
382 
383   PetscFunctionBegin;
384   /* totally local removes ... PCTFS_ct_bits == 0 */
385   get_ngh_buf(gs);
386 
387   if (gs->level) set_pairwise(gs);
388   if (gs->max_left_over) set_tree(gs);
389 
390   /* intersection local and pairwise/tree? */
391   gs->num_local_total      = gs->num_local;
392   gs->gop_local_reduce     = gs->local_reduce;
393   gs->num_gop_local_reduce = gs->num_local_reduce;
394 
395   map = gs->companion;
396 
397   /* is there any local compression */
398   if (!gs->num_local) {
399     gs->local_strength = NONE;
400     gs->num_local_gop  = 0;
401   } else {
402     /* ok find intersection */
403     map    = gs->companion;
404     reduce = gs->local_reduce;
405     for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
406       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
407         t1++;
408         PetscCheck(gs->num_local_reduce[i]>0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
409         gs->num_local_reduce[i] *= -1;
410       }
411       **reduce=map[**reduce];
412     }
413 
414     /* intersection is empty */
415     if (!t1) {
416       gs->local_strength = FULL;
417       gs->num_local_gop  = 0;
418     } else { /* intersection not empty */
419       gs->local_strength = PARTIAL;
420 
421       PetscCall(PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
422 
423       gs->num_local_gop        = t1;
424       gs->num_local_total      =  gs->num_local;
425       gs->num_local           -= t1;
426       gs->gop_local_reduce     = gs->local_reduce;
427       gs->num_gop_local_reduce = gs->num_local_reduce;
428 
429       for (i=0; i<t1; i++) {
430         PetscCheck(gs->num_gop_local_reduce[i]<0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
431         gs->num_gop_local_reduce[i] *= -1;
432         gs->local_reduce++;
433         gs->num_local_reduce++;
434       }
435       gs->local_reduce++;
436       gs->num_local_reduce++;
437     }
438   }
439 
440   elms = gs->pw_elm_list;
441   nel  = gs->len_pw_list;
442   for (i=0; i<nel; i++) elms[i] = map[elms[i]];
443 
444   elms = gs->tree_map_in;
445   nel  = gs->tree_map_sz;
446   for (i=0; i<nel; i++) elms[i] = map[elms[i]];
447 
448   /* clean up */
449   free((void*) gs->local_elms);
450   free((void*) gs->companion);
451   free((void*) gs->elms);
452   free((void*) gs->ngh_buf);
453   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
454   PetscFunctionReturn(0);
455 }
456 
457 /******************************************************************************/
458 static PetscErrorCode place_in_tree(PetscInt elm)
459 {
460   PetscInt *tp, n;
461 
462   PetscFunctionBegin;
463   if (ntree==tree_buf_sz) {
464     if (tree_buf_sz) {
465       tp           = tree_buf;
466       n            = tree_buf_sz;
467       tree_buf_sz<<=1;
468       tree_buf     = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
469       PCTFS_ivec_copy(tree_buf,tp,n);
470       free(tp);
471     } else {
472       tree_buf_sz = TREE_BUF_SZ;
473       tree_buf    = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
474     }
475   }
476 
477   tree_buf[ntree++] = elm;
478   PetscFunctionReturn(0);
479 }
480 
481 /******************************************************************************/
482 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
483 {
484   PetscInt       i, j, npw=0, ntree_map=0;
485   PetscInt       p_mask_size, ngh_buf_size, buf_size;
486   PetscInt       *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
487   PetscInt       *ngh_buf, *buf1, *buf2;
488   PetscInt       offset, per_load, num_loads, or_ct, start, end;
489   PetscInt       *ptr1, *ptr2, i_start, negl, nel, *elms;
490   PetscInt       oper=GL_B_OR;
491   PetscInt       *ptr3, *t_mask, level, ct1, ct2;
492 
493   PetscFunctionBegin;
494   /* to make life easier */
495   nel   = gs->nel;
496   elms  = gs->elms;
497   level = gs->level;
498 
499   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
500   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
501   PetscCall(PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id));
502 
503   /* allocate space for masks and info bufs */
504   gs->nghs       = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
505   gs->pw_nghs    = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
506   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
507   t_mask         = (PetscInt*) malloc(p_mask_size);
508   gs->ngh_buf    = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
509 
510   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
511   /* had thought I could exploit rendezvous threshold */
512 
513   /* default is one pass */
514   per_load      = negl  = gs->negl;
515   gs->num_loads = num_loads = 1;
516   i             = p_mask_size*negl;
517 
518   /* possible overflow on buffer size */
519   /* overflow hack                    */
520   if (i<0) i=INT_MAX;
521 
522   buf_size = PetscMin(msg_buf,i);
523 
524   /* can we do it? */
525   PetscCheck(p_mask_size<=buf_size,PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d",p_mask_size,buf_size);
526 
527   /* get PCTFS_giop buf space ... make *only* one malloc */
528   buf1 = (PetscInt*) malloc(buf_size<<1);
529 
530   /* more than one gior exchange needed? */
531   if (buf_size!=i) {
532     per_load      = buf_size/p_mask_size;
533     buf_size      = per_load*p_mask_size;
534     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
535   }
536 
537   /* convert buf sizes from #bytes to #ints - 32 bit only! */
538   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
539 
540   /* find PCTFS_giop work space */
541   buf2 = buf1+buf_size;
542 
543   /* hold #ints needed for processor masks */
544   gs->mask_sz=p_mask_size;
545 
546   /* init buffers */
547   PetscCall(PCTFS_ivec_zero(sh_proc_mask,p_mask_size));
548   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size));
549   PetscCall(PCTFS_ivec_zero(ngh_buf,ngh_buf_size));
550 
551   /* HACK reset tree info */
552   tree_buf    = NULL;
553   tree_buf_sz = ntree = 0;
554 
555   /* ok do it */
556   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
557     /* identity for bitwise or is 000...000 */
558     PCTFS_ivec_zero(buf1,buf_size);
559 
560     /* load msg buffer */
561     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
562       offset = (offset-start)*p_mask_size;
563       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
564     }
565 
566     /* GLOBAL: pass buffer */
567     PetscCall(PCTFS_giop(buf1,buf2,buf_size,&oper));
568 
569     /* unload buffer into ngh_buf */
570     ptr2=(elms+i_start);
571     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
572       /* I own it ... may have to pairwise it */
573       if (j==*ptr2) {
574         /* do i share it w/anyone? */
575         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
576         /* guess not */
577         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
578 
579         /* i do ... so keep info and turn off my bit */
580         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
581         PetscCall(PCTFS_ivec_xor(ptr1,p_mask,p_mask_size));
582         PetscCall(PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size));
583 
584         /* is it to be done pairwise? */
585         if (--ct1<=level) {
586           npw++;
587 
588           /* turn on high bit to indicate pw need to process */
589           *ptr2++ |= TOP_BIT;
590           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size));
591           ptr1    += p_mask_size;
592           continue;
593         }
594 
595         /* get set for next and note that I have a tree contribution */
596         /* could save exact elm index for tree here -> save a search */
597         ptr2++; ptr1+=p_mask_size; ntree_map++;
598       } else { /* i don't but still might be involved in tree */
599 
600         /* shared by how many? */
601         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
602 
603         /* none! */
604         if (ct1<2) continue;
605 
606         /* is it going to be done pairwise? but not by me of course!*/
607         if (--ct1<=level) continue;
608       }
609       /* LATER we're going to have to process it NOW */
610       /* nope ... tree it */
611       PetscCall(place_in_tree(j));
612     }
613   }
614 
615   free((void*)t_mask);
616   free((void*)buf1);
617 
618   gs->len_pw_list = npw;
619   gs->num_nghs    = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
620 
621   /* expand from bit mask list to int list and save ngh list */
622   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
623   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
624 
625   gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
626 
627   oper         = GL_MAX;
628   ct1          = gs->num_nghs;
629   PetscCall(PCTFS_giop(&ct1,&ct2,1,&oper));
630   gs->max_nghs = ct1;
631 
632   gs->tree_map_sz  = ntree_map;
633   gs->max_left_over=ntree;
634 
635   free((void*)p_mask);
636   free((void*)sh_proc_mask);
637   PetscFunctionReturn(0);
638 }
639 
640 /******************************************************************************/
641 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
642 {
643   PetscInt       i, j;
644   PetscInt       p_mask_size;
645   PetscInt       *p_mask, *sh_proc_mask, *tmp_proc_mask;
646   PetscInt       *ngh_buf, *buf2;
647   PetscInt       offset;
648   PetscInt       *msg_list, *msg_size, **msg_nodes, nprs;
649   PetscInt       *pairwise_elm_list, len_pair_list=0;
650   PetscInt       *iptr, t1, i_start, nel, *elms;
651   PetscInt       ct;
652 
653   PetscFunctionBegin;
654   /* to make life easier */
655   nel          = gs->nel;
656   elms         = gs->elms;
657   ngh_buf      = gs->ngh_buf;
658   sh_proc_mask = gs->pw_nghs;
659 
660   /* need a few temp masks */
661   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
662   p_mask        = (PetscInt*) malloc(p_mask_size);
663   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
664 
665   /* set mask to my PCTFS_my_id's bit mask */
666   PetscCall(PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id));
667 
668   p_mask_size /= sizeof(PetscInt);
669 
670   len_pair_list   = gs->len_pw_list;
671   gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
672 
673   /* how many processors (nghs) do we have to exchange with? */
674   nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
675 
676   /* allocate space for PCTFS_gs_gop() info */
677   gs->pair_list = msg_list  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
678   gs->msg_sizes = msg_size  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
679   gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));
680 
681   /* init msg_size list */
682   PetscCall(PCTFS_ivec_zero(msg_size,nprs));
683 
684   /* expand from bit mask list to int list */
685   PetscCall(PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list));
686 
687   /* keep list of elements being handled pairwise */
688   for (i=j=0; i<nel; i++) {
689     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
690   }
691   pairwise_elm_list[j] = -1;
692 
693   gs->msg_ids_out       = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
694   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
695   gs->msg_ids_in        = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
696   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
697   gs->pw_vals           = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
698 
699   /* find who goes to each processor */
700   for (i_start=i=0; i<nprs; i++) {
701     /* processor i's mask */
702     PetscCall(PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]));
703 
704     /* det # going to processor i */
705     for (ct=j=0; j<len_pair_list; j++) {
706       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
707       PetscCall(PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size));
708       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
709     }
710     msg_size[i] = ct;
711     i_start     = PetscMax(i_start,ct);
712 
713     /*space to hold nodes in message to first neighbor */
714     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
715 
716     for (j=0;j<len_pair_list;j++) {
717       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
718       PetscCall(PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size));
719       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
720     }
721     *iptr = -1;
722   }
723   msg_nodes[nprs] = NULL;
724 
725   j                  = gs->loc_node_pairs=i_start;
726   t1                 = GL_MAX;
727   PetscCall(PCTFS_giop(&i_start,&offset,1,&t1));
728   gs->max_node_pairs = i_start;
729 
730   i_start            = j;
731   t1                 = GL_MIN;
732   PetscCall(PCTFS_giop(&i_start,&offset,1,&t1));
733   gs->min_node_pairs = i_start;
734 
735   i_start            = j;
736   t1                 = GL_ADD;
737   PetscCall(PCTFS_giop(&i_start,&offset,1,&t1));
738   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
739 
740   i_start = nprs;
741   t1      = GL_MAX;
742   PCTFS_giop(&i_start,&offset,1,&t1);
743   gs->max_pairs = i_start;
744 
745   /* remap pairwise in tail of gsi_via_bit_mask() */
746   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
747   gs->out       = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
748   gs->in        = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
749 
750   /* reset malloc pool */
751   free((void*)p_mask);
752   free((void*)tmp_proc_mask);
753   PetscFunctionReturn(0);
754 }
755 
756 /* to do pruned tree just save ngh buf copy for each one and decode here!
757 ******************************************************************************/
758 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
759 {
760   PetscInt i, j, n, nel;
761   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
762 
763   PetscFunctionBegin;
764   /* local work ptrs */
765   elms = gs->elms;
766   nel  = gs->nel;
767 
768   /* how many via tree */
769   gs->tree_nel     = n = ntree;
770   gs->tree_elms    = tree_elms = iptr_in = tree_buf;
771   gs->tree_buf     = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
772   gs->tree_work    = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
773   j                = gs->tree_map_sz;
774   gs->tree_map_in  = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
775   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
776 
777   /* search the longer of the two lists */
778   /* note ... could save this info in get_ngh_buf and save searches */
779   if (n<=nel) {
780     /* bijective fct w/remap - search elm list */
781     for (i=0; i<n; i++) {
782       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
783     }
784   } else {
785     for (i=0; i<nel; i++) {
786       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
787     }
788   }
789 
790   /* sentinel */
791   *iptr_in = *iptr_out = -1;
792   PetscFunctionReturn(0);
793 }
794 
795 /******************************************************************************/
796 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs,  PetscScalar *vals)
797 {
798   PetscInt    *num, *map, **reduce;
799   PetscScalar tmp;
800 
801   PetscFunctionBegin;
802   num    = gs->num_gop_local_reduce;
803   reduce = gs->gop_local_reduce;
804   while ((map = *reduce++)) {
805     /* wall */
806     if (*num == 2) {
807       num++;
808       vals[map[1]] = vals[map[0]];
809     } else if (*num == 3) { /* corner shared by three elements */
810       num++;
811       vals[map[2]] = vals[map[1]] = vals[map[0]];
812     } else if (*num == 4) { /* corner shared by four elements */
813       num++;
814       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
815     } else { /* general case ... odd geoms ... 3D*/
816       num++;
817       tmp = *(vals + *map++);
818       while (*map >= 0) *(vals + *map++) = tmp;
819     }
820   }
821   PetscFunctionReturn(0);
822 }
823 
824 /******************************************************************************/
825 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
826 {
827   PetscInt    *num, *map, **reduce;
828   PetscScalar tmp;
829 
830   PetscFunctionBegin;
831   num    = gs->num_local_reduce;
832   reduce = gs->local_reduce;
833   while ((map = *reduce)) {
834     /* wall */
835     if (*num == 2) {
836       num++; reduce++;
837       vals[map[1]] = vals[map[0]] += vals[map[1]];
838     } else if (*num == 3) { /* corner shared by three elements */
839       num++; reduce++;
840       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
841     } else if (*num == 4) { /* corner shared by four elements */
842       num++; reduce++;
843       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
844     } else { /* general case ... odd geoms ... 3D*/
845       num++;
846       tmp = 0.0;
847       while (*map >= 0) tmp += *(vals + *map++);
848 
849       map = *reduce++;
850       while (*map >= 0) *(vals + *map++) = tmp;
851     }
852   }
853   PetscFunctionReturn(0);
854 }
855 
856 /******************************************************************************/
857 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
858 {
859   PetscInt    *num, *map, **reduce;
860   PetscScalar *base;
861 
862   PetscFunctionBegin;
863   num    = gs->num_gop_local_reduce;
864   reduce = gs->gop_local_reduce;
865   while ((map = *reduce++)) {
866     /* wall */
867     if (*num == 2) {
868       num++;
869       vals[map[0]] += vals[map[1]];
870     } else if (*num == 3) { /* corner shared by three elements */
871       num++;
872       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
873     } else if (*num == 4) { /* corner shared by four elements */
874       num++;
875       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
876     } else { /* general case ... odd geoms ... 3D*/
877       num++;
878       base = vals + *map++;
879       while (*map >= 0) *base += *(vals + *map++);
880     }
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 /******************************************************************************/
886 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
887 {
888   PetscInt       i;
889 
890   PetscFunctionBegin;
891   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
892   if (gs->nghs) free((void*) gs->nghs);
893   if (gs->pw_nghs) free((void*) gs->pw_nghs);
894 
895   /* tree */
896   if (gs->max_left_over) {
897     if (gs->tree_elms) free((void*) gs->tree_elms);
898     if (gs->tree_buf) free((void*) gs->tree_buf);
899     if (gs->tree_work) free((void*) gs->tree_work);
900     if (gs->tree_map_in) free((void*) gs->tree_map_in);
901     if (gs->tree_map_out) free((void*) gs->tree_map_out);
902   }
903 
904   /* pairwise info */
905   if (gs->num_pairs) {
906     /* should be NULL already */
907     if (gs->ngh_buf) free((void*) gs->ngh_buf);
908     if (gs->elms) free((void*) gs->elms);
909     if (gs->local_elms) free((void*) gs->local_elms);
910     if (gs->companion) free((void*) gs->companion);
911 
912     /* only set if pairwise */
913     if (gs->vals) free((void*) gs->vals);
914     if (gs->in) free((void*) gs->in);
915     if (gs->out) free((void*) gs->out);
916     if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
917     if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
918     if (gs->pw_vals) free((void*) gs->pw_vals);
919     if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
920     if (gs->node_list) {
921       for (i=0;i<gs->num_pairs;i++) {
922         if (gs->node_list[i])  {
923           free((void*) gs->node_list[i]);
924         }
925       }
926       free((void*) gs->node_list);
927     }
928     if (gs->msg_sizes) free((void*) gs->msg_sizes);
929     if (gs->pair_list) free((void*) gs->pair_list);
930   }
931 
932   /* local info */
933   if (gs->num_local_total>=0) {
934     for (i=0;i<gs->num_local_total+1;i++) {
935       if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
936     }
937   }
938 
939   /* if intersection tree/pairwise and local isn't empty */
940   if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
941   if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);
942 
943   free((void*) gs);
944   PetscFunctionReturn(0);
945 }
946 
947 /******************************************************************************/
948 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
949 {
950   PetscFunctionBegin;
951   switch (*op) {
952   case '+':
953     PCTFS_gs_gop_vec_plus(gs,vals,step);
954     break;
955   default:
956     PetscCall(PetscInfo(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]));
957     PetscCall(PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n"));
958     PCTFS_gs_gop_vec_plus(gs,vals,step);
959     break;
960   }
961   PetscFunctionReturn(0);
962 }
963 
964 /******************************************************************************/
965 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
966 {
967   PetscFunctionBegin;
968   PetscCheck(gs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
969 
970   /* local only operations!!! */
971   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);
972 
973   /* if intersection tree/pairwise and local isn't empty */
974   if (gs->num_local_gop) {
975     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
976 
977     /* pairwise */
978     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
979 
980     /* tree */
981     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
982 
983     PCTFS_gs_gop_vec_local_out(gs,vals,step);
984   } else { /* if intersection tree/pairwise and local is empty */
985     /* pairwise */
986     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
987 
988     /* tree */
989     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
990   }
991   PetscFunctionReturn(0);
992 }
993 
994 /******************************************************************************/
995 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
996 {
997   PetscInt    *num, *map, **reduce;
998   PetscScalar *base;
999 
1000   PetscFunctionBegin;
1001   num    = gs->num_local_reduce;
1002   reduce = gs->local_reduce;
1003   while ((map = *reduce)) {
1004     base = vals + map[0] * step;
1005 
1006     /* wall */
1007     if (*num == 2) {
1008       num++; reduce++;
1009       PCTFS_rvec_add (base,vals+map[1]*step,step);
1010       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1011     } else if (*num == 3) { /* corner shared by three elements */
1012       num++; reduce++;
1013       PCTFS_rvec_add (base,vals+map[1]*step,step);
1014       PCTFS_rvec_add (base,vals+map[2]*step,step);
1015       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1016       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1017     } else if (*num == 4) { /* corner shared by four elements */
1018       num++; reduce++;
1019       PCTFS_rvec_add (base,vals+map[1]*step,step);
1020       PCTFS_rvec_add (base,vals+map[2]*step,step);
1021       PCTFS_rvec_add (base,vals+map[3]*step,step);
1022       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1023       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1024       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1025     } else { /* general case ... odd geoms ... 3D */
1026       num++;
1027       while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);
1028 
1029       map = *reduce;
1030       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1031 
1032       reduce++;
1033     }
1034   }
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 /******************************************************************************/
1039 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1040 {
1041   PetscInt    *num, *map, **reduce;
1042   PetscScalar *base;
1043 
1044   PetscFunctionBegin;
1045   num    = gs->num_gop_local_reduce;
1046   reduce = gs->gop_local_reduce;
1047   while ((map = *reduce++)) {
1048     base = vals + map[0] * step;
1049 
1050     /* wall */
1051     if (*num == 2) {
1052       num++;
1053       PCTFS_rvec_add(base,vals+map[1]*step,step);
1054     } else if (*num == 3) { /* corner shared by three elements */
1055       num++;
1056       PCTFS_rvec_add(base,vals+map[1]*step,step);
1057       PCTFS_rvec_add(base,vals+map[2]*step,step);
1058     } else if (*num == 4) { /* corner shared by four elements */
1059       num++;
1060       PCTFS_rvec_add(base,vals+map[1]*step,step);
1061       PCTFS_rvec_add(base,vals+map[2]*step,step);
1062       PCTFS_rvec_add(base,vals+map[3]*step,step);
1063     } else { /* general case ... odd geoms ... 3D*/
1064       num++;
1065       while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1066     }
1067   }
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 /******************************************************************************/
1072 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1073 {
1074   PetscInt    *num, *map, **reduce;
1075   PetscScalar *base;
1076 
1077   PetscFunctionBegin;
1078   num    = gs->num_gop_local_reduce;
1079   reduce = gs->gop_local_reduce;
1080   while ((map = *reduce++)) {
1081     base = vals + map[0] * step;
1082 
1083     /* wall */
1084     if (*num == 2) {
1085       num++;
1086       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1087     } else if (*num == 3) { /* corner shared by three elements */
1088       num++;
1089       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1090       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1091     } else if (*num == 4) { /* corner shared by four elements */
1092       num++;
1093       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1094       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1095       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1096     } else { /* general case ... odd geoms ... 3D*/
1097       num++;
1098       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1099     }
1100   }
1101   PetscFunctionReturn(0);
1102 }
1103 
1104 /******************************************************************************/
1105 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1106 {
1107   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1108   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1109   PetscInt       *pw, *list, *size, **nodes;
1110   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1111   MPI_Status     status;
1112   PetscBLASInt   i1 = 1,dstep;
1113 
1114   PetscFunctionBegin;
1115   /* strip and load s */
1116   msg_list    = list     = gs->pair_list;
1117   msg_size    = size     = gs->msg_sizes;
1118   msg_nodes   = nodes    = gs->node_list;
1119   iptr        = pw       = gs->pw_elm_list;
1120   dptr1       = dptr3    = gs->pw_vals;
1121   msg_ids_in  = ids_in   = gs->msg_ids_in;
1122   msg_ids_out = ids_out  = gs->msg_ids_out;
1123   dptr2                  = gs->out;
1124   in1=in2                = gs->in;
1125 
1126   /* post the receives */
1127   /*  msg_nodes=nodes; */
1128   do {
1129     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1130         second one *list and do list++ afterwards */
1131     PetscCallMPI(MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1132     list++;msg_ids_in++;
1133     in1 += *size++ *step;
1134   } while (*++msg_nodes);
1135   msg_nodes=nodes;
1136 
1137   /* load gs values into in out gs buffers */
1138   while (*iptr >= 0) {
1139     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1140     dptr3+=step;
1141     iptr++;
1142   }
1143 
1144   /* load out buffers and post the sends */
1145   while ((iptr = *msg_nodes++)) {
1146     dptr3 = dptr2;
1147     while (*iptr >= 0) {
1148       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1149       dptr2+=step;
1150       iptr++;
1151     }
1152     PetscCallMPI(MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1153     msg_size++; msg_list++;msg_ids_out++;
1154   }
1155 
1156   /* tree */
1157   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);
1158 
1159   /* process the received data */
1160   msg_nodes=nodes;
1161   while ((iptr = *nodes++)) {
1162     PetscScalar d1 = 1.0;
1163 
1164     /* Should I check the return value of MPI_Wait() or status? */
1165     /* Can this loop be replaced by a call to MPI_Waitall()? */
1166     PetscCallMPI(MPI_Wait(ids_in, &status));
1167     ids_in++;
1168     while (*iptr >= 0) {
1169       PetscCall(PetscBLASIntCast(step,&dstep));
1170       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1171       in2+=step;
1172       iptr++;
1173     }
1174   }
1175 
1176   /* replace vals */
1177   while (*pw >= 0) {
1178     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1179     dptr1+=step;
1180     pw++;
1181   }
1182 
1183   /* clear isend message handles */
1184   /* This changed for clarity though it could be the same */
1185 
1186   /* Should I check the return value of MPI_Wait() or status? */
1187   /* Can this loop be replaced by a call to MPI_Waitall()? */
1188   while (*msg_nodes++) {
1189     PetscCallMPI(MPI_Wait(ids_out, &status));
1190     ids_out++;
1191   }
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 /******************************************************************************/
1196 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1197 {
1198   PetscInt       size, *in, *out;
1199   PetscScalar    *buf, *work;
1200   PetscInt       op[] = {GL_ADD,0};
1201   PetscBLASInt   i1   = 1;
1202   PetscBLASInt   dstep;
1203 
1204   PetscFunctionBegin;
1205   /* copy over to local variables */
1206   in   = gs->tree_map_in;
1207   out  = gs->tree_map_out;
1208   buf  = gs->tree_buf;
1209   work = gs->tree_work;
1210   size = gs->tree_nel*step;
1211 
1212   /* zero out collection buffer */
1213   PCTFS_rvec_zero(buf,size);
1214 
1215   /* copy over my contributions */
1216   while (*in >= 0) {
1217     PetscCall(PetscBLASIntCast(step,&dstep));
1218     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1219   }
1220 
1221   /* perform fan in/out on full buffer */
1222   /* must change PCTFS_grop to handle the blas */
1223   PCTFS_grop(buf,work,size,op);
1224 
1225   /* reset */
1226   in  = gs->tree_map_in;
1227   out = gs->tree_map_out;
1228 
1229   /* get the portion of the results I need */
1230   while (*in >= 0) {
1231     PetscCall(PetscBLASIntCast(step,&dstep));
1232     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1233   }
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 /******************************************************************************/
1238 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1239 {
1240   PetscFunctionBegin;
1241   switch (*op) {
1242   case '+':
1243     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1244     break;
1245   default:
1246     PetscCall(PetscInfo(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]));
1247     PetscCall(PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n"));
1248     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1249     break;
1250   }
1251   PetscFunctionReturn(0);
1252 }
1253 
1254 /******************************************************************************/
1255 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1256 {
1257   PetscFunctionBegin;
1258   /* if there's nothing to do return */
1259   if (dim<=0) PetscFunctionReturn(0);
1260 
1261   /* can't do more dimensions then exist */
1262   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1263 
1264   /* local only operations!!! */
1265   if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);
1266 
1267   /* if intersection tree/pairwise and local isn't empty */
1268   if (gs->num_local_gop) {
1269     PCTFS_gs_gop_local_in_plus(gs,vals);
1270 
1271     /* pairwise will do tree inside ... */
1272     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1273     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1274 
1275     PCTFS_gs_gop_local_out(gs,vals);
1276   } else { /* if intersection tree/pairwise and local is empty */
1277     /* pairwise will do tree inside */
1278     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1279     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1280   }
1281   PetscFunctionReturn(0);
1282 }
1283 
1284 /******************************************************************************/
1285 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1286 {
1287   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1288   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1289   PetscInt       *pw, *list, *size, **nodes;
1290   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1291   MPI_Status     status;
1292   PetscInt       i, mask=1;
1293 
1294   PetscFunctionBegin;
1295   for (i=1; i<dim; i++) { mask<<=1; mask++; }
1296 
1297   /* strip and load s */
1298   msg_list    = list     = gs->pair_list;
1299   msg_size    = size     = gs->msg_sizes;
1300   msg_nodes   = nodes    = gs->node_list;
1301   iptr        = pw       = gs->pw_elm_list;
1302   dptr1       = dptr3    = gs->pw_vals;
1303   msg_ids_in  = ids_in   = gs->msg_ids_in;
1304   msg_ids_out = ids_out  = gs->msg_ids_out;
1305   dptr2       = gs->out;
1306   in1         = in2      = gs->in;
1307 
1308   /* post the receives */
1309   /*  msg_nodes=nodes; */
1310   do {
1311     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1312         second one *list and do list++ afterwards */
1313     if ((PCTFS_my_id|mask)==(*list|mask)) {
1314       PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1315       list++; msg_ids_in++;in1 += *size++;
1316     } else { list++; size++; }
1317   } while (*++msg_nodes);
1318 
1319   /* load gs values into in out gs buffers */
1320   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1321 
1322   /* load out buffers and post the sends */
1323   msg_nodes=nodes;
1324   list     = msg_list;
1325   while ((iptr = *msg_nodes++)) {
1326     if ((PCTFS_my_id|mask)==(*list|mask)) {
1327       dptr3 = dptr2;
1328       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1329       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1330       /* is msg_ids_out++ correct? */
1331       PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1332       msg_size++;list++;msg_ids_out++;
1333     } else {list++; msg_size++;}
1334   }
1335 
1336   /* do the tree while we're waiting */
1337   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);
1338 
1339   /* process the received data */
1340   msg_nodes=nodes;
1341   list     = msg_list;
1342   while ((iptr = *nodes++)) {
1343     if ((PCTFS_my_id|mask)==(*list|mask)) {
1344       /* Should I check the return value of MPI_Wait() or status? */
1345       /* Can this loop be replaced by a call to MPI_Waitall()? */
1346       PetscCallMPI(MPI_Wait(ids_in, &status));
1347       ids_in++;
1348       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1349     }
1350     list++;
1351   }
1352 
1353   /* replace vals */
1354   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1355 
1356   /* clear isend message handles */
1357   /* This changed for clarity though it could be the same */
1358   while (*msg_nodes++) {
1359     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1360       /* Should I check the return value of MPI_Wait() or status? */
1361       /* Can this loop be replaced by a call to MPI_Waitall()? */
1362       PetscCallMPI(MPI_Wait(ids_out, &status));
1363       ids_out++;
1364     }
1365     msg_list++;
1366   }
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 /******************************************************************************/
1371 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1372 {
1373   PetscInt    size;
1374   PetscInt    *in, *out;
1375   PetscScalar *buf, *work;
1376   PetscInt    op[] = {GL_ADD,0};
1377 
1378   PetscFunctionBegin;
1379   in   = gs->tree_map_in;
1380   out  = gs->tree_map_out;
1381   buf  = gs->tree_buf;
1382   work = gs->tree_work;
1383   size = gs->tree_nel;
1384 
1385   PCTFS_rvec_zero(buf,size);
1386 
1387   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1388 
1389   in  = gs->tree_map_in;
1390   out = gs->tree_map_out;
1391 
1392   PCTFS_grop_hc(buf,work,size,op,dim);
1393 
1394   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1395   PetscFunctionReturn(0);
1396 }
1397