xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision a6dfd86ebdf75fa024070af26d51b62fd16650a3)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 
30 
31 /***********************************gs.c***************************************
32 Type: struct gather_scatter_id
33 ------------------------------
34 
35 ************************************gs.c**************************************/
36 typedef struct gather_scatter_id {
37   PetscInt id;
38   PetscInt nel_min;
39   PetscInt nel_max;
40   PetscInt nel_sum;
41   PetscInt negl;
42   PetscInt gl_max;
43   PetscInt gl_min;
44   PetscInt repeats;
45   PetscInt ordered;
46   PetscInt positive;
47   PetscScalar *vals;
48 
49   /* bit mask info */
50   PetscInt *my_proc_mask;
51   PetscInt mask_sz;
52   PetscInt *ngh_buf;
53   PetscInt ngh_buf_sz;
54   PetscInt *nghs;
55   PetscInt num_nghs;
56   PetscInt max_nghs;
57   PetscInt *pw_nghs;
58   PetscInt num_pw_nghs;
59   PetscInt *tree_nghs;
60   PetscInt num_tree_nghs;
61 
62   PetscInt num_loads;
63 
64   /* repeats == true -> local info */
65   PetscInt nel;         /* number of unique elememts */
66   PetscInt *elms;       /* of size nel */
67   PetscInt nel_total;
68   PetscInt *local_elms; /* of size nel_total */
69   PetscInt *companion;  /* of size nel_total */
70 
71   /* local info */
72   PetscInt num_local_total;
73   PetscInt local_strength;
74   PetscInt num_local;
75   PetscInt *num_local_reduce;
76   PetscInt **local_reduce;
77   PetscInt num_local_gop;
78   PetscInt *num_gop_local_reduce;
79   PetscInt **gop_local_reduce;
80 
81   /* pairwise info */
82   PetscInt level;
83   PetscInt num_pairs;
84   PetscInt max_pairs;
85   PetscInt loc_node_pairs;
86   PetscInt max_node_pairs;
87   PetscInt min_node_pairs;
88   PetscInt avg_node_pairs;
89   PetscInt *pair_list;
90   PetscInt *msg_sizes;
91   PetscInt **node_list;
92   PetscInt len_pw_list;
93   PetscInt *pw_elm_list;
94   PetscScalar *pw_vals;
95 
96   MPI_Request *msg_ids_in;
97   MPI_Request *msg_ids_out;
98 
99   PetscScalar *out;
100   PetscScalar *in;
101   PetscInt msg_total;
102 
103   /* tree - crystal accumulator info */
104   PetscInt max_left_over;
105   PetscInt *pre;
106   PetscInt *in_num;
107   PetscInt *out_num;
108   PetscInt **in_list;
109   PetscInt **out_list;
110 
111   /* new tree work*/
112   PetscInt  tree_nel;
113   PetscInt *tree_elms;
114   PetscScalar *tree_buf;
115   PetscScalar *tree_work;
116 
117   PetscInt  tree_map_sz;
118   PetscInt *tree_map_in;
119   PetscInt *tree_map_out;
120 
121   /* current memory status */
122   PetscInt gl_bss_min;
123   PetscInt gl_perm_min;
124 
125   /* max segment size for PCTFS_gs_gop_vec() */
126   PetscInt vec_sz;
127 
128   /* hack to make paul happy */
129   MPI_Comm PCTFS_gs_comm;
130 
131 } PCTFS_gs_id;
132 
133 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137 static PCTFS_gs_id * gsi_new(void);
138 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
139 
140 /* same for all but vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142 /* vector flavor */
143 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
144 
145 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
150 
151 
152 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
154 
155 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
158 
159 /* global vars */
160 /* from comm.c module */
161 
162 static PetscInt num_gs_ids = 0;
163 
164 /* should make this dynamic ... later */
165 static PetscInt msg_buf=MAX_MSG_BUF;
166 static PetscInt vec_sz=GS_VEC_SZ;
167 static PetscInt *tree_buf=NULL;
168 static PetscInt tree_buf_sz=0;
169 static PetscInt ntree=0;
170 
171 /***************************************************************************/
172 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173 {
174   PetscFunctionBegin;
175   vec_sz = size;
176   PetscFunctionReturn(0);
177 }
178 
179 /******************************************************************************/
180 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181 {
182   PetscFunctionBegin;
183   msg_buf = buf_size;
184   PetscFunctionReturn(0);
185 }
186 
187 /******************************************************************************/
188 PCTFS_gs_id *PCTFS_gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
189 {
190   PCTFS_gs_id *gs;
191   MPI_Group PCTFS_gs_group;
192   MPI_Comm  PCTFS_gs_comm;
193   PetscErrorCode ierr;
194 
195   PetscFunctionBegin;
196   /* ensure that communication package has been initialized */
197   PCTFS_comm_init();
198 
199 
200   /* determines if we have enough dynamic/semi-static memory */
201   /* checks input, allocs and sets gd_id template            */
202   gs = gsi_check_args(elms,nel,level);
203 
204   /* only bit mask version up and working for the moment    */
205   /* LATER :: get int list version working for sparse pblms */
206   ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
207 
208 
209   ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210   ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211   gs->PCTFS_gs_comm=PCTFS_gs_comm;
212 
213   return(gs);
214 }
215 
216 /******************************************************************************/
217 static PCTFS_gs_id *gsi_new(void)
218 {
219   PetscErrorCode ierr;
220   PCTFS_gs_id *gs;
221   gs = (PCTFS_gs_id *) malloc(sizeof(PCTFS_gs_id));
222   ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
223   return(gs);
224 }
225 
226 /******************************************************************************/
227 static PCTFS_gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
228 {
229    PetscInt i, j, k, t2;
230   PetscInt *companion, *elms, *unique, *iptr;
231   PetscInt num_local=0, *num_to_reduce, **local_reduce;
232   PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
233   PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
234   PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
235   PCTFS_gs_id *gs;
236   PetscErrorCode ierr;
237 
238 
239   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
240   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
241 
242   if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
243 
244   /* get space for gs template */
245   gs = gsi_new();
246   gs->id = ++num_gs_ids;
247 
248   /* hmt 6.4.99                                            */
249   /* caller can set global ids that don't participate to 0 */
250   /* PCTFS_gs_init ignores all zeros in elm list                 */
251   /* negative global ids are still invalid                 */
252   for (i=j=0;i<nel;i++) { if (in_elms[i]!=0) {j++;} }
253 
254   k=nel; nel=j;
255 
256   /* copy over in_elms list and create inverse map */
257   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
258   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
259 
260   for (i=j=0;i<k;i++) {
261     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
262   }
263 
264   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
265 
266   /* pre-pass ... check to see if sorted */
267   elms[nel] = INT_MAX;
268   iptr = elms;
269   unique = elms+1;
270   j=0;
271   while (*iptr!=INT_MAX) {
272     if (*iptr++>*unique++) { j=1; break; }
273   }
274 
275   /* set up inverse map */
276   if (j) {
277     ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
278     ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
279   }
280   else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
281   elms[nel] = INT_MIN;
282 
283   /* first pass */
284   /* determine number of unique elements, check pd */
285   for (i=k=0;i<nel;i+=j) {
286     t2 = elms[i];
287     j=++i;
288 
289     /* clump 'em for now */
290     while (elms[j]==t2) { j++; }
291 
292     /* how many together and num local */
293     if (j-=i) { num_local++; k+=j; }
294   }
295 
296   /* how many unique elements? */
297   gs->repeats=k;
298   gs->nel = nel-k;
299 
300 
301   /* number of repeats? */
302   gs->num_local = num_local;
303   num_local+=2;
304   gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
305   gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
306 
307   unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
308   gs->elms = unique;
309   gs->nel_total = nel;
310   gs->local_elms = elms;
311   gs->companion = companion;
312 
313   /* compess map as well as keep track of local ops */
314   for (num_local=i=j=0;i<gs->nel;i++) {
315     k=j;
316     t2 = unique[i] = elms[j];
317     companion[i] = companion[j];
318 
319     while (elms[j]==t2) { j++; }
320 
321     if ((t2=(j-k))>1) {
322       /* number together */
323       num_to_reduce[num_local] = t2++;
324       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
325 
326       /* to use binary searching don't remap until we check intersection */
327       *iptr++ = i;
328 
329       /* note that we're skipping the first one */
330       while (++k<j) { *(iptr++) = companion[k]; }
331       *iptr = -1;
332     }
333   }
334 
335   /* sentinel for ngh_buf */
336   unique[gs->nel]=INT_MAX;
337 
338   /* for two partition sort hack */
339   num_to_reduce[num_local] = 0;
340   local_reduce[num_local] = NULL;
341   num_to_reduce[++num_local] = 0;
342   local_reduce[num_local] = NULL;
343 
344   /* load 'em up */
345   /* note one extra to hold NON_UNIFORM flag!!! */
346   vals[2] = vals[1] = vals[0] = nel;
347   if (gs->nel>0) {
348     vals[3] = unique[0];
349     vals[4] = unique[gs->nel-1];
350   } else {
351     vals[3] = INT_MAX;
352     vals[4] = INT_MIN;
353   }
354   vals[5] = level;
355   vals[6] = num_gs_ids;
356 
357   /* GLOBAL: send 'em out */
358   ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
359 
360   /* must be semi-pos def - only pairwise depends on this */
361   /* LATER - remove this restriction */
362   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
363   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
364 
365   gs->nel_min = vals[0];
366   gs->nel_max = vals[1];
367   gs->nel_sum = vals[2];
368   gs->gl_min  = vals[3];
369   gs->gl_max  = vals[4];
370   gs->negl    = vals[4]-vals[3]+1;
371 
372   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
373 
374   /* LATER :: add level == -1 -> program selects level */
375   if (vals[5]<0) { vals[5]=0; }
376   else if (vals[5]>PCTFS_num_nodes) { vals[5]=PCTFS_num_nodes; }
377   gs->level = vals[5];
378 
379   return(gs);
380 }
381 
382 /******************************************************************************/
383 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
384 {
385   PetscInt i, nel, *elms;
386   PetscInt t1;
387   PetscInt **reduce;
388   PetscInt *map;
389   PetscErrorCode ierr;
390 
391   PetscFunctionBegin;
392   /* totally local removes ... PCTFS_ct_bits == 0 */
393   get_ngh_buf(gs);
394 
395   if (gs->level) set_pairwise(gs);
396   if (gs->max_left_over) set_tree(gs);
397 
398   /* intersection local and pairwise/tree? */
399   gs->num_local_total = gs->num_local;
400   gs->gop_local_reduce = gs->local_reduce;
401   gs->num_gop_local_reduce = gs->num_local_reduce;
402 
403   map = gs->companion;
404 
405   /* is there any local compression */
406   if (!gs->num_local) {
407     gs->local_strength = NONE;
408     gs->num_local_gop = 0;
409   } else {
410     /* ok find intersection */
411     map = gs->companion;
412     reduce = gs->local_reduce;
413     for (i=0, t1=0; i<gs->num_local; i++, reduce++)
414     {
415       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
416           ||
417           PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
418         t1++;
419         if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
420         gs->num_local_reduce[i] *= -1;
421       }
422       **reduce=map[**reduce];
423     }
424 
425     /* intersection is empty */
426     if (!t1) {
427       gs->local_strength = FULL;
428       gs->num_local_gop = 0;
429     } else { /* intersection not empty */
430       gs->local_strength = PARTIAL;
431       ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr);
432 
433       gs->num_local_gop = t1;
434       gs->num_local_total =  gs->num_local;
435       gs->num_local    -= t1;
436       gs->gop_local_reduce = gs->local_reduce;
437       gs->num_gop_local_reduce = gs->num_local_reduce;
438 
439       for (i=0; i<t1; i++)
440         {
441           if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
442           gs->num_gop_local_reduce[i] *= -1;
443           gs->local_reduce++;
444           gs->num_local_reduce++;
445         }
446       gs->local_reduce++;
447       gs->num_local_reduce++;
448     }
449   }
450 
451   elms = gs->pw_elm_list;
452   nel  = gs->len_pw_list;
453   for (i=0; i<nel; i++) { elms[i] = map[elms[i]]; }
454 
455   elms = gs->tree_map_in;
456   nel  = gs->tree_map_sz;
457   for (i=0; i<nel; i++) { elms[i] = map[elms[i]]; }
458 
459   /* clean up */
460   free((void*) gs->local_elms);
461   free((void*) gs->companion);
462   free((void*) gs->elms);
463   free((void*) gs->ngh_buf);
464   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
465   PetscFunctionReturn(0);
466 }
467 
468 /******************************************************************************/
469 static PetscErrorCode place_in_tree( PetscInt elm)
470 {
471   PetscInt *tp, n;
472 
473   PetscFunctionBegin;
474   if (ntree==tree_buf_sz)
475   {
476     if (tree_buf_sz) {
477       tp = tree_buf;
478       n = tree_buf_sz;
479       tree_buf_sz<<=1;
480       tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
481       PCTFS_ivec_copy(tree_buf,tp,n);
482       free(tp);
483     } else {
484       tree_buf_sz = TREE_BUF_SZ;
485       tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
486     }
487   }
488 
489   tree_buf[ntree++] = elm;
490   PetscFunctionReturn(0);
491 }
492 
493 /******************************************************************************/
494 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
495 {
496   PetscInt i, j, npw=0, ntree_map=0;
497   PetscInt p_mask_size, ngh_buf_size, buf_size;
498   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
499   PetscInt *ngh_buf, *buf1, *buf2;
500   PetscInt offset, per_load, num_loads, or_ct, start, end;
501   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
502   PetscInt oper=GL_B_OR;
503   PetscInt *ptr3, *t_mask, level, ct1, ct2;
504   PetscErrorCode ierr;
505 
506   PetscFunctionBegin;
507   /* to make life easier */
508   nel   = gs->nel;
509   elms  = gs->elms;
510   level = gs->level;
511 
512   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
513   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
514   ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
515 
516   /* allocate space for masks and info bufs */
517   gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
518   gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
519   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
520   t_mask = (PetscInt*) malloc(p_mask_size);
521   gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
522 
523   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
524   /* had thought I could exploit rendezvous threshold */
525 
526   /* default is one pass */
527   per_load = negl  = gs->negl;
528   gs->num_loads = num_loads = 1;
529   i=p_mask_size*negl;
530 
531   /* possible overflow on buffer size */
532   /* overflow hack                    */
533   if (i<0) {i=INT_MAX;}
534 
535   buf_size = PetscMin(msg_buf,i);
536 
537   /* can we do it? */
538   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
539 
540   /* get PCTFS_giop buf space ... make *only* one malloc */
541   buf1 = (PetscInt*) malloc(buf_size<<1);
542 
543   /* more than one gior exchange needed? */
544   if (buf_size!=i) {
545     per_load = buf_size/p_mask_size;
546     buf_size = per_load*p_mask_size;
547     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
548   }
549 
550 
551   /* convert buf sizes from #bytes to #ints - 32 bit only! */
552   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
553 
554   /* find PCTFS_giop work space */
555   buf2 = buf1+buf_size;
556 
557   /* hold #ints needed for processor masks */
558   gs->mask_sz=p_mask_size;
559 
560   /* init buffers */
561   ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr);
562   ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr);
563   ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr);
564 
565   /* HACK reset tree info */
566   tree_buf=NULL;
567   tree_buf_sz=ntree=0;
568 
569   /* ok do it */
570   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
571     /* identity for bitwise or is 000...000 */
572     PCTFS_ivec_zero(buf1,buf_size);
573 
574     /* load msg buffer */
575     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
576       offset = (offset-start)*p_mask_size;
577       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
578     }
579 
580     /* GLOBAL: pass buffer */
581     ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr);
582 
583 
584     /* unload buffer into ngh_buf */
585     ptr2=(elms+i_start);
586     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
587       /* I own it ... may have to pairwise it */
588       if (j==*ptr2) {
589         /* do i share it w/anyone? */
590         ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
591         /* guess not */
592         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
593 
594         /* i do ... so keep info and turn off my bit */
595         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
596         ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr);
597         ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
598 
599         /* is it to be done pairwise? */
600         if (--ct1<=level) {
601           npw++;
602 
603           /* turn on high bit to indicate pw need to process */
604           *ptr2++ |= TOP_BIT;
605           ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
606           ptr1+=p_mask_size;
607           continue;
608         }
609 
610         /* get set for next and note that I have a tree contribution */
611         /* could save exact elm index for tree here -> save a search */
612         ptr2++; ptr1+=p_mask_size; ntree_map++;
613       } else { /* i don't but still might be involved in tree */
614 
615         /* shared by how many? */
616         ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
617 
618         /* none! */
619         if (ct1<2) continue;
620 
621         /* is it going to be done pairwise? but not by me of course!*/
622         if (--ct1<=level) continue;
623       }
624       /* LATER we're going to have to process it NOW */
625       /* nope ... tree it */
626       ierr = place_in_tree(j);CHKERRQ(ierr);
627     }
628   }
629 
630   free((void*)t_mask);
631   free((void*)buf1);
632 
633   gs->len_pw_list=npw;
634   gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
635 
636   /* expand from bit mask list to int list and save ngh list */
637   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
638   PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
639 
640   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
641 
642   oper = GL_MAX;
643   ct1 = gs->num_nghs;
644   ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr);
645   gs->max_nghs = ct1;
646 
647   gs->tree_map_sz  = ntree_map;
648   gs->max_left_over=ntree;
649 
650   free((void*)p_mask);
651   free((void*)sh_proc_mask);
652   PetscFunctionReturn(0);
653 }
654 
655 /******************************************************************************/
656 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
657 {
658    PetscInt i, j;
659   PetscInt p_mask_size;
660   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
661   PetscInt *ngh_buf, *buf2;
662   PetscInt offset;
663   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
664   PetscInt *pairwise_elm_list, len_pair_list=0;
665   PetscInt *iptr, t1, i_start, nel, *elms;
666   PetscInt ct;
667   PetscErrorCode ierr;
668 
669   PetscFunctionBegin;
670   /* to make life easier */
671   nel  = gs->nel;
672   elms = gs->elms;
673   ngh_buf = gs->ngh_buf;
674   sh_proc_mask  = gs->pw_nghs;
675 
676   /* need a few temp masks */
677   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
678   p_mask        = (PetscInt*) malloc(p_mask_size);
679   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
680 
681   /* set mask to my PCTFS_my_id's bit mask */
682   ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
683 
684   p_mask_size /= sizeof(PetscInt);
685 
686   len_pair_list=gs->len_pw_list;
687   gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
688 
689   /* how many processors (nghs) do we have to exchange with? */
690   nprs=gs->num_pairs=PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
691 
692 
693   /* allocate space for PCTFS_gs_gop() info */
694   gs->pair_list = msg_list = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
695   gs->msg_sizes = msg_size  = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
696   gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));
697 
698   /* init msg_size list */
699   ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr);
700 
701   /* expand from bit mask list to int list */
702   ierr = PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr);
703 
704   /* keep list of elements being handled pairwise */
705   for (i=j=0;i<nel;i++) {
706     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
707   }
708   pairwise_elm_list[j] = -1;
709 
710   gs->msg_ids_out = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
711   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
712   gs->msg_ids_in = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
713   gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
714   gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
715 
716   /* find who goes to each processor */
717   for (i_start=i=0;i<nprs;i++) {
718     /* processor i's mask */
719     ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr);
720 
721     /* det # going to processor i */
722     for (ct=j=0;j<len_pair_list;j++) {
723       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
724       ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
725       if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) { ct++; }
726     }
727     msg_size[i] = ct;
728     i_start = PetscMax(i_start,ct);
729 
730     /*space to hold nodes in message to first neighbor */
731     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
732 
733     for (j=0;j<len_pair_list;j++) {
734       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
735       ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
736       if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) { *iptr++ = j; }
737     }
738     *iptr = -1;
739   }
740   msg_nodes[nprs] = NULL;
741 
742   j=gs->loc_node_pairs=i_start;
743   t1 = GL_MAX;
744   ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
745   gs->max_node_pairs = i_start;
746 
747   i_start=j;
748   t1 = GL_MIN;
749   ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
750   gs->min_node_pairs = i_start;
751 
752   i_start=j;
753   t1 = GL_ADD;
754   ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
755   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
756 
757   i_start=nprs;
758   t1 = GL_MAX;
759   PCTFS_giop(&i_start,&offset,1,&t1);
760   gs->max_pairs = i_start;
761 
762 
763   /* remap pairwise in tail of gsi_via_bit_mask() */
764   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
765   gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
766   gs->in  = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
767 
768   /* reset malloc pool */
769   free((void*)p_mask);
770   free((void*)tmp_proc_mask);
771   PetscFunctionReturn(0);
772 }
773 
774 /* to do pruned tree just save ngh buf copy for each one and decode here!
775 ******************************************************************************/
776 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
777 {
778   PetscInt i, j, n, nel;
779   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
780 
781   PetscFunctionBegin;
782   /* local work ptrs */
783   elms = gs->elms;
784   nel     = gs->nel;
785 
786   /* how many via tree */
787   gs->tree_nel  = n = ntree;
788   gs->tree_elms = tree_elms = iptr_in = tree_buf;
789   gs->tree_buf  = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
790   gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
791   j=gs->tree_map_sz;
792   gs->tree_map_in = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
793   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
794 
795   /* search the longer of the two lists */
796   /* note ... could save this info in get_ngh_buf and save searches */
797   if (n<=nel) {
798     /* bijective fct w/remap - search elm list */
799     for (i=0; i<n; i++) {
800       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
801     }
802   } else {
803     for (i=0; i<nel; i++) {
804       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
805     }
806   }
807 
808   /* sentinel */
809   *iptr_in = *iptr_out = -1;
810   PetscFunctionReturn(0);
811 }
812 
813 /******************************************************************************/
814 static PetscErrorCode PCTFS_gs_gop_local_out( PCTFS_gs_id *gs,  PetscScalar *vals)
815 {
816   PetscInt *num, *map, **reduce;
817   PetscScalar tmp;
818 
819   PetscFunctionBegin;
820   num    = gs->num_gop_local_reduce;
821   reduce = gs->gop_local_reduce;
822   while ((map = *reduce++)) {
823     /* wall */
824     if (*num == 2) {
825       num ++;
826       vals[map[1]] = vals[map[0]];
827     } else if (*num == 3) { /* corner shared by three elements */
828       num ++;
829       vals[map[2]] = vals[map[1]] = vals[map[0]];
830     } else if (*num == 4) { /* corner shared by four elements */
831       num ++;
832       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
833     } else { /* general case ... odd geoms ... 3D*/
834       num++;
835       tmp = *(vals + *map++);
836       while (*map >= 0) { *(vals + *map++) = tmp; }
837     }
838   }
839   PetscFunctionReturn(0);
840 }
841 
842 /******************************************************************************/
843 static PetscErrorCode PCTFS_gs_gop_local_plus( PCTFS_gs_id *gs,  PetscScalar *vals)
844 {
845   PetscInt *num, *map, **reduce;
846   PetscScalar tmp;
847 
848   PetscFunctionBegin;
849   num    = gs->num_local_reduce;
850   reduce = gs->local_reduce;
851   while ((map = *reduce)) {
852     /* wall */
853     if (*num == 2) {
854       num ++; reduce++;
855       vals[map[1]] = vals[map[0]] += vals[map[1]];
856     } else if (*num == 3) { /* corner shared by three elements */
857       num ++; reduce++;
858       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
859     } else if (*num == 4) { /* corner shared by four elements */
860       num ++; reduce++;
861       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
862                               (vals[map[1]] + vals[map[2]] + vals[map[3]]);
863     } else { /* general case ... odd geoms ... 3D*/
864       num ++;
865       tmp = 0.0;
866       while (*map >= 0) {tmp += *(vals + *map++);}
867 
868       map = *reduce++;
869       while (*map >= 0) {*(vals + *map++) = tmp;}
870     }
871   }
872   PetscFunctionReturn(0);
873 }
874 
875 /******************************************************************************/
876 static PetscErrorCode PCTFS_gs_gop_local_in_plus( PCTFS_gs_id *gs,  PetscScalar *vals)
877 {
878   PetscInt *num, *map, **reduce;
879   PetscScalar *base;
880 
881   PetscFunctionBegin;
882   num    = gs->num_gop_local_reduce;
883   reduce = gs->gop_local_reduce;
884   while ((map = *reduce++)) {
885     /* wall */
886     if (*num == 2) {
887       num ++;
888       vals[map[0]] += vals[map[1]];
889     } else if (*num == 3) { /* corner shared by three elements */
890       num ++;
891       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
892     } else if (*num == 4) { /* corner shared by four elements */
893       num ++;
894       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
895     } else { /* general case ... odd geoms ... 3D*/
896       num++;
897       base = vals + *map++;
898       while (*map >= 0) {*base += *(vals + *map++);}
899     }
900   }
901   PetscFunctionReturn(0);
902 }
903 
904 /******************************************************************************/
905 PetscErrorCode PCTFS_gs_free( PCTFS_gs_id *gs)
906 {
907   PetscInt i;
908 
909   PetscFunctionBegin;
910   if (gs->nghs) { free((void*) gs->nghs); }
911   if (gs->pw_nghs) { free((void*) gs->pw_nghs); }
912 
913   /* tree */
914   if (gs->max_left_over)
915   {
916     if (gs->tree_elms) { free((void*) gs->tree_elms); }
917     if (gs->tree_buf) { free((void*) gs->tree_buf); }
918     if (gs->tree_work) { free((void*) gs->tree_work); }
919     if (gs->tree_map_in) { free((void*) gs->tree_map_in); }
920     if (gs->tree_map_out) { free((void*) gs->tree_map_out); }
921   }
922 
923   /* pairwise info */
924   if (gs->num_pairs)
925   {
926     /* should be NULL already */
927     if (gs->ngh_buf) { free((void*) gs->ngh_buf); }
928     if (gs->elms) { free((void*) gs->elms); }
929     if (gs->local_elms) { free((void*) gs->local_elms); }
930     if (gs->companion) { free((void*) gs->companion); }
931 
932     /* only set if pairwise */
933     if (gs->vals) { free((void*) gs->vals); }
934     if (gs->in) { free((void*) gs->in); }
935     if (gs->out) { free((void*) gs->out); }
936     if (gs->msg_ids_in) { free((void*) gs->msg_ids_in); }
937     if (gs->msg_ids_out) { free((void*) gs->msg_ids_out); }
938     if (gs->pw_vals) { free((void*) gs->pw_vals); }
939     if (gs->pw_elm_list) { free((void*) gs->pw_elm_list); }
940     if (gs->node_list) {
941       for (i=0;i<gs->num_pairs;i++) {
942         if (gs->node_list[i])  {
943           free((void*) gs->node_list[i]);
944         }
945       }
946       free((void*) gs->node_list);
947     }
948     if (gs->msg_sizes) { free((void*) gs->msg_sizes); }
949     if (gs->pair_list) { free((void*) gs->pair_list); }
950   }
951 
952   /* local info */
953   if (gs->num_local_total>=0) {
954     /*      for (i=0;i<gs->num_local_total;i++) */
955     for (i=0;i<gs->num_local_total+1;i++) {
956       if (gs->num_gop_local_reduce[i]) { free((void*) gs->gop_local_reduce[i]); }
957     }
958   }
959 
960   /* if intersection tree/pairwise and local isn't empty */
961   if (gs->gop_local_reduce) { free((void*) gs->gop_local_reduce); }
962   if (gs->num_gop_local_reduce) { free((void*) gs->num_gop_local_reduce); }
963 
964   free((void*) gs);
965   PetscFunctionReturn(0);
966 }
967 
968 /******************************************************************************/
969 PetscErrorCode PCTFS_gs_gop_vec( PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
970 {
971   PetscErrorCode ierr;
972 
973   PetscFunctionBegin;
974   switch (*op) {
975   case '+':
976     PCTFS_gs_gop_vec_plus(gs,vals,step);
977     break;
978   default:
979     ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op",op[0]);CHKERRQ(ierr);
980     ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus");CHKERRQ(ierr);
981     PCTFS_gs_gop_vec_plus(gs,vals,step);
982     break;
983   }
984   PetscFunctionReturn(0);
985 }
986 
987 /******************************************************************************/
988 static PetscErrorCode PCTFS_gs_gop_vec_plus( PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
989 {
990   PetscFunctionBegin;
991   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
992 
993   /* local only operations!!! */
994   if (gs->num_local) { PCTFS_gs_gop_vec_local_plus(gs,vals,step); }
995 
996   /* if intersection tree/pairwise and local isn't empty */
997   if (gs->num_local_gop)
998   {
999     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
1000 
1001     /* pairwise */
1002     if (gs->num_pairs) { PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); }
1003 
1004     /* tree */
1005     else if (gs->max_left_over) { PCTFS_gs_gop_vec_tree_plus(gs,vals,step); }
1006 
1007     PCTFS_gs_gop_vec_local_out(gs,vals,step);
1008   } else { /* if intersection tree/pairwise and local is empty */
1009     /* pairwise */
1010     if (gs->num_pairs) { PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); }
1011 
1012     /* tree */
1013     else if (gs->max_left_over) { PCTFS_gs_gop_vec_tree_plus(gs,vals,step); }
1014   }
1015   PetscFunctionReturn(0);
1016 }
1017 
1018 /******************************************************************************/
1019 static PetscErrorCode PCTFS_gs_gop_vec_local_plus( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1020 {
1021   PetscInt *num, *map, **reduce;
1022   PetscScalar *base;
1023 
1024   PetscFunctionBegin;
1025   num    = gs->num_local_reduce;
1026   reduce = gs->local_reduce;
1027   while ((map = *reduce)) {
1028     base = vals + map[0] * step;
1029 
1030     /* wall */
1031     if (*num == 2) {
1032       num++; reduce++;
1033       PCTFS_rvec_add (base,vals+map[1]*step,step);
1034       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1035     } else if (*num == 3) { /* corner shared by three elements */
1036       num++; reduce++;
1037       PCTFS_rvec_add (base,vals+map[1]*step,step);
1038       PCTFS_rvec_add (base,vals+map[2]*step,step);
1039       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1040       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1041     } else if (*num == 4) { /* corner shared by four elements */
1042       num++; reduce++;
1043       PCTFS_rvec_add (base,vals+map[1]*step,step);
1044       PCTFS_rvec_add (base,vals+map[2]*step,step);
1045       PCTFS_rvec_add (base,vals+map[3]*step,step);
1046       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1047       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1048       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1049     } else { /* general case ... odd geoms ... 3D */
1050       num++;
1051       while (*++map >= 0) {PCTFS_rvec_add (base,vals+*map*step,step);}
1052 
1053       map = *reduce;
1054       while (*++map >= 0) {PCTFS_rvec_copy(vals+*map*step,base,step);}
1055 
1056       reduce++;
1057     }
1058   }
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 /******************************************************************************/
1063 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1064 {
1065   PetscInt  *num, *map, **reduce;
1066   PetscScalar *base;
1067 
1068   PetscFunctionBegin;
1069   num    = gs->num_gop_local_reduce;
1070   reduce = gs->gop_local_reduce;
1071   while ((map = *reduce++)) {
1072     base = vals + map[0] * step;
1073 
1074     /* wall */
1075     if (*num == 2) {
1076       num ++;
1077       PCTFS_rvec_add(base,vals+map[1]*step,step);
1078     } else if (*num == 3) { /* corner shared by three elements */
1079       num ++;
1080       PCTFS_rvec_add(base,vals+map[1]*step,step);
1081       PCTFS_rvec_add(base,vals+map[2]*step,step);
1082     } else if (*num == 4) { /* corner shared by four elements */
1083       num ++;
1084       PCTFS_rvec_add(base,vals+map[1]*step,step);
1085       PCTFS_rvec_add(base,vals+map[2]*step,step);
1086       PCTFS_rvec_add(base,vals+map[3]*step,step);
1087     } else { /* general case ... odd geoms ... 3D*/
1088       num++;
1089       while (*++map >= 0) {PCTFS_rvec_add(base,vals+*map*step,step);}
1090     }
1091   }
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /******************************************************************************/
1096 static PetscErrorCode PCTFS_gs_gop_vec_local_out( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1097 {
1098   PetscInt *num, *map, **reduce;
1099   PetscScalar *base;
1100 
1101   PetscFunctionBegin;
1102   num    = gs->num_gop_local_reduce;
1103   reduce = gs->gop_local_reduce;
1104   while ((map = *reduce++)) {
1105     base = vals + map[0] * step;
1106 
1107     /* wall */
1108     if (*num == 2) {
1109       num ++;
1110       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1111     } else if (*num == 3) { /* corner shared by three elements */
1112       num ++;
1113       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1114       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1115     } else if (*num == 4) { /* corner shared by four elements */
1116       num ++;
1117       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1118       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1119       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1120     } else { /* general case ... odd geoms ... 3D*/
1121       num++;
1122       while (*++map >= 0) {PCTFS_rvec_copy(vals+*map*step,base,step);}
1123     }
1124   }
1125   PetscFunctionReturn(0);
1126 }
1127 
1128 /******************************************************************************/
1129 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus( PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1130 {
1131   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1132   PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1133   PetscInt *pw, *list, *size, **nodes;
1134   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1135   MPI_Status status;
1136   PetscBLASInt i1 = 1,dstep;
1137   PetscErrorCode ierr;
1138 
1139   PetscFunctionBegin;
1140   /* strip and load s */
1141   msg_list =list         = gs->pair_list;
1142   msg_size =size         = gs->msg_sizes;
1143   msg_nodes=nodes        = gs->node_list;
1144   iptr=pw                = gs->pw_elm_list;
1145   dptr1=dptr3            = gs->pw_vals;
1146   msg_ids_in  = ids_in   = gs->msg_ids_in;
1147   msg_ids_out = ids_out  = gs->msg_ids_out;
1148   dptr2                  = gs->out;
1149   in1=in2                = gs->in;
1150 
1151   /* post the receives */
1152   /*  msg_nodes=nodes; */
1153   do {
1154     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1155         second one *list and do list++ afterwards */
1156     ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr);
1157     list++;msg_ids_in++;
1158     in1 += *size++ *step;
1159   }
1160   while (*++msg_nodes);
1161   msg_nodes=nodes;
1162 
1163   /* load gs values into in out gs buffers */
1164   while (*iptr >= 0) {
1165     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1166     dptr3+=step;
1167     iptr++;
1168   }
1169 
1170   /* load out buffers and post the sends */
1171   while ((iptr = *msg_nodes++)) {
1172     dptr3 = dptr2;
1173     while (*iptr >= 0) {
1174       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1175       dptr2+=step;
1176       iptr++;
1177     }
1178     ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr);
1179     msg_size++; msg_list++;msg_ids_out++;
1180   }
1181 
1182   /* tree */
1183   if (gs->max_left_over) { PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step); }
1184 
1185   /* process the received data */
1186   msg_nodes=nodes;
1187   while ((iptr = *nodes++)) {
1188     PetscScalar d1 = 1.0;
1189 
1190     /* Should I check the return value of MPI_Wait() or status? */
1191     /* Can this loop be replaced by a call to MPI_Waitall()? */
1192     ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr);
1193     ids_in++;
1194     while (*iptr >= 0) {
1195       dstep = PetscBLASIntCast(step);
1196       BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1197       in2+=step;
1198       iptr++;
1199     }
1200   }
1201 
1202   /* replace vals */
1203   while (*pw >= 0) {
1204     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1205     dptr1+=step;
1206     pw++;
1207   }
1208 
1209   /* clear isend message handles */
1210   /* This changed for clarity though it could be the same */
1211 
1212   /* Should I check the return value of MPI_Wait() or status? */
1213   /* Can this loop be replaced by a call to MPI_Waitall()? */
1214   while (*msg_nodes++) {ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);ids_out++;}
1215   PetscFunctionReturn(0);
1216 }
1217 
1218 /******************************************************************************/
1219 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus( PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1220 {
1221   PetscInt size, *in, *out;
1222   PetscScalar *buf, *work;
1223   PetscInt op[] = {GL_ADD,0};
1224   PetscBLASInt i1 = 1;
1225 
1226   PetscFunctionBegin;
1227   /* copy over to local variables */
1228   in   = gs->tree_map_in;
1229   out  = gs->tree_map_out;
1230   buf  = gs->tree_buf;
1231   work = gs->tree_work;
1232   size = gs->tree_nel*step;
1233 
1234   /* zero out collection buffer */
1235   PCTFS_rvec_zero(buf,size);
1236 
1237 
1238   /* copy over my contributions */
1239   while (*in >= 0) {
1240     PetscBLASInt dstep = PetscBLASIntCast(step);
1241     BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1242   }
1243 
1244   /* perform fan in/out on full buffer */
1245   /* must change PCTFS_grop to handle the blas */
1246   PCTFS_grop(buf,work,size,op);
1247 
1248   /* reset */
1249   in   = gs->tree_map_in;
1250   out  = gs->tree_map_out;
1251 
1252   /* get the portion of the results I need */
1253   while (*in >= 0) {
1254     PetscBLASInt dstep = PetscBLASIntCast(step);
1255     BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1256   }
1257   PetscFunctionReturn(0);
1258 }
1259 
1260 /******************************************************************************/
1261 PetscErrorCode PCTFS_gs_gop_hc( PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1262 {
1263   PetscErrorCode ierr;
1264 
1265   PetscFunctionBegin;
1266   switch (*op) {
1267     case '+':
1268       PCTFS_gs_gop_plus_hc(gs,vals,dim);
1269       break;
1270     default:
1271       ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op",op[0]);CHKERRQ(ierr);
1272       ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr);
1273       PCTFS_gs_gop_plus_hc(gs,vals,dim);
1274       break;
1275   }
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 /******************************************************************************/
1280 static PetscErrorCode PCTFS_gs_gop_plus_hc( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1281 {
1282   PetscFunctionBegin;
1283   /* if there's nothing to do return */
1284   if (dim<=0) { PetscFunctionReturn(0); }
1285 
1286   /* can't do more dimensions then exist */
1287   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1288 
1289   /* local only operations!!! */
1290   if (gs->num_local) {PCTFS_gs_gop_local_plus(gs,vals);}
1291 
1292   /* if intersection tree/pairwise and local isn't empty */
1293   if (gs->num_local_gop) {
1294     PCTFS_gs_gop_local_in_plus(gs,vals);
1295 
1296     /* pairwise will do tree inside ... */
1297     if (gs->num_pairs) { PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); }
1298     /* tree only */
1299     else if (gs->max_left_over) { PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); }
1300 
1301     PCTFS_gs_gop_local_out(gs,vals);
1302   } else { /* if intersection tree/pairwise and local is empty */
1303     /* pairwise will do tree inside */
1304     if (gs->num_pairs) { PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); }
1305     /* tree */
1306     else if (gs->max_left_over) { PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); }
1307   }
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 /******************************************************************************/
1312 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc( PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1313 {
1314   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1315   PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1316   PetscInt *pw, *list, *size, **nodes;
1317   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1318   MPI_Status status;
1319   PetscInt i, mask=1;
1320   PetscErrorCode ierr;
1321 
1322   PetscFunctionBegin;
1323   for (i=1; i<dim; i++) { mask<<=1; mask++; }
1324 
1325   /* strip and load s */
1326   msg_list =list         = gs->pair_list;
1327   msg_size =size         = gs->msg_sizes;
1328   msg_nodes=nodes        = gs->node_list;
1329   iptr=pw                = gs->pw_elm_list;
1330   dptr1=dptr3            = gs->pw_vals;
1331   msg_ids_in  = ids_in   = gs->msg_ids_in;
1332   msg_ids_out = ids_out  = gs->msg_ids_out;
1333   dptr2                  = gs->out;
1334   in1=in2                = gs->in;
1335 
1336   /* post the receives */
1337   /*  msg_nodes=nodes; */
1338   do {
1339     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1340         second one *list and do list++ afterwards */
1341     if ((PCTFS_my_id|mask)==(*list|mask)) {
1342       ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr);
1343       list++; msg_ids_in++;in1 += *size++;
1344     } else { list++; size++; }
1345   }
1346   while (*++msg_nodes);
1347 
1348   /* load gs values into in out gs buffers */
1349   while (*iptr >= 0) { *dptr3++ = *(in_vals + *iptr++); }
1350 
1351   /* load out buffers and post the sends */
1352   msg_nodes=nodes;
1353   list = msg_list;
1354   while ((iptr = *msg_nodes++)) {
1355     if ((PCTFS_my_id|mask)==(*list|mask)) {
1356       dptr3 = dptr2;
1357       while (*iptr >= 0) {*dptr2++ = *(dptr1 + *iptr++);}
1358       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1359       /* is msg_ids_out++ correct? */
1360       ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr);
1361       msg_size++;list++;msg_ids_out++;
1362     } else {list++; msg_size++;}
1363   }
1364 
1365   /* do the tree while we're waiting */
1366   if (gs->max_left_over) { PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim); }
1367 
1368   /* process the received data */
1369   msg_nodes=nodes;
1370   list = msg_list;
1371   while ((iptr = *nodes++)) {
1372     if ((PCTFS_my_id|mask)==(*list|mask)) {
1373       /* Should I check the return value of MPI_Wait() or status? */
1374       /* Can this loop be replaced by a call to MPI_Waitall()? */
1375       ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr);
1376       ids_in++;
1377       while (*iptr >= 0) {*(dptr1 + *iptr++) += *in2++;}
1378     }
1379     list++;
1380   }
1381 
1382   /* replace vals */
1383   while (*pw >= 0) { *(in_vals + *pw++) = *dptr1++; }
1384 
1385   /* clear isend message handles */
1386   /* This changed for clarity though it could be the same */
1387   while (*msg_nodes++) {
1388     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1389       /* Should I check the return value of MPI_Wait() or status? */
1390       /* Can this loop be replaced by a call to MPI_Waitall()? */
1391       ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);
1392       ids_out++;
1393     }
1394     msg_list++;
1395   }
1396 
1397   PetscFunctionReturn(0);
1398 }
1399 
1400 /******************************************************************************/
1401 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1402 {
1403   PetscInt size;
1404   PetscInt *in, *out;
1405   PetscScalar *buf, *work;
1406   PetscInt op[] = {GL_ADD,0};
1407 
1408   PetscFunctionBegin;
1409   in   = gs->tree_map_in;
1410   out  = gs->tree_map_out;
1411   buf  = gs->tree_buf;
1412   work = gs->tree_work;
1413   size = gs->tree_nel;
1414 
1415   PCTFS_rvec_zero(buf,size);
1416 
1417   while (*in >= 0) {*(buf + *out++) = *(vals + *in++);}
1418 
1419   in   = gs->tree_map_in;
1420   out  = gs->tree_map_out;
1421 
1422   PCTFS_grop_hc(buf,work,size,op,dim);
1423 
1424   while (*in >= 0) {*(vals + *in++) = *(buf + *out++);}
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 
1429 
1430