xref: /petsc/src/ksp/pc/impls/tfs/comm.c (revision 827bd09bbb551290dd2e51d0997dc3a77eecb2e6)
1 
2 /***********************************comm.c*************************************
3 SPARSE GATHER-SCATTER PACKAGE: bss_malloc bss_malloc ivec error comm gs queue
4 
5 Author: Henry M. Tufo III
6 
7 e-mail: hmt@cs.brown.edu
8 
9 snail-mail:
10 Division of Applied Mathematics
11 Brown University
12 Providence, RI 02912
13 
14 Last Modification:
15 11.21.97
16 ***********************************comm.c*************************************/
17 
18 /***********************************comm.c*************************************
19 File Description:
20 -----------------
21 
22 ***********************************comm.c*************************************/
23 #include <stdio.h>
24 
25 #if   defined NXSRC
26 #ifndef DELTA
27 #include <nx.h>
28 #endif
29 
30 #elif defined MPISRC
31 #include <mpi.h>
32 
33 #endif
34 
35 
36 #include "const.h"
37 #include "types.h"
38 #include "error.h"
39 #include "ivec.h"
40 #include "bss_malloc.h"
41 #include "comm.h"
42 
43 
44 /* global program control variables - explicitly exported */
45 int my_id            = 0;
46 int num_nodes        = 1;
47 int floor_num_nodes  = 0;
48 int i_log2_num_nodes = 0;
49 
50 /* global program control variables */
51 static int p_init = 0;
52 static int modfl_num_nodes;
53 static int edge_not_pow_2;
54 
55 static unsigned int edge_node[INT_LEN*32];
56 
57 static void sgl_iadd(int    *vals, int level);
58 static void sgl_radd(double *vals, int level);
59 static void hmt_concat(REAL *vals, REAL *work, int size);
60 
61 
62 /***********************************comm.c*************************************
63 Function: giop()
64 
65 Input :
66 Output:
67 Return:
68 Description:
69 ***********************************comm.c*************************************/
70 void
71 comm_init (void)
72 {
73 #ifdef DEBUG
74   error_msg_warning("c_init() :: start\n");
75 #endif
76 
77   if (p_init++) return;
78 
79 #if PETSC_SIZEOF_INT != 4
80   error_msg_warning("Int != 4 Bytes!");
81 #endif
82 
83 
84   MPI_Comm_size(MPI_COMM_WORLD,&num_nodes);
85   MPI_Comm_rank(MPI_COMM_WORLD,&my_id);
86 
87   if (num_nodes> (INT_MAX >> 1))
88   {error_msg_fatal("Can't have more then MAX_INT/2 nodes!!!");}
89 
90   ivec_zero((int *)edge_node,INT_LEN*32);
91 
92   floor_num_nodes = 1;
93   i_log2_num_nodes = modfl_num_nodes = 0;
94   while (floor_num_nodes <= num_nodes)
95     {
96       edge_node[i_log2_num_nodes] = my_id ^ floor_num_nodes;
97       floor_num_nodes <<= 1;
98       i_log2_num_nodes++;
99     }
100 
101   i_log2_num_nodes--;
102   floor_num_nodes >>= 1;
103   modfl_num_nodes = (num_nodes - floor_num_nodes);
104 
105   if ((my_id > 0) && (my_id <= modfl_num_nodes))
106     {edge_not_pow_2=((my_id|floor_num_nodes)-1);}
107   else if (my_id >= floor_num_nodes)
108     {edge_not_pow_2=((my_id^floor_num_nodes)+1);
109     }
110   else
111     {edge_not_pow_2 = 0;}
112 
113 #ifdef DEBUG
114   error_msg_warning("c_init() done :: my_id=%d, num_nodes=%d",my_id,num_nodes);
115 #endif
116 }
117 
118 
119 
120 /***********************************comm.c*************************************
121 Function: giop()
122 
123 Input :
124 Output:
125 Return:
126 Description: fan-in/out version
127 ***********************************comm.c*************************************/
128 void
129 giop(int *vals, int *work, int n, int *oprs)
130 {
131   register int mask, edge;
132   int type, dest;
133   vfp fp;
134 #if defined MPISRC
135   MPI_Status  status;
136 #elif defined NXSRC
137   int len;
138 #endif
139 
140 
141 #ifdef SAFE
142   /* ok ... should have some data, work, and operator(s) */
143   if (!vals||!work||!oprs)
144     {error_msg_fatal("giop() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
145 
146   /* non-uniform should have at least two entries */
147   if ((oprs[0] == NON_UNIFORM)&&(n<2))
148     {error_msg_fatal("giop() :: non_uniform and n=0,1?");}
149 
150   /* check to make sure comm package has been initialized */
151   if (!p_init)
152     {comm_init();}
153 #endif
154 
155   /* if there's nothing to do return */
156   if ((num_nodes<2)||(!n))
157     {
158 #ifdef DEBUG
159       error_msg_warning("giop() :: n=%d num_nodes=%d",n,num_nodes);
160 #endif
161       return;
162     }
163 
164   /* a negative number if items to send ==> fatal */
165   if (n<0)
166     {error_msg_fatal("giop() :: n=%d<0?",n);}
167 
168   /* advance to list of n operations for custom */
169   if ((type=oprs[0])==NON_UNIFORM)
170     {oprs++;}
171 
172   /* major league hack */
173   if ((fp = (vfp) ivec_fct_addr(type)) == NULL)
174     {
175       error_msg_warning("giop() :: hope you passed in a rbfp!\n");
176       fp = (vfp) oprs;
177     }
178 
179 #if  defined NXSRC
180   /* all msgs will be of the same length */
181   len = n*INT_LEN;
182 
183   /* if not a hypercube must colapse partial dim */
184   if (edge_not_pow_2)
185     {
186       if (my_id >= floor_num_nodes)
187 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
188       else
189 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,oprs);}
190     }
191 
192   /* implement the mesh fan in/out exchange algorithm */
193   if (my_id<floor_num_nodes)
194     {
195       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
196 	{
197 	  dest = my_id^mask;
198 	  if (my_id > dest)
199 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
200 	  else
201 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
202 	}
203 
204       mask=floor_num_nodes>>1;
205       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
206 	{
207 	  if (my_id%mask)
208 	    {continue;}
209 
210 	  dest = my_id^mask;
211 	  if (my_id < dest)
212 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
213 	  else
214 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
215 	}
216     }
217 
218   /* if not a hypercube must expand to partial dim */
219   if (edge_not_pow_2)
220     {
221       if (my_id >= floor_num_nodes)
222 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
223       else
224 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
225     }
226 
227 #elif defined MPISRC
228   /* all msgs will be of the same length */
229   /* if not a hypercube must colapse partial dim */
230   if (edge_not_pow_2)
231     {
232       if (my_id >= floor_num_nodes)
233 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG0+my_id,MPI_COMM_WORLD);}
234       else
235 	{
236 	  MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
237 		   MPI_COMM_WORLD,&status);
238 	  (*fp)(vals,work,n,oprs);
239 	}
240     }
241 
242   /* implement the mesh fan in/out exchange algorithm */
243   if (my_id<floor_num_nodes)
244     {
245       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
246 	{
247 	  dest = my_id^mask;
248 	  if (my_id > dest)
249 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
250 	  else
251 	    {
252 	      MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG2+dest,
253 		       MPI_COMM_WORLD, &status);
254 	      (*fp)(vals, work, n, oprs);
255 	    }
256 	}
257 
258       mask=floor_num_nodes>>1;
259       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
260 	{
261 	  if (my_id%mask)
262 	    {continue;}
263 
264 	  dest = my_id^mask;
265 	  if (my_id < dest)
266 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
267 	  else
268 	    {
269 	      MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG4+dest,
270 		       MPI_COMM_WORLD, &status);
271 	    }
272 	}
273     }
274 
275   /* if not a hypercube must expand to partial dim */
276   if (edge_not_pow_2)
277     {
278       if (my_id >= floor_num_nodes)
279 	{
280 	  MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
281 		   MPI_COMM_WORLD,&status);
282 	}
283       else
284 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG5+my_id,MPI_COMM_WORLD);}
285     }
286 #else
287   return;
288 #endif
289 }
290 
291 /***********************************comm.c*************************************
292 Function: grop()
293 
294 Input :
295 Output:
296 Return:
297 Description: fan-in/out version
298 ***********************************comm.c*************************************/
299 void
300 grop(REAL *vals, REAL *work, int n, int *oprs)
301 {
302   register int mask, edge;
303   int type, dest;
304   vfp fp;
305 #if defined MPISRC
306   MPI_Status  status;
307 #elif defined NXSRC
308   int len;
309 #endif
310 
311 
312 #ifdef SAFE
313   /* ok ... should have some data, work, and operator(s) */
314   if (!vals||!work||!oprs)
315     {error_msg_fatal("grop() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
316 
317   /* non-uniform should have at least two entries */
318   if ((oprs[0] == NON_UNIFORM)&&(n<2))
319     {error_msg_fatal("grop() :: non_uniform and n=0,1?");}
320 
321   /* check to make sure comm package has been initialized */
322   if (!p_init)
323     {comm_init();}
324 #endif
325 
326   /* if there's nothing to do return */
327   if ((num_nodes<2)||(!n))
328     {return;}
329 
330   /* a negative number of items to send ==> fatal */
331   if (n<0)
332     {error_msg_fatal("gdop() :: n=%d<0?",n);}
333 
334   /* advance to list of n operations for custom */
335   if ((type=oprs[0])==NON_UNIFORM)
336     {oprs++;}
337 
338   if ((fp = (vfp) rvec_fct_addr(type)) == NULL)
339     {
340       error_msg_warning("grop() :: hope you passed in a rbfp!\n");
341       fp = (vfp) oprs;
342     }
343 
344 #if  defined NXSRC
345   /* all msgs will be of the same length */
346   len = n*REAL_LEN;
347 
348   /* if not a hypercube must colapse partial dim */
349   if (edge_not_pow_2)
350     {
351       if (my_id >= floor_num_nodes)
352 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
353       else
354 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,oprs);}
355     }
356 
357   /* implement the mesh fan in/out exchange algorithm */
358   if (my_id<floor_num_nodes)
359     {
360       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
361 	{
362 	  dest = my_id^mask;
363 	  if (my_id > dest)
364 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
365 	  else
366 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
367 	}
368 
369       mask=floor_num_nodes>>1;
370       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
371 	{
372 	  if (my_id%mask)
373 	    {continue;}
374 
375 	  dest = my_id^mask;
376 	  if (my_id < dest)
377 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
378 	  else
379 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
380 	}
381     }
382 
383   /* if not a hypercube must expand to partial dim */
384   if (edge_not_pow_2)
385     {
386       if (my_id >= floor_num_nodes)
387 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
388       else
389 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
390     }
391 
392 #elif defined MPISRC
393   /* all msgs will be of the same length */
394   /* if not a hypercube must colapse partial dim */
395   if (edge_not_pow_2)
396     {
397       if (my_id >= floor_num_nodes)
398 	{MPI_Send(vals,n,REAL_TYPE,edge_not_pow_2,MSGTAG0+my_id,
399 		  MPI_COMM_WORLD);}
400       else
401 	{
402 	  MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
403 		   MPI_COMM_WORLD,&status);
404 	  (*fp)(vals,work,n,oprs);
405 	}
406     }
407 
408   /* implement the mesh fan in/out exchange algorithm */
409   if (my_id<floor_num_nodes)
410     {
411       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
412 	{
413 	  dest = my_id^mask;
414 	  if (my_id > dest)
415 	    {MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
416 	  else
417 	    {
418 	      MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,
419 		       MPI_COMM_WORLD, &status);
420 	      (*fp)(vals, work, n, oprs);
421 	    }
422 	}
423 
424       mask=floor_num_nodes>>1;
425       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
426 	{
427 	  if (my_id%mask)
428 	    {continue;}
429 
430 	  dest = my_id^mask;
431 	  if (my_id < dest)
432 	    {MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
433 	  else
434 	    {
435 	      MPI_Recv(vals,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG4+dest,
436 		       MPI_COMM_WORLD, &status);
437 	    }
438 	}
439     }
440 
441   /* if not a hypercube must expand to partial dim */
442   if (edge_not_pow_2)
443     {
444       if (my_id >= floor_num_nodes)
445 	{
446 	  MPI_Recv(vals,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
447 		   MPI_COMM_WORLD,&status);
448 	}
449       else
450 	{MPI_Send(vals,n,REAL_TYPE,edge_not_pow_2,MSGTAG5+my_id,
451 		  MPI_COMM_WORLD);}
452     }
453 #else
454   return;
455 #endif
456 }
457 
458 
459 /***********************************comm.c*************************************
460 Function: grop()
461 
462 Input :
463 Output:
464 Return:
465 Description: fan-in/out version
466 
467 note good only for num_nodes=2^k!!!
468 
469 ***********************************comm.c*************************************/
470 void
471 grop_hc(REAL *vals, REAL *work, int n, int *oprs, int dim)
472 {
473   register int mask, edge;
474   int type, dest;
475   vfp fp;
476 #if defined MPISRC
477   MPI_Status  status;
478 #elif defined NXSRC
479   int len;
480 #endif
481 
482 
483 #ifdef SAFE
484   /* ok ... should have some data, work, and operator(s) */
485   if (!vals||!work||!oprs)
486     {error_msg_fatal("grop_hc() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
487 
488   /* non-uniform should have at least two entries */
489   if ((oprs[0] == NON_UNIFORM)&&(n<2))
490     {error_msg_fatal("grop_hc() :: non_uniform and n=0,1?");}
491 
492   /* check to make sure comm package has been initialized */
493   if (!p_init)
494     {comm_init();}
495 #endif
496 
497   /* if there's nothing to do return */
498   if ((num_nodes<2)||(!n)||(dim<=0))
499     {return;}
500 
501   /* the error msg says it all!!! */
502   if (modfl_num_nodes)
503     {error_msg_fatal("grop_hc() :: num_nodes not a power of 2!?!");}
504 
505   /* a negative number of items to send ==> fatal */
506   if (n<0)
507     {error_msg_fatal("grop_hc() :: n=%d<0?",n);}
508 
509   /* can't do more dimensions then exist */
510   dim = MIN(dim,i_log2_num_nodes);
511 
512   /* advance to list of n operations for custom */
513   if ((type=oprs[0])==NON_UNIFORM)
514     {oprs++;}
515 
516   if ((fp = (vfp) rvec_fct_addr(type)) == NULL)
517     {
518       error_msg_warning("grop_hc() :: hope you passed in a rbfp!\n");
519       fp = (vfp) oprs;
520     }
521 
522 #if  defined NXSRC
523   /* all msgs will be of the same length */
524   len = n*REAL_LEN;
525 
526   /* implement the mesh fan in/out exchange algorithm */
527   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
528     {
529       dest = my_id^mask;
530       if (my_id > dest)
531 	{csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
532       else
533 	{crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
534     }
535 
536   /* for (mask=1, edge=1; edge<dim; edge++,mask<<=1) {;} */
537   if (edge==dim)
538     {mask>>=1;}
539   else
540     {while (++edge<dim) {mask<<=1;}}
541 
542   for (edge=0; edge<dim; edge++,mask>>=1)
543     {
544       if (my_id%mask)
545 	{continue;}
546 
547       dest = my_id^mask;
548       if (my_id < dest)
549 	{csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
550       else
551 	{crecv(MSGTAG4+dest,(char *)vals,len);}
552     }
553 
554 #elif defined MPISRC
555   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
556     {
557       dest = my_id^mask;
558       if (my_id > dest)
559 	{MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
560       else
561 	{
562 	  MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,MPI_COMM_WORLD,
563 		   &status);
564 	  (*fp)(vals, work, n, oprs);
565 	}
566     }
567 
568   if (edge==dim)
569     {mask>>=1;}
570   else
571     {while (++edge<dim) {mask<<=1;}}
572 
573   for (edge=0; edge<dim; edge++,mask>>=1)
574     {
575       if (my_id%mask)
576 	{continue;}
577 
578       dest = my_id^mask;
579       if (my_id < dest)
580 	{MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
581       else
582 	{
583 	  MPI_Recv(vals,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG4+dest,MPI_COMM_WORLD,
584 		   &status);
585 	}
586     }
587 #else
588   return;
589 #endif
590 }
591 
592 
593 /***********************************comm.c*************************************
594 Function: gop()
595 
596 Input :
597 Output:
598 Return:
599 Description: fan-in/out version
600 ***********************************comm.c*************************************/
601 void
602 gfop(void *vals, void *work, int n, vbfp fp, DATA_TYPE dt, int comm_type)
603 {
604   register int mask, edge;
605   int dest;
606 #if defined MPISRC
607   MPI_Status  status;
608   MPI_Op op;
609 #elif defined NXSRC
610   int len;
611 #endif
612 
613 
614 #ifdef SAFE
615   /* check to make sure comm package has been initialized */
616   if (!p_init)
617     {comm_init();}
618 
619   /* ok ... should have some data, work, and operator(s) */
620   if (!vals||!work||!fp)
621     {error_msg_fatal("gop() :: v=%d, w=%d, f=%d",vals,work,fp);}
622 #endif
623 
624   /* if there's nothing to do return */
625   if ((num_nodes<2)||(!n))
626     {return;}
627 
628   /* a negative number of items to send ==> fatal */
629   if (n<0)
630     {error_msg_fatal("gop() :: n=%d<0?",n);}
631 
632 #if  defined NXSRC
633   switch (dt) {
634   case REAL_TYPE:
635     len = n*REAL_LEN;
636     break;
637   case INT_TYPE:
638     len = n*INT_LEN;
639     break;
640   default:
641     error_msg_fatal("gop() :: unrecognized datatype!!!\n");
642   }
643 
644   /* if not a hypercube must colapse partial dim */
645   if (edge_not_pow_2)
646     {
647       if (my_id >= floor_num_nodes)
648 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
649       else
650 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,dt);}
651     }
652 
653   /* implement the mesh fan in/out exchange algorithm */
654   if (my_id<floor_num_nodes)
655     {
656       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
657 	{
658 	  dest = my_id^mask;
659 	  if (my_id > dest)
660 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
661 	  else
662 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals,work,n,dt);}
663 	}
664 
665       mask=floor_num_nodes>>1;
666       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
667 	{
668 	  if (my_id%mask)
669 	    {continue;}
670 
671 	  dest = my_id^mask;
672 	  if (my_id < dest)
673 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
674 	  else
675 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
676 	}
677     }
678 
679   /* if not a hypercube must expand to partial dim */
680   if (edge_not_pow_2)
681     {
682       if (my_id >= floor_num_nodes)
683 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
684       else
685 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
686     }
687 
688 #elif defined MPISRC
689 
690   if (comm_type==MPI)
691     {
692       MPI_Op_create(fp,TRUE,&op);
693       MPI_Allreduce (vals, work, n, dt, op, MPI_COMM_WORLD);
694       MPI_Op_free(&op);
695       return;
696     }
697 
698   /* if not a hypercube must colapse partial dim */
699   if (edge_not_pow_2)
700     {
701       if (my_id >= floor_num_nodes)
702 	{MPI_Send(vals,n,dt,edge_not_pow_2,MSGTAG0+my_id,
703 		  MPI_COMM_WORLD);}
704       else
705 	{
706 	  MPI_Recv(work,n,dt,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
707 		   MPI_COMM_WORLD,&status);
708 	  (*fp)(vals,work,&n,&dt);
709 	}
710     }
711 
712   /* implement the mesh fan in/out exchange algorithm */
713   if (my_id<floor_num_nodes)
714     {
715       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
716 	{
717 	  dest = my_id^mask;
718 	  if (my_id > dest)
719 	    {MPI_Send(vals,n,dt,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
720 	  else
721 	    {
722 	      MPI_Recv(work,n,dt,MPI_ANY_SOURCE,MSGTAG2+dest,
723 		       MPI_COMM_WORLD, &status);
724 	      (*fp)(vals, work, &n, &dt);
725 	    }
726 	}
727 
728       mask=floor_num_nodes>>1;
729       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
730 	{
731 	  if (my_id%mask)
732 	    {continue;}
733 
734 	  dest = my_id^mask;
735 	  if (my_id < dest)
736 	    {MPI_Send(vals,n,dt,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
737 	  else
738 	    {
739 	      MPI_Recv(vals,n,dt,MPI_ANY_SOURCE,MSGTAG4+dest,
740 		       MPI_COMM_WORLD, &status);
741 	    }
742 	}
743     }
744 
745   /* if not a hypercube must expand to partial dim */
746   if (edge_not_pow_2)
747     {
748       if (my_id >= floor_num_nodes)
749 	{
750 	  MPI_Recv(vals,n,dt,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
751 		   MPI_COMM_WORLD,&status);
752 	}
753       else
754 	{MPI_Send(vals,n,dt,edge_not_pow_2,MSGTAG5+my_id,
755 		  MPI_COMM_WORLD);}
756     }
757 #else
758   return;
759 #endif
760 }
761 
762 
763 
764 
765 
766 
767 /******************************************************************************
768 Function: giop()
769 
770 Input :
771 Output:
772 Return:
773 Description:
774 
775 ii+1 entries in seg :: 0 .. ii
776 
777 ******************************************************************************/
778 void
779 ssgl_radd(register REAL *vals, register REAL *work, register int level,
780 	  register int *segs)
781 {
782   register int edge, type, dest, mask;
783   register int stage_n;
784 #if defined MPISRC
785   MPI_Status  status;
786 #endif
787 
788 
789 #ifdef DEBUG
790   if (level > i_log2_num_nodes)
791     {error_msg_fatal("sgl_add() :: level > log_2(P)!!!");}
792 #endif
793 
794 #ifdef SAFE
795   /* check to make sure comm package has been initialized */
796   if (!p_init)
797     {comm_init();}
798 #endif
799 
800 
801 #if defined NXSRC
802   /* all msgs are *NOT* the same length */
803   /* implement the mesh fan in/out exchange algorithm */
804   for (mask=0, edge=0; edge<level; edge++, mask++)
805     {
806       stage_n = (segs[level] - segs[edge]);
807       if (stage_n && !(my_id & mask))
808 	{
809 	  dest = edge_node[edge];
810 	  type = MSGTAG3 + my_id + (num_nodes*edge);
811 	  if (my_id>dest)
812 	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);}
813 	  else
814 	    {
815 	      type =  type - my_id + dest;
816 	      crecv(type,work,stage_n*REAL_LEN);
817 	      rvec_add(vals+segs[edge], work, stage_n);
818 /*            daxpy(vals+segs[edge], work, stage_n); */
819 	    }
820 	}
821       mask <<= 1;
822     }
823   mask>>=1;
824   for (edge=0; edge<level; edge++)
825     {
826       stage_n = (segs[level] - segs[level-1-edge]);
827       if (stage_n && !(my_id & mask))
828 	{
829 	  dest = edge_node[level-edge-1];
830 	  type = MSGTAG6 + my_id + (num_nodes*edge);
831 	  if (my_id<dest)
832 	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);}
833 	  else
834 	    {
835 	      type =  type - my_id + dest;
836 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN);
837 	    }
838 	}
839       mask >>= 1;
840     }
841 
842 #elif defined MPISRC
843   /* all msgs are *NOT* the same length */
844   /* implement the mesh fan in/out exchange algorithm */
845   for (mask=0, edge=0; edge<level; edge++, mask++)
846     {
847       stage_n = (segs[level] - segs[edge]);
848       if (stage_n && !(my_id & mask))
849 	{
850 	  dest = edge_node[edge];
851 	  type = MSGTAG3 + my_id + (num_nodes*edge);
852 	  if (my_id>dest)
853 /*	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);} */
854           {MPI_Send(vals+segs[edge],stage_n,REAL_TYPE,dest,type,
855                       MPI_COMM_WORLD);}
856 	  else
857 	    {
858 	      type =  type - my_id + dest;
859 /*	      crecv(type,work,stage_n*REAL_LEN); */
860               MPI_Recv(work,stage_n,REAL_TYPE,MPI_ANY_SOURCE,type,
861                        MPI_COMM_WORLD,&status);
862 	      rvec_add(vals+segs[edge], work, stage_n);
863 /*            daxpy(vals+segs[edge], work, stage_n); */
864 	    }
865 	}
866       mask <<= 1;
867     }
868   mask>>=1;
869   for (edge=0; edge<level; edge++)
870     {
871       stage_n = (segs[level] - segs[level-1-edge]);
872       if (stage_n && !(my_id & mask))
873 	{
874 	  dest = edge_node[level-edge-1];
875 	  type = MSGTAG6 + my_id + (num_nodes*edge);
876 	  if (my_id<dest)
877 /*	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);} */
878             {MPI_Send(vals+segs[level-1-edge],stage_n,REAL_TYPE,dest,type,
879                       MPI_COMM_WORLD);}
880 	  else
881 	    {
882 	      type =  type - my_id + dest;
883 /* 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN); */
884               MPI_Recv(vals+segs[level-1-edge],stage_n,REAL_TYPE,
885                        MPI_ANY_SOURCE,type,MPI_COMM_WORLD,&status);
886 	    }
887 	}
888       mask >>= 1;
889     }
890 #else
891   return;
892 #endif
893 }
894 
895 
896 
897 /***********************************comm.c*************************************
898 Function: grop_hc_vvl()
899 
900 Input :
901 Output:
902 Return:
903 Description: fan-in/out version
904 
905 note good only for num_nodes=2^k!!!
906 
907 ***********************************comm.c*************************************/
908 void
909 grop_hc_vvl(REAL *vals, REAL *work, int *segs, int *oprs, int dim)
910 {
911   register int mask, edge, n;
912   int type, dest;
913   vfp fp;
914 #if defined MPISRC
915   MPI_Status  status;
916 #elif defined NXSRC
917   int len;
918 #endif
919 
920 
921   error_msg_fatal("grop_hc_vvl() :: is not working!\n");
922 
923 #ifdef SAFE
924   /* ok ... should have some data, work, and operator(s) */
925   if (!vals||!work||!oprs||!segs)
926     {error_msg_fatal("grop_hc() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
927 
928   /* non-uniform should have at least two entries */
929 #if defined(not_used)
930   if ((oprs[0] == NON_UNIFORM)&&(n<2))
931     {error_msg_fatal("grop_hc() :: non_uniform and n=0,1?");}
932 #endif
933 
934   /* check to make sure comm package has been initialized */
935   if (!p_init)
936     {comm_init();}
937 #endif
938 
939   /* if there's nothing to do return */
940   if ((num_nodes<2)||(dim<=0))
941     {return;}
942 
943   /* the error msg says it all!!! */
944   if (modfl_num_nodes)
945     {error_msg_fatal("grop_hc() :: num_nodes not a power of 2!?!");}
946 
947   /* can't do more dimensions then exist */
948   dim = MIN(dim,i_log2_num_nodes);
949 
950   /* advance to list of n operations for custom */
951   if ((type=oprs[0])==NON_UNIFORM)
952     {oprs++;}
953 
954   if ((fp = (vfp) rvec_fct_addr(type)) == NULL)
955     {
956       error_msg_warning("grop_hc() :: hope you passed in a rbfp!\n");
957       fp = (vfp) oprs;
958     }
959 
960 #if  defined NXSRC
961   /* all msgs are *NOT* the same length */
962   /* implement the mesh fan in/out exchange algorithm */
963   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
964     {
965       n = segs[dim]-segs[edge];
966       /*
967       error_msg_warning("n=%d, segs[%d]=%d, segs[%d]=%d\n",n,dim,segs[dim],
968 			edge,segs[edge]);
969       */
970       len = n*REAL_LEN;
971       dest = my_id^mask;
972       if (my_id > dest)
973 	{csend(MSGTAG2+my_id,(char *)vals+segs[edge],len,dest,0); break;}
974       else
975 	{crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals+segs[edge], work, n, oprs);}
976     }
977 
978   if (edge==dim)
979     {mask>>=1;}
980   else
981     {while (++edge<dim) {mask<<=1;}}
982 
983   for (edge=0; edge<dim; edge++,mask>>=1)
984     {
985       if (my_id%mask)
986 	{continue;}
987       len = (segs[dim]-segs[dim-1-edge])*REAL_LEN;
988       /*
989       error_msg_warning("n=%d, segs[%d]=%d, segs[%d]=%d\n",n,dim,segs[dim],
990                          dim-1-edge,segs[dim-1-edge]);
991       */
992       dest = my_id^mask;
993       if (my_id < dest)
994 	{csend(MSGTAG4+my_id,(char *)vals+segs[dim-1-edge],len,dest,0);}
995       else
996 	{crecv(MSGTAG4+dest,(char *)vals+segs[dim-1-edge],len);}
997     }
998 
999 #elif defined MPISRC
1000   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
1001     {
1002       n = segs[dim]-segs[edge];
1003       dest = my_id^mask;
1004       if (my_id > dest)
1005 	{MPI_Send(vals+segs[edge],n,REAL_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
1006       else
1007 	{
1008 	  MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,
1009 		   MPI_COMM_WORLD, &status);
1010 	  (*fp)(vals+segs[edge], work, n, oprs);
1011 	}
1012     }
1013 
1014   if (edge==dim)
1015     {mask>>=1;}
1016   else
1017     {while (++edge<dim) {mask<<=1;}}
1018 
1019   for (edge=0; edge<dim; edge++,mask>>=1)
1020     {
1021       if (my_id%mask)
1022 	{continue;}
1023 
1024       n = (segs[dim]-segs[dim-1-edge]);
1025 
1026       dest = my_id^mask;
1027       if (my_id < dest)
1028 	{MPI_Send(vals+segs[dim-1-edge],n,REAL_TYPE,dest,MSGTAG4+my_id,
1029 		  MPI_COMM_WORLD);}
1030       else
1031 	{
1032 	  MPI_Recv(vals+segs[dim-1-edge],n,REAL_TYPE,MPI_ANY_SOURCE,
1033 		   MSGTAG4+dest,MPI_COMM_WORLD, &status);
1034 	}
1035     }
1036 #else
1037   return;
1038 #endif
1039 }
1040 
1041 
1042 #ifdef INPROG
1043 
1044 /***********************************comm.c*************************************
1045 Function: proc_sync()
1046 
1047 Input :
1048 Output:
1049 Return:
1050 Description: hc bassed version
1051 ***********************************comm.c*************************************/
1052 void
1053 proc_sync()
1054 {
1055   register int mask, edge;
1056   int type, dest;
1057 #if defined MPISRC
1058   MPI_Status  status;
1059 #endif
1060 
1061 
1062 #ifdef DEBUG
1063   error_msg_warning("begin proc_sync()\n");
1064 #endif
1065 
1066 #ifdef SAFE
1067   /* check to make sure comm package has been initialized */
1068   if (!p_init)
1069     {comm_init();}
1070 #endif
1071 
1072 #if  defined NXSRC
1073   /* all msgs will be of zero length */
1074 
1075   /* if not a hypercube must colapse partial dim */
1076   if (edge_not_pow_2)
1077     {
1078       if (my_id >= floor_num_nodes)
1079 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
1080       else
1081 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,oprs);}
1082     }
1083 
1084   /* implement the mesh fan in/out exchange algorithm */
1085   if (my_id<floor_num_nodes)
1086     {
1087       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
1088 	{
1089 	  dest = my_id^mask;
1090 	  if (my_id > dest)
1091 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
1092 	  else
1093 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
1094 	}
1095 
1096       mask=floor_num_nodes>>1;
1097       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
1098 	{
1099 	  if (my_id%mask)
1100 	    {continue;}
1101 
1102 	  dest = my_id^mask;
1103 	  if (my_id < dest)
1104 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
1105 	  else
1106 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
1107 	}
1108     }
1109 
1110   /* if not a hypercube must expand to partial dim */
1111   if (edge_not_pow_2)
1112     {
1113       if (my_id >= floor_num_nodes)
1114 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
1115       else
1116 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
1117     }
1118 
1119 #elif defined MPISRC
1120   /* all msgs will be of the same length */
1121   /* if not a hypercube must colapse partial dim */
1122   if (edge_not_pow_2)
1123     {
1124       if (my_id >= floor_num_nodes)
1125 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG0+my_id,MPI_COMM_WORLD);}
1126       else
1127 	{
1128 	  MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
1129 		   MPI_COMM_WORLD,&status);
1130 	  (*fp)(vals,work,n,oprs);
1131 	}
1132     }
1133 
1134   /* implement the mesh fan in/out exchange algorithm */
1135   if (my_id<floor_num_nodes)
1136     {
1137       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
1138 	{
1139 	  dest = my_id^mask;
1140 	  if (my_id > dest)
1141 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
1142 	  else
1143 	    {
1144 	      MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG2+dest,
1145 		       MPI_COMM_WORLD, &status);
1146 	      (*fp)(vals, work, n, oprs);
1147 	    }
1148 	}
1149 
1150       mask=floor_num_nodes>>1;
1151       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
1152 	{
1153 	  if (my_id%mask)
1154 	    {continue;}
1155 
1156 	  dest = my_id^mask;
1157 	  if (my_id < dest)
1158 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
1159 	  else
1160 	    {
1161 	      MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG4+dest,
1162 		       MPI_COMM_WORLD, &status);
1163 	    }
1164 	}
1165     }
1166 
1167   /* if not a hypercube must expand to partial dim */
1168   if (edge_not_pow_2)
1169     {
1170       if (my_id >= floor_num_nodes)
1171 	{
1172 	  MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
1173 		   MPI_COMM_WORLD,&status);
1174 	}
1175       else
1176 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG5+my_id,MPI_COMM_WORLD);}
1177     }
1178 #else
1179   return;
1180 #endif
1181 }
1182 #endif
1183 
1184 
1185 /* hmt hack */
1186 int
1187 hmt_xor_ (register int *i1, register int *i2)
1188 {
1189   return(*i1^*i2);
1190 }
1191 
1192 
1193 /******************************************************************************
1194 Function: giop()
1195 
1196 Input :
1197 Output:
1198 Return:
1199 Description:
1200 
1201 ii+1 entries in seg :: 0 .. ii
1202 
1203 ******************************************************************************/
1204 void
1205 staged_gs_ (register REAL *vals, register REAL *work, register int *level,
1206 	    register int *segs)
1207 {
1208   ssgl_radd(vals, work, *level, segs);
1209 }
1210 
1211 /******************************************************************************
1212 Function: giop()
1213 
1214 Input :
1215 Output:
1216 Return:
1217 Description:
1218 ******************************************************************************/
1219 void
1220 staged_iadd_ (int *gl_num, int *level)
1221 {
1222   sgl_iadd(gl_num,*level);
1223 }
1224 
1225 
1226 
1227 /******************************************************************************
1228 Function: giop()
1229 
1230 Input :
1231 Output:
1232 Return:
1233 Description:
1234 ******************************************************************************/
1235 static void
1236 sgl_iadd(int *vals, int level)
1237 {
1238   int edge, type, dest, source, len, mask = -1;
1239   int tmp, *work;
1240 
1241 
1242 #ifdef SAFE
1243   /* check to make sure comm package has been initialized */
1244   if (!p_init)
1245     {comm_init();}
1246 #endif
1247 
1248 
1249   /* all msgs will be of the same length */
1250   work = &tmp;
1251   len = INT_LEN;
1252 
1253   if (level > i_log2_num_nodes)
1254     {error_msg_fatal("sgl_add() :: level too big?");}
1255 
1256   if (level<=0)
1257     {return;}
1258 
1259 #if defined NXSRC
1260   {
1261     long msg_id;
1262     /* implement the mesh fan in/out exchange algorithm */
1263     if (my_id<floor_num_nodes)
1264       {
1265 	mask = 0;
1266 	for (edge = 0; edge < level; edge++)
1267 	  {
1268 	    if (!(my_id & mask))
1269 	      {
1270 		source = dest = edge_node[edge];
1271 		type = MSGTAG1 + my_id + (num_nodes*edge);
1272 		if (my_id > dest)
1273 		  {
1274 		    msg_id = isend(type,vals,len,dest,0);
1275 		    msgwait(msg_id);
1276 		  }
1277 		else
1278 		  {
1279 		    type =  type - my_id + source;
1280 		    msg_id = irecv(type,work,len);
1281 		    msgwait(msg_id);
1282 		    vals[0] += work[0];
1283 		  }
1284 	      }
1285 	    mask <<= 1;
1286 	    mask += 1;
1287 	  }
1288       }
1289 
1290     if (my_id<floor_num_nodes)
1291       {
1292 	mask >>= 1;
1293 	for (edge = 0; edge < level; edge++)
1294 	  {
1295 	    if (!(my_id & mask))
1296 	      {
1297 		source = dest = edge_node[level-edge-1];
1298 		type = MSGTAG1 + my_id + (num_nodes*edge);
1299 		if (my_id < dest)
1300 		  {
1301 		    msg_id = isend(type,vals,len,dest,0);
1302 		    msgwait(msg_id);
1303 		  }
1304 		else
1305 		  {
1306 		    type =  type - my_id + source;
1307 		    msg_id = irecv(type,work,len);
1308 		    msgwait(msg_id);
1309 		    vals[0] = work[0];
1310 		  }
1311 	      }
1312 	    mask >>= 1;
1313 	  }
1314       }
1315   }
1316 #elif defined MPISRC
1317   {
1318     MPI_Request msg_id;
1319     MPI_Status status;
1320 
1321     /* implement the mesh fan in/out exchange algorithm */
1322     if (my_id<floor_num_nodes)
1323       {
1324 	mask = 0;
1325 	for (edge = 0; edge < level; edge++)
1326 	  {
1327 	    if (!(my_id & mask))
1328 	      {
1329 		source = dest = edge_node[edge];
1330 		type = MSGTAG1 + my_id + (num_nodes*edge);
1331 		if (my_id > dest)
1332 		  {
1333 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1334 			      &msg_id);
1335 		    MPI_Wait(&msg_id,&status);
1336 		    /* msg_id = isend(type,vals,len,dest,0);
1337 		       msgwait(msg_id); */
1338 		  }
1339 		else
1340 		  {
1341 		    type =  type - my_id + source;
1342 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1343 			      type,MPI_COMM_WORLD,&msg_id);
1344 		    MPI_Wait(&msg_id,&status);
1345 		    /* msg_id = irecv(type,work,len);
1346 		       msgwait(msg_id); */
1347 		    vals[0] += work[0];
1348 		  }
1349 	      }
1350 	    mask <<= 1;
1351 	    mask += 1;
1352 	  }
1353       }
1354 
1355     if (my_id<floor_num_nodes)
1356       {
1357 	mask >>= 1;
1358 	for (edge = 0; edge < level; edge++)
1359 	  {
1360 	    if (!(my_id & mask))
1361 	      {
1362 		source = dest = edge_node[level-edge-1];
1363 		type = MSGTAG1 + my_id + (num_nodes*edge);
1364 		if (my_id < dest)
1365 		  {
1366 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1367 			      &msg_id);
1368 		    MPI_Wait(&msg_id,&status);
1369 		    /* msg_id = isend(type,vals,len,dest,0);
1370 		    msgwait(msg_id);*/
1371 		  }
1372 		else
1373 		  {
1374 		    type =  type - my_id + source;
1375 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1376 			      type,MPI_COMM_WORLD,&msg_id);
1377 		    MPI_Wait(&msg_id,&status);
1378 		    /* msg_id = irecv(type,work,len);
1379 		    msgwait(msg_id); */
1380 		    vals[0] = work[0];
1381 		  }
1382 	      }
1383 	    mask >>= 1;
1384 	  }
1385       }
1386   }
1387 #else
1388   return;
1389 #endif
1390 
1391 }
1392 
1393 
1394 
1395 /******************************************************************************
1396 Function: giop()
1397 
1398 Input :
1399 Output:
1400 Return:
1401 Description:
1402 ******************************************************************************/
1403 void
1404 staged_radd_ (double *gl_num, int *level)
1405 {
1406   sgl_radd(gl_num,*level);
1407 }
1408 
1409 /******************************************************************************
1410 Function: giop()
1411 
1412 Input :
1413 Output:
1414 Return:
1415 Description:
1416 ******************************************************************************/
1417 static void
1418 sgl_radd(double *vals, int level)
1419 {
1420 #if defined NXSRC
1421   int edge, type, dest, source, len, mask;
1422   double tmp, *work;
1423 #endif
1424 
1425 #ifdef SAFE
1426   /* check to make sure comm package has been initialized */
1427   if (!p_init)
1428     {comm_init();}
1429 #endif
1430 
1431 
1432 #if defined NXSRC
1433   {
1434     long msg_id;
1435 
1436     /* all msgs will be of the same length */
1437     work = &tmp;
1438     len = REAL_LEN;
1439 
1440     if (level > i_log2_num_nodes)
1441       {error_msg_fatal("sgl_add() :: level too big?");}
1442 
1443     if (level<=0)
1444       {return;}
1445 
1446     /* implement the mesh fan in/out exchange algorithm */
1447     if (my_id<floor_num_nodes)
1448       {
1449 	mask = 0;
1450 	for (edge = 0; edge < level; edge++)
1451 	  {
1452 	    if (!(my_id & mask))
1453 	      {
1454 		source = dest = edge_node[edge];
1455 		type = MSGTAG3 + my_id + (num_nodes*edge);
1456 		if (my_id > dest)
1457 		  {
1458 		    msg_id = isend(type,vals,len,dest,0);
1459 		    msgwait(msg_id);
1460 		  }
1461 		else
1462 		  {
1463 		    type =  type - my_id + source;
1464 		    msg_id = irecv(type,work,len);
1465 		    msgwait(msg_id);
1466 		    vals[0] += work[0];
1467 		  }
1468 	      }
1469 	    mask <<= 1;
1470 	    mask += 1;
1471 	  }
1472       }
1473 
1474     if (my_id<floor_num_nodes)
1475       {
1476 	mask >>= 1;
1477 	for (edge = 0; edge < level; edge++)
1478 	  {
1479 	    if (!(my_id & mask))
1480 	      {
1481 		source = dest = edge_node[level-edge-1];
1482 		type = MSGTAG3 + my_id + (num_nodes*edge);
1483 		if (my_id < dest)
1484 		  {
1485 		    msg_id = isend(type,vals,len,dest,0);
1486 		    msgwait(msg_id);
1487 		  }
1488 		else
1489 		  {
1490 		    type =  type - my_id + source;
1491 		    msg_id = irecv(type,work,len);
1492 		    msgwait(msg_id);
1493 		    vals[0] = work[0];
1494 		  }
1495 	      }
1496 	    mask >>= 1;
1497 	  }
1498       }
1499   }
1500 #elif defined MRISRC
1501   {
1502     MPI_Request msg_id;
1503     MPI_Status status;
1504 
1505     /* all msgs will be of the same length */
1506     work = &tmp;
1507     len = REAL_LEN;
1508 
1509     if (level > i_log2_num_nodes)
1510       {error_msg_fatal("sgl_add() :: level too big?");}
1511 
1512     if (level<=0)
1513       {return;}
1514 
1515     /* implement the mesh fan in/out exchange algorithm */
1516     if (my_id<floor_num_nodes)
1517       {
1518 	mask = 0;
1519 	for (edge = 0; edge < level; edge++)
1520 	  {
1521 	    if (!(my_id & mask))
1522 	      {
1523 		source = dest = edge_node[edge];
1524 		type = MSGTAG3 + my_id + (num_nodes*edge);
1525 		if (my_id > dest)
1526 		  {
1527 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1528 			      &msg_id);
1529 		    MPI_Wait(&msg_id,&status);
1530 		    /*msg_id = isend(type,vals,len,dest,0);
1531 		    msgwait(msg_id);*/
1532 		  }
1533 		else
1534 		  {
1535 		    type =  type - my_id + source;
1536 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1537 			      type,MPI_COMM_WORLD,&msg_id);
1538 		    MPI_Wait(&msg_id,&status);
1539 		    /*msg_id = irecv(type,work,len);
1540 		    msgwait(msg_id); */
1541 		    vals[0] += work[0];
1542 		  }
1543 	      }
1544 	    mask <<= 1;
1545 	    mask += 1;
1546 	  }
1547       }
1548 
1549     if (my_id<floor_num_nodes)
1550       {
1551 	mask >>= 1;
1552 	for (edge = 0; edge < level; edge++)
1553 	  {
1554 	    if (!(my_id & mask))
1555 	      {
1556 		source = dest = edge_node[level-edge-1];
1557 		type = MSGTAG3 + my_id + (num_nodes*edge);
1558 		if (my_id < dest)
1559 		  {
1560 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1561 			      &msg_id);
1562 		    MPI_Wait(&msg_id,&status);
1563 		    /* msg_id = isend(type,vals,len,dest,0);
1564 		    msgwait(msg_id); */
1565 		  }
1566 		else
1567 		  {
1568 		    type =  type - my_id + source;
1569 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1570 			      type,MPI_COMM_WORLD,&msg_id);
1571 		    MPI_Wait(&msg_id,&status);
1572 		    /*msg_id = irecv(type,work,len);
1573 		    msgwait(msg_id); */
1574 		    vals[0] = work[0];
1575 		  }
1576 	      }
1577 	    mask >>= 1;
1578 	  }
1579       }
1580   }
1581 #else
1582   return;
1583 #endif
1584 }
1585 
1586 
1587 
1588 /******************************************************************************
1589 Function: giop()
1590 
1591 Input :
1592 Output:
1593 Return:
1594 Description:
1595 
1596 ii+1 entries in seg :: 0 .. ii
1597 ******************************************************************************/
1598 void
1599 hmt_concat_ (REAL *vals, REAL *work, int *size)
1600 {
1601   hmt_concat(vals, work, *size);
1602 }
1603 
1604 
1605 
1606 /******************************************************************************
1607 Function: giop()
1608 
1609 Input :
1610 Output:
1611 Return:
1612 Description:
1613 
1614 ii+1 entries in seg :: 0 .. ii
1615 
1616 ******************************************************************************/
1617 static void
1618 hmt_concat(REAL *vals, REAL *work, int size)
1619 {
1620 #if defined NXSRC
1621   int  mask,stage_n;
1622   int edge, type, dest, source, len;
1623   int offset;
1624   double *dptr;
1625 #endif
1626 
1627 #ifdef SAFE
1628   /* check to make sure comm package has been initialized */
1629   if (!p_init)
1630     {comm_init();}
1631 #endif
1632 
1633   /* all msgs are *NOT* the same length */
1634   /* implement the mesh fan in/out exchange algorithm */
1635   rvec_copy(work,vals,size);
1636 
1637 #if defined NXSRC
1638   mask = 0;
1639   stage_n = size;
1640   {
1641     long msg_id;
1642 
1643     dptr  = work+size;
1644     for (edge = 0; edge < i_log2_num_nodes; edge++)
1645       {
1646 	len = stage_n * REAL_LEN;
1647 
1648 	if (!(my_id & mask))
1649 	  {
1650 	    source = dest = edge_node[edge];
1651 	    type = MSGTAG3 + my_id + (num_nodes*edge);
1652 	    if (my_id > dest)
1653 	      {
1654 		msg_id = isend(type, work, len,dest,0);
1655 		msgwait(msg_id);
1656 	      }
1657 	    else
1658 	      {
1659 		type =  type - my_id + source;
1660 		msg_id = irecv(type, dptr,len);
1661 		msgwait(msg_id);
1662 	      }
1663 	  }
1664 
1665 #ifdef DEBUG_1
1666 	ierror_msg_warning_n0("stage_n = ",stage_n);
1667 #endif
1668 
1669 	dptr += stage_n;
1670 	stage_n <<=1;
1671 	mask <<= 1;
1672 	mask += 1;
1673       }
1674 
1675     size = stage_n;
1676     stage_n >>=1;
1677     dptr -= stage_n;
1678 
1679     mask >>= 1;
1680 
1681     for (edge = 0; edge < i_log2_num_nodes; edge++)
1682       {
1683 	len = (size-stage_n) * REAL_LEN;
1684 
1685 	if (!(my_id & mask) && stage_n)
1686 	  {
1687 	    source = dest = edge_node[i_log2_num_nodes-edge-1];
1688 	    type = MSGTAG6 + my_id + (num_nodes*edge);
1689 	    if (my_id < dest)
1690 	      {
1691 		msg_id = isend(type,dptr,len,dest,0);
1692 		msgwait(msg_id);
1693 	      }
1694 	    else
1695 	      {
1696 		type =  type - my_id + source;
1697 		msg_id = irecv(type,dptr,len);
1698 		msgwait(msg_id);
1699 	      }
1700 	  }
1701 
1702 #ifdef DEBUG_1
1703 	ierror_msg_warning_n0("size-stage_n = ",size-stage_n);
1704 #endif
1705 
1706 	stage_n >>= 1;
1707 	dptr -= stage_n;
1708 	mask >>= 1;
1709       }
1710   }
1711 #elif defined MRISRC
1712   {
1713     MPI_Request msg_id;
1714     MPI_Status status;
1715 
1716     dptr  = work+size;
1717     for (edge = 0; edge < i_log2_num_nodes; edge++)
1718       {
1719 	len = stage_n * REAL_LEN;
1720 
1721 	if (!(my_id & mask))
1722 	  {
1723 	    source = dest = edge_node[edge];
1724 	    type = MSGTAG3 + my_id + (num_nodes*edge);
1725 	    if (my_id > dest)
1726 	      {
1727 		MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1728 			  &msg_id);
1729 		MPI_Wait(&msg_id,&status);
1730 		/*msg_id = isend(type, work, len,dest,0);
1731 		  msgwait(msg_id);*/
1732 	      }
1733 	    else
1734 	      {
1735 		type =  type - my_id + source;
1736 		MPI_Irecv(dptr,len,INT_TYPE,MPI_ANY_SOURCE,
1737 			  type,MPI_COMM_WORLD,&msg_id);
1738 		MPI_Wait(&msg_id,&status);
1739 		/* msg_id = irecv(type, dptr,len);
1740 		msgwait(msg_id);*/
1741 	      }
1742 	  }
1743 
1744 #ifdef DEBUG_1
1745 	ierror_msg_warning_n0("stage_n = ",stage_n);
1746 #endif
1747 
1748 	dptr += stage_n;
1749 	stage_n <<=1;
1750 	mask <<= 1;
1751 	mask += 1;
1752       }
1753 
1754     size = stage_n;
1755     stage_n >>=1;
1756     dptr -= stage_n;
1757 
1758     mask >>= 1;
1759 
1760     for (edge = 0; edge < i_log2_num_nodes; edge++)
1761       {
1762 	len = (size-stage_n) * REAL_LEN;
1763 
1764 	if (!(my_id & mask) && stage_n)
1765 	  {
1766 	    source = dest = edge_node[i_log2_num_nodes-edge-1];
1767 	    type = MSGTAG6 + my_id + (num_nodes*edge);
1768 	    if (my_id < dest)
1769 	      {
1770 		MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1771 			  &msg_id);
1772 		MPI_Wait(&msg_id,&status);
1773 		/*msg_id = isend(type, work, len,dest,0);
1774 		msg_id = isend(type,dptr,len,dest,0); */
1775 		msgwait(msg_id);
1776 	      }
1777 	    else
1778 	      {
1779 		type =  type - my_id + source;
1780 		MPI_Irecv(dptr,len,INT_TYPE,MPI_ANY_SOURCE,
1781 			  type,MPI_COMM_WORLD,&msg_id);
1782 		MPI_Wait(&msg_id,&status);
1783 		/*msg_id = irecv(type,dptr,len);
1784 		msgwait(msg_id);*/
1785 	      }
1786 	  }
1787 
1788 #ifdef DEBUG_1
1789 	ierror_msg_warning_n0("size-stage_n = ",size-stage_n);
1790 #endif
1791 
1792 	stage_n >>= 1;
1793 	dptr -= stage_n;
1794 	mask >>= 1;
1795       }
1796   }
1797 #else
1798   return;
1799 #endif
1800 }
1801 
1802 
1803 
1804 /******************************************************************************
1805 Function: giop()
1806 
1807 Input :
1808 Output:
1809 Return:
1810 Description:
1811 
1812 ii+1 entries in seg :: 0 .. ii
1813 
1814 ******************************************************************************/
1815 void
1816 new_ssgl_radd(register REAL *vals, register REAL *work, register int level,
1817 #if defined MPISRC
1818 	  register int *segs, MPI_Comm ssgl_comm)
1819 #else
1820 	  register int *segs)
1821 #endif
1822 {
1823   register int edge, type, dest, mask;
1824   register int stage_n;
1825 #if defined MPISRC
1826   MPI_Status  status;
1827 #endif
1828 
1829 
1830 #ifdef DEBUG
1831   if (level > i_log2_num_nodes)
1832     {error_msg_fatal("sgl_add() :: level > log_2(P)!!!");}
1833 #endif
1834 
1835 #ifdef SAFE
1836   /* check to make sure comm package has been initialized */
1837   if (!p_init)
1838     {comm_init();}
1839 #endif
1840 
1841 
1842 #if defined NXSRC
1843   /* all msgs are *NOT* the same length */
1844   /* implement the mesh fan in/out exchange algorithm */
1845   for (mask=0, edge=0; edge<level; edge++, mask++)
1846     {
1847       stage_n = (segs[level] - segs[edge]);
1848       if (stage_n && !(my_id & mask))
1849 	{
1850 	  dest = edge_node[edge];
1851 	  type = MSGTAG3 + my_id + (num_nodes*edge);
1852 	  if (my_id>dest)
1853 	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);}
1854 	  else
1855 	    {
1856 	      type =  type - my_id + dest;
1857 	      crecv(type,work,stage_n*REAL_LEN);
1858 	      rvec_add(vals+segs[edge], work, stage_n);
1859 /*            daxpy(vals+segs[edge], work, stage_n); */
1860 	    }
1861 	}
1862       mask <<= 1;
1863     }
1864   mask>>=1;
1865   for (edge=0; edge<level; edge++)
1866     {
1867       stage_n = (segs[level] - segs[level-1-edge]);
1868       if (stage_n && !(my_id & mask))
1869 	{
1870 	  dest = edge_node[level-edge-1];
1871 	  type = MSGTAG6 + my_id + (num_nodes*edge);
1872 	  if (my_id<dest)
1873 	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);}
1874 	  else
1875 	    {
1876 	      type =  type - my_id + dest;
1877 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN);
1878 	    }
1879 	}
1880       mask >>= 1;
1881     }
1882 
1883 #elif defined MPISRC
1884   /* all msgs are *NOT* the same length */
1885   /* implement the mesh fan in/out exchange algorithm */
1886   for (mask=0, edge=0; edge<level; edge++, mask++)
1887     {
1888       stage_n = (segs[level] - segs[edge]);
1889       if (stage_n && !(my_id & mask))
1890 	{
1891 	  dest = edge_node[edge];
1892 	  type = MSGTAG3 + my_id + (num_nodes*edge);
1893 	  if (my_id>dest)
1894 /*	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);} */
1895           {MPI_Send(vals+segs[edge],stage_n,REAL_TYPE,dest,type,
1896                       ssgl_comm);}
1897 	  else
1898 	    {
1899 	      type =  type - my_id + dest;
1900 /*	      crecv(type,work,stage_n*REAL_LEN); */
1901               MPI_Recv(work,stage_n,REAL_TYPE,MPI_ANY_SOURCE,type,
1902                        ssgl_comm,&status);
1903 	      rvec_add(vals+segs[edge], work, stage_n);
1904 /*            daxpy(vals+segs[edge], work, stage_n); */
1905 	    }
1906 	}
1907       mask <<= 1;
1908     }
1909   mask>>=1;
1910   for (edge=0; edge<level; edge++)
1911     {
1912       stage_n = (segs[level] - segs[level-1-edge]);
1913       if (stage_n && !(my_id & mask))
1914 	{
1915 	  dest = edge_node[level-edge-1];
1916 	  type = MSGTAG6 + my_id + (num_nodes*edge);
1917 	  if (my_id<dest)
1918 /*	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);} */
1919             {MPI_Send(vals+segs[level-1-edge],stage_n,REAL_TYPE,dest,type,
1920                       ssgl_comm);}
1921 	  else
1922 	    {
1923 	      type =  type - my_id + dest;
1924 /* 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN); */
1925               MPI_Recv(vals+segs[level-1-edge],stage_n,REAL_TYPE,
1926                        MPI_ANY_SOURCE,type,ssgl_comm,&status);
1927 	    }
1928 	}
1929       mask >>= 1;
1930     }
1931 #else
1932   return;
1933 #endif
1934 }
1935 
1936 
1937 
1938 /***********************************comm.c*************************************
1939 Function: giop()
1940 
1941 Input :
1942 Output:
1943 Return:
1944 Description: fan-in/out version
1945 
1946 note good only for num_nodes=2^k!!!
1947 
1948 ***********************************comm.c*************************************/
1949 void
1950 giop_hc(int *vals, int *work, int n, int *oprs, int dim)
1951 {
1952   register int mask, edge;
1953   int type, dest;
1954   vfp fp;
1955 #if defined MPISRC
1956   MPI_Status  status;
1957 #elif defined NXSRC
1958   int len;
1959 #endif
1960 
1961 
1962 #ifdef SAFE
1963   /* ok ... should have some data, work, and operator(s) */
1964   if (!vals||!work||!oprs)
1965     {error_msg_fatal("giop_hc() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
1966 
1967   /* non-uniform should have at least two entries */
1968   if ((oprs[0] == NON_UNIFORM)&&(n<2))
1969     {error_msg_fatal("giop_hc() :: non_uniform and n=0,1?");}
1970 
1971   /* check to make sure comm package has been initialized */
1972   if (!p_init)
1973     {comm_init();}
1974 #endif
1975 
1976   /* if there's nothing to do return */
1977   if ((num_nodes<2)||(!n)||(dim<=0))
1978     {return;}
1979 
1980   /* the error msg says it all!!! */
1981   if (modfl_num_nodes)
1982     {error_msg_fatal("giop_hc() :: num_nodes not a power of 2!?!");}
1983 
1984   /* a negative number of items to send ==> fatal */
1985   if (n<0)
1986     {error_msg_fatal("giop_hc() :: n=%d<0?",n);}
1987 
1988   /* can't do more dimensions then exist */
1989   dim = MIN(dim,i_log2_num_nodes);
1990 
1991   /* advance to list of n operations for custom */
1992   if ((type=oprs[0])==NON_UNIFORM)
1993     {oprs++;}
1994 
1995   if ((fp = (vfp) ivec_fct_addr(type)) == NULL)
1996     {
1997       error_msg_warning("giop_hc() :: hope you passed in a rbfp!\n");
1998       fp = (vfp) oprs;
1999     }
2000 
2001 #if  defined NXSRC
2002   /* all msgs will be of the same length */
2003   len = n*INT_LEN;
2004 
2005   /* implement the mesh fan in/out exchange algorithm */
2006   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
2007     {
2008       dest = my_id^mask;
2009       if (my_id > dest)
2010 	{csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
2011       else
2012 	{crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
2013     }
2014 
2015   /* for (mask=1, edge=1; edge<dim; edge++,mask<<=1) {;} */
2016   if (edge==dim)
2017     {mask>>=1;}
2018   else
2019     {while (++edge<dim) {mask<<=1;}}
2020 
2021   for (edge=0; edge<dim; edge++,mask>>=1)
2022     {
2023       if (my_id%mask)
2024 	{continue;}
2025 
2026       dest = my_id^mask;
2027       if (my_id < dest)
2028 	{csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
2029       else
2030 	{crecv(MSGTAG4+dest,(char *)vals,len);}
2031     }
2032 
2033 #elif defined MPISRC
2034   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
2035     {
2036       dest = my_id^mask;
2037       if (my_id > dest)
2038 	{MPI_Send(vals,n,INT_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
2039       else
2040 	{
2041 	  MPI_Recv(work,n,INT_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,MPI_COMM_WORLD,
2042 		   &status);
2043 	  (*fp)(vals, work, n, oprs);
2044 	}
2045     }
2046 
2047   if (edge==dim)
2048     {mask>>=1;}
2049   else
2050     {while (++edge<dim) {mask<<=1;}}
2051 
2052   for (edge=0; edge<dim; edge++,mask>>=1)
2053     {
2054       if (my_id%mask)
2055 	{continue;}
2056 
2057       dest = my_id^mask;
2058       if (my_id < dest)
2059 	{MPI_Send(vals,n,INT_TYPE,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
2060       else
2061 	{
2062 	  MPI_Recv(vals,n,INT_TYPE,MPI_ANY_SOURCE,MSGTAG4+dest,MPI_COMM_WORLD,
2063 		   &status);
2064 	}
2065     }
2066 #else
2067   return;
2068 #endif
2069 }
2070