1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 30 31 /***********************************gs.c*************************************** 32 Type: struct gather_scatter_id 33 ------------------------------ 34 35 ************************************gs.c**************************************/ 36 typedef struct gather_scatter_id { 37 PetscInt id; 38 PetscInt nel_min; 39 PetscInt nel_max; 40 PetscInt nel_sum; 41 PetscInt negl; 42 PetscInt gl_max; 43 PetscInt gl_min; 44 PetscInt repeats; 45 PetscInt ordered; 46 PetscInt positive; 47 PetscScalar *vals; 48 49 /* bit mask info */ 50 PetscInt *my_proc_mask; 51 PetscInt mask_sz; 52 PetscInt *ngh_buf; 53 PetscInt ngh_buf_sz; 54 PetscInt *nghs; 55 PetscInt num_nghs; 56 PetscInt max_nghs; 57 PetscInt *pw_nghs; 58 PetscInt num_pw_nghs; 59 PetscInt *tree_nghs; 60 PetscInt num_tree_nghs; 61 62 PetscInt num_loads; 63 64 /* repeats == true -> local info */ 65 PetscInt nel; /* number of unique elememts */ 66 PetscInt *elms; /* of size nel */ 67 PetscInt nel_total; 68 PetscInt *local_elms; /* of size nel_total */ 69 PetscInt *companion; /* of size nel_total */ 70 71 /* local info */ 72 PetscInt num_local_total; 73 PetscInt local_strength; 74 PetscInt num_local; 75 PetscInt *num_local_reduce; 76 PetscInt **local_reduce; 77 PetscInt num_local_gop; 78 PetscInt *num_gop_local_reduce; 79 PetscInt **gop_local_reduce; 80 81 /* pairwise info */ 82 PetscInt level; 83 PetscInt num_pairs; 84 PetscInt max_pairs; 85 PetscInt loc_node_pairs; 86 PetscInt max_node_pairs; 87 PetscInt min_node_pairs; 88 PetscInt avg_node_pairs; 89 PetscInt *pair_list; 90 PetscInt *msg_sizes; 91 PetscInt **node_list; 92 PetscInt len_pw_list; 93 PetscInt *pw_elm_list; 94 PetscScalar *pw_vals; 95 96 MPI_Request *msg_ids_in; 97 MPI_Request *msg_ids_out; 98 99 PetscScalar *out; 100 PetscScalar *in; 101 PetscInt msg_total; 102 103 /* tree - crystal accumulator info */ 104 PetscInt max_left_over; 105 PetscInt *pre; 106 PetscInt *in_num; 107 PetscInt *out_num; 108 PetscInt **in_list; 109 PetscInt **out_list; 110 111 /* new tree work*/ 112 PetscInt tree_nel; 113 PetscInt *tree_elms; 114 PetscScalar *tree_buf; 115 PetscScalar *tree_work; 116 117 PetscInt tree_map_sz; 118 PetscInt *tree_map_in; 119 PetscInt *tree_map_out; 120 121 /* current memory status */ 122 PetscInt gl_bss_min; 123 PetscInt gl_perm_min; 124 125 /* max segment size for PCTFS_gs_gop_vec() */ 126 PetscInt vec_sz; 127 128 /* hack to make paul happy */ 129 MPI_Comm PCTFS_gs_comm; 130 131 } PCTFS_gs_id; 132 133 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 134 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 135 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 136 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 137 static PCTFS_gs_id * gsi_new(void); 138 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 139 140 /* same for all but vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 142 /* vector flavor */ 143 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 144 145 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 149 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 150 151 152 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 153 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 154 155 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 156 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 157 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 158 159 /* global vars */ 160 /* from comm.c module */ 161 162 static PetscInt num_gs_ids = 0; 163 164 /* should make this dynamic ... later */ 165 static PetscInt msg_buf=MAX_MSG_BUF; 166 static PetscInt vec_sz=GS_VEC_SZ; 167 static PetscInt *tree_buf=NULL; 168 static PetscInt tree_buf_sz=0; 169 static PetscInt ntree=0; 170 171 /***************************************************************************/ 172 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 173 { 174 PetscFunctionBegin; 175 vec_sz = size; 176 PetscFunctionReturn(0); 177 } 178 179 /******************************************************************************/ 180 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 181 { 182 PetscFunctionBegin; 183 msg_buf = buf_size; 184 PetscFunctionReturn(0); 185 } 186 187 /******************************************************************************/ 188 PCTFS_gs_id *PCTFS_gs_init( PetscInt *elms, PetscInt nel, PetscInt level) 189 { 190 PCTFS_gs_id *gs; 191 MPI_Group PCTFS_gs_group; 192 MPI_Comm PCTFS_gs_comm; 193 PetscErrorCode ierr; 194 195 PetscFunctionBegin; 196 /* ensure that communication package has been initialized */ 197 PCTFS_comm_init(); 198 199 200 /* determines if we have enough dynamic/semi-static memory */ 201 /* checks input, allocs and sets gd_id template */ 202 gs = gsi_check_args(elms,nel,level); 203 204 /* only bit mask version up and working for the moment */ 205 /* LATER :: get int list version working for sparse pblms */ 206 ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 207 208 209 ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 210 ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr); 211 gs->PCTFS_gs_comm=PCTFS_gs_comm; 212 213 return(gs); 214 } 215 216 /******************************************************************************/ 217 static PCTFS_gs_id *gsi_new(void) 218 { 219 PetscErrorCode ierr; 220 PCTFS_gs_id *gs; 221 gs = (PCTFS_gs_id *) malloc(sizeof(PCTFS_gs_id)); 222 ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr); 223 return(gs); 224 } 225 226 /******************************************************************************/ 227 static PCTFS_gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 228 { 229 PetscInt i, j, k, t2; 230 PetscInt *companion, *elms, *unique, *iptr; 231 PetscInt num_local=0, *num_to_reduce, **local_reduce; 232 PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; 233 PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1]; 234 PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1]; 235 PCTFS_gs_id *gs; 236 PetscErrorCode ierr; 237 238 239 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n"); 240 if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n"); 241 242 if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 243 244 /* get space for gs template */ 245 gs = gsi_new(); 246 gs->id = ++num_gs_ids; 247 248 /* hmt 6.4.99 */ 249 /* caller can set global ids that don't participate to 0 */ 250 /* PCTFS_gs_init ignores all zeros in elm list */ 251 /* negative global ids are still invalid */ 252 for (i=j=0;i<nel;i++) { if (in_elms[i]!=0) {j++;} } 253 254 k=nel; nel=j; 255 256 /* copy over in_elms list and create inverse map */ 257 elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); 258 companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); 259 260 for (i=j=0;i<k;i++) { 261 if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; } 262 } 263 264 if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n"); 265 266 /* pre-pass ... check to see if sorted */ 267 elms[nel] = INT_MAX; 268 iptr = elms; 269 unique = elms+1; 270 j=0; 271 while (*iptr!=INT_MAX) { 272 if (*iptr++>*unique++) { j=1; break; } 273 } 274 275 /* set up inverse map */ 276 if (j) { 277 ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); 278 ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr); 279 } 280 else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 281 elms[nel] = INT_MIN; 282 283 /* first pass */ 284 /* determine number of unique elements, check pd */ 285 for (i=k=0;i<nel;i+=j) { 286 t2 = elms[i]; 287 j=++i; 288 289 /* clump 'em for now */ 290 while (elms[j]==t2) { j++; } 291 292 /* how many together and num local */ 293 if (j-=i) { num_local++; k+=j; } 294 } 295 296 /* how many unique elements? */ 297 gs->repeats=k; 298 gs->nel = nel-k; 299 300 301 /* number of repeats? */ 302 gs->num_local = num_local; 303 num_local+=2; 304 gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*)); 305 gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); 306 307 unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); 308 gs->elms = unique; 309 gs->nel_total = nel; 310 gs->local_elms = elms; 311 gs->companion = companion; 312 313 /* compess map as well as keep track of local ops */ 314 for (num_local=i=j=0;i<gs->nel;i++) { 315 k=j; 316 t2 = unique[i] = elms[j]; 317 companion[i] = companion[j]; 318 319 while (elms[j]==t2) { j++; } 320 321 if ((t2=(j-k))>1) { 322 /* number together */ 323 num_to_reduce[num_local] = t2++; 324 iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); 325 326 /* to use binary searching don't remap until we check intersection */ 327 *iptr++ = i; 328 329 /* note that we're skipping the first one */ 330 while (++k<j) { *(iptr++) = companion[k]; } 331 *iptr = -1; 332 } 333 } 334 335 /* sentinel for ngh_buf */ 336 unique[gs->nel]=INT_MAX; 337 338 /* for two partition sort hack */ 339 num_to_reduce[num_local] = 0; 340 local_reduce[num_local] = NULL; 341 num_to_reduce[++num_local] = 0; 342 local_reduce[num_local] = NULL; 343 344 /* load 'em up */ 345 /* note one extra to hold NON_UNIFORM flag!!! */ 346 vals[2] = vals[1] = vals[0] = nel; 347 if (gs->nel>0) { 348 vals[3] = unique[0]; 349 vals[4] = unique[gs->nel-1]; 350 } else { 351 vals[3] = INT_MAX; 352 vals[4] = INT_MIN; 353 } 354 vals[5] = level; 355 vals[6] = num_gs_ids; 356 357 /* GLOBAL: send 'em out */ 358 ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 359 360 /* must be semi-pos def - only pairwise depends on this */ 361 /* LATER - remove this restriction */ 362 if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n"); 363 if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n"); 364 365 gs->nel_min = vals[0]; 366 gs->nel_max = vals[1]; 367 gs->nel_sum = vals[2]; 368 gs->gl_min = vals[3]; 369 gs->gl_max = vals[4]; 370 gs->negl = vals[4]-vals[3]+1; 371 372 if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n"); 373 374 /* LATER :: add level == -1 -> program selects level */ 375 if (vals[5]<0) { vals[5]=0; } 376 else if (vals[5]>PCTFS_num_nodes) { vals[5]=PCTFS_num_nodes; } 377 gs->level = vals[5]; 378 379 return(gs); 380 } 381 382 /******************************************************************************/ 383 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 384 { 385 PetscInt i, nel, *elms; 386 PetscInt t1; 387 PetscInt **reduce; 388 PetscInt *map; 389 PetscErrorCode ierr; 390 391 PetscFunctionBegin; 392 /* totally local removes ... PCTFS_ct_bits == 0 */ 393 get_ngh_buf(gs); 394 395 if (gs->level) set_pairwise(gs); 396 if (gs->max_left_over) set_tree(gs); 397 398 /* intersection local and pairwise/tree? */ 399 gs->num_local_total = gs->num_local; 400 gs->gop_local_reduce = gs->local_reduce; 401 gs->num_gop_local_reduce = gs->num_local_reduce; 402 403 map = gs->companion; 404 405 /* is there any local compression */ 406 if (!gs->num_local) { 407 gs->local_strength = NONE; 408 gs->num_local_gop = 0; 409 } else { 410 /* ok find intersection */ 411 map = gs->companion; 412 reduce = gs->local_reduce; 413 for (i=0, t1=0; i<gs->num_local; i++, reduce++) 414 { 415 if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) 416 || 417 PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) { 418 t1++; 419 if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?"); 420 gs->num_local_reduce[i] *= -1; 421 } 422 **reduce=map[**reduce]; 423 } 424 425 /* intersection is empty */ 426 if (!t1) { 427 gs->local_strength = FULL; 428 gs->num_local_gop = 0; 429 } else { /* intersection not empty */ 430 gs->local_strength = PARTIAL; 431 ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr); 432 433 gs->num_local_gop = t1; 434 gs->num_local_total = gs->num_local; 435 gs->num_local -= t1; 436 gs->gop_local_reduce = gs->local_reduce; 437 gs->num_gop_local_reduce = gs->num_local_reduce; 438 439 for (i=0; i<t1; i++) 440 { 441 if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?"); 442 gs->num_gop_local_reduce[i] *= -1; 443 gs->local_reduce++; 444 gs->num_local_reduce++; 445 } 446 gs->local_reduce++; 447 gs->num_local_reduce++; 448 } 449 } 450 451 elms = gs->pw_elm_list; 452 nel = gs->len_pw_list; 453 for (i=0; i<nel; i++) { elms[i] = map[elms[i]]; } 454 455 elms = gs->tree_map_in; 456 nel = gs->tree_map_sz; 457 for (i=0; i<nel; i++) { elms[i] = map[elms[i]]; } 458 459 /* clean up */ 460 free((void*) gs->local_elms); 461 free((void*) gs->companion); 462 free((void*) gs->elms); 463 free((void*) gs->ngh_buf); 464 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 465 PetscFunctionReturn(0); 466 } 467 468 /******************************************************************************/ 469 static PetscErrorCode place_in_tree( PetscInt elm) 470 { 471 PetscInt *tp, n; 472 473 PetscFunctionBegin; 474 if (ntree==tree_buf_sz) 475 { 476 if (tree_buf_sz) { 477 tp = tree_buf; 478 n = tree_buf_sz; 479 tree_buf_sz<<=1; 480 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 481 PCTFS_ivec_copy(tree_buf,tp,n); 482 free(tp); 483 } else { 484 tree_buf_sz = TREE_BUF_SZ; 485 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 486 } 487 } 488 489 tree_buf[ntree++] = elm; 490 PetscFunctionReturn(0); 491 } 492 493 /******************************************************************************/ 494 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 495 { 496 PetscInt i, j, npw=0, ntree_map=0; 497 PetscInt p_mask_size, ngh_buf_size, buf_size; 498 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 499 PetscInt *ngh_buf, *buf1, *buf2; 500 PetscInt offset, per_load, num_loads, or_ct, start, end; 501 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 502 PetscInt oper=GL_B_OR; 503 PetscInt *ptr3, *t_mask, level, ct1, ct2; 504 PetscErrorCode ierr; 505 506 PetscFunctionBegin; 507 /* to make life easier */ 508 nel = gs->nel; 509 elms = gs->elms; 510 level = gs->level; 511 512 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 513 p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes)); 514 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 515 516 /* allocate space for masks and info bufs */ 517 gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); 518 gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); 519 gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; 520 t_mask = (PetscInt*) malloc(p_mask_size); 521 gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); 522 523 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 524 /* had thought I could exploit rendezvous threshold */ 525 526 /* default is one pass */ 527 per_load = negl = gs->negl; 528 gs->num_loads = num_loads = 1; 529 i=p_mask_size*negl; 530 531 /* possible overflow on buffer size */ 532 /* overflow hack */ 533 if (i<0) {i=INT_MAX;} 534 535 buf_size = PetscMin(msg_buf,i); 536 537 /* can we do it? */ 538 if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size); 539 540 /* get PCTFS_giop buf space ... make *only* one malloc */ 541 buf1 = (PetscInt*) malloc(buf_size<<1); 542 543 /* more than one gior exchange needed? */ 544 if (buf_size!=i) { 545 per_load = buf_size/p_mask_size; 546 buf_size = per_load*p_mask_size; 547 gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); 548 } 549 550 551 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 552 p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); 553 554 /* find PCTFS_giop work space */ 555 buf2 = buf1+buf_size; 556 557 /* hold #ints needed for processor masks */ 558 gs->mask_sz=p_mask_size; 559 560 /* init buffers */ 561 ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr); 562 ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr); 563 ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr); 564 565 /* HACK reset tree info */ 566 tree_buf=NULL; 567 tree_buf_sz=ntree=0; 568 569 /* ok do it */ 570 for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) { 571 /* identity for bitwise or is 000...000 */ 572 PCTFS_ivec_zero(buf1,buf_size); 573 574 /* load msg buffer */ 575 for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) { 576 offset = (offset-start)*p_mask_size; 577 PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size); 578 } 579 580 /* GLOBAL: pass buffer */ 581 ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr); 582 583 584 /* unload buffer into ngh_buf */ 585 ptr2=(elms+i_start); 586 for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) { 587 /* I own it ... may have to pairwise it */ 588 if (j==*ptr2) { 589 /* do i share it w/anyone? */ 590 ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt)); 591 /* guess not */ 592 if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; } 593 594 /* i do ... so keep info and turn off my bit */ 595 PCTFS_ivec_copy(ptr1,ptr3,p_mask_size); 596 ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr); 597 ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 598 599 /* is it to be done pairwise? */ 600 if (--ct1<=level) { 601 npw++; 602 603 /* turn on high bit to indicate pw need to process */ 604 *ptr2++ |= TOP_BIT; 605 ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 606 ptr1+=p_mask_size; 607 continue; 608 } 609 610 /* get set for next and note that I have a tree contribution */ 611 /* could save exact elm index for tree here -> save a search */ 612 ptr2++; ptr1+=p_mask_size; ntree_map++; 613 } else { /* i don't but still might be involved in tree */ 614 615 /* shared by how many? */ 616 ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt)); 617 618 /* none! */ 619 if (ct1<2) continue; 620 621 /* is it going to be done pairwise? but not by me of course!*/ 622 if (--ct1<=level) continue; 623 } 624 /* LATER we're going to have to process it NOW */ 625 /* nope ... tree it */ 626 ierr = place_in_tree(j);CHKERRQ(ierr); 627 } 628 } 629 630 free((void*)t_mask); 631 free((void*)buf1); 632 633 gs->len_pw_list=npw; 634 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 635 636 /* expand from bit mask list to int list and save ngh list */ 637 gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); 638 PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); 639 640 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); 641 642 oper = GL_MAX; 643 ct1 = gs->num_nghs; 644 ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr); 645 gs->max_nghs = ct1; 646 647 gs->tree_map_sz = ntree_map; 648 gs->max_left_over=ntree; 649 650 free((void*)p_mask); 651 free((void*)sh_proc_mask); 652 PetscFunctionReturn(0); 653 } 654 655 /******************************************************************************/ 656 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 657 { 658 PetscInt i, j; 659 PetscInt p_mask_size; 660 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 661 PetscInt *ngh_buf, *buf2; 662 PetscInt offset; 663 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 664 PetscInt *pairwise_elm_list, len_pair_list=0; 665 PetscInt *iptr, t1, i_start, nel, *elms; 666 PetscInt ct; 667 PetscErrorCode ierr; 668 669 PetscFunctionBegin; 670 /* to make life easier */ 671 nel = gs->nel; 672 elms = gs->elms; 673 ngh_buf = gs->ngh_buf; 674 sh_proc_mask = gs->pw_nghs; 675 676 /* need a few temp masks */ 677 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 678 p_mask = (PetscInt*) malloc(p_mask_size); 679 tmp_proc_mask = (PetscInt*) malloc(p_mask_size); 680 681 /* set mask to my PCTFS_my_id's bit mask */ 682 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 683 684 p_mask_size /= sizeof(PetscInt); 685 686 len_pair_list=gs->len_pw_list; 687 gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); 688 689 /* how many processors (nghs) do we have to exchange with? */ 690 nprs=gs->num_pairs=PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 691 692 693 /* allocate space for PCTFS_gs_gop() info */ 694 gs->pair_list = msg_list = (PetscInt *) malloc(sizeof(PetscInt)*nprs); 695 gs->msg_sizes = msg_size = (PetscInt *) malloc(sizeof(PetscInt)*nprs); 696 gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1)); 697 698 /* init msg_size list */ 699 ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr); 700 701 /* expand from bit mask list to int list */ 702 ierr = PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr); 703 704 /* keep list of elements being handled pairwise */ 705 for (i=j=0;i<nel;i++) { 706 if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; } 707 } 708 pairwise_elm_list[j] = -1; 709 710 gs->msg_ids_out = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1)); 711 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 712 gs->msg_ids_in = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1)); 713 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 714 gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); 715 716 /* find who goes to each processor */ 717 for (i_start=i=0;i<nprs;i++) { 718 /* processor i's mask */ 719 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr); 720 721 /* det # going to processor i */ 722 for (ct=j=0;j<len_pair_list;j++) { 723 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 724 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 725 if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) { ct++; } 726 } 727 msg_size[i] = ct; 728 i_start = PetscMax(i_start,ct); 729 730 /*space to hold nodes in message to first neighbor */ 731 msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); 732 733 for (j=0;j<len_pair_list;j++) { 734 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 735 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 736 if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) { *iptr++ = j; } 737 } 738 *iptr = -1; 739 } 740 msg_nodes[nprs] = NULL; 741 742 j=gs->loc_node_pairs=i_start; 743 t1 = GL_MAX; 744 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 745 gs->max_node_pairs = i_start; 746 747 i_start=j; 748 t1 = GL_MIN; 749 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 750 gs->min_node_pairs = i_start; 751 752 i_start=j; 753 t1 = GL_ADD; 754 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 755 gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1; 756 757 i_start=nprs; 758 t1 = GL_MAX; 759 PCTFS_giop(&i_start,&offset,1,&t1); 760 gs->max_pairs = i_start; 761 762 763 /* remap pairwise in tail of gsi_via_bit_mask() */ 764 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs); 765 gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 766 gs->in = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 767 768 /* reset malloc pool */ 769 free((void*)p_mask); 770 free((void*)tmp_proc_mask); 771 PetscFunctionReturn(0); 772 } 773 774 /* to do pruned tree just save ngh buf copy for each one and decode here! 775 ******************************************************************************/ 776 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 777 { 778 PetscInt i, j, n, nel; 779 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 780 781 PetscFunctionBegin; 782 /* local work ptrs */ 783 elms = gs->elms; 784 nel = gs->nel; 785 786 /* how many via tree */ 787 gs->tree_nel = n = ntree; 788 gs->tree_elms = tree_elms = iptr_in = tree_buf; 789 gs->tree_buf = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz); 790 gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz); 791 j=gs->tree_map_sz; 792 gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 793 gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 794 795 /* search the longer of the two lists */ 796 /* note ... could save this info in get_ngh_buf and save searches */ 797 if (n<=nel) { 798 /* bijective fct w/remap - search elm list */ 799 for (i=0; i<n; i++) { 800 if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;} 801 } 802 } else { 803 for (i=0; i<nel; i++) { 804 if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;} 805 } 806 } 807 808 /* sentinel */ 809 *iptr_in = *iptr_out = -1; 810 PetscFunctionReturn(0); 811 } 812 813 /******************************************************************************/ 814 static PetscErrorCode PCTFS_gs_gop_local_out( PCTFS_gs_id *gs, PetscScalar *vals) 815 { 816 PetscInt *num, *map, **reduce; 817 PetscScalar tmp; 818 819 PetscFunctionBegin; 820 num = gs->num_gop_local_reduce; 821 reduce = gs->gop_local_reduce; 822 while ((map = *reduce++)) { 823 /* wall */ 824 if (*num == 2) { 825 num ++; 826 vals[map[1]] = vals[map[0]]; 827 } else if (*num == 3) { /* corner shared by three elements */ 828 num ++; 829 vals[map[2]] = vals[map[1]] = vals[map[0]]; 830 } else if (*num == 4) { /* corner shared by four elements */ 831 num ++; 832 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 833 } else { /* general case ... odd geoms ... 3D*/ 834 num++; 835 tmp = *(vals + *map++); 836 while (*map >= 0) { *(vals + *map++) = tmp; } 837 } 838 } 839 PetscFunctionReturn(0); 840 } 841 842 /******************************************************************************/ 843 static PetscErrorCode PCTFS_gs_gop_local_plus( PCTFS_gs_id *gs, PetscScalar *vals) 844 { 845 PetscInt *num, *map, **reduce; 846 PetscScalar tmp; 847 848 PetscFunctionBegin; 849 num = gs->num_local_reduce; 850 reduce = gs->local_reduce; 851 while ((map = *reduce)) { 852 /* wall */ 853 if (*num == 2) { 854 num ++; reduce++; 855 vals[map[1]] = vals[map[0]] += vals[map[1]]; 856 } else if (*num == 3) { /* corner shared by three elements */ 857 num ++; reduce++; 858 vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]); 859 } else if (*num == 4) { /* corner shared by four elements */ 860 num ++; reduce++; 861 vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += 862 (vals[map[1]] + vals[map[2]] + vals[map[3]]); 863 } else { /* general case ... odd geoms ... 3D*/ 864 num ++; 865 tmp = 0.0; 866 while (*map >= 0) {tmp += *(vals + *map++);} 867 868 map = *reduce++; 869 while (*map >= 0) {*(vals + *map++) = tmp;} 870 } 871 } 872 PetscFunctionReturn(0); 873 } 874 875 /******************************************************************************/ 876 static PetscErrorCode PCTFS_gs_gop_local_in_plus( PCTFS_gs_id *gs, PetscScalar *vals) 877 { 878 PetscInt *num, *map, **reduce; 879 PetscScalar *base; 880 881 PetscFunctionBegin; 882 num = gs->num_gop_local_reduce; 883 reduce = gs->gop_local_reduce; 884 while ((map = *reduce++)) { 885 /* wall */ 886 if (*num == 2) { 887 num ++; 888 vals[map[0]] += vals[map[1]]; 889 } else if (*num == 3) { /* corner shared by three elements */ 890 num ++; 891 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 892 } else if (*num == 4) { /* corner shared by four elements */ 893 num ++; 894 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 895 } else { /* general case ... odd geoms ... 3D*/ 896 num++; 897 base = vals + *map++; 898 while (*map >= 0) {*base += *(vals + *map++);} 899 } 900 } 901 PetscFunctionReturn(0); 902 } 903 904 /******************************************************************************/ 905 PetscErrorCode PCTFS_gs_free( PCTFS_gs_id *gs) 906 { 907 PetscInt i; 908 909 PetscFunctionBegin; 910 if (gs->nghs) { free((void*) gs->nghs); } 911 if (gs->pw_nghs) { free((void*) gs->pw_nghs); } 912 913 /* tree */ 914 if (gs->max_left_over) 915 { 916 if (gs->tree_elms) { free((void*) gs->tree_elms); } 917 if (gs->tree_buf) { free((void*) gs->tree_buf); } 918 if (gs->tree_work) { free((void*) gs->tree_work); } 919 if (gs->tree_map_in) { free((void*) gs->tree_map_in); } 920 if (gs->tree_map_out) { free((void*) gs->tree_map_out); } 921 } 922 923 /* pairwise info */ 924 if (gs->num_pairs) 925 { 926 /* should be NULL already */ 927 if (gs->ngh_buf) { free((void*) gs->ngh_buf); } 928 if (gs->elms) { free((void*) gs->elms); } 929 if (gs->local_elms) { free((void*) gs->local_elms); } 930 if (gs->companion) { free((void*) gs->companion); } 931 932 /* only set if pairwise */ 933 if (gs->vals) { free((void*) gs->vals); } 934 if (gs->in) { free((void*) gs->in); } 935 if (gs->out) { free((void*) gs->out); } 936 if (gs->msg_ids_in) { free((void*) gs->msg_ids_in); } 937 if (gs->msg_ids_out) { free((void*) gs->msg_ids_out); } 938 if (gs->pw_vals) { free((void*) gs->pw_vals); } 939 if (gs->pw_elm_list) { free((void*) gs->pw_elm_list); } 940 if (gs->node_list) { 941 for (i=0;i<gs->num_pairs;i++) { 942 if (gs->node_list[i]) { 943 free((void*) gs->node_list[i]); 944 } 945 } 946 free((void*) gs->node_list); 947 } 948 if (gs->msg_sizes) { free((void*) gs->msg_sizes); } 949 if (gs->pair_list) { free((void*) gs->pair_list); } 950 } 951 952 /* local info */ 953 if (gs->num_local_total>=0) { 954 /* for (i=0;i<gs->num_local_total;i++) */ 955 for (i=0;i<gs->num_local_total+1;i++) { 956 if (gs->num_gop_local_reduce[i]) { free((void*) gs->gop_local_reduce[i]); } 957 } 958 } 959 960 /* if intersection tree/pairwise and local isn't empty */ 961 if (gs->gop_local_reduce) { free((void*) gs->gop_local_reduce); } 962 if (gs->num_gop_local_reduce) { free((void*) gs->num_gop_local_reduce); } 963 964 free((void*) gs); 965 PetscFunctionReturn(0); 966 } 967 968 /******************************************************************************/ 969 PetscErrorCode PCTFS_gs_gop_vec( PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 970 { 971 PetscErrorCode ierr; 972 973 PetscFunctionBegin; 974 switch (*op) { 975 case '+': 976 PCTFS_gs_gop_vec_plus(gs,vals,step); 977 break; 978 default: 979 ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op",op[0]);CHKERRQ(ierr); 980 ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus");CHKERRQ(ierr); 981 PCTFS_gs_gop_vec_plus(gs,vals,step); 982 break; 983 } 984 PetscFunctionReturn(0); 985 } 986 987 /******************************************************************************/ 988 static PetscErrorCode PCTFS_gs_gop_vec_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 989 { 990 PetscFunctionBegin; 991 if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 992 993 /* local only operations!!! */ 994 if (gs->num_local) { PCTFS_gs_gop_vec_local_plus(gs,vals,step); } 995 996 /* if intersection tree/pairwise and local isn't empty */ 997 if (gs->num_local_gop) 998 { 999 PCTFS_gs_gop_vec_local_in_plus(gs,vals,step); 1000 1001 /* pairwise */ 1002 if (gs->num_pairs) { PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); } 1003 1004 /* tree */ 1005 else if (gs->max_left_over) { PCTFS_gs_gop_vec_tree_plus(gs,vals,step); } 1006 1007 PCTFS_gs_gop_vec_local_out(gs,vals,step); 1008 } else { /* if intersection tree/pairwise and local is empty */ 1009 /* pairwise */ 1010 if (gs->num_pairs) { PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); } 1011 1012 /* tree */ 1013 else if (gs->max_left_over) { PCTFS_gs_gop_vec_tree_plus(gs,vals,step); } 1014 } 1015 PetscFunctionReturn(0); 1016 } 1017 1018 /******************************************************************************/ 1019 static PetscErrorCode PCTFS_gs_gop_vec_local_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1020 { 1021 PetscInt *num, *map, **reduce; 1022 PetscScalar *base; 1023 1024 PetscFunctionBegin; 1025 num = gs->num_local_reduce; 1026 reduce = gs->local_reduce; 1027 while ((map = *reduce)) { 1028 base = vals + map[0] * step; 1029 1030 /* wall */ 1031 if (*num == 2) { 1032 num++; reduce++; 1033 PCTFS_rvec_add (base,vals+map[1]*step,step); 1034 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1035 } else if (*num == 3) { /* corner shared by three elements */ 1036 num++; reduce++; 1037 PCTFS_rvec_add (base,vals+map[1]*step,step); 1038 PCTFS_rvec_add (base,vals+map[2]*step,step); 1039 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1040 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1041 } else if (*num == 4) { /* corner shared by four elements */ 1042 num++; reduce++; 1043 PCTFS_rvec_add (base,vals+map[1]*step,step); 1044 PCTFS_rvec_add (base,vals+map[2]*step,step); 1045 PCTFS_rvec_add (base,vals+map[3]*step,step); 1046 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1047 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1048 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1049 } else { /* general case ... odd geoms ... 3D */ 1050 num++; 1051 while (*++map >= 0) {PCTFS_rvec_add (base,vals+*map*step,step);} 1052 1053 map = *reduce; 1054 while (*++map >= 0) {PCTFS_rvec_copy(vals+*map*step,base,step);} 1055 1056 reduce++; 1057 } 1058 } 1059 PetscFunctionReturn(0); 1060 } 1061 1062 /******************************************************************************/ 1063 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1064 { 1065 PetscInt *num, *map, **reduce; 1066 PetscScalar *base; 1067 1068 PetscFunctionBegin; 1069 num = gs->num_gop_local_reduce; 1070 reduce = gs->gop_local_reduce; 1071 while ((map = *reduce++)) { 1072 base = vals + map[0] * step; 1073 1074 /* wall */ 1075 if (*num == 2) { 1076 num ++; 1077 PCTFS_rvec_add(base,vals+map[1]*step,step); 1078 } else if (*num == 3) { /* corner shared by three elements */ 1079 num ++; 1080 PCTFS_rvec_add(base,vals+map[1]*step,step); 1081 PCTFS_rvec_add(base,vals+map[2]*step,step); 1082 } else if (*num == 4) { /* corner shared by four elements */ 1083 num ++; 1084 PCTFS_rvec_add(base,vals+map[1]*step,step); 1085 PCTFS_rvec_add(base,vals+map[2]*step,step); 1086 PCTFS_rvec_add(base,vals+map[3]*step,step); 1087 } else { /* general case ... odd geoms ... 3D*/ 1088 num++; 1089 while (*++map >= 0) {PCTFS_rvec_add(base,vals+*map*step,step);} 1090 } 1091 } 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /******************************************************************************/ 1096 static PetscErrorCode PCTFS_gs_gop_vec_local_out( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1097 { 1098 PetscInt *num, *map, **reduce; 1099 PetscScalar *base; 1100 1101 PetscFunctionBegin; 1102 num = gs->num_gop_local_reduce; 1103 reduce = gs->gop_local_reduce; 1104 while ((map = *reduce++)) { 1105 base = vals + map[0] * step; 1106 1107 /* wall */ 1108 if (*num == 2) { 1109 num ++; 1110 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1111 } else if (*num == 3) { /* corner shared by three elements */ 1112 num ++; 1113 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1114 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1115 } else if (*num == 4) { /* corner shared by four elements */ 1116 num ++; 1117 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1118 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1119 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1120 } else { /* general case ... odd geoms ... 3D*/ 1121 num++; 1122 while (*++map >= 0) {PCTFS_rvec_copy(vals+*map*step,base,step);} 1123 } 1124 } 1125 PetscFunctionReturn(0); 1126 } 1127 1128 /******************************************************************************/ 1129 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus( PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1130 { 1131 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1132 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1133 PetscInt *pw, *list, *size, **nodes; 1134 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1135 MPI_Status status; 1136 PetscBLASInt i1 = 1,dstep; 1137 PetscErrorCode ierr; 1138 1139 PetscFunctionBegin; 1140 /* strip and load s */ 1141 msg_list =list = gs->pair_list; 1142 msg_size =size = gs->msg_sizes; 1143 msg_nodes=nodes = gs->node_list; 1144 iptr=pw = gs->pw_elm_list; 1145 dptr1=dptr3 = gs->pw_vals; 1146 msg_ids_in = ids_in = gs->msg_ids_in; 1147 msg_ids_out = ids_out = gs->msg_ids_out; 1148 dptr2 = gs->out; 1149 in1=in2 = gs->in; 1150 1151 /* post the receives */ 1152 /* msg_nodes=nodes; */ 1153 do { 1154 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1155 second one *list and do list++ afterwards */ 1156 ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr); 1157 list++;msg_ids_in++; 1158 in1 += *size++ *step; 1159 } 1160 while (*++msg_nodes); 1161 msg_nodes=nodes; 1162 1163 /* load gs values into in out gs buffers */ 1164 while (*iptr >= 0) { 1165 PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step); 1166 dptr3+=step; 1167 iptr++; 1168 } 1169 1170 /* load out buffers and post the sends */ 1171 while ((iptr = *msg_nodes++)) { 1172 dptr3 = dptr2; 1173 while (*iptr >= 0) { 1174 PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step); 1175 dptr2+=step; 1176 iptr++; 1177 } 1178 ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr); 1179 msg_size++; msg_list++;msg_ids_out++; 1180 } 1181 1182 /* tree */ 1183 if (gs->max_left_over) { PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step); } 1184 1185 /* process the received data */ 1186 msg_nodes=nodes; 1187 while ((iptr = *nodes++)) { 1188 PetscScalar d1 = 1.0; 1189 1190 /* Should I check the return value of MPI_Wait() or status? */ 1191 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1192 ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr); 1193 ids_in++; 1194 while (*iptr >= 0) { 1195 dstep = PetscBLASIntCast(step); 1196 BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1); 1197 in2+=step; 1198 iptr++; 1199 } 1200 } 1201 1202 /* replace vals */ 1203 while (*pw >= 0) { 1204 PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step); 1205 dptr1+=step; 1206 pw++; 1207 } 1208 1209 /* clear isend message handles */ 1210 /* This changed for clarity though it could be the same */ 1211 1212 /* Should I check the return value of MPI_Wait() or status? */ 1213 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1214 while (*msg_nodes++) {ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);ids_out++;} 1215 PetscFunctionReturn(0); 1216 } 1217 1218 /******************************************************************************/ 1219 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1220 { 1221 PetscInt size, *in, *out; 1222 PetscScalar *buf, *work; 1223 PetscInt op[] = {GL_ADD,0}; 1224 PetscBLASInt i1 = 1; 1225 1226 PetscFunctionBegin; 1227 /* copy over to local variables */ 1228 in = gs->tree_map_in; 1229 out = gs->tree_map_out; 1230 buf = gs->tree_buf; 1231 work = gs->tree_work; 1232 size = gs->tree_nel*step; 1233 1234 /* zero out collection buffer */ 1235 PCTFS_rvec_zero(buf,size); 1236 1237 1238 /* copy over my contributions */ 1239 while (*in >= 0) { 1240 PetscBLASInt dstep = PetscBLASIntCast(step); 1241 BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1); 1242 } 1243 1244 /* perform fan in/out on full buffer */ 1245 /* must change PCTFS_grop to handle the blas */ 1246 PCTFS_grop(buf,work,size,op); 1247 1248 /* reset */ 1249 in = gs->tree_map_in; 1250 out = gs->tree_map_out; 1251 1252 /* get the portion of the results I need */ 1253 while (*in >= 0) { 1254 PetscBLASInt dstep = PetscBLASIntCast(step); 1255 BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1); 1256 } 1257 PetscFunctionReturn(0); 1258 } 1259 1260 /******************************************************************************/ 1261 PetscErrorCode PCTFS_gs_gop_hc( PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1262 { 1263 PetscErrorCode ierr; 1264 1265 PetscFunctionBegin; 1266 switch (*op) { 1267 case '+': 1268 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1269 break; 1270 default: 1271 ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op",op[0]);CHKERRQ(ierr); 1272 ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr); 1273 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1274 break; 1275 } 1276 PetscFunctionReturn(0); 1277 } 1278 1279 /******************************************************************************/ 1280 static PetscErrorCode PCTFS_gs_gop_plus_hc( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1281 { 1282 PetscFunctionBegin; 1283 /* if there's nothing to do return */ 1284 if (dim<=0) { PetscFunctionReturn(0); } 1285 1286 /* can't do more dimensions then exist */ 1287 dim = PetscMin(dim,PCTFS_i_log2_num_nodes); 1288 1289 /* local only operations!!! */ 1290 if (gs->num_local) {PCTFS_gs_gop_local_plus(gs,vals);} 1291 1292 /* if intersection tree/pairwise and local isn't empty */ 1293 if (gs->num_local_gop) { 1294 PCTFS_gs_gop_local_in_plus(gs,vals); 1295 1296 /* pairwise will do tree inside ... */ 1297 if (gs->num_pairs) { PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); } 1298 /* tree only */ 1299 else if (gs->max_left_over) { PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); } 1300 1301 PCTFS_gs_gop_local_out(gs,vals); 1302 } else { /* if intersection tree/pairwise and local is empty */ 1303 /* pairwise will do tree inside */ 1304 if (gs->num_pairs) { PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); } 1305 /* tree */ 1306 else if (gs->max_left_over) { PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); } 1307 } 1308 PetscFunctionReturn(0); 1309 } 1310 1311 /******************************************************************************/ 1312 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc( PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1313 { 1314 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1315 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1316 PetscInt *pw, *list, *size, **nodes; 1317 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1318 MPI_Status status; 1319 PetscInt i, mask=1; 1320 PetscErrorCode ierr; 1321 1322 PetscFunctionBegin; 1323 for (i=1; i<dim; i++) { mask<<=1; mask++; } 1324 1325 /* strip and load s */ 1326 msg_list =list = gs->pair_list; 1327 msg_size =size = gs->msg_sizes; 1328 msg_nodes=nodes = gs->node_list; 1329 iptr=pw = gs->pw_elm_list; 1330 dptr1=dptr3 = gs->pw_vals; 1331 msg_ids_in = ids_in = gs->msg_ids_in; 1332 msg_ids_out = ids_out = gs->msg_ids_out; 1333 dptr2 = gs->out; 1334 in1=in2 = gs->in; 1335 1336 /* post the receives */ 1337 /* msg_nodes=nodes; */ 1338 do { 1339 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1340 second one *list and do list++ afterwards */ 1341 if ((PCTFS_my_id|mask)==(*list|mask)) { 1342 ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr); 1343 list++; msg_ids_in++;in1 += *size++; 1344 } else { list++; size++; } 1345 } 1346 while (*++msg_nodes); 1347 1348 /* load gs values into in out gs buffers */ 1349 while (*iptr >= 0) { *dptr3++ = *(in_vals + *iptr++); } 1350 1351 /* load out buffers and post the sends */ 1352 msg_nodes=nodes; 1353 list = msg_list; 1354 while ((iptr = *msg_nodes++)) { 1355 if ((PCTFS_my_id|mask)==(*list|mask)) { 1356 dptr3 = dptr2; 1357 while (*iptr >= 0) {*dptr2++ = *(dptr1 + *iptr++);} 1358 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1359 /* is msg_ids_out++ correct? */ 1360 ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr); 1361 msg_size++;list++;msg_ids_out++; 1362 } else {list++; msg_size++;} 1363 } 1364 1365 /* do the tree while we're waiting */ 1366 if (gs->max_left_over) { PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim); } 1367 1368 /* process the received data */ 1369 msg_nodes=nodes; 1370 list = msg_list; 1371 while ((iptr = *nodes++)) { 1372 if ((PCTFS_my_id|mask)==(*list|mask)) { 1373 /* Should I check the return value of MPI_Wait() or status? */ 1374 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1375 ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr); 1376 ids_in++; 1377 while (*iptr >= 0) {*(dptr1 + *iptr++) += *in2++;} 1378 } 1379 list++; 1380 } 1381 1382 /* replace vals */ 1383 while (*pw >= 0) { *(in_vals + *pw++) = *dptr1++; } 1384 1385 /* clear isend message handles */ 1386 /* This changed for clarity though it could be the same */ 1387 while (*msg_nodes++) { 1388 if ((PCTFS_my_id|mask)==(*msg_list|mask)) { 1389 /* Should I check the return value of MPI_Wait() or status? */ 1390 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1391 ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr); 1392 ids_out++; 1393 } 1394 msg_list++; 1395 } 1396 1397 PetscFunctionReturn(0); 1398 } 1399 1400 /******************************************************************************/ 1401 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1402 { 1403 PetscInt size; 1404 PetscInt *in, *out; 1405 PetscScalar *buf, *work; 1406 PetscInt op[] = {GL_ADD,0}; 1407 1408 PetscFunctionBegin; 1409 in = gs->tree_map_in; 1410 out = gs->tree_map_out; 1411 buf = gs->tree_buf; 1412 work = gs->tree_work; 1413 size = gs->tree_nel; 1414 1415 PCTFS_rvec_zero(buf,size); 1416 1417 while (*in >= 0) {*(buf + *out++) = *(vals + *in++);} 1418 1419 in = gs->tree_map_in; 1420 out = gs->tree_map_out; 1421 1422 PCTFS_grop_hc(buf,work,size,op,dim); 1423 1424 while (*in >= 0) {*(vals + *in++) = *(buf + *out++);} 1425 PetscFunctionReturn(0); 1426 } 1427 1428 1429 1430