1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 /***********************************gs.c*************************************** 30 Type: struct gather_scatter_id 31 ------------------------------ 32 33 ************************************gs.c**************************************/ 34 typedef struct gather_scatter_id { 35 PetscInt id; 36 PetscInt nel_min; 37 PetscInt nel_max; 38 PetscInt nel_sum; 39 PetscInt negl; 40 PetscInt gl_max; 41 PetscInt gl_min; 42 PetscInt repeats; 43 PetscInt ordered; 44 PetscInt positive; 45 PetscScalar *vals; 46 47 /* bit mask info */ 48 PetscInt *my_proc_mask; 49 PetscInt mask_sz; 50 PetscInt *ngh_buf; 51 PetscInt ngh_buf_sz; 52 PetscInt *nghs; 53 PetscInt num_nghs; 54 PetscInt max_nghs; 55 PetscInt *pw_nghs; 56 PetscInt num_pw_nghs; 57 PetscInt *tree_nghs; 58 PetscInt num_tree_nghs; 59 60 PetscInt num_loads; 61 62 /* repeats == true -> local info */ 63 PetscInt nel; /* number of unique elememts */ 64 PetscInt *elms; /* of size nel */ 65 PetscInt nel_total; 66 PetscInt *local_elms; /* of size nel_total */ 67 PetscInt *companion; /* of size nel_total */ 68 69 /* local info */ 70 PetscInt num_local_total; 71 PetscInt local_strength; 72 PetscInt num_local; 73 PetscInt *num_local_reduce; 74 PetscInt **local_reduce; 75 PetscInt num_local_gop; 76 PetscInt *num_gop_local_reduce; 77 PetscInt **gop_local_reduce; 78 79 /* pairwise info */ 80 PetscInt level; 81 PetscInt num_pairs; 82 PetscInt max_pairs; 83 PetscInt loc_node_pairs; 84 PetscInt max_node_pairs; 85 PetscInt min_node_pairs; 86 PetscInt avg_node_pairs; 87 PetscInt *pair_list; 88 PetscInt *msg_sizes; 89 PetscInt **node_list; 90 PetscInt len_pw_list; 91 PetscInt *pw_elm_list; 92 PetscScalar *pw_vals; 93 94 MPI_Request *msg_ids_in; 95 MPI_Request *msg_ids_out; 96 97 PetscScalar *out; 98 PetscScalar *in; 99 PetscInt msg_total; 100 101 /* tree - crystal accumulator info */ 102 PetscInt max_left_over; 103 PetscInt *pre; 104 PetscInt *in_num; 105 PetscInt *out_num; 106 PetscInt **in_list; 107 PetscInt **out_list; 108 109 /* new tree work*/ 110 PetscInt tree_nel; 111 PetscInt *tree_elms; 112 PetscScalar *tree_buf; 113 PetscScalar *tree_work; 114 115 PetscInt tree_map_sz; 116 PetscInt *tree_map_in; 117 PetscInt *tree_map_out; 118 119 /* current memory status */ 120 PetscInt gl_bss_min; 121 PetscInt gl_perm_min; 122 123 /* max segment size for PCTFS_gs_gop_vec() */ 124 PetscInt vec_sz; 125 126 /* hack to make paul happy */ 127 MPI_Comm PCTFS_gs_comm; 128 129 } PCTFS_gs_id; 130 131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 135 static PCTFS_gs_id *gsi_new(void); 136 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 137 138 /* same for all but vector flavor */ 139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 140 /* vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 142 143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 151 152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 155 156 /* global vars */ 157 /* from comm.c module */ 158 159 static PetscInt num_gs_ids = 0; 160 161 /* should make this dynamic ... later */ 162 static PetscInt msg_buf =MAX_MSG_BUF; 163 static PetscInt vec_sz =GS_VEC_SZ; 164 static PetscInt *tree_buf =NULL; 165 static PetscInt tree_buf_sz=0; 166 static PetscInt ntree =0; 167 168 /***************************************************************************/ 169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 170 { 171 PetscFunctionBegin; 172 vec_sz = size; 173 PetscFunctionReturn(0); 174 } 175 176 /******************************************************************************/ 177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 178 { 179 PetscFunctionBegin; 180 msg_buf = buf_size; 181 PetscFunctionReturn(0); 182 } 183 184 /******************************************************************************/ 185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 186 { 187 PCTFS_gs_id *gs; 188 MPI_Group PCTFS_gs_group; 189 MPI_Comm PCTFS_gs_comm; 190 191 /* ensure that communication package has been initialized */ 192 PCTFS_comm_init(); 193 194 /* determines if we have enough dynamic/semi-static memory */ 195 /* checks input, allocs and sets gd_id template */ 196 gs = gsi_check_args(elms,nel,level); 197 198 /* only bit mask version up and working for the moment */ 199 /* LATER :: get int list version working for sparse pblms */ 200 PetscCallAbort(PETSC_COMM_WORLD,gsi_via_bit_mask(gs)); 201 202 PetscCallAbort(PETSC_COMM_WORLD,MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group)); 203 PetscCallAbort(PETSC_COMM_WORLD,MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm)); 204 PetscCallAbort(PETSC_COMM_WORLD,MPI_Group_free(&PCTFS_gs_group)); 205 206 gs->PCTFS_gs_comm=PCTFS_gs_comm; 207 208 return(gs); 209 } 210 211 /******************************************************************************/ 212 static PCTFS_gs_id *gsi_new(void) 213 { 214 PCTFS_gs_id *gs; 215 gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id)); 216 PetscCallAbort(PETSC_COMM_WORLD,PetscMemzero(gs,sizeof(PCTFS_gs_id))); 217 return(gs); 218 } 219 220 /******************************************************************************/ 221 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 222 { 223 PetscInt i, j, k, t2; 224 PetscInt *companion, *elms, *unique, *iptr; 225 PetscInt num_local=0, *num_to_reduce, **local_reduce; 226 PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; 227 PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs)-1]; 228 PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs)-1]; 229 PCTFS_gs_id *gs; 230 231 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n"); 232 if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n"); 233 234 if (nel==0) PetscCallAbort(PETSC_COMM_WORLD,PetscInfo(0,"I don't have any elements!!!\n")); 235 236 /* get space for gs template */ 237 gs = gsi_new(); 238 gs->id = ++num_gs_ids; 239 240 /* hmt 6.4.99 */ 241 /* caller can set global ids that don't participate to 0 */ 242 /* PCTFS_gs_init ignores all zeros in elm list */ 243 /* negative global ids are still invalid */ 244 for (i=j=0; i<nel; i++) { 245 if (in_elms[i]!=0) j++; 246 } 247 248 k=nel; nel=j; 249 250 /* copy over in_elms list and create inverse map */ 251 elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); 252 companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); 253 254 for (i=j=0; i<k; i++) { 255 if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; } 256 } 257 258 if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n"); 259 260 /* pre-pass ... check to see if sorted */ 261 elms[nel] = INT_MAX; 262 iptr = elms; 263 unique = elms+1; 264 j =0; 265 while (*iptr!=INT_MAX) { 266 if (*iptr++>*unique++) { j=1; break; } 267 } 268 269 /* set up inverse map */ 270 if (j) { 271 PetscCallAbort(PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n")); 272 PetscCallAbort(PETSC_COMM_WORLD,PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER)); 273 } else PetscCallAbort(PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list sorted!\n")); 274 elms[nel] = INT_MIN; 275 276 /* first pass */ 277 /* determine number of unique elements, check pd */ 278 for (i=k=0; i<nel; i+=j) { 279 t2 = elms[i]; 280 j = ++i; 281 282 /* clump 'em for now */ 283 while (elms[j]==t2) j++; 284 285 /* how many together and num local */ 286 if (j-=i) { num_local++; k+=j; } 287 } 288 289 /* how many unique elements? */ 290 gs->repeats = k; 291 gs->nel = nel-k; 292 293 /* number of repeats? */ 294 gs->num_local = num_local; 295 num_local += 2; 296 gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*)); 297 gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); 298 299 unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); 300 gs->elms = unique; 301 gs->nel_total = nel; 302 gs->local_elms = elms; 303 gs->companion = companion; 304 305 /* compess map as well as keep track of local ops */ 306 for (num_local=i=j=0; i<gs->nel; i++) { 307 k = j; 308 t2 = unique[i] = elms[j]; 309 companion[i] = companion[j]; 310 311 while (elms[j]==t2) j++; 312 313 if ((t2=(j-k))>1) { 314 /* number together */ 315 num_to_reduce[num_local] = t2++; 316 317 iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); 318 319 /* to use binary searching don't remap until we check intersection */ 320 *iptr++ = i; 321 322 /* note that we're skipping the first one */ 323 while (++k<j) *(iptr++) = companion[k]; 324 *iptr = -1; 325 } 326 } 327 328 /* sentinel for ngh_buf */ 329 unique[gs->nel]=INT_MAX; 330 331 /* for two partition sort hack */ 332 num_to_reduce[num_local] = 0; 333 local_reduce[num_local] = NULL; 334 num_to_reduce[++num_local] = 0; 335 local_reduce[num_local] = NULL; 336 337 /* load 'em up */ 338 /* note one extra to hold NON_UNIFORM flag!!! */ 339 vals[2] = vals[1] = vals[0] = nel; 340 if (gs->nel>0) { 341 vals[3] = unique[0]; 342 vals[4] = unique[gs->nel-1]; 343 } else { 344 vals[3] = INT_MAX; 345 vals[4] = INT_MIN; 346 } 347 vals[5] = level; 348 vals[6] = num_gs_ids; 349 350 /* GLOBAL: send 'em out */ 351 PetscCallAbort(PETSC_COMM_WORLD,PCTFS_giop(vals,work,PETSC_STATIC_ARRAY_LENGTH(oprs)-1,oprs)); 352 353 /* must be semi-pos def - only pairwise depends on this */ 354 /* LATER - remove this restriction */ 355 if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n"); 356 if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n"); 357 358 gs->nel_min = vals[0]; 359 gs->nel_max = vals[1]; 360 gs->nel_sum = vals[2]; 361 gs->gl_min = vals[3]; 362 gs->gl_max = vals[4]; 363 gs->negl = vals[4]-vals[3]+1; 364 365 if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n"); 366 367 /* LATER :: add level == -1 -> program selects level */ 368 if (vals[5]<0) vals[5]=0; 369 else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes; 370 gs->level = vals[5]; 371 372 return(gs); 373 } 374 375 /******************************************************************************/ 376 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 377 { 378 PetscInt i, nel, *elms; 379 PetscInt t1; 380 PetscInt **reduce; 381 PetscInt *map; 382 383 PetscFunctionBegin; 384 /* totally local removes ... PCTFS_ct_bits == 0 */ 385 get_ngh_buf(gs); 386 387 if (gs->level) set_pairwise(gs); 388 if (gs->max_left_over) set_tree(gs); 389 390 /* intersection local and pairwise/tree? */ 391 gs->num_local_total = gs->num_local; 392 gs->gop_local_reduce = gs->local_reduce; 393 gs->num_gop_local_reduce = gs->num_local_reduce; 394 395 map = gs->companion; 396 397 /* is there any local compression */ 398 if (!gs->num_local) { 399 gs->local_strength = NONE; 400 gs->num_local_gop = 0; 401 } else { 402 /* ok find intersection */ 403 map = gs->companion; 404 reduce = gs->local_reduce; 405 for (i=0, t1=0; i<gs->num_local; i++, reduce++) { 406 if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) { 407 t1++; 408 PetscCheck(gs->num_local_reduce[i]>0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?"); 409 gs->num_local_reduce[i] *= -1; 410 } 411 **reduce=map[**reduce]; 412 } 413 414 /* intersection is empty */ 415 if (!t1) { 416 gs->local_strength = FULL; 417 gs->num_local_gop = 0; 418 } else { /* intersection not empty */ 419 gs->local_strength = PARTIAL; 420 421 PetscCall(PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR)); 422 423 gs->num_local_gop = t1; 424 gs->num_local_total = gs->num_local; 425 gs->num_local -= t1; 426 gs->gop_local_reduce = gs->local_reduce; 427 gs->num_gop_local_reduce = gs->num_local_reduce; 428 429 for (i=0; i<t1; i++) { 430 PetscCheck(gs->num_gop_local_reduce[i]<0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?"); 431 gs->num_gop_local_reduce[i] *= -1; 432 gs->local_reduce++; 433 gs->num_local_reduce++; 434 } 435 gs->local_reduce++; 436 gs->num_local_reduce++; 437 } 438 } 439 440 elms = gs->pw_elm_list; 441 nel = gs->len_pw_list; 442 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 443 444 elms = gs->tree_map_in; 445 nel = gs->tree_map_sz; 446 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 447 448 /* clean up */ 449 free((void*) gs->local_elms); 450 free((void*) gs->companion); 451 free((void*) gs->elms); 452 free((void*) gs->ngh_buf); 453 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 454 PetscFunctionReturn(0); 455 } 456 457 /******************************************************************************/ 458 static PetscErrorCode place_in_tree(PetscInt elm) 459 { 460 PetscInt *tp, n; 461 462 PetscFunctionBegin; 463 if (ntree==tree_buf_sz) { 464 if (tree_buf_sz) { 465 tp = tree_buf; 466 n = tree_buf_sz; 467 tree_buf_sz<<=1; 468 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 469 PCTFS_ivec_copy(tree_buf,tp,n); 470 free(tp); 471 } else { 472 tree_buf_sz = TREE_BUF_SZ; 473 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 474 } 475 } 476 477 tree_buf[ntree++] = elm; 478 PetscFunctionReturn(0); 479 } 480 481 /******************************************************************************/ 482 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 483 { 484 PetscInt i, j, npw=0, ntree_map=0; 485 PetscInt p_mask_size, ngh_buf_size, buf_size; 486 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 487 PetscInt *ngh_buf, *buf1, *buf2; 488 PetscInt offset, per_load, num_loads, or_ct, start, end; 489 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 490 PetscInt oper=GL_B_OR; 491 PetscInt *ptr3, *t_mask, level, ct1, ct2; 492 493 PetscFunctionBegin; 494 /* to make life easier */ 495 nel = gs->nel; 496 elms = gs->elms; 497 level = gs->level; 498 499 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 500 p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes)); 501 PetscCall(PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id)); 502 503 /* allocate space for masks and info bufs */ 504 gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); 505 gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); 506 gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; 507 t_mask = (PetscInt*) malloc(p_mask_size); 508 gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); 509 510 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 511 /* had thought I could exploit rendezvous threshold */ 512 513 /* default is one pass */ 514 per_load = negl = gs->negl; 515 gs->num_loads = num_loads = 1; 516 i = p_mask_size*negl; 517 518 /* possible overflow on buffer size */ 519 /* overflow hack */ 520 if (i<0) i=INT_MAX; 521 522 buf_size = PetscMin(msg_buf,i); 523 524 /* can we do it? */ 525 PetscCheck(p_mask_size<=buf_size,PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d",p_mask_size,buf_size); 526 527 /* get PCTFS_giop buf space ... make *only* one malloc */ 528 buf1 = (PetscInt*) malloc(buf_size<<1); 529 530 /* more than one gior exchange needed? */ 531 if (buf_size!=i) { 532 per_load = buf_size/p_mask_size; 533 buf_size = per_load*p_mask_size; 534 gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); 535 } 536 537 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 538 p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); 539 540 /* find PCTFS_giop work space */ 541 buf2 = buf1+buf_size; 542 543 /* hold #ints needed for processor masks */ 544 gs->mask_sz=p_mask_size; 545 546 /* init buffers */ 547 PetscCall(PCTFS_ivec_zero(sh_proc_mask,p_mask_size)); 548 PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size)); 549 PetscCall(PCTFS_ivec_zero(ngh_buf,ngh_buf_size)); 550 551 /* HACK reset tree info */ 552 tree_buf = NULL; 553 tree_buf_sz = ntree = 0; 554 555 /* ok do it */ 556 for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) { 557 /* identity for bitwise or is 000...000 */ 558 PCTFS_ivec_zero(buf1,buf_size); 559 560 /* load msg buffer */ 561 for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) { 562 offset = (offset-start)*p_mask_size; 563 PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size); 564 } 565 566 /* GLOBAL: pass buffer */ 567 PetscCall(PCTFS_giop(buf1,buf2,buf_size,&oper)); 568 569 /* unload buffer into ngh_buf */ 570 ptr2=(elms+i_start); 571 for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) { 572 /* I own it ... may have to pairwise it */ 573 if (j==*ptr2) { 574 /* do i share it w/anyone? */ 575 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 576 /* guess not */ 577 if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; } 578 579 /* i do ... so keep info and turn off my bit */ 580 PCTFS_ivec_copy(ptr1,ptr3,p_mask_size); 581 PetscCall(PCTFS_ivec_xor(ptr1,p_mask,p_mask_size)); 582 PetscCall(PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size)); 583 584 /* is it to be done pairwise? */ 585 if (--ct1<=level) { 586 npw++; 587 588 /* turn on high bit to indicate pw need to process */ 589 *ptr2++ |= TOP_BIT; 590 PetscCall(PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size)); 591 ptr1 += p_mask_size; 592 continue; 593 } 594 595 /* get set for next and note that I have a tree contribution */ 596 /* could save exact elm index for tree here -> save a search */ 597 ptr2++; ptr1+=p_mask_size; ntree_map++; 598 } else { /* i don't but still might be involved in tree */ 599 600 /* shared by how many? */ 601 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 602 603 /* none! */ 604 if (ct1<2) continue; 605 606 /* is it going to be done pairwise? but not by me of course!*/ 607 if (--ct1<=level) continue; 608 } 609 /* LATER we're going to have to process it NOW */ 610 /* nope ... tree it */ 611 PetscCall(place_in_tree(j)); 612 } 613 } 614 615 free((void*)t_mask); 616 free((void*)buf1); 617 618 gs->len_pw_list = npw; 619 gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 620 621 /* expand from bit mask list to int list and save ngh list */ 622 gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); 623 PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); 624 625 gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); 626 627 oper = GL_MAX; 628 ct1 = gs->num_nghs; 629 PetscCall(PCTFS_giop(&ct1,&ct2,1,&oper)); 630 gs->max_nghs = ct1; 631 632 gs->tree_map_sz = ntree_map; 633 gs->max_left_over=ntree; 634 635 free((void*)p_mask); 636 free((void*)sh_proc_mask); 637 PetscFunctionReturn(0); 638 } 639 640 /******************************************************************************/ 641 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 642 { 643 PetscInt i, j; 644 PetscInt p_mask_size; 645 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 646 PetscInt *ngh_buf, *buf2; 647 PetscInt offset; 648 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 649 PetscInt *pairwise_elm_list, len_pair_list=0; 650 PetscInt *iptr, t1, i_start, nel, *elms; 651 PetscInt ct; 652 653 PetscFunctionBegin; 654 /* to make life easier */ 655 nel = gs->nel; 656 elms = gs->elms; 657 ngh_buf = gs->ngh_buf; 658 sh_proc_mask = gs->pw_nghs; 659 660 /* need a few temp masks */ 661 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 662 p_mask = (PetscInt*) malloc(p_mask_size); 663 tmp_proc_mask = (PetscInt*) malloc(p_mask_size); 664 665 /* set mask to my PCTFS_my_id's bit mask */ 666 PetscCall(PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id)); 667 668 p_mask_size /= sizeof(PetscInt); 669 670 len_pair_list = gs->len_pw_list; 671 gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); 672 673 /* how many processors (nghs) do we have to exchange with? */ 674 nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 675 676 /* allocate space for PCTFS_gs_gop() info */ 677 gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 678 gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 679 gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1)); 680 681 /* init msg_size list */ 682 PetscCall(PCTFS_ivec_zero(msg_size,nprs)); 683 684 /* expand from bit mask list to int list */ 685 PetscCall(PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list)); 686 687 /* keep list of elements being handled pairwise */ 688 for (i=j=0; i<nel; i++) { 689 if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; } 690 } 691 pairwise_elm_list[j] = -1; 692 693 gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 694 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 695 gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 696 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 697 gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); 698 699 /* find who goes to each processor */ 700 for (i_start=i=0; i<nprs; i++) { 701 /* processor i's mask */ 702 PetscCall(PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i])); 703 704 /* det # going to processor i */ 705 for (ct=j=0; j<len_pair_list; j++) { 706 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 707 PetscCall(PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size)); 708 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++; 709 } 710 msg_size[i] = ct; 711 i_start = PetscMax(i_start,ct); 712 713 /*space to hold nodes in message to first neighbor */ 714 msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); 715 716 for (j=0;j<len_pair_list;j++) { 717 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 718 PetscCall(PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size)); 719 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j; 720 } 721 *iptr = -1; 722 } 723 msg_nodes[nprs] = NULL; 724 725 j = gs->loc_node_pairs=i_start; 726 t1 = GL_MAX; 727 PetscCall(PCTFS_giop(&i_start,&offset,1,&t1)); 728 gs->max_node_pairs = i_start; 729 730 i_start = j; 731 t1 = GL_MIN; 732 PetscCall(PCTFS_giop(&i_start,&offset,1,&t1)); 733 gs->min_node_pairs = i_start; 734 735 i_start = j; 736 t1 = GL_ADD; 737 PetscCall(PCTFS_giop(&i_start,&offset,1,&t1)); 738 gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1; 739 740 i_start = nprs; 741 t1 = GL_MAX; 742 PCTFS_giop(&i_start,&offset,1,&t1); 743 gs->max_pairs = i_start; 744 745 /* remap pairwise in tail of gsi_via_bit_mask() */ 746 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs); 747 gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 748 gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 749 750 /* reset malloc pool */ 751 free((void*)p_mask); 752 free((void*)tmp_proc_mask); 753 PetscFunctionReturn(0); 754 } 755 756 /* to do pruned tree just save ngh buf copy for each one and decode here! 757 ******************************************************************************/ 758 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 759 { 760 PetscInt i, j, n, nel; 761 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 762 763 PetscFunctionBegin; 764 /* local work ptrs */ 765 elms = gs->elms; 766 nel = gs->nel; 767 768 /* how many via tree */ 769 gs->tree_nel = n = ntree; 770 gs->tree_elms = tree_elms = iptr_in = tree_buf; 771 gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 772 gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 773 j = gs->tree_map_sz; 774 gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 775 gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 776 777 /* search the longer of the two lists */ 778 /* note ... could save this info in get_ngh_buf and save searches */ 779 if (n<=nel) { 780 /* bijective fct w/remap - search elm list */ 781 for (i=0; i<n; i++) { 782 if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;} 783 } 784 } else { 785 for (i=0; i<nel; i++) { 786 if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;} 787 } 788 } 789 790 /* sentinel */ 791 *iptr_in = *iptr_out = -1; 792 PetscFunctionReturn(0); 793 } 794 795 /******************************************************************************/ 796 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 797 { 798 PetscInt *num, *map, **reduce; 799 PetscScalar tmp; 800 801 PetscFunctionBegin; 802 num = gs->num_gop_local_reduce; 803 reduce = gs->gop_local_reduce; 804 while ((map = *reduce++)) { 805 /* wall */ 806 if (*num == 2) { 807 num++; 808 vals[map[1]] = vals[map[0]]; 809 } else if (*num == 3) { /* corner shared by three elements */ 810 num++; 811 vals[map[2]] = vals[map[1]] = vals[map[0]]; 812 } else if (*num == 4) { /* corner shared by four elements */ 813 num++; 814 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 815 } else { /* general case ... odd geoms ... 3D*/ 816 num++; 817 tmp = *(vals + *map++); 818 while (*map >= 0) *(vals + *map++) = tmp; 819 } 820 } 821 PetscFunctionReturn(0); 822 } 823 824 /******************************************************************************/ 825 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 826 { 827 PetscInt *num, *map, **reduce; 828 PetscScalar tmp; 829 830 PetscFunctionBegin; 831 num = gs->num_local_reduce; 832 reduce = gs->local_reduce; 833 while ((map = *reduce)) { 834 /* wall */ 835 if (*num == 2) { 836 num++; reduce++; 837 vals[map[1]] = vals[map[0]] += vals[map[1]]; 838 } else if (*num == 3) { /* corner shared by three elements */ 839 num++; reduce++; 840 vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]); 841 } else if (*num == 4) { /* corner shared by four elements */ 842 num++; reduce++; 843 vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 844 } else { /* general case ... odd geoms ... 3D*/ 845 num++; 846 tmp = 0.0; 847 while (*map >= 0) tmp += *(vals + *map++); 848 849 map = *reduce++; 850 while (*map >= 0) *(vals + *map++) = tmp; 851 } 852 } 853 PetscFunctionReturn(0); 854 } 855 856 /******************************************************************************/ 857 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 858 { 859 PetscInt *num, *map, **reduce; 860 PetscScalar *base; 861 862 PetscFunctionBegin; 863 num = gs->num_gop_local_reduce; 864 reduce = gs->gop_local_reduce; 865 while ((map = *reduce++)) { 866 /* wall */ 867 if (*num == 2) { 868 num++; 869 vals[map[0]] += vals[map[1]]; 870 } else if (*num == 3) { /* corner shared by three elements */ 871 num++; 872 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 873 } else if (*num == 4) { /* corner shared by four elements */ 874 num++; 875 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 876 } else { /* general case ... odd geoms ... 3D*/ 877 num++; 878 base = vals + *map++; 879 while (*map >= 0) *base += *(vals + *map++); 880 } 881 } 882 PetscFunctionReturn(0); 883 } 884 885 /******************************************************************************/ 886 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 887 { 888 PetscInt i; 889 890 PetscFunctionBegin; 891 PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm)); 892 if (gs->nghs) free((void*) gs->nghs); 893 if (gs->pw_nghs) free((void*) gs->pw_nghs); 894 895 /* tree */ 896 if (gs->max_left_over) { 897 if (gs->tree_elms) free((void*) gs->tree_elms); 898 if (gs->tree_buf) free((void*) gs->tree_buf); 899 if (gs->tree_work) free((void*) gs->tree_work); 900 if (gs->tree_map_in) free((void*) gs->tree_map_in); 901 if (gs->tree_map_out) free((void*) gs->tree_map_out); 902 } 903 904 /* pairwise info */ 905 if (gs->num_pairs) { 906 /* should be NULL already */ 907 if (gs->ngh_buf) free((void*) gs->ngh_buf); 908 if (gs->elms) free((void*) gs->elms); 909 if (gs->local_elms) free((void*) gs->local_elms); 910 if (gs->companion) free((void*) gs->companion); 911 912 /* only set if pairwise */ 913 if (gs->vals) free((void*) gs->vals); 914 if (gs->in) free((void*) gs->in); 915 if (gs->out) free((void*) gs->out); 916 if (gs->msg_ids_in) free((void*) gs->msg_ids_in); 917 if (gs->msg_ids_out) free((void*) gs->msg_ids_out); 918 if (gs->pw_vals) free((void*) gs->pw_vals); 919 if (gs->pw_elm_list) free((void*) gs->pw_elm_list); 920 if (gs->node_list) { 921 for (i=0;i<gs->num_pairs;i++) { 922 if (gs->node_list[i]) { 923 free((void*) gs->node_list[i]); 924 } 925 } 926 free((void*) gs->node_list); 927 } 928 if (gs->msg_sizes) free((void*) gs->msg_sizes); 929 if (gs->pair_list) free((void*) gs->pair_list); 930 } 931 932 /* local info */ 933 if (gs->num_local_total>=0) { 934 for (i=0;i<gs->num_local_total+1;i++) { 935 if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]); 936 } 937 } 938 939 /* if intersection tree/pairwise and local isn't empty */ 940 if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce); 941 if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce); 942 943 free((void*) gs); 944 PetscFunctionReturn(0); 945 } 946 947 /******************************************************************************/ 948 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 949 { 950 PetscFunctionBegin; 951 switch (*op) { 952 case '+': 953 PCTFS_gs_gop_vec_plus(gs,vals,step); 954 break; 955 default: 956 PetscCall(PetscInfo(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0])); 957 PetscCall(PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n")); 958 PCTFS_gs_gop_vec_plus(gs,vals,step); 959 break; 960 } 961 PetscFunctionReturn(0); 962 } 963 964 /******************************************************************************/ 965 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 966 { 967 PetscFunctionBegin; 968 PetscCheck(gs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 969 970 /* local only operations!!! */ 971 if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step); 972 973 /* if intersection tree/pairwise and local isn't empty */ 974 if (gs->num_local_gop) { 975 PCTFS_gs_gop_vec_local_in_plus(gs,vals,step); 976 977 /* pairwise */ 978 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 979 980 /* tree */ 981 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 982 983 PCTFS_gs_gop_vec_local_out(gs,vals,step); 984 } else { /* if intersection tree/pairwise and local is empty */ 985 /* pairwise */ 986 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 987 988 /* tree */ 989 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 990 } 991 PetscFunctionReturn(0); 992 } 993 994 /******************************************************************************/ 995 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 996 { 997 PetscInt *num, *map, **reduce; 998 PetscScalar *base; 999 1000 PetscFunctionBegin; 1001 num = gs->num_local_reduce; 1002 reduce = gs->local_reduce; 1003 while ((map = *reduce)) { 1004 base = vals + map[0] * step; 1005 1006 /* wall */ 1007 if (*num == 2) { 1008 num++; reduce++; 1009 PCTFS_rvec_add (base,vals+map[1]*step,step); 1010 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1011 } else if (*num == 3) { /* corner shared by three elements */ 1012 num++; reduce++; 1013 PCTFS_rvec_add (base,vals+map[1]*step,step); 1014 PCTFS_rvec_add (base,vals+map[2]*step,step); 1015 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1016 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1017 } else if (*num == 4) { /* corner shared by four elements */ 1018 num++; reduce++; 1019 PCTFS_rvec_add (base,vals+map[1]*step,step); 1020 PCTFS_rvec_add (base,vals+map[2]*step,step); 1021 PCTFS_rvec_add (base,vals+map[3]*step,step); 1022 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1023 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1024 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1025 } else { /* general case ... odd geoms ... 3D */ 1026 num++; 1027 while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step); 1028 1029 map = *reduce; 1030 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1031 1032 reduce++; 1033 } 1034 } 1035 PetscFunctionReturn(0); 1036 } 1037 1038 /******************************************************************************/ 1039 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1040 { 1041 PetscInt *num, *map, **reduce; 1042 PetscScalar *base; 1043 1044 PetscFunctionBegin; 1045 num = gs->num_gop_local_reduce; 1046 reduce = gs->gop_local_reduce; 1047 while ((map = *reduce++)) { 1048 base = vals + map[0] * step; 1049 1050 /* wall */ 1051 if (*num == 2) { 1052 num++; 1053 PCTFS_rvec_add(base,vals+map[1]*step,step); 1054 } else if (*num == 3) { /* corner shared by three elements */ 1055 num++; 1056 PCTFS_rvec_add(base,vals+map[1]*step,step); 1057 PCTFS_rvec_add(base,vals+map[2]*step,step); 1058 } else if (*num == 4) { /* corner shared by four elements */ 1059 num++; 1060 PCTFS_rvec_add(base,vals+map[1]*step,step); 1061 PCTFS_rvec_add(base,vals+map[2]*step,step); 1062 PCTFS_rvec_add(base,vals+map[3]*step,step); 1063 } else { /* general case ... odd geoms ... 3D*/ 1064 num++; 1065 while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step); 1066 } 1067 } 1068 PetscFunctionReturn(0); 1069 } 1070 1071 /******************************************************************************/ 1072 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1073 { 1074 PetscInt *num, *map, **reduce; 1075 PetscScalar *base; 1076 1077 PetscFunctionBegin; 1078 num = gs->num_gop_local_reduce; 1079 reduce = gs->gop_local_reduce; 1080 while ((map = *reduce++)) { 1081 base = vals + map[0] * step; 1082 1083 /* wall */ 1084 if (*num == 2) { 1085 num++; 1086 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1087 } else if (*num == 3) { /* corner shared by three elements */ 1088 num++; 1089 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1090 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1091 } else if (*num == 4) { /* corner shared by four elements */ 1092 num++; 1093 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1094 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1095 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1096 } else { /* general case ... odd geoms ... 3D*/ 1097 num++; 1098 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1099 } 1100 } 1101 PetscFunctionReturn(0); 1102 } 1103 1104 /******************************************************************************/ 1105 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1106 { 1107 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1108 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1109 PetscInt *pw, *list, *size, **nodes; 1110 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1111 MPI_Status status; 1112 PetscBLASInt i1 = 1,dstep; 1113 1114 PetscFunctionBegin; 1115 /* strip and load s */ 1116 msg_list = list = gs->pair_list; 1117 msg_size = size = gs->msg_sizes; 1118 msg_nodes = nodes = gs->node_list; 1119 iptr = pw = gs->pw_elm_list; 1120 dptr1 = dptr3 = gs->pw_vals; 1121 msg_ids_in = ids_in = gs->msg_ids_in; 1122 msg_ids_out = ids_out = gs->msg_ids_out; 1123 dptr2 = gs->out; 1124 in1=in2 = gs->in; 1125 1126 /* post the receives */ 1127 /* msg_nodes=nodes; */ 1128 do { 1129 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1130 second one *list and do list++ afterwards */ 1131 PetscCallMPI(MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1132 list++;msg_ids_in++; 1133 in1 += *size++ *step; 1134 } while (*++msg_nodes); 1135 msg_nodes=nodes; 1136 1137 /* load gs values into in out gs buffers */ 1138 while (*iptr >= 0) { 1139 PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step); 1140 dptr3+=step; 1141 iptr++; 1142 } 1143 1144 /* load out buffers and post the sends */ 1145 while ((iptr = *msg_nodes++)) { 1146 dptr3 = dptr2; 1147 while (*iptr >= 0) { 1148 PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step); 1149 dptr2+=step; 1150 iptr++; 1151 } 1152 PetscCallMPI(MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1153 msg_size++; msg_list++;msg_ids_out++; 1154 } 1155 1156 /* tree */ 1157 if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step); 1158 1159 /* process the received data */ 1160 msg_nodes=nodes; 1161 while ((iptr = *nodes++)) { 1162 PetscScalar d1 = 1.0; 1163 1164 /* Should I check the return value of MPI_Wait() or status? */ 1165 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1166 PetscCallMPI(MPI_Wait(ids_in, &status)); 1167 ids_in++; 1168 while (*iptr >= 0) { 1169 PetscCall(PetscBLASIntCast(step,&dstep)); 1170 PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1)); 1171 in2+=step; 1172 iptr++; 1173 } 1174 } 1175 1176 /* replace vals */ 1177 while (*pw >= 0) { 1178 PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step); 1179 dptr1+=step; 1180 pw++; 1181 } 1182 1183 /* clear isend message handles */ 1184 /* This changed for clarity though it could be the same */ 1185 1186 /* Should I check the return value of MPI_Wait() or status? */ 1187 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1188 while (*msg_nodes++) { 1189 PetscCallMPI(MPI_Wait(ids_out, &status)); 1190 ids_out++; 1191 } 1192 PetscFunctionReturn(0); 1193 } 1194 1195 /******************************************************************************/ 1196 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1197 { 1198 PetscInt size, *in, *out; 1199 PetscScalar *buf, *work; 1200 PetscInt op[] = {GL_ADD,0}; 1201 PetscBLASInt i1 = 1; 1202 PetscBLASInt dstep; 1203 1204 PetscFunctionBegin; 1205 /* copy over to local variables */ 1206 in = gs->tree_map_in; 1207 out = gs->tree_map_out; 1208 buf = gs->tree_buf; 1209 work = gs->tree_work; 1210 size = gs->tree_nel*step; 1211 1212 /* zero out collection buffer */ 1213 PCTFS_rvec_zero(buf,size); 1214 1215 /* copy over my contributions */ 1216 while (*in >= 0) { 1217 PetscCall(PetscBLASIntCast(step,&dstep)); 1218 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1)); 1219 } 1220 1221 /* perform fan in/out on full buffer */ 1222 /* must change PCTFS_grop to handle the blas */ 1223 PCTFS_grop(buf,work,size,op); 1224 1225 /* reset */ 1226 in = gs->tree_map_in; 1227 out = gs->tree_map_out; 1228 1229 /* get the portion of the results I need */ 1230 while (*in >= 0) { 1231 PetscCall(PetscBLASIntCast(step,&dstep)); 1232 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1)); 1233 } 1234 PetscFunctionReturn(0); 1235 } 1236 1237 /******************************************************************************/ 1238 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1239 { 1240 PetscFunctionBegin; 1241 switch (*op) { 1242 case '+': 1243 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1244 break; 1245 default: 1246 PetscCall(PetscInfo(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0])); 1247 PetscCall(PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n")); 1248 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1249 break; 1250 } 1251 PetscFunctionReturn(0); 1252 } 1253 1254 /******************************************************************************/ 1255 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1256 { 1257 PetscFunctionBegin; 1258 /* if there's nothing to do return */ 1259 if (dim<=0) PetscFunctionReturn(0); 1260 1261 /* can't do more dimensions then exist */ 1262 dim = PetscMin(dim,PCTFS_i_log2_num_nodes); 1263 1264 /* local only operations!!! */ 1265 if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals); 1266 1267 /* if intersection tree/pairwise and local isn't empty */ 1268 if (gs->num_local_gop) { 1269 PCTFS_gs_gop_local_in_plus(gs,vals); 1270 1271 /* pairwise will do tree inside ... */ 1272 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */ 1273 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1274 1275 PCTFS_gs_gop_local_out(gs,vals); 1276 } else { /* if intersection tree/pairwise and local is empty */ 1277 /* pairwise will do tree inside */ 1278 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */ 1279 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1280 } 1281 PetscFunctionReturn(0); 1282 } 1283 1284 /******************************************************************************/ 1285 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1286 { 1287 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1288 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1289 PetscInt *pw, *list, *size, **nodes; 1290 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1291 MPI_Status status; 1292 PetscInt i, mask=1; 1293 1294 PetscFunctionBegin; 1295 for (i=1; i<dim; i++) { mask<<=1; mask++; } 1296 1297 /* strip and load s */ 1298 msg_list = list = gs->pair_list; 1299 msg_size = size = gs->msg_sizes; 1300 msg_nodes = nodes = gs->node_list; 1301 iptr = pw = gs->pw_elm_list; 1302 dptr1 = dptr3 = gs->pw_vals; 1303 msg_ids_in = ids_in = gs->msg_ids_in; 1304 msg_ids_out = ids_out = gs->msg_ids_out; 1305 dptr2 = gs->out; 1306 in1 = in2 = gs->in; 1307 1308 /* post the receives */ 1309 /* msg_nodes=nodes; */ 1310 do { 1311 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1312 second one *list and do list++ afterwards */ 1313 if ((PCTFS_my_id|mask)==(*list|mask)) { 1314 PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1315 list++; msg_ids_in++;in1 += *size++; 1316 } else { list++; size++; } 1317 } while (*++msg_nodes); 1318 1319 /* load gs values into in out gs buffers */ 1320 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1321 1322 /* load out buffers and post the sends */ 1323 msg_nodes=nodes; 1324 list = msg_list; 1325 while ((iptr = *msg_nodes++)) { 1326 if ((PCTFS_my_id|mask)==(*list|mask)) { 1327 dptr3 = dptr2; 1328 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1329 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1330 /* is msg_ids_out++ correct? */ 1331 PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1332 msg_size++;list++;msg_ids_out++; 1333 } else {list++; msg_size++;} 1334 } 1335 1336 /* do the tree while we're waiting */ 1337 if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim); 1338 1339 /* process the received data */ 1340 msg_nodes=nodes; 1341 list = msg_list; 1342 while ((iptr = *nodes++)) { 1343 if ((PCTFS_my_id|mask)==(*list|mask)) { 1344 /* Should I check the return value of MPI_Wait() or status? */ 1345 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1346 PetscCallMPI(MPI_Wait(ids_in, &status)); 1347 ids_in++; 1348 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1349 } 1350 list++; 1351 } 1352 1353 /* replace vals */ 1354 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1355 1356 /* clear isend message handles */ 1357 /* This changed for clarity though it could be the same */ 1358 while (*msg_nodes++) { 1359 if ((PCTFS_my_id|mask)==(*msg_list|mask)) { 1360 /* Should I check the return value of MPI_Wait() or status? */ 1361 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1362 PetscCallMPI(MPI_Wait(ids_out, &status)); 1363 ids_out++; 1364 } 1365 msg_list++; 1366 } 1367 PetscFunctionReturn(0); 1368 } 1369 1370 /******************************************************************************/ 1371 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1372 { 1373 PetscInt size; 1374 PetscInt *in, *out; 1375 PetscScalar *buf, *work; 1376 PetscInt op[] = {GL_ADD,0}; 1377 1378 PetscFunctionBegin; 1379 in = gs->tree_map_in; 1380 out = gs->tree_map_out; 1381 buf = gs->tree_buf; 1382 work = gs->tree_work; 1383 size = gs->tree_nel; 1384 1385 PCTFS_rvec_zero(buf,size); 1386 1387 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1388 1389 in = gs->tree_map_in; 1390 out = gs->tree_map_out; 1391 1392 PCTFS_grop_hc(buf,work,size,op,dim); 1393 1394 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1395 PetscFunctionReturn(0); 1396 } 1397