1 2 /*************************************xyt.c************************************ 3 Module Name: xyt 4 Module Info: 5 6 author: Henry M. Tufo III 7 e-mail: hmt@asci.uchicago.edu 8 contact: 9 +--------------------------------+--------------------------------+ 10 |MCS Division - Building 221 |Department of Computer Science | 11 |Argonne National Laboratory |Ryerson 152 | 12 |9700 S. Cass Avenue |The University of Chicago | 13 |Argonne, IL 60439 |Chicago, IL 60637 | 14 |(630) 252-5354/5986 ph/fx |(773) 702-6019/8487 ph/fx | 15 +--------------------------------+--------------------------------+ 16 17 Last Modification: 3.20.01 18 **************************************xyt.c***********************************/ 19 20 21 /*************************************xyt.c************************************ 22 NOTES ON USAGE: 23 24 **************************************xyt.c***********************************/ 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <limits.h> 28 #include <float.h> 29 #include <math.h> 30 31 #include "petsc.h" 32 33 #include "const.h" 34 #include "types.h" 35 #include "comm.h" 36 #include "error.h" 37 #include "ivec.h" 38 #include "queue.h" 39 #include "gs.h" 40 #include "xyt.h" 41 42 #define LEFT -1 43 #define RIGHT 1 44 #define BOTH 0 45 #define MAX_FORTRAN_HANDLES 10 46 47 typedef struct xyt_solver_info { 48 int n, m, n_global, m_global; 49 int nnz, max_nnz, msg_buf_sz; 50 int *nsep, *lnsep, *fo, nfo, *stages; 51 int *xcol_sz, *xcol_indices; 52 PetscScalar **xcol_vals, *x, *solve_uu, *solve_w; 53 int *ycol_sz, *ycol_indices; 54 PetscScalar **ycol_vals, *y; 55 int nsolves; 56 PetscScalar tot_solve_time; 57 } xyt_info; 58 59 60 typedef struct matvec_info { 61 int n, m, n_global, m_global; 62 int *local2global; 63 gs_ADT gs_handle; 64 PetscErrorCode (*matvec)(struct matvec_info*,PetscScalar*,PetscScalar*); 65 void *grid_data; 66 } mv_info; 67 68 struct xyt_CDT{ 69 int id; 70 int ns; 71 int level; 72 xyt_info *info; 73 mv_info *mvi; 74 }; 75 76 static int n_xyt=0; 77 static int n_xyt_handles=0; 78 79 /* prototypes */ 80 static void do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *rhs); 81 static void check_init(void); 82 static void check_handle(xyt_ADT xyt_handle); 83 static void det_separators(xyt_ADT xyt_handle); 84 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u); 85 static int xyt_generate(xyt_ADT xyt_handle); 86 static int do_xyt_factor(xyt_ADT xyt_handle); 87 static mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data); 88 89 90 /*************************************xyt.c************************************ 91 Function: XYT_new() 92 93 Input : 94 Output: 95 Return: 96 Description: 97 **************************************xyt.c***********************************/ 98 xyt_ADT 99 XYT_new(void) 100 { 101 xyt_ADT xyt_handle; 102 103 104 105 /* rolling count on n_xyt ... pot. problem here */ 106 n_xyt_handles++; 107 xyt_handle = (xyt_ADT)malloc(sizeof(struct xyt_CDT)); 108 xyt_handle->id = ++n_xyt; 109 xyt_handle->info = NULL; 110 xyt_handle->mvi = NULL; 111 112 return(xyt_handle); 113 } 114 115 116 /*************************************xyt.c************************************ 117 Function: XYT_factor() 118 119 Input : 120 Output: 121 Return: 122 Description: 123 **************************************xyt.c***********************************/ 124 int 125 XYT_factor(xyt_ADT xyt_handle, /* prev. allocated xyt handle */ 126 int *local2global, /* global column mapping */ 127 int n, /* local num rows */ 128 int m, /* local num cols */ 129 void *matvec, /* b_loc=A_local.x_loc */ 130 void *grid_data /* grid data for matvec */ 131 ) 132 { 133 134 check_init(); 135 check_handle(xyt_handle); 136 137 /* only 2^k for now and all nodes participating */ 138 if ((1<<(xyt_handle->level=i_log2_num_nodes))!=num_nodes) 139 {error_msg_fatal("only 2^k for now and MPI_COMM_WORLD!!! %d != %d\n",1<<i_log2_num_nodes,num_nodes);} 140 141 /* space for X info */ 142 xyt_handle->info = (xyt_info*)malloc(sizeof(xyt_info)); 143 144 /* set up matvec handles */ 145 xyt_handle->mvi = set_mvi(local2global, n, m, matvec, grid_data); 146 147 /* matrix is assumed to be of full rank */ 148 /* LATER we can reset to indicate rank def. */ 149 xyt_handle->ns=0; 150 151 /* determine separators and generate firing order - NB xyt info set here */ 152 det_separators(xyt_handle); 153 154 return(do_xyt_factor(xyt_handle)); 155 } 156 157 158 /*************************************xyt.c************************************ 159 Function: XYT_solve 160 161 Input : 162 Output: 163 Return: 164 Description: 165 **************************************xyt.c***********************************/ 166 int 167 XYT_solve(xyt_ADT xyt_handle, double *x, double *b) 168 { 169 check_init(); 170 check_handle(xyt_handle); 171 172 /* need to copy b into x? */ 173 if (b) 174 {rvec_copy(x,b,xyt_handle->mvi->n);} 175 do_xyt_solve(xyt_handle,x); 176 177 return(0); 178 } 179 180 181 /*************************************xyt.c************************************ 182 Function: XYT_free() 183 184 Input : 185 Output: 186 Return: 187 Description: 188 **************************************xyt.c***********************************/ 189 int 190 XYT_free(xyt_ADT xyt_handle) 191 { 192 check_init(); 193 check_handle(xyt_handle); 194 n_xyt_handles--; 195 196 free(xyt_handle->info->nsep); 197 free(xyt_handle->info->lnsep); 198 free(xyt_handle->info->fo); 199 free(xyt_handle->info->stages); 200 free(xyt_handle->info->solve_uu); 201 free(xyt_handle->info->solve_w); 202 free(xyt_handle->info->x); 203 free(xyt_handle->info->xcol_vals); 204 free(xyt_handle->info->xcol_sz); 205 free(xyt_handle->info->xcol_indices); 206 free(xyt_handle->info->y); 207 free(xyt_handle->info->ycol_vals); 208 free(xyt_handle->info->ycol_sz); 209 free(xyt_handle->info->ycol_indices); 210 free(xyt_handle->info); 211 free(xyt_handle->mvi->local2global); 212 gs_free(xyt_handle->mvi->gs_handle); 213 free(xyt_handle->mvi); 214 free(xyt_handle); 215 216 217 /* if the check fails we nuke */ 218 /* if NULL pointer passed to free we nuke */ 219 /* if the calls to free fail that's not my problem */ 220 return(0); 221 } 222 223 224 225 /*************************************xyt.c************************************ 226 Function: 227 228 Input : 229 Output: 230 Return: 231 Description: 232 **************************************xyt.c***********************************/ 233 int 234 XYT_stats(xyt_ADT xyt_handle) 235 { 236 int op[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD}; 237 int fop[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD}; 238 int vals[9], work[9]; 239 PetscScalar fvals[3], fwork[3]; 240 241 242 check_init(); 243 check_handle(xyt_handle); 244 245 /* if factorization not done there are no stats */ 246 if (!xyt_handle->info||!xyt_handle->mvi) 247 { 248 if (!my_id) 249 {printf("XYT_stats() :: no stats available!\n");} 250 return 1; 251 } 252 253 vals[0]=vals[1]=vals[2]=xyt_handle->info->nnz; 254 vals[3]=vals[4]=vals[5]=xyt_handle->mvi->n; 255 vals[6]=vals[7]=vals[8]=xyt_handle->info->msg_buf_sz; 256 giop(vals,work,sizeof(op)/sizeof(op[0])-1,op); 257 258 fvals[0]=fvals[1]=fvals[2] 259 =xyt_handle->info->tot_solve_time/xyt_handle->info->nsolves++; 260 grop(fvals,fwork,sizeof(fop)/sizeof(fop[0])-1,fop); 261 262 if (!my_id) 263 { 264 printf("%d :: min xyt_nnz=%d\n",my_id,vals[0]); 265 printf("%d :: max xyt_nnz=%d\n",my_id,vals[1]); 266 printf("%d :: avg xyt_nnz=%g\n",my_id,1.0*vals[2]/num_nodes); 267 printf("%d :: tot xyt_nnz=%d\n",my_id,vals[2]); 268 printf("%d :: xyt C(2d) =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.5))); 269 printf("%d :: xyt C(3d) =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.6667))); 270 printf("%d :: min xyt_n =%d\n",my_id,vals[3]); 271 printf("%d :: max xyt_n =%d\n",my_id,vals[4]); 272 printf("%d :: avg xyt_n =%g\n",my_id,1.0*vals[5]/num_nodes); 273 printf("%d :: tot xyt_n =%d\n",my_id,vals[5]); 274 printf("%d :: min xyt_buf=%d\n",my_id,vals[6]); 275 printf("%d :: max xyt_buf=%d\n",my_id,vals[7]); 276 printf("%d :: avg xyt_buf=%g\n",my_id,1.0*vals[8]/num_nodes); 277 printf("%d :: min xyt_slv=%g\n",my_id,fvals[0]); 278 printf("%d :: max xyt_slv=%g\n",my_id,fvals[1]); 279 printf("%d :: avg xyt_slv=%g\n",my_id,fvals[2]/num_nodes); 280 } 281 282 return(0); 283 } 284 285 286 /*************************************xyt.c************************************ 287 Function: do_xyt_factor 288 289 Input : 290 Output: 291 Return: 292 Description: get A_local, local portion of global coarse matrix which 293 is a row dist. nxm matrix w/ n<m. 294 o my_ml holds address of ML struct associated w/A_local and coarse grid 295 o local2global holds global number of column i (i=0,...,m-1) 296 o local2global holds global number of row i (i=0,...,n-1) 297 o mylocmatvec performs A_local . vec_local (note that gs is performed using 298 gs_init/gop). 299 300 mylocmatvec = my_ml->Amat[grid_tag].matvec->external; 301 mylocmatvec (void :: void *data, double *in, double *out) 302 **************************************xyt.c***********************************/ 303 static 304 int 305 do_xyt_factor(xyt_ADT xyt_handle) 306 { 307 int flag; 308 309 310 flag=xyt_generate(xyt_handle); 311 return(flag); 312 } 313 314 315 /*************************************xyt.c************************************ 316 Function: 317 318 Input : 319 Output: 320 Return: 321 Description: 322 **************************************xyt.c***********************************/ 323 static 324 int 325 xyt_generate(xyt_ADT xyt_handle) 326 { 327 int i,j,k,idx; 328 int dim, col; 329 PetscScalar *u, *uu, *v, *z, *w, alpha, alpha_w; 330 int *segs; 331 int op[] = {GL_ADD,0}; 332 int off, len; 333 PetscScalar *x_ptr, *y_ptr; 334 int *iptr, flag; 335 int start=0, end, work; 336 int op2[] = {GL_MIN,0}; 337 gs_ADT gs_handle; 338 int *nsep, *lnsep, *fo; 339 int a_n=xyt_handle->mvi->n; 340 int a_m=xyt_handle->mvi->m; 341 int *a_local2global=xyt_handle->mvi->local2global; 342 int level; 343 int n, m; 344 int *xcol_sz, *xcol_indices, *stages; 345 PetscScalar **xcol_vals, *x; 346 int *ycol_sz, *ycol_indices; 347 PetscScalar **ycol_vals, *y; 348 int n_global; 349 int xt_nnz=0, xt_max_nnz=0; 350 int yt_nnz=0, yt_max_nnz=0; 351 int xt_zero_nnz =0; 352 int xt_zero_nnz_0=0; 353 int yt_zero_nnz =0; 354 int yt_zero_nnz_0=0; 355 PetscBLASInt i1 = 1; 356 PetscScalar dm1 = -1.0; 357 358 n=xyt_handle->mvi->n; 359 nsep=xyt_handle->info->nsep; 360 lnsep=xyt_handle->info->lnsep; 361 fo=xyt_handle->info->fo; 362 end=lnsep[0]; 363 level=xyt_handle->level; 364 gs_handle=xyt_handle->mvi->gs_handle; 365 366 /* is there a null space? */ 367 /* LATER add in ability to detect null space by checking alpha */ 368 for (i=0, j=0; i<=level; i++) 369 {j+=nsep[i];} 370 371 m = j-xyt_handle->ns; 372 if (m!=j) 373 {printf("xyt_generate() :: null space exists %d %d %d\n",m,j,xyt_handle->ns);} 374 375 error_msg_warning("xyt_generate() :: X(%d,%d)\n",n,m); 376 377 /* get and initialize storage for x local */ 378 /* note that x local is nxm and stored by columns */ 379 xcol_sz = (int*) malloc(m*sizeof(PetscInt)); 380 xcol_indices = (int*) malloc((2*m+1)*sizeof(int)); 381 xcol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *)); 382 for (i=j=0; i<m; i++, j+=2) 383 { 384 xcol_indices[j]=xcol_indices[j+1]=xcol_sz[i]=-1; 385 xcol_vals[i] = NULL; 386 } 387 xcol_indices[j]=-1; 388 389 /* get and initialize storage for y local */ 390 /* note that y local is nxm and stored by columns */ 391 ycol_sz = (int*) malloc(m*sizeof(PetscInt)); 392 ycol_indices = (int*) malloc((2*m+1)*sizeof(int)); 393 ycol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *)); 394 for (i=j=0; i<m; i++, j+=2) 395 { 396 ycol_indices[j]=ycol_indices[j+1]=ycol_sz[i]=-1; 397 ycol_vals[i] = NULL; 398 } 399 ycol_indices[j]=-1; 400 401 /* size of separators for each sub-hc working from bottom of tree to top */ 402 /* this looks like nsep[]=segments */ 403 stages = (int*) malloc((level+1)*sizeof(PetscInt)); 404 segs = (int*) malloc((level+1)*sizeof(PetscInt)); 405 ivec_zero(stages,level+1); 406 ivec_copy(segs,nsep,level+1); 407 for (i=0; i<level; i++) 408 {segs[i+1] += segs[i];} 409 stages[0] = segs[0]; 410 411 /* temporary vectors */ 412 u = (PetscScalar *) malloc(n*sizeof(PetscScalar)); 413 z = (PetscScalar *) malloc(n*sizeof(PetscScalar)); 414 v = (PetscScalar *) malloc(a_m*sizeof(PetscScalar)); 415 uu = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 416 w = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 417 418 /* extra nnz due to replication of vertices across separators */ 419 for (i=1, j=0; i<=level; i++) 420 {j+=nsep[i];} 421 422 /* storage for sparse x values */ 423 n_global = xyt_handle->info->n_global; 424 xt_max_nnz = yt_max_nnz = (int)(2.5*pow(1.0*n_global,1.6667) + j*n/2)/num_nodes; 425 x = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar)); 426 y = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar)); 427 428 /* LATER - can embed next sep to fire in gs */ 429 /* time to make the donuts - generate X factor */ 430 for (dim=i=j=0;i<m;i++) 431 { 432 /* time to move to the next level? */ 433 while (i==segs[dim]) 434 { 435 #ifdef SAFE 436 if (dim==level) 437 {error_msg_fatal("dim about to exceed level\n"); break;} 438 #endif 439 440 stages[dim++]=i; 441 end+=lnsep[dim]; 442 } 443 stages[dim]=i; 444 445 /* which column are we firing? */ 446 /* i.e. set v_l */ 447 /* use new seps and do global min across hc to determine which one to fire */ 448 (start<end) ? (col=fo[start]) : (col=INT_MAX); 449 giop_hc(&col,&work,1,op2,dim); 450 451 /* shouldn't need this */ 452 if (col==INT_MAX) 453 { 454 error_msg_warning("hey ... col==INT_MAX??\n"); 455 continue; 456 } 457 458 /* do I own it? I should */ 459 rvec_zero(v ,a_m); 460 if (col==fo[start]) 461 { 462 start++; 463 idx=ivec_linear_search(col, a_local2global, a_n); 464 if (idx!=-1) 465 {v[idx] = 1.0; j++;} 466 else 467 {error_msg_fatal("NOT FOUND!\n");} 468 } 469 else 470 { 471 idx=ivec_linear_search(col, a_local2global, a_m); 472 if (idx!=-1) 473 {v[idx] = 1.0;} 474 } 475 476 /* perform u = A.v_l */ 477 rvec_zero(u,n); 478 do_matvec(xyt_handle->mvi,v,u); 479 480 /* uu = X^T.u_l (local portion) */ 481 /* technically only need to zero out first i entries */ 482 /* later turn this into an XYT_solve call ? */ 483 rvec_zero(uu,m); 484 y_ptr=y; 485 iptr = ycol_indices; 486 for (k=0; k<i; k++) 487 { 488 off = *iptr++; 489 len = *iptr++; 490 491 uu[k] = BLdot_(&len,u+off,&i1,y_ptr,&i1); 492 y_ptr+=len; 493 } 494 495 /* uu = X^T.u_l (comm portion) */ 496 ssgl_radd (uu, w, dim, stages); 497 498 /* z = X.uu */ 499 rvec_zero(z,n); 500 x_ptr=x; 501 iptr = xcol_indices; 502 for (k=0; k<i; k++) 503 { 504 off = *iptr++; 505 len = *iptr++; 506 507 BLaxpy_(&len,&uu[k],x_ptr,&i1,z+off,&i1); 508 x_ptr+=len; 509 } 510 511 /* compute v_l = v_l - z */ 512 rvec_zero(v+a_n,a_m-a_n); 513 BLaxpy_(&n,&dm1,z,&i1,v,&i1); 514 515 /* compute u_l = A.v_l */ 516 if (a_n!=a_m) 517 {gs_gop_hc(gs_handle,v,"+\0",dim);} 518 rvec_zero(u,n); 519 do_matvec(xyt_handle->mvi,v,u); 520 521 /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - local portion */ 522 alpha = BLdot_(&n,u,&i1,u,&i1); 523 /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - comm portion */ 524 grop_hc(&alpha, &alpha_w, 1, op, dim); 525 526 alpha = (PetscScalar) sqrt((double)alpha); 527 528 /* check for small alpha */ 529 /* LATER use this to detect and determine null space */ 530 if (fabs(alpha)<1.0e-14) 531 {error_msg_fatal("bad alpha! %g\n",alpha);} 532 533 /* compute v_l = v_l/sqrt(alpha) */ 534 rvec_scale(v,1.0/alpha,n); 535 rvec_scale(u,1.0/alpha,n); 536 537 /* add newly generated column, v_l, to X */ 538 flag = 1; 539 off=len=0; 540 for (k=0; k<n; k++) 541 { 542 if (v[k]!=0.0) 543 { 544 len=k; 545 if (flag) 546 {off=k; flag=0;} 547 } 548 } 549 550 len -= (off-1); 551 552 if (len>0) 553 { 554 if ((xt_nnz+len)>xt_max_nnz) 555 { 556 error_msg_warning("increasing space for X by 2x!\n"); 557 xt_max_nnz *= 2; 558 x_ptr = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar)); 559 rvec_copy(x_ptr,x,xt_nnz); 560 free(x); 561 x = x_ptr; 562 x_ptr+=xt_nnz; 563 } 564 xt_nnz += len; 565 rvec_copy(x_ptr,v+off,len); 566 567 /* keep track of number of zeros */ 568 if (dim) 569 { 570 for (k=0; k<len; k++) 571 { 572 if (x_ptr[k]==0.0) 573 {xt_zero_nnz++;} 574 } 575 } 576 else 577 { 578 for (k=0; k<len; k++) 579 { 580 if (x_ptr[k]==0.0) 581 {xt_zero_nnz_0++;} 582 } 583 } 584 xcol_indices[2*i] = off; 585 xcol_sz[i] = xcol_indices[2*i+1] = len; 586 xcol_vals[i] = x_ptr; 587 } 588 else 589 { 590 xcol_indices[2*i] = 0; 591 xcol_sz[i] = xcol_indices[2*i+1] = 0; 592 xcol_vals[i] = x_ptr; 593 } 594 595 596 /* add newly generated column, u_l, to Y */ 597 flag = 1; 598 off=len=0; 599 for (k=0; k<n; k++) 600 { 601 if (u[k]!=0.0) 602 { 603 len=k; 604 if (flag) 605 {off=k; flag=0;} 606 } 607 } 608 609 len -= (off-1); 610 611 if (len>0) 612 { 613 if ((yt_nnz+len)>yt_max_nnz) 614 { 615 error_msg_warning("increasing space for Y by 2x!\n"); 616 yt_max_nnz *= 2; 617 y_ptr = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar)); 618 rvec_copy(y_ptr,y,yt_nnz); 619 free(y); 620 y = y_ptr; 621 y_ptr+=yt_nnz; 622 } 623 yt_nnz += len; 624 rvec_copy(y_ptr,u+off,len); 625 626 /* keep track of number of zeros */ 627 if (dim) 628 { 629 for (k=0; k<len; k++) 630 { 631 if (y_ptr[k]==0.0) 632 {yt_zero_nnz++;} 633 } 634 } 635 else 636 { 637 for (k=0; k<len; k++) 638 { 639 if (y_ptr[k]==0.0) 640 {yt_zero_nnz_0++;} 641 } 642 } 643 ycol_indices[2*i] = off; 644 ycol_sz[i] = ycol_indices[2*i+1] = len; 645 ycol_vals[i] = y_ptr; 646 } 647 else 648 { 649 ycol_indices[2*i] = 0; 650 ycol_sz[i] = ycol_indices[2*i+1] = 0; 651 ycol_vals[i] = y_ptr; 652 } 653 } 654 655 /* close off stages for execution phase */ 656 while (dim!=level) 657 { 658 stages[dim++]=i; 659 error_msg_warning("disconnected!!! dim(%d)!=level(%d)\n",dim,level); 660 } 661 stages[dim]=i; 662 663 xyt_handle->info->n=xyt_handle->mvi->n; 664 xyt_handle->info->m=m; 665 xyt_handle->info->nnz=xt_nnz + yt_nnz; 666 xyt_handle->info->max_nnz=xt_max_nnz + yt_max_nnz; 667 xyt_handle->info->msg_buf_sz=stages[level]-stages[0]; 668 xyt_handle->info->solve_uu = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 669 xyt_handle->info->solve_w = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 670 xyt_handle->info->x=x; 671 xyt_handle->info->xcol_vals=xcol_vals; 672 xyt_handle->info->xcol_sz=xcol_sz; 673 xyt_handle->info->xcol_indices=xcol_indices; 674 xyt_handle->info->stages=stages; 675 xyt_handle->info->y=y; 676 xyt_handle->info->ycol_vals=ycol_vals; 677 xyt_handle->info->ycol_sz=ycol_sz; 678 xyt_handle->info->ycol_indices=ycol_indices; 679 680 free(segs); 681 free(u); 682 free(v); 683 free(uu); 684 free(z); 685 free(w); 686 687 return(0); 688 } 689 690 691 /*************************************xyt.c************************************ 692 Function: 693 694 Input : 695 Output: 696 Return: 697 Description: 698 **************************************xyt.c***********************************/ 699 static 700 void 701 do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *uc) 702 { 703 int off, len, *iptr; 704 int level =xyt_handle->level; 705 int n =xyt_handle->info->n; 706 int m =xyt_handle->info->m; 707 int *stages =xyt_handle->info->stages; 708 int *xcol_indices=xyt_handle->info->xcol_indices; 709 int *ycol_indices=xyt_handle->info->ycol_indices; 710 PetscScalar *x_ptr, *y_ptr, *uu_ptr; 711 PetscScalar *solve_uu=xyt_handle->info->solve_uu; 712 PetscScalar *solve_w =xyt_handle->info->solve_w; 713 PetscScalar *x =xyt_handle->info->x; 714 PetscScalar *y =xyt_handle->info->y; 715 PetscBLASInt i1 = 1; 716 717 718 uu_ptr=solve_uu; 719 rvec_zero(uu_ptr,m); 720 721 /* x = X.Y^T.b */ 722 /* uu = Y^T.b */ 723 for (y_ptr=y,iptr=ycol_indices; *iptr!=-1; y_ptr+=len) 724 { 725 off=*iptr++; len=*iptr++; 726 *uu_ptr++ = BLdot_(&len,uc+off,&i1,y_ptr,&i1); 727 } 728 729 /* comunication of beta */ 730 uu_ptr=solve_uu; 731 if (level) {ssgl_radd(uu_ptr, solve_w, level, stages);} 732 733 rvec_zero(uc,n); 734 735 /* x = X.uu */ 736 for (x_ptr=x,iptr=xcol_indices; *iptr!=-1; x_ptr+=len) 737 { 738 off=*iptr++; len=*iptr++; 739 BLaxpy_(&len,uu_ptr++,x_ptr,&i1,uc+off,&i1); 740 } 741 742 } 743 744 745 /*************************************Xyt.c************************************ 746 Function: check_init 747 748 Input : 749 Output: 750 Return: 751 Description: 752 **************************************xyt.c***********************************/ 753 static 754 void 755 check_init(void) 756 { 757 comm_init(); 758 759 760 } 761 762 763 /*************************************xyt.c************************************ 764 Function: check_handle() 765 766 Input : 767 Output: 768 Return: 769 Description: 770 **************************************xyt.c***********************************/ 771 static 772 void 773 check_handle(xyt_ADT xyt_handle) 774 { 775 #ifdef SAFE 776 int vals[2], work[2], op[] = {NON_UNIFORM,GL_MIN,GL_MAX}; 777 #endif 778 779 if (xyt_handle==NULL) 780 {error_msg_fatal("check_handle() :: bad handle :: NULL %d\n",xyt_handle);} 781 782 #ifdef SAFE 783 vals[0]=vals[1]=xyt_handle->id; 784 giop(vals,work,sizeof(op)/sizeof(op[0])-1,op); 785 if ((vals[0]!=vals[1])||(xyt_handle->id<=0)) 786 {error_msg_fatal("check_handle() :: bad handle :: id mismatch min/max %d/%d %d\n", 787 vals[0],vals[1], xyt_handle->id);} 788 #endif 789 790 } 791 792 793 /*************************************xyt.c************************************ 794 Function: det_separators 795 796 Input : 797 Output: 798 Return: 799 Description: 800 det_separators(xyt_handle, local2global, n, m, mylocmatvec, grid_data); 801 **************************************xyt.c***********************************/ 802 static 803 void 804 det_separators(xyt_ADT xyt_handle) 805 { 806 int i, ct, id; 807 int mask, edge, *iptr; 808 int *dir, *used; 809 int sum[4], w[4]; 810 PetscScalar rsum[4], rw[4]; 811 int op[] = {GL_ADD,0}; 812 PetscScalar *lhs, *rhs; 813 int *nsep, *lnsep, *fo, nfo=0; 814 gs_ADT gs_handle=xyt_handle->mvi->gs_handle; 815 int *local2global=xyt_handle->mvi->local2global; 816 int n=xyt_handle->mvi->n; 817 int m=xyt_handle->mvi->m; 818 int level=xyt_handle->level; 819 int shared=FALSE; 820 821 dir = (int*)malloc(sizeof(PetscInt)*(level+1)); 822 nsep = (int*)malloc(sizeof(PetscInt)*(level+1)); 823 lnsep= (int*)malloc(sizeof(PetscInt)*(level+1)); 824 fo = (int*)malloc(sizeof(PetscInt)*(n+1)); 825 used = (int*)malloc(sizeof(PetscInt)*n); 826 827 ivec_zero(dir ,level+1); 828 ivec_zero(nsep ,level+1); 829 ivec_zero(lnsep,level+1); 830 ivec_set (fo ,-1,n+1); 831 ivec_zero(used,n); 832 833 lhs = (double*)malloc(sizeof(PetscScalar)*m); 834 rhs = (double*)malloc(sizeof(PetscScalar)*m); 835 836 /* determine the # of unique dof */ 837 rvec_zero(lhs,m); 838 rvec_set(lhs,1.0,n); 839 gs_gop_hc(gs_handle,lhs,"+\0",level); 840 error_msg_warning("done first gs_gop_hc\n"); 841 rvec_zero(rsum,2); 842 for (ct=i=0;i<n;i++) 843 { 844 if (lhs[i]!=0.0) 845 {rsum[0]+=1.0/lhs[i]; rsum[1]+=lhs[i];} 846 847 if (lhs[i]!=1.0) 848 { 849 shared=TRUE; 850 } 851 } 852 853 grop_hc(rsum,rw,2,op,level); 854 rsum[0]+=0.1; 855 rsum[1]+=0.1; 856 857 /* 858 if (!my_id) 859 { 860 printf("xyt n unique = %d (%g)\n",(int) rsum[0], rsum[0]); 861 printf("xyt n shared = %d (%g)\n",(int) rsum[1], rsum[1]); 862 } 863 */ 864 865 xyt_handle->info->n_global=xyt_handle->info->m_global=(int) rsum[0]; 866 xyt_handle->mvi->n_global =xyt_handle->mvi->m_global =(int) rsum[0]; 867 868 /* determine separator sets top down */ 869 if (shared) 870 { 871 /* solution is to do as in the symmetric shared case but then */ 872 /* pick the sub-hc with the most free dofs and do a mat-vec */ 873 /* and pick up the responses on the other sub-hc from the */ 874 /* initial separator set obtained from the symm. shared case */ 875 error_msg_fatal("shared dof separator determination not ready ... see hmt!!!\n"); 876 for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1) 877 { 878 /* set rsh of hc, fire, and collect lhs responses */ 879 (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m); 880 gs_gop_hc(gs_handle,lhs,"+\0",edge); 881 882 /* set lsh of hc, fire, and collect rhs responses */ 883 (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m); 884 gs_gop_hc(gs_handle,rhs,"+\0",edge); 885 886 for (i=0;i<n;i++) 887 { 888 if (id< mask) 889 { 890 if (lhs[i]!=0.0) 891 {lhs[i]=1.0;} 892 } 893 if (id>=mask) 894 { 895 if (rhs[i]!=0.0) 896 {rhs[i]=1.0;} 897 } 898 } 899 900 if (id< mask) 901 {gs_gop_hc(gs_handle,lhs,"+\0",edge-1);} 902 else 903 {gs_gop_hc(gs_handle,rhs,"+\0",edge-1);} 904 905 /* count number of dofs I own that have signal and not in sep set */ 906 rvec_zero(rsum,4); 907 for (ivec_zero(sum,4),ct=i=0;i<n;i++) 908 { 909 if (!used[i]) 910 { 911 /* number of unmarked dofs on node */ 912 ct++; 913 /* number of dofs to be marked on lhs hc */ 914 if (id< mask) 915 { 916 if (lhs[i]!=0.0) 917 {sum[0]++; rsum[0]+=1.0/lhs[i];} 918 } 919 /* number of dofs to be marked on rhs hc */ 920 if (id>=mask) 921 { 922 if (rhs[i]!=0.0) 923 {sum[1]++; rsum[1]+=1.0/rhs[i];} 924 } 925 } 926 } 927 928 /* go for load balance - choose half with most unmarked dofs, bias LHS */ 929 (id<mask) ? (sum[2]=ct) : (sum[3]=ct); 930 (id<mask) ? (rsum[2]=ct) : (rsum[3]=ct); 931 giop_hc(sum,w,4,op,edge); 932 grop_hc(rsum,rw,4,op,edge); 933 rsum[0]+=0.1; rsum[1]+=0.1; rsum[2]+=0.1; rsum[3]+=0.1; 934 935 if (id<mask) 936 { 937 /* mark dofs I own that have signal and not in sep set */ 938 for (ct=i=0;i<n;i++) 939 { 940 if ((!used[i])&&(lhs[i]!=0.0)) 941 { 942 ct++; nfo++; 943 944 if (nfo>n) 945 {error_msg_fatal("nfo about to exceed n\n");} 946 947 *--iptr = local2global[i]; 948 used[i]=edge; 949 } 950 } 951 if (ct>1) {ivec_sort(iptr,ct);} 952 953 lnsep[edge]=ct; 954 nsep[edge]=(int) rsum[0]; 955 dir [edge]=LEFT; 956 } 957 958 if (id>=mask) 959 { 960 /* mark dofs I own that have signal and not in sep set */ 961 for (ct=i=0;i<n;i++) 962 { 963 if ((!used[i])&&(rhs[i]!=0.0)) 964 { 965 ct++; nfo++; 966 967 if (nfo>n) 968 {error_msg_fatal("nfo about to exceed n\n");} 969 970 *--iptr = local2global[i]; 971 used[i]=edge; 972 } 973 } 974 if (ct>1) {ivec_sort(iptr,ct);} 975 976 lnsep[edge]=ct; 977 nsep[edge]= (int) rsum[1]; 978 dir [edge]=RIGHT; 979 } 980 981 /* LATER or we can recur on these to order seps at this level */ 982 /* do we need full set of separators for this? */ 983 984 /* fold rhs hc into lower */ 985 if (id>=mask) 986 {id-=mask;} 987 } 988 } 989 else 990 { 991 for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1) 992 { 993 /* set rsh of hc, fire, and collect lhs responses */ 994 (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m); 995 gs_gop_hc(gs_handle,lhs,"+\0",edge); 996 997 /* set lsh of hc, fire, and collect rhs responses */ 998 (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m); 999 gs_gop_hc(gs_handle,rhs,"+\0",edge); 1000 1001 /* count number of dofs I own that have signal and not in sep set */ 1002 for (ivec_zero(sum,4),ct=i=0;i<n;i++) 1003 { 1004 if (!used[i]) 1005 { 1006 /* number of unmarked dofs on node */ 1007 ct++; 1008 /* number of dofs to be marked on lhs hc */ 1009 if ((id< mask)&&(lhs[i]!=0.0)) {sum[0]++;} 1010 /* number of dofs to be marked on rhs hc */ 1011 if ((id>=mask)&&(rhs[i]!=0.0)) {sum[1]++;} 1012 } 1013 } 1014 1015 /* for the non-symmetric case we need separators of width 2 */ 1016 /* so take both sides */ 1017 (id<mask) ? (sum[2]=ct) : (sum[3]=ct); 1018 giop_hc(sum,w,4,op,edge); 1019 1020 ct=0; 1021 if (id<mask) 1022 { 1023 /* mark dofs I own that have signal and not in sep set */ 1024 for (i=0;i<n;i++) 1025 { 1026 if ((!used[i])&&(lhs[i]!=0.0)) 1027 { 1028 ct++; nfo++; 1029 *--iptr = local2global[i]; 1030 used[i]=edge; 1031 } 1032 } 1033 /* LSH hc summation of ct should be sum[0] */ 1034 } 1035 else 1036 { 1037 /* mark dofs I own that have signal and not in sep set */ 1038 for (i=0;i<n;i++) 1039 { 1040 if ((!used[i])&&(rhs[i]!=0.0)) 1041 { 1042 ct++; nfo++; 1043 *--iptr = local2global[i]; 1044 used[i]=edge; 1045 } 1046 } 1047 /* RSH hc summation of ct should be sum[1] */ 1048 } 1049 1050 if (ct>1) {ivec_sort(iptr,ct);} 1051 lnsep[edge]=ct; 1052 nsep[edge]=sum[0]+sum[1]; 1053 dir [edge]=BOTH; 1054 1055 /* LATER or we can recur on these to order seps at this level */ 1056 /* do we need full set of separators for this? */ 1057 1058 /* fold rhs hc into lower */ 1059 if (id>=mask) 1060 {id-=mask;} 1061 } 1062 } 1063 1064 /* level 0 is on processor case - so mark the remainder */ 1065 for (ct=i=0;i<n;i++) 1066 { 1067 if (!used[i]) 1068 { 1069 ct++; nfo++; 1070 *--iptr = local2global[i]; 1071 used[i]=edge; 1072 } 1073 } 1074 if (ct>1) {ivec_sort(iptr,ct);} 1075 lnsep[edge]=ct; 1076 nsep [edge]=ct; 1077 dir [edge]=BOTH; 1078 1079 xyt_handle->info->nsep=nsep; 1080 xyt_handle->info->lnsep=lnsep; 1081 xyt_handle->info->fo=fo; 1082 xyt_handle->info->nfo=nfo; 1083 1084 free(dir); 1085 free(lhs); 1086 free(rhs); 1087 free(used); 1088 1089 } 1090 1091 1092 /*************************************xyt.c************************************ 1093 Function: set_mvi 1094 1095 Input : 1096 Output: 1097 Return: 1098 Description: 1099 **************************************xyt.c***********************************/ 1100 static 1101 mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data) 1102 { 1103 mv_info *mvi; 1104 1105 1106 mvi = (mv_info*)malloc(sizeof(mv_info)); 1107 mvi->n=n; 1108 mvi->m=m; 1109 mvi->n_global=-1; 1110 mvi->m_global=-1; 1111 mvi->local2global=(int*)malloc((m+1)*sizeof(PetscInt)); 1112 ivec_copy(mvi->local2global,local2global,m); 1113 mvi->local2global[m] = INT_MAX; 1114 mvi->matvec=(PetscErrorCode (*)(mv_info*,PetscScalar*,PetscScalar*))matvec; 1115 mvi->grid_data=grid_data; 1116 1117 /* set xyt communication handle to perform restricted matvec */ 1118 mvi->gs_handle = gs_init(local2global, m, num_nodes); 1119 1120 return(mvi); 1121 } 1122 1123 1124 /*************************************xyt.c************************************ 1125 Function: set_mvi 1126 1127 Input : 1128 Output: 1129 Return: 1130 Description: 1131 1132 computes u = A.v 1133 do_matvec(xyt_handle->mvi,v,u); 1134 **************************************xyt.c***********************************/ 1135 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u) 1136 { 1137 A->matvec((mv_info*)A->grid_data,v,u); 1138 } 1139 1140 1141 1142