1 2 /* 3 Module Name: xyt 4 Module Info: 5 6 author: Henry M. Tufo III 7 e-mail: hmt@asci.uchicago.edu 8 contact: 9 +--------------------------------+--------------------------------+ 10 |MCS Division - Building 221 |Department of Computer Science | 11 |Argonne National Laboratory |Ryerson 152 | 12 |9700 S. Cass Avenue |The University of Chicago | 13 |Argonne, IL 60439 |Chicago, IL 60637 | 14 |(630) 252-5354/5986 ph/fx |(773) 702-6019/8487 ph/fx | 15 +--------------------------------+--------------------------------+ 16 17 Last Modification: 3.20.01 18 */ 19 #include <../src/ksp/pc/impls/tfs/tfs.h> 20 21 #define LEFT -1 22 #define RIGHT 1 23 #define BOTH 0 24 25 typedef struct xyt_solver_info { 26 PetscInt n, m, n_global, m_global; 27 PetscInt nnz, max_nnz, msg_buf_sz; 28 PetscInt *nsep, *lnsep, *fo, nfo, *stages; 29 PetscInt *xcol_sz, *xcol_indices; 30 PetscScalar **xcol_vals, *x, *solve_uu, *solve_w; 31 PetscInt *ycol_sz, *ycol_indices; 32 PetscScalar **ycol_vals, *y; 33 PetscInt nsolves; 34 PetscScalar tot_solve_time; 35 } xyt_info; 36 37 typedef struct matvec_info { 38 PetscInt n, m, n_global, m_global; 39 PetscInt *local2global; 40 PCTFS_gs_ADT PCTFS_gs_handle; 41 PetscErrorCode (*matvec)(struct matvec_info *, PetscScalar *, PetscScalar *); 42 void *grid_data; 43 } mv_info; 44 45 struct xyt_CDT { 46 PetscInt id; 47 PetscInt ns; 48 PetscInt level; 49 xyt_info *info; 50 mv_info *mvi; 51 }; 52 53 static PetscInt n_xyt = 0; 54 static PetscInt n_xyt_handles = 0; 55 56 /* prototypes */ 57 static PetscErrorCode do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *rhs); 58 static PetscErrorCode check_handle(xyt_ADT xyt_handle); 59 static PetscErrorCode det_separators(xyt_ADT xyt_handle); 60 static PetscErrorCode do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u); 61 static PetscErrorCode xyt_generate(xyt_ADT xyt_handle); 62 static PetscErrorCode do_xyt_factor(xyt_ADT xyt_handle); 63 static mv_info *set_mvi(PetscInt *local2global, PetscInt n, PetscInt m, PetscErrorCode (*matvec)(mv_info *, PetscScalar *, PetscScalar *), void *grid_data); 64 65 xyt_ADT XYT_new(void) 66 { 67 xyt_ADT xyt_handle; 68 69 /* rolling count on n_xyt ... pot. problem here */ 70 n_xyt_handles++; 71 xyt_handle = (xyt_ADT)malloc(sizeof(struct xyt_CDT)); 72 xyt_handle->id = ++n_xyt; 73 xyt_handle->info = NULL; 74 xyt_handle->mvi = NULL; 75 76 return (xyt_handle); 77 } 78 79 PetscErrorCode XYT_factor(xyt_ADT xyt_handle, /* prev. allocated xyt handle */ 80 PetscInt *local2global, /* global column mapping */ 81 PetscInt n, /* local num rows */ 82 PetscInt m, /* local num cols */ 83 PetscErrorCode (*matvec)(void *, PetscScalar *, PetscScalar *), /* b_loc=A_local.x_loc */ 84 void *grid_data) /* grid data for matvec */ 85 { 86 PetscFunctionBegin; 87 PetscCall(PCTFS_comm_init()); 88 PetscCall(check_handle(xyt_handle)); 89 90 /* only 2^k for now and all nodes participating */ 91 PetscCheck((1 << (xyt_handle->level = PCTFS_i_log2_num_nodes)) == PCTFS_num_nodes, PETSC_COMM_SELF, PETSC_ERR_PLIB, "only 2^k for now and MPI_COMM_WORLD!!! %d != %d", 1 << PCTFS_i_log2_num_nodes, PCTFS_num_nodes); 92 93 /* space for X info */ 94 xyt_handle->info = (xyt_info *)malloc(sizeof(xyt_info)); 95 96 /* set up matvec handles */ 97 xyt_handle->mvi = set_mvi(local2global, n, m, (PetscErrorCode(*)(mv_info *, PetscScalar *, PetscScalar *))matvec, grid_data); 98 99 /* matrix is assumed to be of full rank */ 100 /* LATER we can reset to indicate rank def. */ 101 xyt_handle->ns = 0; 102 103 /* determine separators and generate firing order - NB xyt info set here */ 104 PetscCall(det_separators(xyt_handle)); 105 106 PetscCall(do_xyt_factor(xyt_handle)); 107 PetscFunctionReturn(PETSC_SUCCESS); 108 } 109 110 PetscErrorCode XYT_solve(xyt_ADT xyt_handle, PetscScalar *x, PetscScalar *b) 111 { 112 PetscFunctionBegin; 113 PetscCall(PCTFS_comm_init()); 114 PetscCall(check_handle(xyt_handle)); 115 116 /* need to copy b into x? */ 117 if (b) PetscCall(PCTFS_rvec_copy(x, b, xyt_handle->mvi->n)); 118 PetscCall(do_xyt_solve(xyt_handle, x)); 119 PetscFunctionReturn(PETSC_SUCCESS); 120 } 121 122 PetscErrorCode XYT_free(xyt_ADT xyt_handle) 123 { 124 PetscFunctionBegin; 125 PetscCall(PCTFS_comm_init()); 126 PetscCall(check_handle(xyt_handle)); 127 n_xyt_handles--; 128 129 free(xyt_handle->info->nsep); 130 free(xyt_handle->info->lnsep); 131 free(xyt_handle->info->fo); 132 free(xyt_handle->info->stages); 133 free(xyt_handle->info->solve_uu); 134 free(xyt_handle->info->solve_w); 135 free(xyt_handle->info->x); 136 free(xyt_handle->info->xcol_vals); 137 free(xyt_handle->info->xcol_sz); 138 free(xyt_handle->info->xcol_indices); 139 free(xyt_handle->info->y); 140 free(xyt_handle->info->ycol_vals); 141 free(xyt_handle->info->ycol_sz); 142 free(xyt_handle->info->ycol_indices); 143 free(xyt_handle->info); 144 free(xyt_handle->mvi->local2global); 145 PetscCall(PCTFS_gs_free(xyt_handle->mvi->PCTFS_gs_handle)); 146 free(xyt_handle->mvi); 147 free(xyt_handle); 148 149 /* if the check fails we nuke */ 150 /* if NULL pointer passed to free we nuke */ 151 /* if the calls to free fail that's not my problem */ 152 PetscFunctionReturn(PETSC_SUCCESS); 153 } 154 155 /* This function is currently not used */ 156 PetscErrorCode XYT_stats(xyt_ADT xyt_handle) 157 { 158 PetscInt op[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_ADD}; 159 PetscInt fop[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD}; 160 PetscInt vals[9], work[9]; 161 PetscScalar fvals[3], fwork[3]; 162 163 PetscFunctionBegin; 164 PetscCall(PCTFS_comm_init()); 165 PetscCall(check_handle(xyt_handle)); 166 167 /* if factorization not done there are no stats */ 168 if (!xyt_handle->info || !xyt_handle->mvi) { 169 if (!PCTFS_my_id) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "XYT_stats() :: no stats available!\n")); 170 PetscFunctionReturn(PETSC_SUCCESS); 171 } 172 173 vals[0] = vals[1] = vals[2] = xyt_handle->info->nnz; 174 vals[3] = vals[4] = vals[5] = xyt_handle->mvi->n; 175 vals[6] = vals[7] = vals[8] = xyt_handle->info->msg_buf_sz; 176 PetscCall(PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(op) - 1, op)); 177 178 fvals[0] = fvals[1] = fvals[2] = xyt_handle->info->tot_solve_time / xyt_handle->info->nsolves++; 179 PetscCall(PCTFS_grop(fvals, fwork, PETSC_STATIC_ARRAY_LENGTH(fop) - 1, fop)); 180 181 if (!PCTFS_my_id) { 182 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min xyt_nnz=%" PetscInt_FMT "\n", PCTFS_my_id, vals[0])); 183 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max xyt_nnz=%" PetscInt_FMT "\n", PCTFS_my_id, vals[1])); 184 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg xyt_nnz=%g\n", PCTFS_my_id, (double)(1.0 * vals[2] / PCTFS_num_nodes))); 185 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: tot xyt_nnz=%" PetscInt_FMT "\n", PCTFS_my_id, vals[2])); 186 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: xyt C(2d) =%g\n", PCTFS_my_id, (double)(vals[2] / (PetscPowReal(1.0 * vals[5], 1.5))))); 187 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: xyt C(3d) =%g\n", PCTFS_my_id, (double)(vals[2] / (PetscPowReal(1.0 * vals[5], 1.6667))))); 188 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min xyt_n =%" PetscInt_FMT "\n", PCTFS_my_id, vals[3])); 189 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max xyt_n =%" PetscInt_FMT "\n", PCTFS_my_id, vals[4])); 190 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg xyt_n =%g\n", PCTFS_my_id, (double)(1.0 * vals[5] / PCTFS_num_nodes))); 191 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: tot xyt_n =%" PetscInt_FMT "\n", PCTFS_my_id, vals[5])); 192 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min xyt_buf=%" PetscInt_FMT "\n", PCTFS_my_id, vals[6])); 193 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max xyt_buf=%" PetscInt_FMT "\n", PCTFS_my_id, vals[7])); 194 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg xyt_buf=%g\n", PCTFS_my_id, (double)(1.0 * vals[8] / PCTFS_num_nodes))); 195 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min xyt_slv=%g\n", PCTFS_my_id, (double)PetscRealPart(fvals[0]))); 196 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max xyt_slv=%g\n", PCTFS_my_id, (double)PetscRealPart(fvals[1]))); 197 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg xyt_slv=%g\n", PCTFS_my_id, (double)PetscRealPart(fvals[2] / PCTFS_num_nodes))); 198 } 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 /* 203 204 Description: get A_local, local portion of global coarse matrix which 205 is a row dist. nxm matrix w/ n<m. 206 o my_ml holds address of ML struct associated w/A_local and coarse grid 207 o local2global holds global number of column i (i=0,...,m-1) 208 o local2global holds global number of row i (i=0,...,n-1) 209 o mylocmatvec performs A_local . vec_local (note that gs is performed using 210 PCTFS_gs_init/gop). 211 212 mylocmatvec = my_ml->Amat[grid_tag].matvec->external; 213 mylocmatvec (void :: void *data, double *in, double *out) 214 */ 215 static PetscErrorCode do_xyt_factor(xyt_ADT xyt_handle) 216 { 217 return xyt_generate(xyt_handle); 218 } 219 220 static PetscErrorCode xyt_generate(xyt_ADT xyt_handle) 221 { 222 PetscInt i, j, k, idx; 223 PetscInt dim, col; 224 PetscScalar *u, *uu, *v, *z, *w, alpha, alpha_w; 225 PetscInt *segs; 226 PetscInt op[] = {GL_ADD, 0}; 227 PetscInt off, len; 228 PetscScalar *x_ptr, *y_ptr; 229 PetscInt *iptr, flag; 230 PetscInt start = 0, end, work; 231 PetscInt op2[] = {GL_MIN, 0}; 232 PCTFS_gs_ADT PCTFS_gs_handle; 233 PetscInt *nsep, *lnsep, *fo; 234 PetscInt a_n = xyt_handle->mvi->n; 235 PetscInt a_m = xyt_handle->mvi->m; 236 PetscInt *a_local2global = xyt_handle->mvi->local2global; 237 PetscInt level; 238 PetscInt n, m; 239 PetscInt *xcol_sz, *xcol_indices, *stages; 240 PetscScalar **xcol_vals, *x; 241 PetscInt *ycol_sz, *ycol_indices; 242 PetscScalar **ycol_vals, *y; 243 PetscInt n_global; 244 PetscInt xt_nnz = 0, xt_max_nnz = 0; 245 PetscInt yt_nnz = 0, yt_max_nnz = 0; 246 PetscBLASInt i1 = 1, dlen; 247 PetscScalar dm1 = -1.0; 248 249 n = xyt_handle->mvi->n; 250 nsep = xyt_handle->info->nsep; 251 lnsep = xyt_handle->info->lnsep; 252 fo = xyt_handle->info->fo; 253 end = lnsep[0]; 254 level = xyt_handle->level; 255 PCTFS_gs_handle = xyt_handle->mvi->PCTFS_gs_handle; 256 257 /* is there a null space? */ 258 /* LATER add in ability to detect null space by checking alpha */ 259 for (i = 0, j = 0; i <= level; i++) j += nsep[i]; 260 261 m = j - xyt_handle->ns; 262 if (m != j) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "xyt_generate() :: null space exists %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, j, xyt_handle->ns)); 263 264 PetscCall(PetscInfo(0, "xyt_generate() :: X(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", n, m)); 265 266 /* get and initialize storage for x local */ 267 /* note that x local is nxm and stored by columns */ 268 xcol_sz = (PetscInt *)malloc(m * sizeof(PetscInt)); 269 xcol_indices = (PetscInt *)malloc((2 * m + 1) * sizeof(PetscInt)); 270 xcol_vals = (PetscScalar **)malloc(m * sizeof(PetscScalar *)); 271 for (i = j = 0; i < m; i++, j += 2) { 272 xcol_indices[j] = xcol_indices[j + 1] = xcol_sz[i] = -1; 273 xcol_vals[i] = NULL; 274 } 275 xcol_indices[j] = -1; 276 277 /* get and initialize storage for y local */ 278 /* note that y local is nxm and stored by columns */ 279 ycol_sz = (PetscInt *)malloc(m * sizeof(PetscInt)); 280 ycol_indices = (PetscInt *)malloc((2 * m + 1) * sizeof(PetscInt)); 281 ycol_vals = (PetscScalar **)malloc(m * sizeof(PetscScalar *)); 282 for (i = j = 0; i < m; i++, j += 2) { 283 ycol_indices[j] = ycol_indices[j + 1] = ycol_sz[i] = -1; 284 ycol_vals[i] = NULL; 285 } 286 ycol_indices[j] = -1; 287 288 /* size of separators for each sub-hc working from bottom of tree to top */ 289 /* this looks like nsep[]=segments */ 290 stages = (PetscInt *)malloc((level + 1) * sizeof(PetscInt)); 291 segs = (PetscInt *)malloc((level + 1) * sizeof(PetscInt)); 292 PetscCall(PCTFS_ivec_zero(stages, level + 1)); 293 PCTFS_ivec_copy(segs, nsep, level + 1); 294 for (i = 0; i < level; i++) segs[i + 1] += segs[i]; 295 stages[0] = segs[0]; 296 297 /* temporary vectors */ 298 u = (PetscScalar *)malloc(n * sizeof(PetscScalar)); 299 z = (PetscScalar *)malloc(n * sizeof(PetscScalar)); 300 v = (PetscScalar *)malloc(a_m * sizeof(PetscScalar)); 301 uu = (PetscScalar *)malloc(m * sizeof(PetscScalar)); 302 w = (PetscScalar *)malloc(m * sizeof(PetscScalar)); 303 304 /* extra nnz due to replication of vertices across separators */ 305 for (i = 1, j = 0; i <= level; i++) j += nsep[i]; 306 307 /* storage for sparse x values */ 308 n_global = xyt_handle->info->n_global; 309 xt_max_nnz = yt_max_nnz = (PetscInt)(2.5 * PetscPowReal(1.0 * n_global, 1.6667) + j * n / 2) / PCTFS_num_nodes; 310 x = (PetscScalar *)malloc(xt_max_nnz * sizeof(PetscScalar)); 311 y = (PetscScalar *)malloc(yt_max_nnz * sizeof(PetscScalar)); 312 313 /* LATER - can embed next sep to fire in gs */ 314 /* time to make the donuts - generate X factor */ 315 for (dim = i = j = 0; i < m; i++) { 316 /* time to move to the next level? */ 317 while (i == segs[dim]) { 318 PetscCheck(dim != level, PETSC_COMM_SELF, PETSC_ERR_PLIB, "dim about to exceed level"); 319 stages[dim++] = i; 320 end += lnsep[dim]; 321 } 322 stages[dim] = i; 323 324 /* which column are we firing? */ 325 /* i.e. set v_l */ 326 /* use new seps and do global min across hc to determine which one to fire */ 327 (start < end) ? (col = fo[start]) : (col = INT_MAX); 328 PetscCall(PCTFS_giop_hc(&col, &work, 1, op2, dim)); 329 330 /* shouldn't need this */ 331 if (col == INT_MAX) { 332 PetscCall(PetscInfo(0, "hey ... col==INT_MAX??\n")); 333 continue; 334 } 335 336 /* do I own it? I should */ 337 PetscCall(PCTFS_rvec_zero(v, a_m)); 338 if (col == fo[start]) { 339 start++; 340 idx = PCTFS_ivec_linear_search(col, a_local2global, a_n); 341 if (idx != -1) { 342 v[idx] = 1.0; 343 j++; 344 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "NOT FOUND!"); 345 } else { 346 idx = PCTFS_ivec_linear_search(col, a_local2global, a_m); 347 if (idx != -1) v[idx] = 1.0; 348 } 349 350 /* perform u = A.v_l */ 351 PetscCall(PCTFS_rvec_zero(u, n)); 352 PetscCall(do_matvec(xyt_handle->mvi, v, u)); 353 354 /* uu = X^T.u_l (local portion) */ 355 /* technically only need to zero out first i entries */ 356 /* later turn this into an XYT_solve call ? */ 357 PetscCall(PCTFS_rvec_zero(uu, m)); 358 y_ptr = y; 359 iptr = ycol_indices; 360 for (k = 0; k < i; k++) { 361 off = *iptr++; 362 len = *iptr++; 363 PetscCall(PetscBLASIntCast(len, &dlen)); 364 PetscCallBLAS("BLASdot", uu[k] = BLASdot_(&dlen, u + off, &i1, y_ptr, &i1)); 365 y_ptr += len; 366 } 367 368 /* uu = X^T.u_l (comm portion) */ 369 PetscCall(PCTFS_ssgl_radd(uu, w, dim, stages)); 370 371 /* z = X.uu */ 372 PetscCall(PCTFS_rvec_zero(z, n)); 373 x_ptr = x; 374 iptr = xcol_indices; 375 for (k = 0; k < i; k++) { 376 off = *iptr++; 377 len = *iptr++; 378 PetscCall(PetscBLASIntCast(len, &dlen)); 379 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dlen, &uu[k], x_ptr, &i1, z + off, &i1)); 380 x_ptr += len; 381 } 382 383 /* compute v_l = v_l - z */ 384 PetscCall(PCTFS_rvec_zero(v + a_n, a_m - a_n)); 385 PetscCall(PetscBLASIntCast(n, &dlen)); 386 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dlen, &dm1, z, &i1, v, &i1)); 387 388 /* compute u_l = A.v_l */ 389 if (a_n != a_m) PetscCall(PCTFS_gs_gop_hc(PCTFS_gs_handle, v, "+\0", dim)); 390 PetscCall(PCTFS_rvec_zero(u, n)); 391 PetscCall(do_matvec(xyt_handle->mvi, v, u)); 392 393 /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - local portion */ 394 PetscCall(PetscBLASIntCast(n, &dlen)); 395 PetscCallBLAS("BLASdot", alpha = BLASdot_(&dlen, u, &i1, u, &i1)); 396 /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - comm portion */ 397 PetscCall(PCTFS_grop_hc(&alpha, &alpha_w, 1, op, dim)); 398 399 alpha = (PetscScalar)PetscSqrtReal((PetscReal)alpha); 400 401 /* check for small alpha */ 402 /* LATER use this to detect and determine null space */ 403 PetscCheck(PetscAbsScalar(alpha) >= 1.0e-14, PETSC_COMM_SELF, PETSC_ERR_PLIB, "bad alpha! %g", (double)PetscAbsScalar(alpha)); 404 405 /* compute v_l = v_l/sqrt(alpha) */ 406 PetscCall(PCTFS_rvec_scale(v, 1.0 / alpha, n)); 407 PetscCall(PCTFS_rvec_scale(u, 1.0 / alpha, n)); 408 409 /* add newly generated column, v_l, to X */ 410 flag = 1; 411 off = len = 0; 412 for (k = 0; k < n; k++) { 413 if (v[k] != 0.0) { 414 len = k; 415 if (flag) { 416 off = k; 417 flag = 0; 418 } 419 } 420 } 421 422 len -= (off - 1); 423 424 if (len > 0) { 425 if ((xt_nnz + len) > xt_max_nnz) { 426 PetscCall(PetscInfo(0, "increasing space for X by 2x!\n")); 427 xt_max_nnz *= 2; 428 x_ptr = (PetscScalar *)malloc(xt_max_nnz * sizeof(PetscScalar)); 429 PetscCall(PCTFS_rvec_copy(x_ptr, x, xt_nnz)); 430 free(x); 431 x = x_ptr; 432 x_ptr += xt_nnz; 433 } 434 xt_nnz += len; 435 PetscCall(PCTFS_rvec_copy(x_ptr, v + off, len)); 436 437 xcol_indices[2 * i] = off; 438 xcol_sz[i] = xcol_indices[2 * i + 1] = len; 439 xcol_vals[i] = x_ptr; 440 } else { 441 xcol_indices[2 * i] = 0; 442 xcol_sz[i] = xcol_indices[2 * i + 1] = 0; 443 xcol_vals[i] = x_ptr; 444 } 445 446 /* add newly generated column, u_l, to Y */ 447 flag = 1; 448 off = len = 0; 449 for (k = 0; k < n; k++) { 450 if (u[k] != 0.0) { 451 len = k; 452 if (flag) { 453 off = k; 454 flag = 0; 455 } 456 } 457 } 458 459 len -= (off - 1); 460 461 if (len > 0) { 462 if ((yt_nnz + len) > yt_max_nnz) { 463 PetscCall(PetscInfo(0, "increasing space for Y by 2x!\n")); 464 yt_max_nnz *= 2; 465 y_ptr = (PetscScalar *)malloc(yt_max_nnz * sizeof(PetscScalar)); 466 PetscCall(PCTFS_rvec_copy(y_ptr, y, yt_nnz)); 467 free(y); 468 y = y_ptr; 469 y_ptr += yt_nnz; 470 } 471 yt_nnz += len; 472 PetscCall(PCTFS_rvec_copy(y_ptr, u + off, len)); 473 474 ycol_indices[2 * i] = off; 475 ycol_sz[i] = ycol_indices[2 * i + 1] = len; 476 ycol_vals[i] = y_ptr; 477 } else { 478 ycol_indices[2 * i] = 0; 479 ycol_sz[i] = ycol_indices[2 * i + 1] = 0; 480 ycol_vals[i] = y_ptr; 481 } 482 } 483 484 /* close off stages for execution phase */ 485 while (dim != level) { 486 stages[dim++] = i; 487 PetscCall(PetscInfo(0, "disconnected!!! dim(%" PetscInt_FMT ")!=level(%" PetscInt_FMT ")\n", dim, level)); 488 } 489 stages[dim] = i; 490 491 xyt_handle->info->n = xyt_handle->mvi->n; 492 xyt_handle->info->m = m; 493 xyt_handle->info->nnz = xt_nnz + yt_nnz; 494 xyt_handle->info->max_nnz = xt_max_nnz + yt_max_nnz; 495 xyt_handle->info->msg_buf_sz = stages[level] - stages[0]; 496 xyt_handle->info->solve_uu = (PetscScalar *)malloc(m * sizeof(PetscScalar)); 497 xyt_handle->info->solve_w = (PetscScalar *)malloc(m * sizeof(PetscScalar)); 498 xyt_handle->info->x = x; 499 xyt_handle->info->xcol_vals = xcol_vals; 500 xyt_handle->info->xcol_sz = xcol_sz; 501 xyt_handle->info->xcol_indices = xcol_indices; 502 xyt_handle->info->stages = stages; 503 xyt_handle->info->y = y; 504 xyt_handle->info->ycol_vals = ycol_vals; 505 xyt_handle->info->ycol_sz = ycol_sz; 506 xyt_handle->info->ycol_indices = ycol_indices; 507 508 free(segs); 509 free(u); 510 free(v); 511 free(uu); 512 free(z); 513 free(w); 514 515 return PETSC_SUCCESS; 516 } 517 518 static PetscErrorCode do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *uc) 519 { 520 PetscInt off, len, *iptr; 521 PetscInt level = xyt_handle->level; 522 PetscInt n = xyt_handle->info->n; 523 PetscInt m = xyt_handle->info->m; 524 PetscInt *stages = xyt_handle->info->stages; 525 PetscInt *xcol_indices = xyt_handle->info->xcol_indices; 526 PetscInt *ycol_indices = xyt_handle->info->ycol_indices; 527 PetscScalar *x_ptr, *y_ptr, *uu_ptr; 528 PetscScalar *solve_uu = xyt_handle->info->solve_uu; 529 PetscScalar *solve_w = xyt_handle->info->solve_w; 530 PetscScalar *x = xyt_handle->info->x; 531 PetscScalar *y = xyt_handle->info->y; 532 PetscBLASInt i1 = 1, dlen; 533 534 PetscFunctionBegin; 535 uu_ptr = solve_uu; 536 PetscCall(PCTFS_rvec_zero(uu_ptr, m)); 537 538 /* x = X.Y^T.b */ 539 /* uu = Y^T.b */ 540 for (y_ptr = y, iptr = ycol_indices; *iptr != -1; y_ptr += len) { 541 off = *iptr++; 542 len = *iptr++; 543 PetscCall(PetscBLASIntCast(len, &dlen)); 544 PetscCallBLAS("BLASdot", *uu_ptr++ = BLASdot_(&dlen, uc + off, &i1, y_ptr, &i1)); 545 } 546 547 /* communication of beta */ 548 uu_ptr = solve_uu; 549 if (level) PetscCall(PCTFS_ssgl_radd(uu_ptr, solve_w, level, stages)); 550 PetscCall(PCTFS_rvec_zero(uc, n)); 551 552 /* x = X.uu */ 553 for (x_ptr = x, iptr = xcol_indices; *iptr != -1; x_ptr += len) { 554 off = *iptr++; 555 len = *iptr++; 556 PetscCall(PetscBLASIntCast(len, &dlen)); 557 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dlen, uu_ptr++, x_ptr, &i1, uc + off, &i1)); 558 } 559 PetscFunctionReturn(PETSC_SUCCESS); 560 } 561 562 static PetscErrorCode check_handle(xyt_ADT xyt_handle) 563 { 564 PetscInt vals[2], work[2], op[] = {NON_UNIFORM, GL_MIN, GL_MAX}; 565 566 PetscFunctionBegin; 567 PetscCheck(xyt_handle, PETSC_COMM_SELF, PETSC_ERR_PLIB, "check_handle() :: bad handle :: NULL %p", (void *)xyt_handle); 568 569 vals[0] = vals[1] = xyt_handle->id; 570 PetscCall(PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(op) - 1, op)); 571 PetscCheck(!(vals[0] != vals[1]) && !(xyt_handle->id <= 0), PETSC_COMM_SELF, PETSC_ERR_PLIB, "check_handle() :: bad handle :: id mismatch min/max %" PetscInt_FMT "/%" PetscInt_FMT " %" PetscInt_FMT, vals[0], vals[1], xyt_handle->id); 572 PetscFunctionReturn(PETSC_SUCCESS); 573 } 574 575 static PetscErrorCode det_separators(xyt_ADT xyt_handle) 576 { 577 PetscInt i, ct, id; 578 PetscInt mask, edge, *iptr; 579 PetscInt *dir, *used; 580 PetscInt sum[4], w[4]; 581 PetscScalar rsum[4], rw[4]; 582 PetscInt op[] = {GL_ADD, 0}; 583 PetscScalar *lhs, *rhs; 584 PetscInt *nsep, *lnsep, *fo, nfo = 0; 585 PCTFS_gs_ADT PCTFS_gs_handle = xyt_handle->mvi->PCTFS_gs_handle; 586 PetscInt *local2global = xyt_handle->mvi->local2global; 587 PetscInt n = xyt_handle->mvi->n; 588 PetscInt m = xyt_handle->mvi->m; 589 PetscInt level = xyt_handle->level; 590 PetscInt shared = 0; 591 592 PetscFunctionBegin; 593 dir = (PetscInt *)malloc(sizeof(PetscInt) * (level + 1)); 594 nsep = (PetscInt *)malloc(sizeof(PetscInt) * (level + 1)); 595 lnsep = (PetscInt *)malloc(sizeof(PetscInt) * (level + 1)); 596 fo = (PetscInt *)malloc(sizeof(PetscInt) * (n + 1)); 597 used = (PetscInt *)malloc(sizeof(PetscInt) * n); 598 599 PetscCall(PCTFS_ivec_zero(dir, level + 1)); 600 PetscCall(PCTFS_ivec_zero(nsep, level + 1)); 601 PetscCall(PCTFS_ivec_zero(lnsep, level + 1)); 602 PetscCall(PCTFS_ivec_set(fo, -1, n + 1)); 603 PetscCall(PCTFS_ivec_zero(used, n)); 604 605 lhs = (PetscScalar *)malloc(sizeof(PetscScalar) * m); 606 rhs = (PetscScalar *)malloc(sizeof(PetscScalar) * m); 607 608 /* determine the # of unique dof */ 609 PetscCall(PCTFS_rvec_zero(lhs, m)); 610 PetscCall(PCTFS_rvec_set(lhs, 1.0, n)); 611 PetscCall(PCTFS_gs_gop_hc(PCTFS_gs_handle, lhs, "+\0", level)); 612 PetscCall(PetscInfo(0, "done first PCTFS_gs_gop_hc\n")); 613 PetscCall(PCTFS_rvec_zero(rsum, 2)); 614 for (i = 0; i < n; i++) { 615 if (lhs[i] != 0.0) { 616 rsum[0] += 1.0 / lhs[i]; 617 rsum[1] += lhs[i]; 618 } 619 if (lhs[i] != 1.0) shared = 1; 620 } 621 622 PetscCall(PCTFS_grop_hc(rsum, rw, 2, op, level)); 623 rsum[0] += 0.1; 624 rsum[1] += 0.1; 625 626 xyt_handle->info->n_global = xyt_handle->info->m_global = (PetscInt)rsum[0]; 627 xyt_handle->mvi->n_global = xyt_handle->mvi->m_global = (PetscInt)rsum[0]; 628 629 /* determine separator sets top down */ 630 PetscCheck(!shared, PETSC_COMM_SELF, PETSC_ERR_PLIB, "shared dof separator determination not ready ... see hmt!!!"); 631 /* solution is to do as in the symmetric shared case but then */ 632 /* pick the sub-hc with the most free dofs and do a mat-vec */ 633 /* and pick up the responses on the other sub-hc from the */ 634 /* initial separator set obtained from the symm. shared case */ 635 /* [dead code deleted since it is unlikely to be completed] */ 636 for (iptr = fo + n, id = PCTFS_my_id, mask = PCTFS_num_nodes >> 1, edge = level; edge > 0; edge--, mask >>= 1) { 637 /* set rsh of hc, fire, and collect lhs responses */ 638 PetscCall((id < mask) ? PCTFS_rvec_zero(lhs, m) : PCTFS_rvec_set(lhs, 1.0, m)); 639 PetscCall(PCTFS_gs_gop_hc(PCTFS_gs_handle, lhs, "+\0", edge)); 640 641 /* set lsh of hc, fire, and collect rhs responses */ 642 PetscCall((id < mask) ? PCTFS_rvec_set(rhs, 1.0, m) : PCTFS_rvec_zero(rhs, m)); 643 PetscCall(PCTFS_gs_gop_hc(PCTFS_gs_handle, rhs, "+\0", edge)); 644 645 /* count number of dofs I own that have signal and not in sep set */ 646 PetscCall(PCTFS_ivec_zero(sum, 4)); 647 for (ct = i = 0; i < n; i++) { 648 if (!used[i]) { 649 /* number of unmarked dofs on node */ 650 ct++; 651 /* number of dofs to be marked on lhs hc */ 652 if ((id < mask) && (lhs[i] != 0.0)) sum[0]++; 653 /* number of dofs to be marked on rhs hc */ 654 if ((id >= mask) && (rhs[i] != 0.0)) sum[1]++; 655 } 656 } 657 658 /* for the non-symmetric case we need separators of width 2 */ 659 /* so take both sides */ 660 (id < mask) ? (sum[2] = ct) : (sum[3] = ct); 661 PetscCall(PCTFS_giop_hc(sum, w, 4, op, edge)); 662 663 ct = 0; 664 if (id < mask) { 665 /* mark dofs I own that have signal and not in sep set */ 666 for (i = 0; i < n; i++) { 667 if ((!used[i]) && (lhs[i] != 0.0)) { 668 ct++; 669 nfo++; 670 *--iptr = local2global[i]; 671 used[i] = edge; 672 } 673 } 674 /* LSH hc summation of ct should be sum[0] */ 675 } else { 676 /* mark dofs I own that have signal and not in sep set */ 677 for (i = 0; i < n; i++) { 678 if ((!used[i]) && (rhs[i] != 0.0)) { 679 ct++; 680 nfo++; 681 *--iptr = local2global[i]; 682 used[i] = edge; 683 } 684 } 685 /* RSH hc summation of ct should be sum[1] */ 686 } 687 688 if (ct > 1) PetscCall(PCTFS_ivec_sort(iptr, ct)); 689 lnsep[edge] = ct; 690 nsep[edge] = sum[0] + sum[1]; 691 dir[edge] = BOTH; 692 693 /* LATER or we can recur on these to order seps at this level */ 694 /* do we need full set of separators for this? */ 695 696 /* fold rhs hc into lower */ 697 if (id >= mask) id -= mask; 698 } 699 700 /* level 0 is on processor case - so mark the remainder */ 701 for (ct = i = 0; i < n; i++) { 702 if (!used[i]) { 703 ct++; 704 nfo++; 705 *--iptr = local2global[i]; 706 used[i] = edge; 707 } 708 } 709 if (ct > 1) PetscCall(PCTFS_ivec_sort(iptr, ct)); 710 lnsep[edge] = ct; 711 nsep[edge] = ct; 712 dir[edge] = BOTH; 713 714 xyt_handle->info->nsep = nsep; 715 xyt_handle->info->lnsep = lnsep; 716 xyt_handle->info->fo = fo; 717 xyt_handle->info->nfo = nfo; 718 719 free(dir); 720 free(lhs); 721 free(rhs); 722 free(used); 723 PetscFunctionReturn(PETSC_SUCCESS); 724 } 725 726 static mv_info *set_mvi(PetscInt *local2global, PetscInt n, PetscInt m, PetscErrorCode (*matvec)(mv_info *, PetscScalar *, PetscScalar *), void *grid_data) 727 { 728 mv_info *mvi; 729 730 mvi = (mv_info *)malloc(sizeof(mv_info)); 731 mvi->n = n; 732 mvi->m = m; 733 mvi->n_global = -1; 734 mvi->m_global = -1; 735 mvi->local2global = (PetscInt *)malloc((m + 1) * sizeof(PetscInt)); 736 737 PCTFS_ivec_copy(mvi->local2global, local2global, m); 738 mvi->local2global[m] = INT_MAX; 739 mvi->matvec = matvec; 740 mvi->grid_data = grid_data; 741 742 /* set xyt communication handle to perform restricted matvec */ 743 mvi->PCTFS_gs_handle = PCTFS_gs_init(local2global, m, PCTFS_num_nodes); 744 745 return (mvi); 746 } 747 748 static PetscErrorCode do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u) 749 { 750 PetscFunctionBegin; 751 PetscCall(A->matvec((mv_info *)A->grid_data, v, u)); 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754