xref: /petsc/src/ksp/pc/impls/tfs/xyt.c (revision be1d678a52e6eff2808b2fa31ae986cdbf03c9fe)
1 
2 /*************************************xyt.c************************************
3 Module Name: xyt
4 Module Info:
5 
6 author:  Henry M. Tufo III
7 e-mail:  hmt@asci.uchicago.edu
8 contact:
9 +--------------------------------+--------------------------------+
10 |MCS Division - Building 221     |Department of Computer Science  |
11 |Argonne National Laboratory     |Ryerson 152                     |
12 |9700 S. Cass Avenue             |The University of Chicago       |
13 |Argonne, IL  60439              |Chicago, IL  60637              |
14 |(630) 252-5354/5986 ph/fx       |(773) 702-6019/8487 ph/fx       |
15 +--------------------------------+--------------------------------+
16 
17 Last Modification: 3.20.01
18 **************************************xyt.c***********************************/
19 
20 
21 /*************************************xyt.c************************************
22 NOTES ON USAGE:
23 
24 **************************************xyt.c***********************************/
25 #include "src/ksp/pc/impls/tfs/tfs.h"
26 
27 #define LEFT  -1
28 #define RIGHT  1
29 #define BOTH   0
30 #define MAX_FORTRAN_HANDLES  10
31 
32 typedef struct xyt_solver_info {
33   int n, m, n_global, m_global;
34   int nnz, max_nnz, msg_buf_sz;
35   int *nsep, *lnsep, *fo, nfo, *stages;
36   int *xcol_sz, *xcol_indices;
37   PetscScalar **xcol_vals, *x, *solve_uu, *solve_w;
38   int *ycol_sz, *ycol_indices;
39   PetscScalar **ycol_vals, *y;
40   int nsolves;
41   PetscScalar tot_solve_time;
42 } xyt_info;
43 
44 
45 typedef struct matvec_info {
46   int n, m, n_global, m_global;
47   int *local2global;
48   gs_ADT gs_handle;
49   PetscErrorCode (*matvec)(struct matvec_info*,PetscScalar*,PetscScalar*);
50   void *grid_data;
51 } mv_info;
52 
53 struct xyt_CDT{
54   int id;
55   int ns;
56   int level;
57   xyt_info *info;
58   mv_info  *mvi;
59 };
60 
61 static int n_xyt=0;
62 static int n_xyt_handles=0;
63 
64 /* prototypes */
65 static void do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *rhs);
66 static void check_init(void);
67 static void check_handle(xyt_ADT xyt_handle);
68 static void det_separators(xyt_ADT xyt_handle);
69 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u);
70 static int xyt_generate(xyt_ADT xyt_handle);
71 static int do_xyt_factor(xyt_ADT xyt_handle);
72 static mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data);
73 
74 
75 /*************************************xyt.c************************************
76 Function: XYT_new()
77 
78 Input :
79 Output:
80 Return:
81 Description:
82 **************************************xyt.c***********************************/
83 xyt_ADT
84 XYT_new(void)
85 {
86   xyt_ADT xyt_handle;
87 
88 
89 
90   /* rolling count on n_xyt ... pot. problem here */
91   n_xyt_handles++;
92   xyt_handle       = (xyt_ADT)malloc(sizeof(struct xyt_CDT));
93   xyt_handle->id   = ++n_xyt;
94   xyt_handle->info = NULL;
95   xyt_handle->mvi  = NULL;
96 
97   return(xyt_handle);
98 }
99 
100 
101 /*************************************xyt.c************************************
102 Function: XYT_factor()
103 
104 Input :
105 Output:
106 Return:
107 Description:
108 **************************************xyt.c***********************************/
109 int
110 XYT_factor(xyt_ADT xyt_handle, /* prev. allocated xyt  handle */
111 	   int *local2global,  /* global column mapping       */
112 	   int n,              /* local num rows              */
113 	   int m,              /* local num cols              */
114 	   void *matvec,       /* b_loc=A_local.x_loc         */
115 	   void *grid_data     /* grid data for matvec        */
116 	   )
117 {
118 
119   check_init();
120   check_handle(xyt_handle);
121 
122   /* only 2^k for now and all nodes participating */
123   if ((1<<(xyt_handle->level=i_log2_num_nodes))!=num_nodes)
124     {error_msg_fatal("only 2^k for now and MPI_COMM_WORLD!!! %d != %d\n",1<<i_log2_num_nodes,num_nodes);}
125 
126   /* space for X info */
127   xyt_handle->info = (xyt_info*)malloc(sizeof(xyt_info));
128 
129   /* set up matvec handles */
130   xyt_handle->mvi  = set_mvi(local2global, n, m, matvec, grid_data);
131 
132   /* matrix is assumed to be of full rank */
133   /* LATER we can reset to indicate rank def. */
134   xyt_handle->ns=0;
135 
136   /* determine separators and generate firing order - NB xyt info set here */
137   det_separators(xyt_handle);
138 
139   return(do_xyt_factor(xyt_handle));
140 }
141 
142 
143 /*************************************xyt.c************************************
144 Function: XYT_solve
145 
146 Input :
147 Output:
148 Return:
149 Description:
150 **************************************xyt.c***********************************/
151 int
152 XYT_solve(xyt_ADT xyt_handle, double *x, double *b)
153 {
154   check_init();
155   check_handle(xyt_handle);
156 
157   /* need to copy b into x? */
158   if (b)
159     {rvec_copy(x,b,xyt_handle->mvi->n);}
160   do_xyt_solve(xyt_handle,x);
161 
162   return(0);
163 }
164 
165 
166 /*************************************xyt.c************************************
167 Function: XYT_free()
168 
169 Input :
170 Output:
171 Return:
172 Description:
173 **************************************xyt.c***********************************/
174 int
175 XYT_free(xyt_ADT xyt_handle)
176 {
177   check_init();
178   check_handle(xyt_handle);
179   n_xyt_handles--;
180 
181   free(xyt_handle->info->nsep);
182   free(xyt_handle->info->lnsep);
183   free(xyt_handle->info->fo);
184   free(xyt_handle->info->stages);
185   free(xyt_handle->info->solve_uu);
186   free(xyt_handle->info->solve_w);
187   free(xyt_handle->info->x);
188   free(xyt_handle->info->xcol_vals);
189   free(xyt_handle->info->xcol_sz);
190   free(xyt_handle->info->xcol_indices);
191   free(xyt_handle->info->y);
192   free(xyt_handle->info->ycol_vals);
193   free(xyt_handle->info->ycol_sz);
194   free(xyt_handle->info->ycol_indices);
195   free(xyt_handle->info);
196   free(xyt_handle->mvi->local2global);
197    gs_free(xyt_handle->mvi->gs_handle);
198   free(xyt_handle->mvi);
199   free(xyt_handle);
200 
201 
202   /* if the check fails we nuke */
203   /* if NULL pointer passed to free we nuke */
204   /* if the calls to free fail that's not my problem */
205   return(0);
206 }
207 
208 
209 
210 /*************************************xyt.c************************************
211 Function:
212 
213 Input :
214 Output:
215 Return:
216 Description:
217 **************************************xyt.c***********************************/
218 int
219 XYT_stats(xyt_ADT xyt_handle)
220 {
221   int  op[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD};
222   int fop[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD};
223   int   vals[9],  work[9];
224   PetscScalar fvals[3], fwork[3];
225 
226 
227   check_init();
228   check_handle(xyt_handle);
229 
230   /* if factorization not done there are no stats */
231   if (!xyt_handle->info||!xyt_handle->mvi)
232     {
233       if (!my_id)
234 	{printf("XYT_stats() :: no stats available!\n");}
235       return 1;
236     }
237 
238   vals[0]=vals[1]=vals[2]=xyt_handle->info->nnz;
239   vals[3]=vals[4]=vals[5]=xyt_handle->mvi->n;
240   vals[6]=vals[7]=vals[8]=xyt_handle->info->msg_buf_sz;
241   giop(vals,work,sizeof(op)/sizeof(op[0])-1,op);
242 
243   fvals[0]=fvals[1]=fvals[2]
244     =xyt_handle->info->tot_solve_time/xyt_handle->info->nsolves++;
245   grop(fvals,fwork,sizeof(fop)/sizeof(fop[0])-1,fop);
246 
247   if (!my_id)
248     {
249       printf("%d :: min   xyt_nnz=%d\n",my_id,vals[0]);
250       printf("%d :: max   xyt_nnz=%d\n",my_id,vals[1]);
251       printf("%d :: avg   xyt_nnz=%g\n",my_id,1.0*vals[2]/num_nodes);
252       printf("%d :: tot   xyt_nnz=%d\n",my_id,vals[2]);
253       printf("%d :: xyt   C(2d)  =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.5)));
254       printf("%d :: xyt   C(3d)  =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.6667)));
255       printf("%d :: min   xyt_n  =%d\n",my_id,vals[3]);
256       printf("%d :: max   xyt_n  =%d\n",my_id,vals[4]);
257       printf("%d :: avg   xyt_n  =%g\n",my_id,1.0*vals[5]/num_nodes);
258       printf("%d :: tot   xyt_n  =%d\n",my_id,vals[5]);
259       printf("%d :: min   xyt_buf=%d\n",my_id,vals[6]);
260       printf("%d :: max   xyt_buf=%d\n",my_id,vals[7]);
261       printf("%d :: avg   xyt_buf=%g\n",my_id,1.0*vals[8]/num_nodes);
262       printf("%d :: min   xyt_slv=%g\n",my_id,fvals[0]);
263       printf("%d :: max   xyt_slv=%g\n",my_id,fvals[1]);
264       printf("%d :: avg   xyt_slv=%g\n",my_id,fvals[2]/num_nodes);
265     }
266 
267   return(0);
268 }
269 
270 
271 /*************************************xyt.c************************************
272 Function: do_xyt_factor
273 
274 Input :
275 Output:
276 Return:
277 Description: get A_local, local portion of global coarse matrix which
278 is a row dist. nxm matrix w/ n<m.
279    o my_ml holds address of ML struct associated w/A_local and coarse grid
280    o local2global holds global number of column i (i=0,...,m-1)
281    o local2global holds global number of row    i (i=0,...,n-1)
282    o mylocmatvec performs A_local . vec_local (note that gs is performed using
283    gs_init/gop).
284 
285 mylocmatvec = my_ml->Amat[grid_tag].matvec->external;
286 mylocmatvec (void :: void *data, double *in, double *out)
287 **************************************xyt.c***********************************/
288 static
289 int
290 do_xyt_factor(xyt_ADT xyt_handle)
291 {
292   int flag;
293 
294 
295   flag=xyt_generate(xyt_handle);
296   return(flag);
297 }
298 
299 
300 /*************************************xyt.c************************************
301 Function:
302 
303 Input :
304 Output:
305 Return:
306 Description:
307 **************************************xyt.c***********************************/
308 static
309 int
310 xyt_generate(xyt_ADT xyt_handle)
311 {
312   int i,j,k,idx;
313   int dim, col;
314   PetscScalar *u, *uu, *v, *z, *w, alpha, alpha_w;
315   int *segs;
316   int op[] = {GL_ADD,0};
317   int off, len;
318   PetscScalar *x_ptr, *y_ptr;
319   int *iptr, flag;
320   int start=0, end, work;
321   int op2[] = {GL_MIN,0};
322   gs_ADT gs_handle;
323   int *nsep, *lnsep, *fo;
324   int a_n=xyt_handle->mvi->n;
325   int a_m=xyt_handle->mvi->m;
326   int *a_local2global=xyt_handle->mvi->local2global;
327   int level;
328   int n, m;
329   int *xcol_sz, *xcol_indices, *stages;
330   PetscScalar **xcol_vals, *x;
331   int *ycol_sz, *ycol_indices;
332   PetscScalar **ycol_vals, *y;
333   int n_global;
334   int xt_nnz=0, xt_max_nnz=0;
335   int yt_nnz=0, yt_max_nnz=0;
336   int xt_zero_nnz  =0;
337   int xt_zero_nnz_0=0;
338   int yt_zero_nnz  =0;
339   int yt_zero_nnz_0=0;
340   PetscBLASInt i1 = 1;
341   PetscScalar dm1 = -1.0;
342 
343   n=xyt_handle->mvi->n;
344   nsep=xyt_handle->info->nsep;
345   lnsep=xyt_handle->info->lnsep;
346   fo=xyt_handle->info->fo;
347   end=lnsep[0];
348   level=xyt_handle->level;
349   gs_handle=xyt_handle->mvi->gs_handle;
350 
351   /* is there a null space? */
352   /* LATER add in ability to detect null space by checking alpha */
353   for (i=0, j=0; i<=level; i++)
354     {j+=nsep[i];}
355 
356   m = j-xyt_handle->ns;
357   if (m!=j)
358     {printf("xyt_generate() :: null space exists %d %d %d\n",m,j,xyt_handle->ns);}
359 
360   error_msg_warning("xyt_generate() :: X(%d,%d)\n",n,m);
361 
362   /* get and initialize storage for x local         */
363   /* note that x local is nxm and stored by columns */
364   xcol_sz = (int*) malloc(m*sizeof(PetscInt));
365   xcol_indices = (int*) malloc((2*m+1)*sizeof(int));
366   xcol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *));
367   for (i=j=0; i<m; i++, j+=2)
368     {
369       xcol_indices[j]=xcol_indices[j+1]=xcol_sz[i]=-1;
370       xcol_vals[i] = NULL;
371     }
372   xcol_indices[j]=-1;
373 
374   /* get and initialize storage for y local         */
375   /* note that y local is nxm and stored by columns */
376   ycol_sz = (int*) malloc(m*sizeof(PetscInt));
377   ycol_indices = (int*) malloc((2*m+1)*sizeof(int));
378   ycol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *));
379   for (i=j=0; i<m; i++, j+=2)
380     {
381       ycol_indices[j]=ycol_indices[j+1]=ycol_sz[i]=-1;
382       ycol_vals[i] = NULL;
383     }
384   ycol_indices[j]=-1;
385 
386   /* size of separators for each sub-hc working from bottom of tree to top */
387   /* this looks like nsep[]=segments */
388   stages = (int*) malloc((level+1)*sizeof(PetscInt));
389   segs   = (int*) malloc((level+1)*sizeof(PetscInt));
390   ivec_zero(stages,level+1);
391   ivec_copy(segs,nsep,level+1);
392   for (i=0; i<level; i++)
393     {segs[i+1] += segs[i];}
394   stages[0] = segs[0];
395 
396   /* temporary vectors  */
397   u  = (PetscScalar *) malloc(n*sizeof(PetscScalar));
398   z  = (PetscScalar *) malloc(n*sizeof(PetscScalar));
399   v  = (PetscScalar *) malloc(a_m*sizeof(PetscScalar));
400   uu = (PetscScalar *) malloc(m*sizeof(PetscScalar));
401   w  = (PetscScalar *) malloc(m*sizeof(PetscScalar));
402 
403   /* extra nnz due to replication of vertices across separators */
404   for (i=1, j=0; i<=level; i++)
405     {j+=nsep[i];}
406 
407   /* storage for sparse x values */
408   n_global = xyt_handle->info->n_global;
409   xt_max_nnz = yt_max_nnz = (int)(2.5*pow(1.0*n_global,1.6667) + j*n/2)/num_nodes;
410   x = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar));
411   y = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar));
412 
413   /* LATER - can embed next sep to fire in gs */
414   /* time to make the donuts - generate X factor */
415   for (dim=i=j=0;i<m;i++)
416     {
417       /* time to move to the next level? */
418       while (i==segs[dim])
419 	{
420 #ifdef SAFE
421 	  if (dim==level)
422 	    {error_msg_fatal("dim about to exceed level\n"); break;}
423 #endif
424 
425 	  stages[dim++]=i;
426 	  end+=lnsep[dim];
427 	}
428       stages[dim]=i;
429 
430       /* which column are we firing? */
431       /* i.e. set v_l */
432       /* use new seps and do global min across hc to determine which one to fire */
433       (start<end) ? (col=fo[start]) : (col=INT_MAX);
434       giop_hc(&col,&work,1,op2,dim);
435 
436       /* shouldn't need this */
437       if (col==INT_MAX)
438 	{
439 	  error_msg_warning("hey ... col==INT_MAX??\n");
440 	  continue;
441 	}
442 
443       /* do I own it? I should */
444       rvec_zero(v ,a_m);
445       if (col==fo[start])
446 	{
447 	  start++;
448 	  idx=ivec_linear_search(col, a_local2global, a_n);
449 	  if (idx!=-1)
450 	    {v[idx] = 1.0; j++;}
451 	  else
452 	    {error_msg_fatal("NOT FOUND!\n");}
453 	}
454       else
455 	{
456 	  idx=ivec_linear_search(col, a_local2global, a_m);
457 	  if (idx!=-1)
458 	    {v[idx] = 1.0;}
459 	}
460 
461       /* perform u = A.v_l */
462       rvec_zero(u,n);
463       do_matvec(xyt_handle->mvi,v,u);
464 
465       /* uu =  X^T.u_l (local portion) */
466       /* technically only need to zero out first i entries */
467       /* later turn this into an XYT_solve call ? */
468       rvec_zero(uu,m);
469       y_ptr=y;
470       iptr = ycol_indices;
471       for (k=0; k<i; k++)
472 	{
473 	  off = *iptr++;
474 	  len = *iptr++;
475 
476 	  uu[k] = BLASdot_(&len,u+off,&i1,y_ptr,&i1);
477 	  y_ptr+=len;
478 	}
479 
480       /* uu = X^T.u_l (comm portion) */
481       ssgl_radd  (uu, w, dim, stages);
482 
483       /* z = X.uu */
484       rvec_zero(z,n);
485       x_ptr=x;
486       iptr = xcol_indices;
487       for (k=0; k<i; k++)
488 	{
489 	  off = *iptr++;
490 	  len = *iptr++;
491 
492 	  BLASaxpy_(&len,&uu[k],x_ptr,&i1,z+off,&i1);
493 	  x_ptr+=len;
494 	}
495 
496       /* compute v_l = v_l - z */
497       rvec_zero(v+a_n,a_m-a_n);
498       BLASaxpy_(&n,&dm1,z,&i1,v,&i1);
499 
500       /* compute u_l = A.v_l */
501       if (a_n!=a_m)
502 	{gs_gop_hc(gs_handle,v,"+\0",dim);}
503       rvec_zero(u,n);
504      do_matvec(xyt_handle->mvi,v,u);
505 
506       /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - local portion */
507       alpha = BLASdot_(&n,u,&i1,u,&i1);
508       /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - comm portion */
509       grop_hc(&alpha, &alpha_w, 1, op, dim);
510 
511       alpha = (PetscScalar) sqrt((double)alpha);
512 
513       /* check for small alpha                             */
514       /* LATER use this to detect and determine null space */
515       if (fabs(alpha)<1.0e-14)
516 	{error_msg_fatal("bad alpha! %g\n",alpha);}
517 
518       /* compute v_l = v_l/sqrt(alpha) */
519       rvec_scale(v,1.0/alpha,n);
520       rvec_scale(u,1.0/alpha,n);
521 
522       /* add newly generated column, v_l, to X */
523       flag = 1;
524       off=len=0;
525       for (k=0; k<n; k++)
526 	{
527 	  if (v[k]!=0.0)
528 	    {
529 	      len=k;
530 	      if (flag)
531 		{off=k; flag=0;}
532 	    }
533 	}
534 
535       len -= (off-1);
536 
537       if (len>0)
538 	{
539 	  if ((xt_nnz+len)>xt_max_nnz)
540 	    {
541 	      error_msg_warning("increasing space for X by 2x!\n");
542 	      xt_max_nnz *= 2;
543 	      x_ptr = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar));
544 	      rvec_copy(x_ptr,x,xt_nnz);
545 	      free(x);
546 	      x = x_ptr;
547 	      x_ptr+=xt_nnz;
548 	    }
549 	  xt_nnz += len;
550 	  rvec_copy(x_ptr,v+off,len);
551 
552           /* keep track of number of zeros */
553 	  if (dim)
554 	    {
555 	      for (k=0; k<len; k++)
556 		{
557 		  if (x_ptr[k]==0.0)
558 		    {xt_zero_nnz++;}
559 		}
560 	    }
561 	  else
562 	    {
563 	      for (k=0; k<len; k++)
564 		{
565 		  if (x_ptr[k]==0.0)
566 		    {xt_zero_nnz_0++;}
567 		}
568 	    }
569 	  xcol_indices[2*i] = off;
570 	  xcol_sz[i] = xcol_indices[2*i+1] = len;
571 	  xcol_vals[i] = x_ptr;
572 	}
573       else
574 	{
575 	  xcol_indices[2*i] = 0;
576 	  xcol_sz[i] = xcol_indices[2*i+1] = 0;
577 	  xcol_vals[i] = x_ptr;
578 	}
579 
580 
581       /* add newly generated column, u_l, to Y */
582       flag = 1;
583       off=len=0;
584       for (k=0; k<n; k++)
585 	{
586 	  if (u[k]!=0.0)
587 	    {
588 	      len=k;
589 	      if (flag)
590 		{off=k; flag=0;}
591 	    }
592 	}
593 
594       len -= (off-1);
595 
596       if (len>0)
597 	{
598 	  if ((yt_nnz+len)>yt_max_nnz)
599 	    {
600 	      error_msg_warning("increasing space for Y by 2x!\n");
601 	      yt_max_nnz *= 2;
602 	      y_ptr = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar));
603 	      rvec_copy(y_ptr,y,yt_nnz);
604 	      free(y);
605 	      y = y_ptr;
606 	      y_ptr+=yt_nnz;
607 	    }
608 	  yt_nnz += len;
609 	  rvec_copy(y_ptr,u+off,len);
610 
611           /* keep track of number of zeros */
612 	  if (dim)
613 	    {
614 	      for (k=0; k<len; k++)
615 		{
616 		  if (y_ptr[k]==0.0)
617 		    {yt_zero_nnz++;}
618 		}
619 	    }
620 	  else
621 	    {
622 	      for (k=0; k<len; k++)
623 		{
624 		  if (y_ptr[k]==0.0)
625 		    {yt_zero_nnz_0++;}
626 		}
627 	    }
628 	  ycol_indices[2*i] = off;
629 	  ycol_sz[i] = ycol_indices[2*i+1] = len;
630 	  ycol_vals[i] = y_ptr;
631 	}
632       else
633 	{
634 	  ycol_indices[2*i] = 0;
635 	  ycol_sz[i] = ycol_indices[2*i+1] = 0;
636 	  ycol_vals[i] = y_ptr;
637 	}
638     }
639 
640   /* close off stages for execution phase */
641   while (dim!=level)
642     {
643       stages[dim++]=i;
644       error_msg_warning("disconnected!!! dim(%d)!=level(%d)\n",dim,level);
645     }
646   stages[dim]=i;
647 
648   xyt_handle->info->n=xyt_handle->mvi->n;
649   xyt_handle->info->m=m;
650   xyt_handle->info->nnz=xt_nnz + yt_nnz;
651   xyt_handle->info->max_nnz=xt_max_nnz + yt_max_nnz;
652   xyt_handle->info->msg_buf_sz=stages[level]-stages[0];
653   xyt_handle->info->solve_uu = (PetscScalar *) malloc(m*sizeof(PetscScalar));
654   xyt_handle->info->solve_w  = (PetscScalar *) malloc(m*sizeof(PetscScalar));
655   xyt_handle->info->x=x;
656   xyt_handle->info->xcol_vals=xcol_vals;
657   xyt_handle->info->xcol_sz=xcol_sz;
658   xyt_handle->info->xcol_indices=xcol_indices;
659   xyt_handle->info->stages=stages;
660   xyt_handle->info->y=y;
661   xyt_handle->info->ycol_vals=ycol_vals;
662   xyt_handle->info->ycol_sz=ycol_sz;
663   xyt_handle->info->ycol_indices=ycol_indices;
664 
665   free(segs);
666   free(u);
667   free(v);
668   free(uu);
669   free(z);
670   free(w);
671 
672   return(0);
673 }
674 
675 
676 /*************************************xyt.c************************************
677 Function:
678 
679 Input :
680 Output:
681 Return:
682 Description:
683 **************************************xyt.c***********************************/
684 static
685 void
686 do_xyt_solve(xyt_ADT xyt_handle,  PetscScalar *uc)
687 {
688   int off, len, *iptr;
689   int level       =xyt_handle->level;
690   int n           =xyt_handle->info->n;
691   int m           =xyt_handle->info->m;
692   int *stages     =xyt_handle->info->stages;
693   int *xcol_indices=xyt_handle->info->xcol_indices;
694   int *ycol_indices=xyt_handle->info->ycol_indices;
695    PetscScalar *x_ptr, *y_ptr, *uu_ptr;
696   PetscScalar *solve_uu=xyt_handle->info->solve_uu;
697   PetscScalar *solve_w =xyt_handle->info->solve_w;
698   PetscScalar *x       =xyt_handle->info->x;
699   PetscScalar *y       =xyt_handle->info->y;
700   PetscBLASInt i1 = 1;
701 
702 
703   uu_ptr=solve_uu;
704   rvec_zero(uu_ptr,m);
705 
706   /* x  = X.Y^T.b */
707   /* uu = Y^T.b */
708   for (y_ptr=y,iptr=ycol_indices; *iptr!=-1; y_ptr+=len)
709     {
710       off=*iptr++; len=*iptr++;
711       *uu_ptr++ = BLASdot_(&len,uc+off,&i1,y_ptr,&i1);
712     }
713 
714   /* comunication of beta */
715   uu_ptr=solve_uu;
716   if (level) {ssgl_radd(uu_ptr, solve_w, level, stages);}
717 
718   rvec_zero(uc,n);
719 
720   /* x = X.uu */
721   for (x_ptr=x,iptr=xcol_indices; *iptr!=-1; x_ptr+=len)
722     {
723       off=*iptr++; len=*iptr++;
724       BLASaxpy_(&len,uu_ptr++,x_ptr,&i1,uc+off,&i1);
725     }
726 
727 }
728 
729 
730 /*************************************Xyt.c************************************
731 Function: check_init
732 
733 Input :
734 Output:
735 Return:
736 Description:
737 **************************************xyt.c***********************************/
738 static
739 void
740 check_init(void)
741 {
742   comm_init();
743 
744 
745 }
746 
747 
748 /*************************************xyt.c************************************
749 Function: check_handle()
750 
751 Input :
752 Output:
753 Return:
754 Description:
755 **************************************xyt.c***********************************/
756 static
757 void
758 check_handle(xyt_ADT xyt_handle)
759 {
760 #ifdef SAFE
761   int vals[2], work[2], op[] = {NON_UNIFORM,GL_MIN,GL_MAX};
762 #endif
763 
764   if (xyt_handle==NULL)
765     {error_msg_fatal("check_handle() :: bad handle :: NULL %d\n",xyt_handle);}
766 
767 #ifdef SAFE
768   vals[0]=vals[1]=xyt_handle->id;
769   giop(vals,work,sizeof(op)/sizeof(op[0])-1,op);
770   if ((vals[0]!=vals[1])||(xyt_handle->id<=0))
771     {error_msg_fatal("check_handle() :: bad handle :: id mismatch min/max %d/%d %d\n",
772 		     vals[0],vals[1], xyt_handle->id);}
773 #endif
774 
775 }
776 
777 
778 /*************************************xyt.c************************************
779 Function: det_separators
780 
781 Input :
782 Output:
783 Return:
784 Description:
785   det_separators(xyt_handle, local2global, n, m, mylocmatvec, grid_data);
786 **************************************xyt.c***********************************/
787 static
788 void
789 det_separators(xyt_ADT xyt_handle)
790 {
791   int i, ct, id;
792   int mask, edge, *iptr;
793   int *dir, *used;
794   int sum[4], w[4];
795   PetscScalar rsum[4], rw[4];
796   int op[] = {GL_ADD,0};
797   PetscScalar *lhs, *rhs;
798   int *nsep, *lnsep, *fo, nfo=0;
799   gs_ADT gs_handle=xyt_handle->mvi->gs_handle;
800   int *local2global=xyt_handle->mvi->local2global;
801   int  n=xyt_handle->mvi->n;
802   int  m=xyt_handle->mvi->m;
803   int level=xyt_handle->level;
804   int shared=FALSE;
805 
806   dir  = (int*)malloc(sizeof(PetscInt)*(level+1));
807   nsep = (int*)malloc(sizeof(PetscInt)*(level+1));
808   lnsep= (int*)malloc(sizeof(PetscInt)*(level+1));
809   fo   = (int*)malloc(sizeof(PetscInt)*(n+1));
810   used = (int*)malloc(sizeof(PetscInt)*n);
811 
812   ivec_zero(dir  ,level+1);
813   ivec_zero(nsep ,level+1);
814   ivec_zero(lnsep,level+1);
815   ivec_set (fo   ,-1,n+1);
816   ivec_zero(used,n);
817 
818   lhs  = (double*)malloc(sizeof(PetscScalar)*m);
819   rhs  = (double*)malloc(sizeof(PetscScalar)*m);
820 
821   /* determine the # of unique dof */
822   rvec_zero(lhs,m);
823   rvec_set(lhs,1.0,n);
824   gs_gop_hc(gs_handle,lhs,"+\0",level);
825   error_msg_warning("done first gs_gop_hc\n");
826   rvec_zero(rsum,2);
827   for (ct=i=0;i<n;i++)
828     {
829       if (lhs[i]!=0.0)
830 	{rsum[0]+=1.0/lhs[i]; rsum[1]+=lhs[i];}
831 
832       if (lhs[i]!=1.0)
833 	{
834           shared=TRUE;
835         }
836     }
837 
838   grop_hc(rsum,rw,2,op,level);
839   rsum[0]+=0.1;
840   rsum[1]+=0.1;
841 
842   /*
843       if (!my_id)
844       {
845       printf("xyt n unique = %d (%g)\n",(int) rsum[0], rsum[0]);
846       printf("xyt n shared = %d (%g)\n",(int) rsum[1], rsum[1]);
847       }
848   */
849 
850   xyt_handle->info->n_global=xyt_handle->info->m_global=(int) rsum[0];
851   xyt_handle->mvi->n_global =xyt_handle->mvi->m_global =(int) rsum[0];
852 
853   /* determine separator sets top down */
854   if (shared)
855     {
856       /* solution is to do as in the symmetric shared case but then */
857       /* pick the sub-hc with the most free dofs and do a mat-vec   */
858       /* and pick up the responses on the other sub-hc from the     */
859       /* initial separator set obtained from the symm. shared case  */
860       error_msg_fatal("shared dof separator determination not ready ... see hmt!!!\n");
861       for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1)
862 	{
863 	  /* set rsh of hc, fire, and collect lhs responses */
864 	  (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m);
865 	  gs_gop_hc(gs_handle,lhs,"+\0",edge);
866 
867 	  /* set lsh of hc, fire, and collect rhs responses */
868 	  (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m);
869 	  gs_gop_hc(gs_handle,rhs,"+\0",edge);
870 
871 	  for (i=0;i<n;i++)
872 	    {
873 	      if (id< mask)
874 		{
875 		  if (lhs[i]!=0.0)
876 		    {lhs[i]=1.0;}
877 		}
878 	      if (id>=mask)
879 		{
880 		  if (rhs[i]!=0.0)
881 		    {rhs[i]=1.0;}
882 		}
883 	    }
884 
885 	  if (id< mask)
886 	    {gs_gop_hc(gs_handle,lhs,"+\0",edge-1);}
887 	  else
888 	    {gs_gop_hc(gs_handle,rhs,"+\0",edge-1);}
889 
890 	  /* count number of dofs I own that have signal and not in sep set */
891 	  rvec_zero(rsum,4);
892 	  for (ivec_zero(sum,4),ct=i=0;i<n;i++)
893 	    {
894 	      if (!used[i])
895 		{
896 		  /* number of unmarked dofs on node */
897 		  ct++;
898 		  /* number of dofs to be marked on lhs hc */
899 		  if (id< mask)
900 		    {
901 		      if (lhs[i]!=0.0)
902 			{sum[0]++; rsum[0]+=1.0/lhs[i];}
903 		    }
904 		  /* number of dofs to be marked on rhs hc */
905 		  if (id>=mask)
906 		    {
907 		      if (rhs[i]!=0.0)
908 			{sum[1]++; rsum[1]+=1.0/rhs[i];}
909 		    }
910 		}
911 	    }
912 
913 	  /* go for load balance - choose half with most unmarked dofs, bias LHS */
914 	  (id<mask) ? (sum[2]=ct) : (sum[3]=ct);
915 	  (id<mask) ? (rsum[2]=ct) : (rsum[3]=ct);
916 	  giop_hc(sum,w,4,op,edge);
917 	  grop_hc(rsum,rw,4,op,edge);
918 	  rsum[0]+=0.1; rsum[1]+=0.1; rsum[2]+=0.1; rsum[3]+=0.1;
919 
920 	  if (id<mask)
921 	    {
922 	      /* mark dofs I own that have signal and not in sep set */
923 	      for (ct=i=0;i<n;i++)
924 		{
925 		  if ((!used[i])&&(lhs[i]!=0.0))
926 		    {
927 		      ct++; nfo++;
928 
929 		      if (nfo>n)
930 			{error_msg_fatal("nfo about to exceed n\n");}
931 
932 		      *--iptr = local2global[i];
933 		      used[i]=edge;
934 		    }
935 		}
936 	      if (ct>1) {ivec_sort(iptr,ct);}
937 
938 	      lnsep[edge]=ct;
939 	      nsep[edge]=(int) rsum[0];
940 	      dir [edge]=LEFT;
941 	    }
942 
943 	  if (id>=mask)
944 	    {
945 	      /* mark dofs I own that have signal and not in sep set */
946 	      for (ct=i=0;i<n;i++)
947 		{
948 		  if ((!used[i])&&(rhs[i]!=0.0))
949 		    {
950 		      ct++; nfo++;
951 
952 		      if (nfo>n)
953 			{error_msg_fatal("nfo about to exceed n\n");}
954 
955 		      *--iptr = local2global[i];
956 		      used[i]=edge;
957 		    }
958 		}
959 	      if (ct>1) {ivec_sort(iptr,ct);}
960 
961 	      lnsep[edge]=ct;
962 	      nsep[edge]= (int) rsum[1];
963 	      dir [edge]=RIGHT;
964 	    }
965 
966 	  /* LATER or we can recur on these to order seps at this level */
967 	  /* do we need full set of separators for this?                */
968 
969 	  /* fold rhs hc into lower */
970 	  if (id>=mask)
971 	    {id-=mask;}
972 	}
973     }
974   else
975     {
976       for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1)
977 	{
978 	  /* set rsh of hc, fire, and collect lhs responses */
979 	  (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m);
980 	  gs_gop_hc(gs_handle,lhs,"+\0",edge);
981 
982 	  /* set lsh of hc, fire, and collect rhs responses */
983 	  (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m);
984 	  gs_gop_hc(gs_handle,rhs,"+\0",edge);
985 
986 	  /* count number of dofs I own that have signal and not in sep set */
987 	  for (ivec_zero(sum,4),ct=i=0;i<n;i++)
988 	    {
989 	      if (!used[i])
990 		{
991 		  /* number of unmarked dofs on node */
992 		  ct++;
993 		  /* number of dofs to be marked on lhs hc */
994 		  if ((id< mask)&&(lhs[i]!=0.0)) {sum[0]++;}
995 		  /* number of dofs to be marked on rhs hc */
996 		  if ((id>=mask)&&(rhs[i]!=0.0)) {sum[1]++;}
997 		}
998 	    }
999 
1000 	  /* for the non-symmetric case we need separators of width 2 */
1001 	  /* so take both sides */
1002 	  (id<mask) ? (sum[2]=ct) : (sum[3]=ct);
1003 	  giop_hc(sum,w,4,op,edge);
1004 
1005 	  ct=0;
1006 	  if (id<mask)
1007 	    {
1008 	      /* mark dofs I own that have signal and not in sep set */
1009 	      for (i=0;i<n;i++)
1010 		{
1011 		  if ((!used[i])&&(lhs[i]!=0.0))
1012 		    {
1013 		      ct++; nfo++;
1014 		      *--iptr = local2global[i];
1015 		      used[i]=edge;
1016 		    }
1017 		}
1018 	      /* LSH hc summation of ct should be sum[0] */
1019 	    }
1020 	  else
1021 	    {
1022 	      /* mark dofs I own that have signal and not in sep set */
1023 	      for (i=0;i<n;i++)
1024 		{
1025 		  if ((!used[i])&&(rhs[i]!=0.0))
1026 		    {
1027 		      ct++; nfo++;
1028 		      *--iptr = local2global[i];
1029 		      used[i]=edge;
1030 		    }
1031 		}
1032 	      /* RSH hc summation of ct should be sum[1] */
1033 	    }
1034 
1035 	  if (ct>1) {ivec_sort(iptr,ct);}
1036 	  lnsep[edge]=ct;
1037 	  nsep[edge]=sum[0]+sum[1];
1038 	  dir [edge]=BOTH;
1039 
1040 	  /* LATER or we can recur on these to order seps at this level */
1041 	  /* do we need full set of separators for this?                */
1042 
1043 	  /* fold rhs hc into lower */
1044 	  if (id>=mask)
1045 	    {id-=mask;}
1046 	}
1047     }
1048 
1049   /* level 0 is on processor case - so mark the remainder */
1050   for (ct=i=0;i<n;i++)
1051     {
1052       if (!used[i])
1053 	{
1054 	  ct++; nfo++;
1055 	  *--iptr = local2global[i];
1056 	  used[i]=edge;
1057 	}
1058     }
1059   if (ct>1) {ivec_sort(iptr,ct);}
1060   lnsep[edge]=ct;
1061   nsep [edge]=ct;
1062   dir  [edge]=BOTH;
1063 
1064   xyt_handle->info->nsep=nsep;
1065   xyt_handle->info->lnsep=lnsep;
1066   xyt_handle->info->fo=fo;
1067   xyt_handle->info->nfo=nfo;
1068 
1069   free(dir);
1070   free(lhs);
1071   free(rhs);
1072   free(used);
1073 
1074 }
1075 
1076 
1077 /*************************************xyt.c************************************
1078 Function: set_mvi
1079 
1080 Input :
1081 Output:
1082 Return:
1083 Description:
1084 **************************************xyt.c***********************************/
1085 static
1086 mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data)
1087 {
1088   mv_info *mvi;
1089 
1090 
1091   mvi = (mv_info*)malloc(sizeof(mv_info));
1092   mvi->n=n;
1093   mvi->m=m;
1094   mvi->n_global=-1;
1095   mvi->m_global=-1;
1096   mvi->local2global=(int*)malloc((m+1)*sizeof(PetscInt));
1097   ivec_copy(mvi->local2global,local2global,m);
1098   mvi->local2global[m] = INT_MAX;
1099   mvi->matvec=(PetscErrorCode (*)(mv_info*,PetscScalar*,PetscScalar*))matvec;
1100   mvi->grid_data=grid_data;
1101 
1102   /* set xyt communication handle to perform restricted matvec */
1103   mvi->gs_handle = gs_init(local2global, m, num_nodes);
1104 
1105   return(mvi);
1106 }
1107 
1108 
1109 /*************************************xyt.c************************************
1110 Function: set_mvi
1111 
1112 Input :
1113 Output:
1114 Return:
1115 Description:
1116 
1117       computes u = A.v
1118       do_matvec(xyt_handle->mvi,v,u);
1119 **************************************xyt.c***********************************/
1120 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u)
1121 {
1122   A->matvec((mv_info*)A->grid_data,v,u);
1123 }
1124 
1125 
1126 
1127