xref: /petsc/src/ksp/pc/impls/tfs/xyt.c (revision 73e7a55880f7211d317aabb87c8bd5e70ea1847f)
1 
2 /*************************************xyt.c************************************
3 Module Name: xyt
4 Module Info:
5 
6 author:  Henry M. Tufo III
7 e-mail:  hmt@asci.uchicago.edu
8 contact:
9 +--------------------------------+--------------------------------+
10 |MCS Division - Building 221     |Department of Computer Science  |
11 |Argonne National Laboratory     |Ryerson 152                     |
12 |9700 S. Cass Avenue             |The University of Chicago       |
13 |Argonne, IL  60439              |Chicago, IL  60637              |
14 |(630) 252-5354/5986 ph/fx       |(773) 702-6019/8487 ph/fx       |
15 +--------------------------------+--------------------------------+
16 
17 Last Modification: 3.20.01
18 **************************************xyt.c***********************************/
19 
20 
21 /*************************************xyt.c************************************
22 NOTES ON USAGE:
23 
24 **************************************xyt.c***********************************/
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <limits.h>
28 #include <float.h>
29 #include <math.h>
30 
31 #include "petsc.h"
32 
33 #include "const.h"
34 #include "types.h"
35 #include "comm.h"
36 #include "error.h"
37 #include "ivec.h"
38 #include "queue.h"
39 #include "gs.h"
40 #include "xyt.h"
41 
42 #define LEFT  -1
43 #define RIGHT  1
44 #define BOTH   0
45 #define MAX_FORTRAN_HANDLES  10
46 
47 typedef struct xyt_solver_info {
48   int n, m, n_global, m_global;
49   int nnz, max_nnz, msg_buf_sz;
50   int *nsep, *lnsep, *fo, nfo, *stages;
51   int *xcol_sz, *xcol_indices;
52   PetscScalar **xcol_vals, *x, *solve_uu, *solve_w;
53   int *ycol_sz, *ycol_indices;
54   PetscScalar **ycol_vals, *y;
55   int nsolves;
56   PetscScalar tot_solve_time;
57 } xyt_info;
58 
59 
60 typedef struct matvec_info {
61   int n, m, n_global, m_global;
62   int *local2global;
63   gs_ADT gs_handle;
64   PetscErrorCode (*matvec)(struct matvec_info*,PetscScalar*,PetscScalar*);
65   void *grid_data;
66 } mv_info;
67 
68 struct xyt_CDT{
69   int id;
70   int ns;
71   int level;
72   xyt_info *info;
73   mv_info  *mvi;
74 };
75 
76 static int n_xyt=0;
77 static int n_xyt_handles=0;
78 
79 /* prototypes */
80 static void do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *rhs);
81 static void check_init(void);
82 static void check_handle(xyt_ADT xyt_handle);
83 static void det_separators(xyt_ADT xyt_handle);
84 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u);
85 static int xyt_generate(xyt_ADT xyt_handle);
86 static int do_xyt_factor(xyt_ADT xyt_handle);
87 static mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data);
88 
89 
90 /*************************************xyt.c************************************
91 Function: XYT_new()
92 
93 Input :
94 Output:
95 Return:
96 Description:
97 **************************************xyt.c***********************************/
98 xyt_ADT
99 XYT_new(void)
100 {
101   xyt_ADT xyt_handle;
102 
103 
104 
105   /* rolling count on n_xyt ... pot. problem here */
106   n_xyt_handles++;
107   xyt_handle       = (xyt_ADT)malloc(sizeof(struct xyt_CDT));
108   xyt_handle->id   = ++n_xyt;
109   xyt_handle->info = NULL;
110   xyt_handle->mvi  = NULL;
111 
112   return(xyt_handle);
113 }
114 
115 
116 /*************************************xyt.c************************************
117 Function: XYT_factor()
118 
119 Input :
120 Output:
121 Return:
122 Description:
123 **************************************xyt.c***********************************/
124 int
125 XYT_factor(xyt_ADT xyt_handle, /* prev. allocated xyt  handle */
126 	   int *local2global,  /* global column mapping       */
127 	   int n,              /* local num rows              */
128 	   int m,              /* local num cols              */
129 	   void *matvec,       /* b_loc=A_local.x_loc         */
130 	   void *grid_data     /* grid data for matvec        */
131 	   )
132 {
133 
134   check_init();
135   check_handle(xyt_handle);
136 
137   /* only 2^k for now and all nodes participating */
138   if ((1<<(xyt_handle->level=i_log2_num_nodes))!=num_nodes)
139     {error_msg_fatal("only 2^k for now and MPI_COMM_WORLD!!! %d != %d\n",1<<i_log2_num_nodes,num_nodes);}
140 
141   /* space for X info */
142   xyt_handle->info = (xyt_info*)malloc(sizeof(xyt_info));
143 
144   /* set up matvec handles */
145   xyt_handle->mvi  = set_mvi(local2global, n, m, matvec, grid_data);
146 
147   /* matrix is assumed to be of full rank */
148   /* LATER we can reset to indicate rank def. */
149   xyt_handle->ns=0;
150 
151   /* determine separators and generate firing order - NB xyt info set here */
152   det_separators(xyt_handle);
153 
154   return(do_xyt_factor(xyt_handle));
155 }
156 
157 
158 /*************************************xyt.c************************************
159 Function: XYT_solve
160 
161 Input :
162 Output:
163 Return:
164 Description:
165 **************************************xyt.c***********************************/
166 int
167 XYT_solve(xyt_ADT xyt_handle, double *x, double *b)
168 {
169   check_init();
170   check_handle(xyt_handle);
171 
172   /* need to copy b into x? */
173   if (b)
174     {rvec_copy(x,b,xyt_handle->mvi->n);}
175   do_xyt_solve(xyt_handle,x);
176 
177   return(0);
178 }
179 
180 
181 /*************************************xyt.c************************************
182 Function: XYT_free()
183 
184 Input :
185 Output:
186 Return:
187 Description:
188 **************************************xyt.c***********************************/
189 int
190 XYT_free(xyt_ADT xyt_handle)
191 {
192   check_init();
193   check_handle(xyt_handle);
194   n_xyt_handles--;
195 
196   free(xyt_handle->info->nsep);
197   free(xyt_handle->info->lnsep);
198   free(xyt_handle->info->fo);
199   free(xyt_handle->info->stages);
200   free(xyt_handle->info->solve_uu);
201   free(xyt_handle->info->solve_w);
202   free(xyt_handle->info->x);
203   free(xyt_handle->info->xcol_vals);
204   free(xyt_handle->info->xcol_sz);
205   free(xyt_handle->info->xcol_indices);
206   free(xyt_handle->info->y);
207   free(xyt_handle->info->ycol_vals);
208   free(xyt_handle->info->ycol_sz);
209   free(xyt_handle->info->ycol_indices);
210   free(xyt_handle->info);
211   free(xyt_handle->mvi->local2global);
212    gs_free(xyt_handle->mvi->gs_handle);
213   free(xyt_handle->mvi);
214   free(xyt_handle);
215 
216 
217   /* if the check fails we nuke */
218   /* if NULL pointer passed to free we nuke */
219   /* if the calls to free fail that's not my problem */
220   return(0);
221 }
222 
223 
224 
225 /*************************************xyt.c************************************
226 Function:
227 
228 Input :
229 Output:
230 Return:
231 Description:
232 **************************************xyt.c***********************************/
233 int
234 XYT_stats(xyt_ADT xyt_handle)
235 {
236   int  op[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD};
237   int fop[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD};
238   int   vals[9],  work[9];
239   PetscScalar fvals[3], fwork[3];
240 
241 
242   check_init();
243   check_handle(xyt_handle);
244 
245   /* if factorization not done there are no stats */
246   if (!xyt_handle->info||!xyt_handle->mvi)
247     {
248       if (!my_id)
249 	{printf("XYT_stats() :: no stats available!\n");}
250       return 1;
251     }
252 
253   vals[0]=vals[1]=vals[2]=xyt_handle->info->nnz;
254   vals[3]=vals[4]=vals[5]=xyt_handle->mvi->n;
255   vals[6]=vals[7]=vals[8]=xyt_handle->info->msg_buf_sz;
256   giop(vals,work,sizeof(op)/sizeof(op[0])-1,op);
257 
258   fvals[0]=fvals[1]=fvals[2]
259     =xyt_handle->info->tot_solve_time/xyt_handle->info->nsolves++;
260   grop(fvals,fwork,sizeof(fop)/sizeof(fop[0])-1,fop);
261 
262   if (!my_id)
263     {
264       printf("%d :: min   xyt_nnz=%d\n",my_id,vals[0]);
265       printf("%d :: max   xyt_nnz=%d\n",my_id,vals[1]);
266       printf("%d :: avg   xyt_nnz=%g\n",my_id,1.0*vals[2]/num_nodes);
267       printf("%d :: tot   xyt_nnz=%d\n",my_id,vals[2]);
268       printf("%d :: xyt   C(2d)  =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.5)));
269       printf("%d :: xyt   C(3d)  =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.6667)));
270       printf("%d :: min   xyt_n  =%d\n",my_id,vals[3]);
271       printf("%d :: max   xyt_n  =%d\n",my_id,vals[4]);
272       printf("%d :: avg   xyt_n  =%g\n",my_id,1.0*vals[5]/num_nodes);
273       printf("%d :: tot   xyt_n  =%d\n",my_id,vals[5]);
274       printf("%d :: min   xyt_buf=%d\n",my_id,vals[6]);
275       printf("%d :: max   xyt_buf=%d\n",my_id,vals[7]);
276       printf("%d :: avg   xyt_buf=%g\n",my_id,1.0*vals[8]/num_nodes);
277       printf("%d :: min   xyt_slv=%g\n",my_id,fvals[0]);
278       printf("%d :: max   xyt_slv=%g\n",my_id,fvals[1]);
279       printf("%d :: avg   xyt_slv=%g\n",my_id,fvals[2]/num_nodes);
280     }
281 
282   return(0);
283 }
284 
285 
286 /*************************************xyt.c************************************
287 Function: do_xyt_factor
288 
289 Input :
290 Output:
291 Return:
292 Description: get A_local, local portion of global coarse matrix which
293 is a row dist. nxm matrix w/ n<m.
294    o my_ml holds address of ML struct associated w/A_local and coarse grid
295    o local2global holds global number of column i (i=0,...,m-1)
296    o local2global holds global number of row    i (i=0,...,n-1)
297    o mylocmatvec performs A_local . vec_local (note that gs is performed using
298    gs_init/gop).
299 
300 mylocmatvec = my_ml->Amat[grid_tag].matvec->external;
301 mylocmatvec (void :: void *data, double *in, double *out)
302 **************************************xyt.c***********************************/
303 static
304 int
305 do_xyt_factor(xyt_ADT xyt_handle)
306 {
307   int flag;
308 
309 
310   flag=xyt_generate(xyt_handle);
311   return(flag);
312 }
313 
314 
315 /*************************************xyt.c************************************
316 Function:
317 
318 Input :
319 Output:
320 Return:
321 Description:
322 **************************************xyt.c***********************************/
323 static
324 int
325 xyt_generate(xyt_ADT xyt_handle)
326 {
327   int i,j,k,idx;
328   int dim, col;
329   PetscScalar *u, *uu, *v, *z, *w, alpha, alpha_w;
330   int *segs;
331   int op[] = {GL_ADD,0};
332   int off, len;
333   PetscScalar *x_ptr, *y_ptr;
334   int *iptr, flag;
335   int start=0, end, work;
336   int op2[] = {GL_MIN,0};
337   gs_ADT gs_handle;
338   int *nsep, *lnsep, *fo;
339   int a_n=xyt_handle->mvi->n;
340   int a_m=xyt_handle->mvi->m;
341   int *a_local2global=xyt_handle->mvi->local2global;
342   int level;
343   int n, m;
344   int *xcol_sz, *xcol_indices, *stages;
345   PetscScalar **xcol_vals, *x;
346   int *ycol_sz, *ycol_indices;
347   PetscScalar **ycol_vals, *y;
348   int n_global;
349   int xt_nnz=0, xt_max_nnz=0;
350   int yt_nnz=0, yt_max_nnz=0;
351   int xt_zero_nnz  =0;
352   int xt_zero_nnz_0=0;
353   int yt_zero_nnz  =0;
354   int yt_zero_nnz_0=0;
355   PetscBLASInt i1 = 1;
356   PetscScalar dm1 = -1.0;
357 
358   n=xyt_handle->mvi->n;
359   nsep=xyt_handle->info->nsep;
360   lnsep=xyt_handle->info->lnsep;
361   fo=xyt_handle->info->fo;
362   end=lnsep[0];
363   level=xyt_handle->level;
364   gs_handle=xyt_handle->mvi->gs_handle;
365 
366   /* is there a null space? */
367   /* LATER add in ability to detect null space by checking alpha */
368   for (i=0, j=0; i<=level; i++)
369     {j+=nsep[i];}
370 
371   m = j-xyt_handle->ns;
372   if (m!=j)
373     {printf("xyt_generate() :: null space exists %d %d %d\n",m,j,xyt_handle->ns);}
374 
375   error_msg_warning("xyt_generate() :: X(%d,%d)\n",n,m);
376 
377   /* get and initialize storage for x local         */
378   /* note that x local is nxm and stored by columns */
379   xcol_sz = (int*) malloc(m*sizeof(PetscInt));
380   xcol_indices = (int*) malloc((2*m+1)*sizeof(int));
381   xcol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *));
382   for (i=j=0; i<m; i++, j+=2)
383     {
384       xcol_indices[j]=xcol_indices[j+1]=xcol_sz[i]=-1;
385       xcol_vals[i] = NULL;
386     }
387   xcol_indices[j]=-1;
388 
389   /* get and initialize storage for y local         */
390   /* note that y local is nxm and stored by columns */
391   ycol_sz = (int*) malloc(m*sizeof(PetscInt));
392   ycol_indices = (int*) malloc((2*m+1)*sizeof(int));
393   ycol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *));
394   for (i=j=0; i<m; i++, j+=2)
395     {
396       ycol_indices[j]=ycol_indices[j+1]=ycol_sz[i]=-1;
397       ycol_vals[i] = NULL;
398     }
399   ycol_indices[j]=-1;
400 
401   /* size of separators for each sub-hc working from bottom of tree to top */
402   /* this looks like nsep[]=segments */
403   stages = (int*) malloc((level+1)*sizeof(PetscInt));
404   segs   = (int*) malloc((level+1)*sizeof(PetscInt));
405   ivec_zero(stages,level+1);
406   ivec_copy(segs,nsep,level+1);
407   for (i=0; i<level; i++)
408     {segs[i+1] += segs[i];}
409   stages[0] = segs[0];
410 
411   /* temporary vectors  */
412   u  = (PetscScalar *) malloc(n*sizeof(PetscScalar));
413   z  = (PetscScalar *) malloc(n*sizeof(PetscScalar));
414   v  = (PetscScalar *) malloc(a_m*sizeof(PetscScalar));
415   uu = (PetscScalar *) malloc(m*sizeof(PetscScalar));
416   w  = (PetscScalar *) malloc(m*sizeof(PetscScalar));
417 
418   /* extra nnz due to replication of vertices across separators */
419   for (i=1, j=0; i<=level; i++)
420     {j+=nsep[i];}
421 
422   /* storage for sparse x values */
423   n_global = xyt_handle->info->n_global;
424   xt_max_nnz = yt_max_nnz = (int)(2.5*pow(1.0*n_global,1.6667) + j*n/2)/num_nodes;
425   x = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar));
426   y = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar));
427 
428   /* LATER - can embed next sep to fire in gs */
429   /* time to make the donuts - generate X factor */
430   for (dim=i=j=0;i<m;i++)
431     {
432       /* time to move to the next level? */
433       while (i==segs[dim])
434 	{
435 #ifdef SAFE
436 	  if (dim==level)
437 	    {error_msg_fatal("dim about to exceed level\n"); break;}
438 #endif
439 
440 	  stages[dim++]=i;
441 	  end+=lnsep[dim];
442 	}
443       stages[dim]=i;
444 
445       /* which column are we firing? */
446       /* i.e. set v_l */
447       /* use new seps and do global min across hc to determine which one to fire */
448       (start<end) ? (col=fo[start]) : (col=INT_MAX);
449       giop_hc(&col,&work,1,op2,dim);
450 
451       /* shouldn't need this */
452       if (col==INT_MAX)
453 	{
454 	  error_msg_warning("hey ... col==INT_MAX??\n");
455 	  continue;
456 	}
457 
458       /* do I own it? I should */
459       rvec_zero(v ,a_m);
460       if (col==fo[start])
461 	{
462 	  start++;
463 	  idx=ivec_linear_search(col, a_local2global, a_n);
464 	  if (idx!=-1)
465 	    {v[idx] = 1.0; j++;}
466 	  else
467 	    {error_msg_fatal("NOT FOUND!\n");}
468 	}
469       else
470 	{
471 	  idx=ivec_linear_search(col, a_local2global, a_m);
472 	  if (idx!=-1)
473 	    {v[idx] = 1.0;}
474 	}
475 
476       /* perform u = A.v_l */
477       rvec_zero(u,n);
478       do_matvec(xyt_handle->mvi,v,u);
479 
480       /* uu =  X^T.u_l (local portion) */
481       /* technically only need to zero out first i entries */
482       /* later turn this into an XYT_solve call ? */
483       rvec_zero(uu,m);
484       y_ptr=y;
485       iptr = ycol_indices;
486       for (k=0; k<i; k++)
487 	{
488 	  off = *iptr++;
489 	  len = *iptr++;
490 
491 	  uu[k] = BLdot_(&len,u+off,&i1,y_ptr,&i1);
492 	  y_ptr+=len;
493 	}
494 
495       /* uu = X^T.u_l (comm portion) */
496       ssgl_radd  (uu, w, dim, stages);
497 
498       /* z = X.uu */
499       rvec_zero(z,n);
500       x_ptr=x;
501       iptr = xcol_indices;
502       for (k=0; k<i; k++)
503 	{
504 	  off = *iptr++;
505 	  len = *iptr++;
506 
507 	  BLaxpy_(&len,&uu[k],x_ptr,&i1,z+off,&i1);
508 	  x_ptr+=len;
509 	}
510 
511       /* compute v_l = v_l - z */
512       rvec_zero(v+a_n,a_m-a_n);
513       BLaxpy_(&n,&dm1,z,&i1,v,&i1);
514 
515       /* compute u_l = A.v_l */
516       if (a_n!=a_m)
517 	{gs_gop_hc(gs_handle,v,"+\0",dim);}
518       rvec_zero(u,n);
519      do_matvec(xyt_handle->mvi,v,u);
520 
521       /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - local portion */
522       alpha = BLdot_(&n,u,&i1,u,&i1);
523       /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - comm portion */
524       grop_hc(&alpha, &alpha_w, 1, op, dim);
525 
526       alpha = (PetscScalar) sqrt((double)alpha);
527 
528       /* check for small alpha                             */
529       /* LATER use this to detect and determine null space */
530       if (fabs(alpha)<1.0e-14)
531 	{error_msg_fatal("bad alpha! %g\n",alpha);}
532 
533       /* compute v_l = v_l/sqrt(alpha) */
534       rvec_scale(v,1.0/alpha,n);
535       rvec_scale(u,1.0/alpha,n);
536 
537       /* add newly generated column, v_l, to X */
538       flag = 1;
539       off=len=0;
540       for (k=0; k<n; k++)
541 	{
542 	  if (v[k]!=0.0)
543 	    {
544 	      len=k;
545 	      if (flag)
546 		{off=k; flag=0;}
547 	    }
548 	}
549 
550       len -= (off-1);
551 
552       if (len>0)
553 	{
554 	  if ((xt_nnz+len)>xt_max_nnz)
555 	    {
556 	      error_msg_warning("increasing space for X by 2x!\n");
557 	      xt_max_nnz *= 2;
558 	      x_ptr = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar));
559 	      rvec_copy(x_ptr,x,xt_nnz);
560 	      free(x);
561 	      x = x_ptr;
562 	      x_ptr+=xt_nnz;
563 	    }
564 	  xt_nnz += len;
565 	  rvec_copy(x_ptr,v+off,len);
566 
567           /* keep track of number of zeros */
568 	  if (dim)
569 	    {
570 	      for (k=0; k<len; k++)
571 		{
572 		  if (x_ptr[k]==0.0)
573 		    {xt_zero_nnz++;}
574 		}
575 	    }
576 	  else
577 	    {
578 	      for (k=0; k<len; k++)
579 		{
580 		  if (x_ptr[k]==0.0)
581 		    {xt_zero_nnz_0++;}
582 		}
583 	    }
584 	  xcol_indices[2*i] = off;
585 	  xcol_sz[i] = xcol_indices[2*i+1] = len;
586 	  xcol_vals[i] = x_ptr;
587 	}
588       else
589 	{
590 	  xcol_indices[2*i] = 0;
591 	  xcol_sz[i] = xcol_indices[2*i+1] = 0;
592 	  xcol_vals[i] = x_ptr;
593 	}
594 
595 
596       /* add newly generated column, u_l, to Y */
597       flag = 1;
598       off=len=0;
599       for (k=0; k<n; k++)
600 	{
601 	  if (u[k]!=0.0)
602 	    {
603 	      len=k;
604 	      if (flag)
605 		{off=k; flag=0;}
606 	    }
607 	}
608 
609       len -= (off-1);
610 
611       if (len>0)
612 	{
613 	  if ((yt_nnz+len)>yt_max_nnz)
614 	    {
615 	      error_msg_warning("increasing space for Y by 2x!\n");
616 	      yt_max_nnz *= 2;
617 	      y_ptr = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar));
618 	      rvec_copy(y_ptr,y,yt_nnz);
619 	      free(y);
620 	      y = y_ptr;
621 	      y_ptr+=yt_nnz;
622 	    }
623 	  yt_nnz += len;
624 	  rvec_copy(y_ptr,u+off,len);
625 
626           /* keep track of number of zeros */
627 	  if (dim)
628 	    {
629 	      for (k=0; k<len; k++)
630 		{
631 		  if (y_ptr[k]==0.0)
632 		    {yt_zero_nnz++;}
633 		}
634 	    }
635 	  else
636 	    {
637 	      for (k=0; k<len; k++)
638 		{
639 		  if (y_ptr[k]==0.0)
640 		    {yt_zero_nnz_0++;}
641 		}
642 	    }
643 	  ycol_indices[2*i] = off;
644 	  ycol_sz[i] = ycol_indices[2*i+1] = len;
645 	  ycol_vals[i] = y_ptr;
646 	}
647       else
648 	{
649 	  ycol_indices[2*i] = 0;
650 	  ycol_sz[i] = ycol_indices[2*i+1] = 0;
651 	  ycol_vals[i] = y_ptr;
652 	}
653     }
654 
655   /* close off stages for execution phase */
656   while (dim!=level)
657     {
658       stages[dim++]=i;
659       error_msg_warning("disconnected!!! dim(%d)!=level(%d)\n",dim,level);
660     }
661   stages[dim]=i;
662 
663   xyt_handle->info->n=xyt_handle->mvi->n;
664   xyt_handle->info->m=m;
665   xyt_handle->info->nnz=xt_nnz + yt_nnz;
666   xyt_handle->info->max_nnz=xt_max_nnz + yt_max_nnz;
667   xyt_handle->info->msg_buf_sz=stages[level]-stages[0];
668   xyt_handle->info->solve_uu = (PetscScalar *) malloc(m*sizeof(PetscScalar));
669   xyt_handle->info->solve_w  = (PetscScalar *) malloc(m*sizeof(PetscScalar));
670   xyt_handle->info->x=x;
671   xyt_handle->info->xcol_vals=xcol_vals;
672   xyt_handle->info->xcol_sz=xcol_sz;
673   xyt_handle->info->xcol_indices=xcol_indices;
674   xyt_handle->info->stages=stages;
675   xyt_handle->info->y=y;
676   xyt_handle->info->ycol_vals=ycol_vals;
677   xyt_handle->info->ycol_sz=ycol_sz;
678   xyt_handle->info->ycol_indices=ycol_indices;
679 
680   free(segs);
681   free(u);
682   free(v);
683   free(uu);
684   free(z);
685   free(w);
686 
687   return(0);
688 }
689 
690 
691 /*************************************xyt.c************************************
692 Function:
693 
694 Input :
695 Output:
696 Return:
697 Description:
698 **************************************xyt.c***********************************/
699 static
700 void
701 do_xyt_solve(xyt_ADT xyt_handle,  PetscScalar *uc)
702 {
703   int off, len, *iptr;
704   int level       =xyt_handle->level;
705   int n           =xyt_handle->info->n;
706   int m           =xyt_handle->info->m;
707   int *stages     =xyt_handle->info->stages;
708   int *xcol_indices=xyt_handle->info->xcol_indices;
709   int *ycol_indices=xyt_handle->info->ycol_indices;
710    PetscScalar *x_ptr, *y_ptr, *uu_ptr;
711   PetscScalar *solve_uu=xyt_handle->info->solve_uu;
712   PetscScalar *solve_w =xyt_handle->info->solve_w;
713   PetscScalar *x       =xyt_handle->info->x;
714   PetscScalar *y       =xyt_handle->info->y;
715   PetscBLASInt i1 = 1;
716 
717 
718   uu_ptr=solve_uu;
719   rvec_zero(uu_ptr,m);
720 
721   /* x  = X.Y^T.b */
722   /* uu = Y^T.b */
723   for (y_ptr=y,iptr=ycol_indices; *iptr!=-1; y_ptr+=len)
724     {
725       off=*iptr++; len=*iptr++;
726       *uu_ptr++ = BLdot_(&len,uc+off,&i1,y_ptr,&i1);
727     }
728 
729   /* comunication of beta */
730   uu_ptr=solve_uu;
731   if (level) {ssgl_radd(uu_ptr, solve_w, level, stages);}
732 
733   rvec_zero(uc,n);
734 
735   /* x = X.uu */
736   for (x_ptr=x,iptr=xcol_indices; *iptr!=-1; x_ptr+=len)
737     {
738       off=*iptr++; len=*iptr++;
739       BLaxpy_(&len,uu_ptr++,x_ptr,&i1,uc+off,&i1);
740     }
741 
742 }
743 
744 
745 /*************************************Xyt.c************************************
746 Function: check_init
747 
748 Input :
749 Output:
750 Return:
751 Description:
752 **************************************xyt.c***********************************/
753 static
754 void
755 check_init(void)
756 {
757   comm_init();
758 
759 
760 }
761 
762 
763 /*************************************xyt.c************************************
764 Function: check_handle()
765 
766 Input :
767 Output:
768 Return:
769 Description:
770 **************************************xyt.c***********************************/
771 static
772 void
773 check_handle(xyt_ADT xyt_handle)
774 {
775 #ifdef SAFE
776   int vals[2], work[2], op[] = {NON_UNIFORM,GL_MIN,GL_MAX};
777 #endif
778 
779   if (xyt_handle==NULL)
780     {error_msg_fatal("check_handle() :: bad handle :: NULL %d\n",xyt_handle);}
781 
782 #ifdef SAFE
783   vals[0]=vals[1]=xyt_handle->id;
784   giop(vals,work,sizeof(op)/sizeof(op[0])-1,op);
785   if ((vals[0]!=vals[1])||(xyt_handle->id<=0))
786     {error_msg_fatal("check_handle() :: bad handle :: id mismatch min/max %d/%d %d\n",
787 		     vals[0],vals[1], xyt_handle->id);}
788 #endif
789 
790 }
791 
792 
793 /*************************************xyt.c************************************
794 Function: det_separators
795 
796 Input :
797 Output:
798 Return:
799 Description:
800   det_separators(xyt_handle, local2global, n, m, mylocmatvec, grid_data);
801 **************************************xyt.c***********************************/
802 static
803 void
804 det_separators(xyt_ADT xyt_handle)
805 {
806   int i, ct, id;
807   int mask, edge, *iptr;
808   int *dir, *used;
809   int sum[4], w[4];
810   PetscScalar rsum[4], rw[4];
811   int op[] = {GL_ADD,0};
812   PetscScalar *lhs, *rhs;
813   int *nsep, *lnsep, *fo, nfo=0;
814   gs_ADT gs_handle=xyt_handle->mvi->gs_handle;
815   int *local2global=xyt_handle->mvi->local2global;
816   int  n=xyt_handle->mvi->n;
817   int  m=xyt_handle->mvi->m;
818   int level=xyt_handle->level;
819   int shared=FALSE;
820 
821   dir  = (int*)malloc(sizeof(PetscInt)*(level+1));
822   nsep = (int*)malloc(sizeof(PetscInt)*(level+1));
823   lnsep= (int*)malloc(sizeof(PetscInt)*(level+1));
824   fo   = (int*)malloc(sizeof(PetscInt)*(n+1));
825   used = (int*)malloc(sizeof(PetscInt)*n);
826 
827   ivec_zero(dir  ,level+1);
828   ivec_zero(nsep ,level+1);
829   ivec_zero(lnsep,level+1);
830   ivec_set (fo   ,-1,n+1);
831   ivec_zero(used,n);
832 
833   lhs  = (double*)malloc(sizeof(PetscScalar)*m);
834   rhs  = (double*)malloc(sizeof(PetscScalar)*m);
835 
836   /* determine the # of unique dof */
837   rvec_zero(lhs,m);
838   rvec_set(lhs,1.0,n);
839   gs_gop_hc(gs_handle,lhs,"+\0",level);
840   error_msg_warning("done first gs_gop_hc\n");
841   rvec_zero(rsum,2);
842   for (ct=i=0;i<n;i++)
843     {
844       if (lhs[i]!=0.0)
845 	{rsum[0]+=1.0/lhs[i]; rsum[1]+=lhs[i];}
846 
847       if (lhs[i]!=1.0)
848 	{
849           shared=TRUE;
850         }
851     }
852 
853   grop_hc(rsum,rw,2,op,level);
854   rsum[0]+=0.1;
855   rsum[1]+=0.1;
856 
857   /*
858       if (!my_id)
859       {
860       printf("xyt n unique = %d (%g)\n",(int) rsum[0], rsum[0]);
861       printf("xyt n shared = %d (%g)\n",(int) rsum[1], rsum[1]);
862       }
863   */
864 
865   xyt_handle->info->n_global=xyt_handle->info->m_global=(int) rsum[0];
866   xyt_handle->mvi->n_global =xyt_handle->mvi->m_global =(int) rsum[0];
867 
868   /* determine separator sets top down */
869   if (shared)
870     {
871       /* solution is to do as in the symmetric shared case but then */
872       /* pick the sub-hc with the most free dofs and do a mat-vec   */
873       /* and pick up the responses on the other sub-hc from the     */
874       /* initial separator set obtained from the symm. shared case  */
875       error_msg_fatal("shared dof separator determination not ready ... see hmt!!!\n");
876       for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1)
877 	{
878 	  /* set rsh of hc, fire, and collect lhs responses */
879 	  (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m);
880 	  gs_gop_hc(gs_handle,lhs,"+\0",edge);
881 
882 	  /* set lsh of hc, fire, and collect rhs responses */
883 	  (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m);
884 	  gs_gop_hc(gs_handle,rhs,"+\0",edge);
885 
886 	  for (i=0;i<n;i++)
887 	    {
888 	      if (id< mask)
889 		{
890 		  if (lhs[i]!=0.0)
891 		    {lhs[i]=1.0;}
892 		}
893 	      if (id>=mask)
894 		{
895 		  if (rhs[i]!=0.0)
896 		    {rhs[i]=1.0;}
897 		}
898 	    }
899 
900 	  if (id< mask)
901 	    {gs_gop_hc(gs_handle,lhs,"+\0",edge-1);}
902 	  else
903 	    {gs_gop_hc(gs_handle,rhs,"+\0",edge-1);}
904 
905 	  /* count number of dofs I own that have signal and not in sep set */
906 	  rvec_zero(rsum,4);
907 	  for (ivec_zero(sum,4),ct=i=0;i<n;i++)
908 	    {
909 	      if (!used[i])
910 		{
911 		  /* number of unmarked dofs on node */
912 		  ct++;
913 		  /* number of dofs to be marked on lhs hc */
914 		  if (id< mask)
915 		    {
916 		      if (lhs[i]!=0.0)
917 			{sum[0]++; rsum[0]+=1.0/lhs[i];}
918 		    }
919 		  /* number of dofs to be marked on rhs hc */
920 		  if (id>=mask)
921 		    {
922 		      if (rhs[i]!=0.0)
923 			{sum[1]++; rsum[1]+=1.0/rhs[i];}
924 		    }
925 		}
926 	    }
927 
928 	  /* go for load balance - choose half with most unmarked dofs, bias LHS */
929 	  (id<mask) ? (sum[2]=ct) : (sum[3]=ct);
930 	  (id<mask) ? (rsum[2]=ct) : (rsum[3]=ct);
931 	  giop_hc(sum,w,4,op,edge);
932 	  grop_hc(rsum,rw,4,op,edge);
933 	  rsum[0]+=0.1; rsum[1]+=0.1; rsum[2]+=0.1; rsum[3]+=0.1;
934 
935 	  if (id<mask)
936 	    {
937 	      /* mark dofs I own that have signal and not in sep set */
938 	      for (ct=i=0;i<n;i++)
939 		{
940 		  if ((!used[i])&&(lhs[i]!=0.0))
941 		    {
942 		      ct++; nfo++;
943 
944 		      if (nfo>n)
945 			{error_msg_fatal("nfo about to exceed n\n");}
946 
947 		      *--iptr = local2global[i];
948 		      used[i]=edge;
949 		    }
950 		}
951 	      if (ct>1) {ivec_sort(iptr,ct);}
952 
953 	      lnsep[edge]=ct;
954 	      nsep[edge]=(int) rsum[0];
955 	      dir [edge]=LEFT;
956 	    }
957 
958 	  if (id>=mask)
959 	    {
960 	      /* mark dofs I own that have signal and not in sep set */
961 	      for (ct=i=0;i<n;i++)
962 		{
963 		  if ((!used[i])&&(rhs[i]!=0.0))
964 		    {
965 		      ct++; nfo++;
966 
967 		      if (nfo>n)
968 			{error_msg_fatal("nfo about to exceed n\n");}
969 
970 		      *--iptr = local2global[i];
971 		      used[i]=edge;
972 		    }
973 		}
974 	      if (ct>1) {ivec_sort(iptr,ct);}
975 
976 	      lnsep[edge]=ct;
977 	      nsep[edge]= (int) rsum[1];
978 	      dir [edge]=RIGHT;
979 	    }
980 
981 	  /* LATER or we can recur on these to order seps at this level */
982 	  /* do we need full set of separators for this?                */
983 
984 	  /* fold rhs hc into lower */
985 	  if (id>=mask)
986 	    {id-=mask;}
987 	}
988     }
989   else
990     {
991       for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1)
992 	{
993 	  /* set rsh of hc, fire, and collect lhs responses */
994 	  (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m);
995 	  gs_gop_hc(gs_handle,lhs,"+\0",edge);
996 
997 	  /* set lsh of hc, fire, and collect rhs responses */
998 	  (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m);
999 	  gs_gop_hc(gs_handle,rhs,"+\0",edge);
1000 
1001 	  /* count number of dofs I own that have signal and not in sep set */
1002 	  for (ivec_zero(sum,4),ct=i=0;i<n;i++)
1003 	    {
1004 	      if (!used[i])
1005 		{
1006 		  /* number of unmarked dofs on node */
1007 		  ct++;
1008 		  /* number of dofs to be marked on lhs hc */
1009 		  if ((id< mask)&&(lhs[i]!=0.0)) {sum[0]++;}
1010 		  /* number of dofs to be marked on rhs hc */
1011 		  if ((id>=mask)&&(rhs[i]!=0.0)) {sum[1]++;}
1012 		}
1013 	    }
1014 
1015 	  /* for the non-symmetric case we need separators of width 2 */
1016 	  /* so take both sides */
1017 	  (id<mask) ? (sum[2]=ct) : (sum[3]=ct);
1018 	  giop_hc(sum,w,4,op,edge);
1019 
1020 	  ct=0;
1021 	  if (id<mask)
1022 	    {
1023 	      /* mark dofs I own that have signal and not in sep set */
1024 	      for (i=0;i<n;i++)
1025 		{
1026 		  if ((!used[i])&&(lhs[i]!=0.0))
1027 		    {
1028 		      ct++; nfo++;
1029 		      *--iptr = local2global[i];
1030 		      used[i]=edge;
1031 		    }
1032 		}
1033 	      /* LSH hc summation of ct should be sum[0] */
1034 	    }
1035 	  else
1036 	    {
1037 	      /* mark dofs I own that have signal and not in sep set */
1038 	      for (i=0;i<n;i++)
1039 		{
1040 		  if ((!used[i])&&(rhs[i]!=0.0))
1041 		    {
1042 		      ct++; nfo++;
1043 		      *--iptr = local2global[i];
1044 		      used[i]=edge;
1045 		    }
1046 		}
1047 	      /* RSH hc summation of ct should be sum[1] */
1048 	    }
1049 
1050 	  if (ct>1) {ivec_sort(iptr,ct);}
1051 	  lnsep[edge]=ct;
1052 	  nsep[edge]=sum[0]+sum[1];
1053 	  dir [edge]=BOTH;
1054 
1055 	  /* LATER or we can recur on these to order seps at this level */
1056 	  /* do we need full set of separators for this?                */
1057 
1058 	  /* fold rhs hc into lower */
1059 	  if (id>=mask)
1060 	    {id-=mask;}
1061 	}
1062     }
1063 
1064   /* level 0 is on processor case - so mark the remainder */
1065   for (ct=i=0;i<n;i++)
1066     {
1067       if (!used[i])
1068 	{
1069 	  ct++; nfo++;
1070 	  *--iptr = local2global[i];
1071 	  used[i]=edge;
1072 	}
1073     }
1074   if (ct>1) {ivec_sort(iptr,ct);}
1075   lnsep[edge]=ct;
1076   nsep [edge]=ct;
1077   dir  [edge]=BOTH;
1078 
1079   xyt_handle->info->nsep=nsep;
1080   xyt_handle->info->lnsep=lnsep;
1081   xyt_handle->info->fo=fo;
1082   xyt_handle->info->nfo=nfo;
1083 
1084   free(dir);
1085   free(lhs);
1086   free(rhs);
1087   free(used);
1088 
1089 }
1090 
1091 
1092 /*************************************xyt.c************************************
1093 Function: set_mvi
1094 
1095 Input :
1096 Output:
1097 Return:
1098 Description:
1099 **************************************xyt.c***********************************/
1100 static
1101 mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data)
1102 {
1103   mv_info *mvi;
1104 
1105 
1106   mvi = (mv_info*)malloc(sizeof(mv_info));
1107   mvi->n=n;
1108   mvi->m=m;
1109   mvi->n_global=-1;
1110   mvi->m_global=-1;
1111   mvi->local2global=(int*)malloc((m+1)*sizeof(PetscInt));
1112   ivec_copy(mvi->local2global,local2global,m);
1113   mvi->local2global[m] = INT_MAX;
1114   mvi->matvec=(PetscErrorCode (*)(mv_info*,PetscScalar*,PetscScalar*))matvec;
1115   mvi->grid_data=grid_data;
1116 
1117   /* set xyt communication handle to perform restricted matvec */
1118   mvi->gs_handle = gs_init(local2global, m, num_nodes);
1119 
1120   return(mvi);
1121 }
1122 
1123 
1124 /*************************************xyt.c************************************
1125 Function: set_mvi
1126 
1127 Input :
1128 Output:
1129 Return:
1130 Description:
1131 
1132       computes u = A.v
1133       do_matvec(xyt_handle->mvi,v,u);
1134 **************************************xyt.c***********************************/
1135 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u)
1136 {
1137   A->matvec((mv_info*)A->grid_data,v,u);
1138 }
1139 
1140 
1141 
1142