xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 24139b4f9694abbbeaad6f160311130ce9c103c9)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiov.c,v 1.8 1996/01/30 23:57:32 balay Exp balay $";
3 #endif
4 
5 #include "mpiaij.h"
6 int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *);
7 
8 int MatIncreaseOverlap_MPIAIJ(Mat C, int is_max, IS *is, int ov)
9 {
10   int i, ierr;
11   if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");}
12   for (i =0; i<ov; ++i) {
13     ierr = MatIncreaseOverlap_MPIAIJ_private(C, is_max, is); CHKERRQ(ierr);
14   }
15   return 0;
16 }
17 
18 
19 
20 int MatIncreaseOverlap_MPIAIJ_private(Mat C, int is_max, IS *is)
21 {
22   Mat_MPIAIJ  *c = (Mat_MPIAIJ *) C->data;
23   Mat          A = c->A, B = c->B;
24   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
25   int         **idx, *n, *w1, *w2, *w3, *w4, *rtable,**table,**data;
26   int         size, rank, m,i,j,k, ierr, **rbuf, row, proc, mct, msz, **outdat, **ptr;
27   int         *ctr, sum, *pa, tag, *tmp,bsz, nmsg , *isz, *isz1, **xdata, *xtable,rstart;
28   int         cstart, *ai, *aj, *bi, *bj, *garray, bsz1, **rbuf2, ashift, bshift;
29   MPI_Comm    comm;
30   MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ;
31   MPI_Status  *send_status ,*recv_status;
32   int         space, fr, maxs;
33                               /* assume overlap = 1 */
34   comm   = C->comm;
35   tag    = C->tag;
36   size   = c->size;
37   rank   = c->rank;
38   m      = c->M;
39   rstart = c->rstart;
40   cstart = c->cstart;
41   ashift = a->indexshift;
42   ai     = a->i;
43   aj     = a->j +ashift;
44   bshift = b->indexshift;
45   bi     = b->i;
46   bj     = b->j +bshift;
47   garray = c->garray;
48 
49 
50   TrSpace( &space, &fr, &maxs );
51   MPIU_printf(MPI_COMM_SELF,"[%d] acclcated space = %d fragments = %d max ever allocated = %d\n", rank, space, fr, maxs);
52 
53   idx    = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(idx);
54   n      = (int *)PetscMalloc((is_max+1)*sizeof(int )); CHKPTRQ(n);
55   rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable);
56                                 /* Hash table for maping row ->proc */
57 
58   for ( i=0 ; i<is_max ; i++) {
59     ierr = ISGetIndices(is[i],&idx[i]);  CHKERRQ(ierr);
60     ierr = ISGetLocalSize(is[i],&n[i]);  CHKERRQ(ierr);
61   }
62 
63   /* Create hash table for the mapping :row -> proc*/
64   for( i=0, j=0; i< size; i++) {
65     for (; j <c->rowners[i+1]; j++) {
66       rtable[j] = i;
67     }
68   }
69 
70   /* evaluate communication - mesg to who, length of mesg, and buffer space
71      required. Based on this, buffers are allocated, and data copied into them*/
72   w1     = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /*  mesg size */
73   w2     = w1 + size;         /* if w2[i] marked, then a message to proc i*/
74   w3     = w2 + size;         /* no of IS that needs to be sent to proc i */
75   w4     = w3 + size;         /* temp work space used in determining w1, w2, w3 */
76   PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/
77   for ( i=0;  i<is_max ; i++) {
78     PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/
79     for ( j =0 ; j < n[i] ; j++) {
80       row  = idx[i][j];
81       proc = rtable[row];
82       w4[proc]++;
83     }
84     for( j = 0; j < size; j++){
85       if( w4[j] ) { w1[j] += w4[j];  w3[j] += 1;}
86     }
87   }
88 
89   mct      = 0;              /* no of outgoing messages */
90   msz      = 0;              /* total mesg length (for all proc */
91   w1[rank] = 0;              /* no mesg sent to intself */
92   w3[rank] = 0;
93   for (i =0; i < size ; i++) {
94     if (w1[i])  { w2[i] = 1; mct++;} /* there exists a message to proc i */
95   }
96   pa = (int *)PetscMalloc((mct +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */
97   for (i =0, j=0; i < size ; i++) {
98     if (w1[i]) { pa[j] = i; j++; }
99   }
100 
101   /* Each message would have a header = 1 + 2*(no of IS) + data */
102   for (i = 0; i<mct ; i++) {
103     j = pa[i];
104     w1[j] += w2[j] + 2* w3[j];
105     msz   += w1[j];
106   }
107 
108   /* Allocate Memory for outgoing messages */
109   outdat    = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat);
110   PetscMemzero(outdat,  2*size*sizeof(int*));
111   tmp       = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */
112   ptr       = outdat +size;     /* Pointers to the data in outgoing buffers */
113   ctr       = (int *)PetscMalloc( size*sizeof(int));   CHKPTRQ(ctr);
114 
115   {
116     int *iptr = tmp;
117     int ict  = 0;
118     for (i = 0; i < mct ; i++) {
119       j         = pa[i];
120       iptr     +=  ict;
121       outdat[j] = iptr;
122       ict       = w1[j];
123     }
124   }
125 
126   /* Form the outgoing messages */
127   /*plug in the headers*/
128   for ( i=0 ; i<mct ; i++) {
129     j = pa[i];
130     outdat[j][0] = 0;
131     PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int));
132     ptr[j] = outdat[j] + 2*w3[j] +1;
133   }
134 
135   /* Memory for doing local proc's work*/
136   table = (int **)PetscMalloc((is_max+1)*sizeof(int *));  CHKPTRQ(table);
137   data  = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(data);
138   table[0] = (int *)PetscMalloc((m+1)*(is_max)*sizeof(int)); CHKPTRQ(table[0]);
139   data [0] = (int *)PetscMalloc((m+1)*(is_max)*sizeof(int)); CHKPTRQ(data[0]);
140 
141   for(i = 1; i<is_max ; i++) {
142     table[i] = table[0] + (m+1)*i;
143     data[i]  = data[0] + (m+1)*i;
144   }
145 
146   PetscMemzero((void*)*table,(m+1)*(is_max)*sizeof(int));
147   isz = (int *)PetscMalloc((is_max+1) *sizeof(int)); CHKPTRQ(isz);
148   PetscMemzero((void *)isz,(is_max+1) *sizeof(int));
149 
150   /* Parse the IS and update local tables and the outgoing buf with the data*/
151   for ( i=0 ; i<is_max ; i++) {
152     PetscMemzero(ctr,size*sizeof(int));
153     for( j=0;  j<n[i]; j++) {  /* parse the indices of each IS */
154       row  = idx[i][j];
155       proc = rtable[row];
156       if (proc != rank) { /* copy to the outgoing buf*/
157         ctr[proc]++;
158         *ptr[proc] = row;
159         ptr[proc]++;
160       }
161       else { /* Update the table */
162         if(!table[i][row]++) { data[i][isz[i]++] = row;}
163       }
164     }
165     /* Update the headers for the current IS */
166     for( j = 0; j<size; j++) { /* Can Optimise this loop too */
167       if (ctr[j]) {
168         k= ++outdat[j][0];
169         outdat[j][2*k]   = ctr[j];
170         outdat[j][2*k-1] = i;
171       }
172     }
173   }
174 
175   /* Check Validity of the outgoing messages */
176   for ( i=0 ; i<mct ; i++) {
177     j = pa[i];
178     if (w3[j] != outdat[j][0]) {SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[1] mismatch!\n"); }
179   }
180 
181   for ( i=0 ; i<mct ; i++) {
182     j = pa[i];
183     sum = 1;
184     for (k = 1; k <= w3[j]; k++) sum += outdat[j][2*k]+2;
185     if (sum != w1[j]) { SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[2-n] mismatch!  \n"); }
186   }
187 
188 
189   /* Do a global reduction to determine how many messages to expect*/
190   {
191     int *rw1, *rw2;
192     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
193     rw2 = rw1+size;
194     MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm);
195     bsz   = rw1[rank];
196     MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm);
197     nmsg  = rw2[rank];
198     PetscFree(rw1);
199   }
200 
201   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
202   rbuf    = (int**) PetscMalloc((nmsg+1) *sizeof(int*));  CHKPTRQ(rbuf);
203   rbuf[0] = (int *) PetscMalloc((nmsg *bsz+1) * sizeof(int));  CHKPTRQ(rbuf[0]);
204   for (i=1; i<nmsg ; ++i) rbuf[i] = rbuf[i-1] + bsz;
205 
206   /* Now post the receives */
207   recv_waits = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
208   CHKPTRQ(recv_waits);
209   for ( i=0; i<nmsg; ++i){
210     MPI_Irecv((void *)rbuf[i], bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i);
211   }
212 
213   /*  Now  post the sends */
214   send_waits = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
215   CHKPTRQ(send_waits);
216   for( i =0; i< mct; ++i){
217     j = pa[i];
218     MPI_Isend( (void *)outdat[j], w1[j], MPI_INT, j, tag, comm, send_waits+i);
219   }
220 
221   /* Do Local work*/
222   /* Extract the matrices */
223   {
224     int  start, end, val, max;
225 
226     for( i=0; i<is_max; i++) {
227       for ( j=0, max =isz[i] ; j< max; j++) {
228         row   = data[i][j] - rstart;
229         start = ai[row];
230         end   = ai[row+1];
231         for ( k=start; k < end; k++) { /* Amat */
232           val = aj[k] + ashift + cstart;
233           if(!table[i][val]++) { data[i][isz[i]++] = val;}
234         }
235         start = bi[row];
236         end   = bi[row+1];
237         for ( k=start; k < end; k++) { /* Bmat */
238           val = garray[bj[k]+bshift] ;
239           if(!table[i][val]++) { data[i][isz[i]++] = val;}
240         }
241       }
242     }
243   }
244   /* Receive messages*/
245   {
246     int        index;
247 
248     recv_status = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
249     CHKPTRQ(recv_status);
250     for ( i=0; i< nmsg; ++i) {
251       MPI_Waitany(nmsg, recv_waits, &index, recv_status+i);
252     }
253 
254     send_status = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
255     CHKPTRQ(send_status);
256     MPI_Waitall(mct,send_waits,send_status);
257   }
258 
259 
260   /* Do work on recieved messages*/
261   {
262     int ct, ct1, ct2;
263     int oct2, l, start, end, val, max1, max2;
264 
265 
266     for (i =0, ct =0; i< nmsg; ++i) ct+= rbuf[i][0];
267 
268     xdata    = (int **)PetscMalloc((nmsg+1)*sizeof(int *)); CHKPTRQ(xdata);
269     xdata[0] = (int *)PetscMalloc((ct+nmsg+(m+1)*ct)*sizeof(int)); CHKPTRQ(xdata[0]);
270     xtable   = (int *)PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(xtable);
271     isz1     = (int *)PetscMalloc((nmsg+1) *sizeof(int)); CHKPTRQ(isz1);
272     PetscMemzero((void *)isz1,(nmsg+1) *sizeof(int));
273 
274 
275     for (i =0; i< nmsg; i++) { /* for easch mesg from proc i */
276       ct1 = 2*rbuf[i][0]+1;
277       ct2 = 2*rbuf[i][0]+1;
278       for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/
279         PetscMemzero((void *)xtable,(m+1)*sizeof(int));
280         oct2 = ct2;
281         for (k =0; k < rbuf[i][2*j]; k++, ct1++) {
282           row = rbuf[i][ct1];
283           if(!xtable[row]++) { xdata[i][ct2++] = row;}
284         }
285         for ( k=oct2, max2 =ct2 ; k< max2; k++) {
286           row   = xdata[i][k] - rstart;
287           start = ai[row];
288           end   = ai[row+1];
289           for ( l=start; l < end; l++) {
290             val = aj[l] +ashift + cstart;
291             if(!xtable[val]++) { xdata[i][ct2++] = val;}
292           }
293           start = bi[row];
294           end   = bi[row+1];
295           for ( l=start; l < end; l++) {
296             val = garray[bj[l]+bshift] ;
297             if(!xtable[val]++) { xdata[i][ct2++] = val;}
298           }
299         }
300         /* Update the header*/
301         xdata[i][2*j]   = ct2-oct2; /* Undo the vector isz1 and use only a var*/
302         xdata[i][2*j-1] = rbuf[i][2*j-1];
303       }
304       xdata[i][0] = rbuf[i][0];
305       xdata[i+1]  = xdata[i] +ct2;
306       isz1[i]     = ct2; /* size of each message */
307     }
308 
309   }
310   /* need isz, xdata;*/
311 
312   /* Send the data back*/
313   /* Do a global reduction to know the buffer space req for incoming messages*/
314   {
315     int *rw1, *rw2;
316 
317     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
318     PetscMemzero((void*)rw1,2*size*sizeof(int));
319     rw2 = rw1+size;
320     for (i =0; i < nmsg ; ++i) {
321       proc      = recv_status[i].MPI_SOURCE;
322       rw1[proc] = isz1[i];
323     }
324 
325     MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm);
326     bsz1   = rw2[rank];
327     PetscFree(rw1);
328   }
329 
330   /* Allocate buffers*/
331 
332   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
333   rbuf2    = (int**) PetscMalloc((mct+1) *sizeof(int*));  CHKPTRQ(rbuf2);
334   rbuf2[0] = (int *) PetscMalloc((mct*bsz1+1) * sizeof(int));  CHKPTRQ(rbuf2[0]);
335   for (i=1; i<mct ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1;
336 
337   /* Now post the receives */
338   recv_waits2 = (MPI_Request *)PetscMalloc((mct+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2)
339   CHKPTRQ(recv_waits2);
340   for ( i=0; i<mct; ++i){
341     MPI_Irecv((void *)rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i);
342   }
343 
344   /*  Now  post the sends */
345   send_waits2 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
346   CHKPTRQ(send_waits2);
347   for( i =0; i< nmsg; ++i){
348     j = recv_status[i].MPI_SOURCE;
349     MPI_Isend( (void *)xdata[i], isz1[i], MPI_INT, j, tag, comm, send_waits2+i);
350   }
351 
352   /* recieve work done on other processors*/
353   {
354     int         index, is_no, ct1, max;
355     MPI_Status  *send_status2 ,*recv_status2;
356 
357     recv_status2 = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
358     CHKPTRQ(recv_status2);
359 
360 
361     for ( i=0; i< mct; ++i) {
362       MPI_Waitany(mct, recv_waits2, &index, recv_status2+i);
363       /* Process the message*/
364       ct1 = 2*rbuf2[index][0]+1;
365       for (j=1; j<=rbuf2[index][0]; j++) {
366         max   = rbuf2[index][2*j];
367         is_no = rbuf2[index][2*j-1];
368         for (k=0; k < max ; k++, ct1++) {
369           row = rbuf2[index][ct1];
370           if(!table[is_no][row]++) { data[is_no][isz[is_no]++] = row;}
371         }
372       }
373     }
374 
375 
376     send_status2 = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
377     CHKPTRQ(send_status2);
378     MPI_Waitall(nmsg,send_waits2,send_status2);
379 
380     PetscFree(send_status2); PetscFree(recv_status2);
381   }
382   for( i=0; i< is_max; ++i) {
383     ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr);
384   }
385   for( i=0; i< is_max; ++i) {
386     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
387   }
388   for ( i=0; i<is_max; ++i) {
389     ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr);
390   }
391 
392   TrSpace( &space, &fr, &maxs );
393   MPIU_printf(MPI_COMM_SELF,"[%d] acclcated space = %d fragments = %d max ever allocated = %d\n", rank, space, fr, maxs);
394 
395   /* whew! already done? check again :) */
396   /* PetscFree(idx);
397   PetscFree(n);
398   PetscFree(rtable);
399   PetscFree(w1);
400   PetscFree(tmp);
401   PetscFree(outdat);
402   PetscFree(ctr);
403   PetscFree(pa);
404   PetscFree(rbuf[0]);
405   PetscFree(rbuf);
406   PetscFree(rbuf2[0]);
407   PetscFree(rbuf2);
408   PetscFree(send_waits);
409   PetscFree(recv_waits);
410   PetscFree(send_waits2);
411   PetscFree(recv_waits2);
412   PetscFree(table[0]);
413   PetscFree(table);
414   PetscFree(data[0]);
415   PetscFree(data);
416   PetscFree(send_status);
417   PetscFree(recv_status);
418   PetscFree(isz);
419   PetscFree(isz1);
420   PetscFree(xtable);
421   PetscFree(xdata[0]);
422   PetscFree(xdata); */
423 
424   /* Dont forget to ISRestoreIndices */
425   return 0;
426 }
427