1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.8 1996/01/30 23:57:32 balay Exp balay $"; 3 #endif 4 5 #include "mpiaij.h" 6 int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *); 7 8 int MatIncreaseOverlap_MPIAIJ(Mat C, int is_max, IS *is, int ov) 9 { 10 int i, ierr; 11 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");} 12 for (i =0; i<ov; ++i) { 13 ierr = MatIncreaseOverlap_MPIAIJ_private(C, is_max, is); CHKERRQ(ierr); 14 } 15 return 0; 16 } 17 18 19 20 int MatIncreaseOverlap_MPIAIJ_private(Mat C, int is_max, IS *is) 21 { 22 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 23 Mat A = c->A, B = c->B; 24 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 25 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**table,**data; 26 int size, rank, m,i,j,k, ierr, **rbuf, row, proc, mct, msz, **outdat, **ptr; 27 int *ctr, sum, *pa, tag, *tmp,bsz, nmsg , *isz, *isz1, **xdata, *xtable,rstart; 28 int cstart, *ai, *aj, *bi, *bj, *garray, bsz1, **rbuf2, ashift, bshift; 29 MPI_Comm comm; 30 MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ; 31 MPI_Status *send_status ,*recv_status; 32 int space, fr, maxs; 33 /* assume overlap = 1 */ 34 comm = C->comm; 35 tag = C->tag; 36 size = c->size; 37 rank = c->rank; 38 m = c->M; 39 rstart = c->rstart; 40 cstart = c->cstart; 41 ashift = a->indexshift; 42 ai = a->i; 43 aj = a->j +ashift; 44 bshift = b->indexshift; 45 bi = b->i; 46 bj = b->j +bshift; 47 garray = c->garray; 48 49 50 TrSpace( &space, &fr, &maxs ); 51 MPIU_printf(MPI_COMM_SELF,"[%d] acclcated space = %d fragments = %d max ever allocated = %d\n", rank, space, fr, maxs); 52 53 idx = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(idx); 54 n = (int *)PetscMalloc((is_max+1)*sizeof(int )); CHKPTRQ(n); 55 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 56 /* Hash table for maping row ->proc */ 57 58 for ( i=0 ; i<is_max ; i++) { 59 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 60 ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); 61 } 62 63 /* Create hash table for the mapping :row -> proc*/ 64 for( i=0, j=0; i< size; i++) { 65 for (; j <c->rowners[i+1]; j++) { 66 rtable[j] = i; 67 } 68 } 69 70 /* evaluate communication - mesg to who, length of mesg, and buffer space 71 required. Based on this, buffers are allocated, and data copied into them*/ 72 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 73 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 74 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 75 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 76 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 77 for ( i=0; i<is_max ; i++) { 78 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 79 for ( j =0 ; j < n[i] ; j++) { 80 row = idx[i][j]; 81 proc = rtable[row]; 82 w4[proc]++; 83 } 84 for( j = 0; j < size; j++){ 85 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 86 } 87 } 88 89 mct = 0; /* no of outgoing messages */ 90 msz = 0; /* total mesg length (for all proc */ 91 w1[rank] = 0; /* no mesg sent to intself */ 92 w3[rank] = 0; 93 for (i =0; i < size ; i++) { 94 if (w1[i]) { w2[i] = 1; mct++;} /* there exists a message to proc i */ 95 } 96 pa = (int *)PetscMalloc((mct +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 97 for (i =0, j=0; i < size ; i++) { 98 if (w1[i]) { pa[j] = i; j++; } 99 } 100 101 /* Each message would have a header = 1 + 2*(no of IS) + data */ 102 for (i = 0; i<mct ; i++) { 103 j = pa[i]; 104 w1[j] += w2[j] + 2* w3[j]; 105 msz += w1[j]; 106 } 107 108 /* Allocate Memory for outgoing messages */ 109 outdat = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat); 110 PetscMemzero(outdat, 2*size*sizeof(int*)); 111 tmp = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 112 ptr = outdat +size; /* Pointers to the data in outgoing buffers */ 113 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 114 115 { 116 int *iptr = tmp; 117 int ict = 0; 118 for (i = 0; i < mct ; i++) { 119 j = pa[i]; 120 iptr += ict; 121 outdat[j] = iptr; 122 ict = w1[j]; 123 } 124 } 125 126 /* Form the outgoing messages */ 127 /*plug in the headers*/ 128 for ( i=0 ; i<mct ; i++) { 129 j = pa[i]; 130 outdat[j][0] = 0; 131 PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int)); 132 ptr[j] = outdat[j] + 2*w3[j] +1; 133 } 134 135 /* Memory for doing local proc's work*/ 136 table = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(table); 137 data = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(data); 138 table[0] = (int *)PetscMalloc((m+1)*(is_max)*sizeof(int)); CHKPTRQ(table[0]); 139 data [0] = (int *)PetscMalloc((m+1)*(is_max)*sizeof(int)); CHKPTRQ(data[0]); 140 141 for(i = 1; i<is_max ; i++) { 142 table[i] = table[0] + (m+1)*i; 143 data[i] = data[0] + (m+1)*i; 144 } 145 146 PetscMemzero((void*)*table,(m+1)*(is_max)*sizeof(int)); 147 isz = (int *)PetscMalloc((is_max+1) *sizeof(int)); CHKPTRQ(isz); 148 PetscMemzero((void *)isz,(is_max+1) *sizeof(int)); 149 150 /* Parse the IS and update local tables and the outgoing buf with the data*/ 151 for ( i=0 ; i<is_max ; i++) { 152 PetscMemzero(ctr,size*sizeof(int)); 153 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 154 row = idx[i][j]; 155 proc = rtable[row]; 156 if (proc != rank) { /* copy to the outgoing buf*/ 157 ctr[proc]++; 158 *ptr[proc] = row; 159 ptr[proc]++; 160 } 161 else { /* Update the table */ 162 if(!table[i][row]++) { data[i][isz[i]++] = row;} 163 } 164 } 165 /* Update the headers for the current IS */ 166 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 167 if (ctr[j]) { 168 k= ++outdat[j][0]; 169 outdat[j][2*k] = ctr[j]; 170 outdat[j][2*k-1] = i; 171 } 172 } 173 } 174 175 /* Check Validity of the outgoing messages */ 176 for ( i=0 ; i<mct ; i++) { 177 j = pa[i]; 178 if (w3[j] != outdat[j][0]) {SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[1] mismatch!\n"); } 179 } 180 181 for ( i=0 ; i<mct ; i++) { 182 j = pa[i]; 183 sum = 1; 184 for (k = 1; k <= w3[j]; k++) sum += outdat[j][2*k]+2; 185 if (sum != w1[j]) { SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[2-n] mismatch! \n"); } 186 } 187 188 189 /* Do a global reduction to determine how many messages to expect*/ 190 { 191 int *rw1, *rw2; 192 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 193 rw2 = rw1+size; 194 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 195 bsz = rw1[rank]; 196 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 197 nmsg = rw2[rank]; 198 PetscFree(rw1); 199 } 200 201 /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */ 202 rbuf = (int**) PetscMalloc((nmsg+1) *sizeof(int*)); CHKPTRQ(rbuf); 203 rbuf[0] = (int *) PetscMalloc((nmsg *bsz+1) * sizeof(int)); CHKPTRQ(rbuf[0]); 204 for (i=1; i<nmsg ; ++i) rbuf[i] = rbuf[i-1] + bsz; 205 206 /* Now post the receives */ 207 recv_waits = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request)); 208 CHKPTRQ(recv_waits); 209 for ( i=0; i<nmsg; ++i){ 210 MPI_Irecv((void *)rbuf[i], bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 211 } 212 213 /* Now post the sends */ 214 send_waits = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request)); 215 CHKPTRQ(send_waits); 216 for( i =0; i< mct; ++i){ 217 j = pa[i]; 218 MPI_Isend( (void *)outdat[j], w1[j], MPI_INT, j, tag, comm, send_waits+i); 219 } 220 221 /* Do Local work*/ 222 /* Extract the matrices */ 223 { 224 int start, end, val, max; 225 226 for( i=0; i<is_max; i++) { 227 for ( j=0, max =isz[i] ; j< max; j++) { 228 row = data[i][j] - rstart; 229 start = ai[row]; 230 end = ai[row+1]; 231 for ( k=start; k < end; k++) { /* Amat */ 232 val = aj[k] + ashift + cstart; 233 if(!table[i][val]++) { data[i][isz[i]++] = val;} 234 } 235 start = bi[row]; 236 end = bi[row+1]; 237 for ( k=start; k < end; k++) { /* Bmat */ 238 val = garray[bj[k]+bshift] ; 239 if(!table[i][val]++) { data[i][isz[i]++] = val;} 240 } 241 } 242 } 243 } 244 /* Receive messages*/ 245 { 246 int index; 247 248 recv_status = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) ); 249 CHKPTRQ(recv_status); 250 for ( i=0; i< nmsg; ++i) { 251 MPI_Waitany(nmsg, recv_waits, &index, recv_status+i); 252 } 253 254 send_status = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) ); 255 CHKPTRQ(send_status); 256 MPI_Waitall(mct,send_waits,send_status); 257 } 258 259 260 /* Do work on recieved messages*/ 261 { 262 int ct, ct1, ct2; 263 int oct2, l, start, end, val, max1, max2; 264 265 266 for (i =0, ct =0; i< nmsg; ++i) ct+= rbuf[i][0]; 267 268 xdata = (int **)PetscMalloc((nmsg+1)*sizeof(int *)); CHKPTRQ(xdata); 269 xdata[0] = (int *)PetscMalloc((ct+nmsg+(m+1)*ct)*sizeof(int)); CHKPTRQ(xdata[0]); 270 xtable = (int *)PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(xtable); 271 isz1 = (int *)PetscMalloc((nmsg+1) *sizeof(int)); CHKPTRQ(isz1); 272 PetscMemzero((void *)isz1,(nmsg+1) *sizeof(int)); 273 274 275 for (i =0; i< nmsg; i++) { /* for easch mesg from proc i */ 276 ct1 = 2*rbuf[i][0]+1; 277 ct2 = 2*rbuf[i][0]+1; 278 for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/ 279 PetscMemzero((void *)xtable,(m+1)*sizeof(int)); 280 oct2 = ct2; 281 for (k =0; k < rbuf[i][2*j]; k++, ct1++) { 282 row = rbuf[i][ct1]; 283 if(!xtable[row]++) { xdata[i][ct2++] = row;} 284 } 285 for ( k=oct2, max2 =ct2 ; k< max2; k++) { 286 row = xdata[i][k] - rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for ( l=start; l < end; l++) { 290 val = aj[l] +ashift + cstart; 291 if(!xtable[val]++) { xdata[i][ct2++] = val;} 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for ( l=start; l < end; l++) { 296 val = garray[bj[l]+bshift] ; 297 if(!xtable[val]++) { xdata[i][ct2++] = val;} 298 } 299 } 300 /* Update the header*/ 301 xdata[i][2*j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/ 302 xdata[i][2*j-1] = rbuf[i][2*j-1]; 303 } 304 xdata[i][0] = rbuf[i][0]; 305 xdata[i+1] = xdata[i] +ct2; 306 isz1[i] = ct2; /* size of each message */ 307 } 308 309 } 310 /* need isz, xdata;*/ 311 312 /* Send the data back*/ 313 /* Do a global reduction to know the buffer space req for incoming messages*/ 314 { 315 int *rw1, *rw2; 316 317 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 318 PetscMemzero((void*)rw1,2*size*sizeof(int)); 319 rw2 = rw1+size; 320 for (i =0; i < nmsg ; ++i) { 321 proc = recv_status[i].MPI_SOURCE; 322 rw1[proc] = isz1[i]; 323 } 324 325 MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm); 326 bsz1 = rw2[rank]; 327 PetscFree(rw1); 328 } 329 330 /* Allocate buffers*/ 331 332 /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */ 333 rbuf2 = (int**) PetscMalloc((mct+1) *sizeof(int*)); CHKPTRQ(rbuf2); 334 rbuf2[0] = (int *) PetscMalloc((mct*bsz1+1) * sizeof(int)); CHKPTRQ(rbuf2[0]); 335 for (i=1; i<mct ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 336 337 /* Now post the receives */ 338 recv_waits2 = (MPI_Request *)PetscMalloc((mct+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2) 339 CHKPTRQ(recv_waits2); 340 for ( i=0; i<mct; ++i){ 341 MPI_Irecv((void *)rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i); 342 } 343 344 /* Now post the sends */ 345 send_waits2 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request)); 346 CHKPTRQ(send_waits2); 347 for( i =0; i< nmsg; ++i){ 348 j = recv_status[i].MPI_SOURCE; 349 MPI_Isend( (void *)xdata[i], isz1[i], MPI_INT, j, tag, comm, send_waits2+i); 350 } 351 352 /* recieve work done on other processors*/ 353 { 354 int index, is_no, ct1, max; 355 MPI_Status *send_status2 ,*recv_status2; 356 357 recv_status2 = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) ); 358 CHKPTRQ(recv_status2); 359 360 361 for ( i=0; i< mct; ++i) { 362 MPI_Waitany(mct, recv_waits2, &index, recv_status2+i); 363 /* Process the message*/ 364 ct1 = 2*rbuf2[index][0]+1; 365 for (j=1; j<=rbuf2[index][0]; j++) { 366 max = rbuf2[index][2*j]; 367 is_no = rbuf2[index][2*j-1]; 368 for (k=0; k < max ; k++, ct1++) { 369 row = rbuf2[index][ct1]; 370 if(!table[is_no][row]++) { data[is_no][isz[is_no]++] = row;} 371 } 372 } 373 } 374 375 376 send_status2 = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) ); 377 CHKPTRQ(send_status2); 378 MPI_Waitall(nmsg,send_waits2,send_status2); 379 380 PetscFree(send_status2); PetscFree(recv_status2); 381 } 382 for( i=0; i< is_max; ++i) { 383 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 384 } 385 for( i=0; i< is_max; ++i) { 386 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 387 } 388 for ( i=0; i<is_max; ++i) { 389 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 390 } 391 392 TrSpace( &space, &fr, &maxs ); 393 MPIU_printf(MPI_COMM_SELF,"[%d] acclcated space = %d fragments = %d max ever allocated = %d\n", rank, space, fr, maxs); 394 395 /* whew! already done? check again :) */ 396 /* PetscFree(idx); 397 PetscFree(n); 398 PetscFree(rtable); 399 PetscFree(w1); 400 PetscFree(tmp); 401 PetscFree(outdat); 402 PetscFree(ctr); 403 PetscFree(pa); 404 PetscFree(rbuf[0]); 405 PetscFree(rbuf); 406 PetscFree(rbuf2[0]); 407 PetscFree(rbuf2); 408 PetscFree(send_waits); 409 PetscFree(recv_waits); 410 PetscFree(send_waits2); 411 PetscFree(recv_waits2); 412 PetscFree(table[0]); 413 PetscFree(table); 414 PetscFree(data[0]); 415 PetscFree(data); 416 PetscFree(send_status); 417 PetscFree(recv_status); 418 PetscFree(isz); 419 PetscFree(isz1); 420 PetscFree(xtable); 421 PetscFree(xdata[0]); 422 PetscFree(xdata); */ 423 424 /* Dont forget to ISRestoreIndices */ 425 return 0; 426 } 427