1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.26.1.29 1996/04/26 00:04:17 balay Exp $"; 3 #endif 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "mpiaij.h" 9 #include "src/inline/bitarray.h" 10 11 static int MatIncreaseOverlap_MPIAIJ_Once(Mat, int, IS *); 12 static int MatIncreaseOverlap_MPIAIJ_Local(Mat , int , char **,int*, int**); 13 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat , int, int **, int**, int* ); 14 extern int MatGetRow_MPIAIJ(Mat,int,int*,int**,Scalar**); 15 extern int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,Scalar**); 16 17 int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov) 18 { 19 int i, ierr; 20 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ:Negative overlap specified\n");} 21 for ( i=0; i<ov; ++i ) { 22 ierr = MatIncreaseOverlap_MPIAIJ_Once(C, imax, is); CHKERRQ(ierr); 23 } 24 return 0; 25 } 26 27 /* 28 Sample message format: 29 If a processor A wants processor B to process some elements corresponding 30 to index sets 1s[1], is[5] 31 mesg [0] = 2 ( no of index sets in the mesg) 32 ----------- 33 mesg [1] = 1 => is[1] 34 mesg [2] = sizeof(is[1]); 35 ----------- 36 mesg [5] = 5 => is[5] 37 mesg [6] = sizeof(is[5]); 38 ----------- 39 mesg [7] 40 mesg [n] datas[1] 41 ----------- 42 mesg[n+1] 43 mesg[m] data(is[5]) 44 ----------- 45 46 Notes: 47 nrqs - no of requests sent (or to be sent out) 48 nrqr - no of requests recieved (which have to be or which have been processed 49 */ 50 static int MatIncreaseOverlap_MPIAIJ_Once(Mat C, int imax, IS *is) 51 { 52 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 53 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data,len,*idx_i; 54 int size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 55 int *ctr,*pa,tag,*tmp,bsz,nrqr,*isz,*isz1,**xdata,bsz1,**rbuf2; 56 char **table; 57 MPI_Comm comm; 58 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 59 MPI_Status *s_status,*recv_status; 60 61 comm = C->comm; 62 tag = C->tag; 63 size = c->size; 64 rank = c->rank; 65 m = c->M; 66 67 len = (imax+1)*sizeof(int *) + (imax + m)*sizeof(int); 68 idx = (int **) PetscMalloc(len); CHKPTRQ(idx); 69 n = (int *) (idx + imax); 70 rtable = n + imax; 71 72 for ( i=0; i<imax; i++ ) { 73 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 74 ierr = ISGetSize(is[i],&n[i]); CHKERRQ(ierr); 75 } 76 77 /* Create hash table for the mapping :row -> proc*/ 78 for ( i=0,j=0; i<size; i++ ) { 79 len = c->rowners[i+1]; 80 for ( ; j<len; j++ ) { 81 rtable[j] = i; 82 } 83 } 84 85 /* evaluate communication - mesg to who, length of mesg, and buffer space 86 required. Based on this, buffers are allocated, and data copied into them*/ 87 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1);/* mesg size */ 88 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 89 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 90 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 91 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 92 for ( i=0; i<imax; i++ ) { 93 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 94 idx_i = idx[i]; 95 len = n[i]; 96 for ( j=0; j<len; j++ ) { 97 row = idx_i[j]; 98 proc = rtable[row]; 99 w4[proc]++; 100 } 101 for ( j=0; j<size; j++ ){ 102 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 103 } 104 } 105 106 nrqs = 0; /* no of outgoing messages */ 107 msz = 0; /* total mesg length (for all proc */ 108 w1[rank] = 0; /* no mesg sent to intself */ 109 w3[rank] = 0; 110 for ( i=0; i<size; i++ ) { 111 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 112 } 113 /* pa - is list of processors to communicate with */ 114 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); 115 for ( i=0,j=0; i<size; i++ ) { 116 if (w1[i]) {pa[j] = i; j++;} 117 } 118 119 /* Each message would have a header = 1 + 2*(no of IS) + data */ 120 for ( i=0; i<nrqs; i++ ) { 121 j = pa[i]; 122 w1[j] += w2[j] + 2*w3[j]; 123 msz += w1[j]; 124 } 125 126 127 /* Do a global reduction to determine how many messages to expect*/ 128 { 129 int *rw1, *rw2; 130 rw1 = (int *) PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 131 rw2 = rw1+size; 132 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 133 bsz = rw1[rank]; 134 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 135 nrqr = rw2[rank]; 136 PetscFree(rw1); 137 } 138 139 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 140 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 141 rbuf = (int**) PetscMalloc(len); CHKPTRQ(rbuf); 142 rbuf[0] = (int *) (rbuf + nrqr); 143 for ( i=1; i<nrqr; ++i ) rbuf[i] = rbuf[i-1] + bsz; 144 145 /* Post the receives */ 146 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 147 CHKPTRQ(r_waits1); 148 for ( i=0; i<nrqr; ++i ){ 149 MPI_Irecv(rbuf[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 150 } 151 152 /* Allocate Memory for outgoing messages */ 153 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 154 outdat = (int **)PetscMalloc(len); CHKPTRQ(outdat); 155 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 156 PetscMemzero(outdat,2*size*sizeof(int*)); 157 tmp = (int *) (outdat + 2*size); 158 ctr = tmp + msz; 159 160 { 161 int *iptr = tmp,ict = 0; 162 for ( i=0; i<nrqs; i++ ) { 163 j = pa[i]; 164 iptr += ict; 165 outdat[j] = iptr; 166 ict = w1[j]; 167 } 168 } 169 170 /* Form the outgoing messages */ 171 /*plug in the headers*/ 172 for ( i=0; i<nrqs; i++ ) { 173 j = pa[i]; 174 outdat[j][0] = 0; 175 PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int)); 176 ptr[j] = outdat[j] + 2*w3[j] + 1; 177 } 178 179 /* Memory for doing local proc's work*/ 180 { 181 int *d_p; 182 char *t_p; 183 184 len = (imax+1)*(sizeof(char *) + sizeof(int *) + sizeof(int)) + 185 (m+1)*imax*sizeof(int) + (m/BITSPERBYTE+1)*imax*sizeof(char); 186 table = (char **)PetscMalloc(len); CHKPTRQ(table); 187 data = (int **)(table + imax); 188 data[0] = (int *)(data + imax); 189 isz = data[0] + (m+1)*imax; 190 table[0] = (char *)(isz + imax); 191 d_p = data[0]; t_p = table[0]; 192 for ( i=1; i<imax; i++ ) { 193 table[i] = t_p + (m/BITSPERBYTE+1)*i; 194 data[i] = d_p + (m+1)*i; 195 } 196 } 197 PetscMemzero(*table,(m/BITSPERBYTE+1)*imax); 198 PetscMemzero(isz,imax*sizeof(int)); 199 200 /* Parse the IS and update local tables and the outgoing buf with the data*/ 201 { 202 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 203 char *table_i; 204 205 for ( i=0; i<imax; i++ ) { 206 PetscMemzero(ctr,size*sizeof(int)); 207 n_i = n[i]; 208 table_i = table[i]; 209 idx_i = idx[i]; 210 data_i = data[i]; 211 isz_i = isz[i]; 212 for ( j=0; j<n_i; j++ ) { /* parse the indices of each IS */ 213 row = idx_i[j]; 214 proc = rtable[row]; 215 if (proc != rank) { /* copy to the outgoing buffer */ 216 ctr[proc]++; 217 *ptr[proc] = row; 218 ptr[proc]++; 219 } 220 else { /* Update the local table */ 221 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 222 } 223 } 224 /* Update the headers for the current IS */ 225 for ( j=0; j<size; j++ ) { /* Can Optimise this loop by using pa[] */ 226 if ((ctr_j = ctr[j])) { 227 outdat_j = outdat[j]; 228 k = ++outdat_j[0]; 229 outdat_j[2*k] = ctr_j; 230 outdat_j[2*k-1] = i; 231 } 232 } 233 isz[i] = isz_i; 234 } 235 } 236 237 238 239 /* Now post the sends */ 240 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 241 CHKPTRQ(s_waits1); 242 for ( i=0; i<nrqs; ++i ) { 243 j = pa[i]; 244 MPI_Isend(outdat[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 245 } 246 247 /* No longer need the original indices*/ 248 for ( i=0; i<imax; ++i ) { 249 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 250 } 251 PetscFree(idx); 252 253 for ( i=0; i<imax; ++i ) { 254 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 255 } 256 257 /* Do Local work*/ 258 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 259 260 /* Receive messages*/ 261 { 262 int index; 263 264 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 265 CHKPTRQ(recv_status); 266 for ( i=0; i<nrqr; ++i ) { 267 MPI_Waitany(nrqr, r_waits1, &index, recv_status+i); 268 } 269 270 s_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 271 CHKPTRQ(s_status); 272 MPI_Waitall(nrqs,s_waits1,s_status); 273 } 274 275 /* Phase 1 sends are complete - deallocate buffers */ 276 PetscFree(outdat); 277 PetscFree(w1); 278 279 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 280 isz1 = (int *)PetscMalloc((nrqr+1)*sizeof(int)); CHKPTRQ(isz1); 281 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 282 PetscFree(rbuf); 283 284 /* Send the data back*/ 285 /* Do a global reduction to know the buffer space req for incoming messages*/ 286 { 287 int *rw1, *rw2; 288 289 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 290 PetscMemzero(rw1,2*size*sizeof(int)); 291 rw2 = rw1+size; 292 for ( i=0; i<nrqr; ++i ) { 293 proc = recv_status[i].MPI_SOURCE; 294 rw1[proc] = isz1[i]; 295 } 296 297 MPI_Allreduce(rw1, rw2, size, MPI_INT, MPI_MAX, comm); 298 bsz1 = rw2[rank]; 299 PetscFree(rw1); 300 } 301 302 /* Allocate buffers*/ 303 304 /* Allocate memory for recv buffers. Prob none if nrqr = 0 ???? */ 305 len = (nrqs+1)*sizeof(int*) + nrqs*bsz1*sizeof(int); 306 rbuf2 = (int**) PetscMalloc(len); CHKPTRQ(rbuf2); 307 rbuf2[0] = (int *) (rbuf2 + nrqs); 308 for ( i=1; i<nrqs; ++i ) rbuf2[i] = rbuf2[i-1] + bsz1; 309 310 /* Post the receives */ 311 r_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 312 CHKPTRQ(r_waits2); 313 for ( i=0; i<nrqs; ++i ) { 314 MPI_Irecv(rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, r_waits2+i); 315 } 316 317 /* Now post the sends */ 318 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 319 CHKPTRQ(s_waits2); 320 for ( i=0; i<nrqr; ++i ) { 321 j = recv_status[i].MPI_SOURCE; 322 MPI_Isend( xdata[i], isz1[i], MPI_INT, j, tag, comm, s_waits2+i); 323 } 324 325 /* receive work done on other processors*/ 326 { 327 int index, is_no, ct1, max,*rbuf2_i,isz_i,*data_i,jmax; 328 char *table_i; 329 MPI_Status *status2; 330 331 status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(status2); 332 333 for ( i=0; i<nrqs; ++i ) { 334 MPI_Waitany(nrqs, r_waits2, &index, status2+i); 335 /* Process the message*/ 336 rbuf2_i = rbuf2[index]; 337 ct1 = 2*rbuf2_i[0]+1; 338 jmax = rbuf2[index][0]; 339 for ( j=1; j<=jmax; j++ ) { 340 max = rbuf2_i[2*j]; 341 is_no = rbuf2_i[2*j-1]; 342 isz_i = isz[is_no]; 343 data_i = data[is_no]; 344 table_i = table[is_no]; 345 for ( k=0; k<max; k++,ct1++ ) { 346 row = rbuf2_i[ct1]; 347 if (!BT_LOOKUP(table_i,row)) { data_i[isz_i++] = row;} 348 } 349 isz[is_no] = isz_i; 350 } 351 } 352 MPI_Waitall(nrqr,s_waits2,status2); 353 PetscFree(status2); 354 } 355 356 for ( i=0; i<imax; ++i ) { 357 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 358 } 359 360 PetscFree(pa); 361 PetscFree(rbuf2); 362 PetscFree(s_waits1); 363 PetscFree(r_waits1); 364 PetscFree(s_waits2); 365 PetscFree(r_waits2); 366 PetscFree(table); 367 PetscFree(s_status); 368 PetscFree(recv_status); 369 PetscFree(xdata[0]); 370 PetscFree(xdata); 371 PetscFree(isz1); 372 return 0; 373 } 374 375 /* 376 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 377 the work on the local processor. 378 379 Inputs: 380 C - MAT_MPIAIJ; 381 imax - total no of index sets processed at a time; 382 table - an array of char - size = m bits. 383 384 Output: 385 isz - array containing the count of the solution elements correspondign 386 to each index set; 387 data - pointer to the solutions 388 */ 389 static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,char **table,int *isz, 390 int **data) 391 { 392 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 393 Mat A = c->A, B = c->B; 394 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 395 int start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj; 396 int *bi, *bj, *garray, i, j, k, row,*data_i,isz_i; 397 char *table_i; 398 399 rstart = c->rstart; 400 cstart = c->cstart; 401 ashift = a->indexshift; 402 ai = a->i; 403 aj = a->j +ashift; 404 bshift = b->indexshift; 405 bi = b->i; 406 bj = b->j +bshift; 407 garray = c->garray; 408 409 410 for ( i=0; i<imax; i++ ) { 411 data_i = data[i]; 412 table_i = table[i]; 413 isz_i = isz[i]; 414 for ( j=0, max=isz[i]; j<max; j++ ) { 415 row = data_i[j] - rstart; 416 start = ai[row]; 417 end = ai[row+1]; 418 for ( k=start; k<end; k++ ) { /* Amat */ 419 val = aj[k] + ashift + cstart; 420 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 421 } 422 start = bi[row]; 423 end = bi[row+1]; 424 for ( k=start; k<end; k++ ) { /* Bmat */ 425 val = garray[bj[k]+bshift]; 426 if (!BT_LOOKUP(table_i,val)) { data_i[isz_i++] = val;} 427 } 428 } 429 isz[i] = isz_i; 430 } 431 return 0; 432 } 433 /* 434 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 435 and return the output 436 437 Input: 438 C - the matrix 439 nrqr - no of messages being processed. 440 rbuf - an array of pointers to the recieved requests 441 442 Output: 443 xdata - array of messages to be sent back 444 isz1 - size of each message 445 446 For better efficiency perhaps we should malloc seperately each xdata[i], 447 then if a remalloc is required we need only copy the data for that one row 448 rather then all previous rows as it is now where a single large chunck of 449 memory is used. 450 451 */ 452 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf, 453 int **xdata, int * isz1) 454 { 455 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 456 Mat A = c->A, B = c->B; 457 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 458 int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k; 459 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 460 int val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr; 461 int *rbuf_i,kmax,rbuf_0; 462 char *xtable; 463 464 rank = c->rank; 465 m = c->M; 466 rstart = c->rstart; 467 cstart = c->cstart; 468 ashift = a->indexshift; 469 ai = a->i; 470 aj = a->j +ashift; 471 bshift = b->indexshift; 472 bi = b->i; 473 bj = b->j +bshift; 474 garray = c->garray; 475 476 477 for ( i=0,ct=0,total_sz=0; i<nrqr; ++i ) { 478 rbuf_i = rbuf[i]; 479 rbuf_0 = rbuf_i[0]; 480 ct += rbuf_0; 481 for ( j=1; j<=rbuf_0; j++ ) { total_sz += rbuf_i[2*j]; } 482 } 483 484 max1 = ct*(a->nz +b->nz)/c->m; 485 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 486 xdata[0] = (int *)PetscMalloc(mem_estimate*sizeof(int)); CHKPTRQ(xdata[0]); 487 ++no_malloc; 488 xtable = (char *)PetscMalloc((m/BITSPERBYTE+1)*sizeof(char)); CHKPTRQ(xtable); 489 PetscMemzero(isz1,nrqr*sizeof(int)); 490 491 ct3 = 0; 492 for ( i=0; i<nrqr; i++ ) { /* for easch mesg from proc i */ 493 rbuf_i = rbuf[i]; 494 rbuf_0 = rbuf_i[0]; 495 ct1 = 2*rbuf_0+1; 496 ct2 = ct1; 497 ct3 += ct1; 498 for ( j=1; j<=rbuf_0; j++ ) { /* for each IS from proc i*/ 499 PetscMemzero(xtable,(m/BITSPERBYTE+1)*sizeof(char)); 500 oct2 = ct2; 501 kmax = rbuf_i[2*j]; 502 for ( k=0; k<kmax; k++, ct1++ ) { 503 row = rbuf_i[ct1]; 504 if (!BT_LOOKUP(xtable,row)) { 505 if (!(ct3 < mem_estimate)) { 506 new_estimate = (int)(1.5*mem_estimate)+1; 507 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 508 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 509 PetscFree(xdata[0]); 510 xdata[0] = tmp; 511 mem_estimate = new_estimate; ++no_malloc; 512 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 513 } 514 xdata[i][ct2++] = row; 515 ct3++; 516 } 517 } 518 for ( k=oct2,max2=ct2; k<max2; k++ ) { 519 row = xdata[i][k] - rstart; 520 start = ai[row]; 521 end = ai[row+1]; 522 for ( l=start; l<end; l++ ) { 523 val = aj[l] + ashift + cstart; 524 if (!BT_LOOKUP(xtable,val)) { 525 if (!(ct3 < mem_estimate)) { 526 new_estimate = (int)(1.5*mem_estimate)+1; 527 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 528 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 529 PetscFree(xdata[0]); 530 xdata[0] = tmp; 531 mem_estimate = new_estimate; ++no_malloc; 532 for ( ctr=1; ctr<=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 533 } 534 xdata[i][ct2++] = val; 535 ct3++; 536 } 537 } 538 start = bi[row]; 539 end = bi[row+1]; 540 for ( l=start; l<end; l++ ) { 541 val = garray[bj[l]+bshift]; 542 if (!BT_LOOKUP(xtable,val)) { 543 if (!(ct3 < mem_estimate)) { 544 new_estimate = (int)(1.5*mem_estimate)+1; 545 tmp = (int*) PetscMalloc(new_estimate * sizeof(int));CHKPTRQ(tmp); 546 PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int)); 547 PetscFree(xdata[0]); 548 xdata[0] = tmp; 549 mem_estimate = new_estimate; ++no_malloc; 550 for ( ctr =1; ctr <=i; ctr++ ) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 551 } 552 xdata[i][ct2++] = val; 553 ct3++; 554 } 555 } 556 } 557 /* Update the header*/ 558 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 559 xdata[i][2*j-1] = rbuf_i[2*j-1]; 560 } 561 xdata[i][0] = rbuf_0; 562 xdata[i+1] = xdata[i] + ct2; 563 isz1[i] = ct2; /* size of each message */ 564 } 565 PetscFree(xtable); 566 PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 567 return 0; 568 } 569 570 /* -------------------------------------------------------------------------*/ 571 int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,IS *isrow,IS *iscol, 572 MatGetSubMatrixCall scall,Mat **submat) 573 { 574 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 575 Mat A = c->A,*submats = *submat; 576 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)c->B->data, *mat; 577 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 578 int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 579 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,tag,*tmp,tcol,bsz,nrqr; 580 int **rbuf3,*req_source,**sbuf_aj, ashift, **rbuf2, max1,max2,**rmap; 581 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 582 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 583 int *rmap_i; 584 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 585 MPI_Request *r_waits4,*s_waits3,*s_waits4; 586 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 587 MPI_Status *r_status3,*r_status4,*s_status4; 588 MPI_Comm comm; 589 Scalar **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i; 590 591 comm = C->comm; 592 tag = C->tag; 593 size = c->size; 594 rank = c->rank; 595 m = c->M; 596 ashift = a->indexshift; 597 598 /* Check if the col indices are sorted */ 599 for ( i=0; i<ismax; i++ ) { 600 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 601 if (!j) SETERRQ(1,"MatGetSubmatrices_MPIAIJ:IS is not sorted"); 602 } 603 604 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (m+1)*sizeof(int); 605 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 606 icol = irow + ismax; 607 nrow = (int *) (icol + ismax); 608 ncol = nrow + ismax; 609 rtable = ncol + ismax; 610 611 for ( i=0; i<ismax; i++ ) { 612 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 613 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 614 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 615 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 616 } 617 618 /* Create hash table for the mapping :row -> proc*/ 619 for ( i=0,j=0; i<size; i++ ) { 620 jmax = c->rowners[i+1]; 621 for ( ; j<jmax; j++ ) { 622 rtable[j] = i; 623 } 624 } 625 626 /* evaluate communication - mesg to who, length of mesg, and buffer space 627 required. Based on this, buffers are allocated, and data copied into them*/ 628 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 629 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 630 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 631 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 632 PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/ 633 for ( i=0; i<ismax; i++ ) { 634 PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/ 635 jmax = nrow[i]; 636 irow_i = irow[i]; 637 for ( j=0; j<jmax; j++ ) { 638 row = irow_i[j]; 639 proc = rtable[row]; 640 w4[proc]++; 641 } 642 for ( j=0; j<size; j++ ) { 643 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 644 } 645 } 646 647 nrqs = 0; /* no of outgoing messages */ 648 msz = 0; /* total mesg length (for all proc */ 649 w1[rank] = 0; /* no mesg sent to intself */ 650 w3[rank] = 0; 651 for ( i=0; i<size; i++ ) { 652 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 653 } 654 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 655 for ( i=0, j=0; i<size; i++ ) { 656 if (w1[i]) { pa[j] = i; j++; } 657 } 658 659 /* Each message would have a header = 1 + 2*(no of IS) + data */ 660 for ( i=0; i<nrqs; i++ ) { 661 j = pa[i]; 662 w1[j] += w2[j] + 2* w3[j]; 663 msz += w1[j]; 664 } 665 /* Do a global reduction to determine how many messages to expect*/ 666 { 667 int *rw1, *rw2; 668 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 669 rw2 = rw1+size; 670 MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm); 671 bsz = rw1[rank]; 672 MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm); 673 nrqr = rw2[rank]; 674 PetscFree(rw1); 675 } 676 677 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 678 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 679 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 680 rbuf1[0] = (int *) (rbuf1 + nrqr); 681 for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 682 683 /* Post the receives */ 684 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 685 CHKPTRQ(r_waits1); 686 for ( i=0; i<nrqr; ++i ) { 687 MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag,comm,r_waits1+i); 688 } 689 690 /* Allocate Memory for outgoing messages */ 691 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 692 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 693 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 694 PetscMemzero(sbuf1,2*size*sizeof(int*)); 695 /* allocate memory for outgoing data + buf to receive the first reply */ 696 tmp = (int *) (ptr + size); 697 ctr = tmp + 2*msz; 698 699 { 700 int *iptr = tmp,ict = 0; 701 for ( i=0; i<nrqs; i++ ) { 702 j = pa[i]; 703 iptr += ict; 704 sbuf1[j] = iptr; 705 ict = w1[j]; 706 } 707 } 708 709 /* Form the outgoing messages */ 710 /* Initialise the header space */ 711 for ( i=0; i<nrqs; i++ ) { 712 j = pa[i]; 713 sbuf1[j][0] = 0; 714 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 715 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 716 } 717 718 /* Parse the isrow and copy data into outbuf */ 719 for ( i=0; i<ismax; i++ ) { 720 PetscMemzero(ctr,size*sizeof(int)); 721 irow_i = irow[i]; 722 jmax = nrow[i]; 723 for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 724 row = irow_i[j]; 725 proc = rtable[row]; 726 if (proc != rank) { /* copy to the outgoing buf*/ 727 ctr[proc]++; 728 *ptr[proc] = row; 729 ptr[proc]++; 730 } 731 } 732 /* Update the headers for the current IS */ 733 for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 734 if ((ctr_j = ctr[j])) { 735 sbuf1_j = sbuf1[j]; 736 k = ++sbuf1_j[0]; 737 sbuf1_j[2*k] = ctr_j; 738 sbuf1_j[2*k-1] = i; 739 } 740 } 741 } 742 743 /* Now post the sends */ 744 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 745 CHKPTRQ(s_waits1); 746 for ( i=0; i<nrqs; ++i ) { 747 j = pa[i]; 748 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 749 MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag, comm, s_waits1+i); 750 } 751 752 /* Post Recieves to capture the buffer size */ 753 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 754 CHKPTRQ(r_waits2); 755 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 756 rbuf2[0] = tmp + msz; 757 for ( i=1; i<nrqs; ++i ) { 758 j = pa[i]; 759 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 760 } 761 for ( i=0; i<nrqs; ++i ) { 762 j = pa[i]; 763 MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag+1, comm, r_waits2+i); 764 } 765 766 /* Send to other procs the buf size they should allocate */ 767 768 769 /* Receive messages*/ 770 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 771 CHKPTRQ(s_waits2); 772 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 773 CHKPTRQ(r_status1); 774 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 775 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 776 req_size = (int *) (sbuf2 + nrqr); 777 req_source = req_size + nrqr; 778 779 { 780 Mat_SeqAIJ *sA = (Mat_SeqAIJ*) c->A->data, *sB = (Mat_SeqAIJ*) c->B->data; 781 int *sAi = sA->i, *sBi = sB->i, id, rstart = c->rstart; 782 int *sbuf2_i; 783 784 for ( i=0; i<nrqr; ++i ) { 785 MPI_Waitany(nrqr, r_waits1, &index, r_status1+i); 786 req_size[index] = 0; 787 rbuf1_i = rbuf1[index]; 788 start = 2*rbuf1_i[0] + 1; 789 MPI_Get_count(r_status1+i,MPI_INT, &end); 790 sbuf2[index] = (int *)PetscMalloc(end*sizeof(int));CHKPTRQ(sbuf2[index]); 791 sbuf2_i = sbuf2[index]; 792 for ( j=start; j<end; j++ ) { 793 id = rbuf1_i[j] - rstart; 794 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 795 sbuf2_i[j] = ncols; 796 req_size[index] += ncols; 797 } 798 req_source[index] = r_status1[i].MPI_SOURCE; 799 /* form the header */ 800 sbuf2_i[0] = req_size[index]; 801 for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 802 MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag+1,comm,s_waits2+i); 803 } 804 } 805 PetscFree(r_status1); PetscFree(r_waits1); 806 807 /* recv buffer sizes */ 808 /* Receive messages*/ 809 810 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 811 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 812 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 813 CHKPTRQ(r_waits3); 814 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 815 CHKPTRQ(r_waits4); 816 r_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 817 CHKPTRQ(r_status2); 818 819 for ( i=0; i<nrqs; ++i ) { 820 MPI_Waitany(nrqs, r_waits2, &index, r_status2+i); 821 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 822 CHKPTRQ(rbuf3[index]); 823 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 824 CHKPTRQ(rbuf4[index]); 825 MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 826 r_status2[i].MPI_SOURCE, tag+2, comm, r_waits3+index); 827 MPI_Irecv(rbuf4[index],rbuf2[index][0], MPIU_SCALAR, 828 r_status2[i].MPI_SOURCE, tag+3, comm, r_waits4+index); 829 } 830 PetscFree(r_status2); PetscFree(r_waits2); 831 832 /* Wait on sends1 and sends2 */ 833 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 834 CHKPTRQ(s_status1); 835 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 836 CHKPTRQ(s_status2); 837 838 MPI_Waitall(nrqs,s_waits1,s_status1); 839 MPI_Waitall(nrqr,s_waits2,s_status2); 840 PetscFree(s_status1); PetscFree(s_status2); 841 PetscFree(s_waits1); PetscFree(s_waits2); 842 843 /* Now allocate buffers for a->j, and send them off */ 844 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); 845 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 846 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 847 for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 848 849 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 850 CHKPTRQ(s_waits3); 851 { 852 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 853 int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 854 int *a_j = a->j, *b_j = b->j, shift = a->indexshift,ctmp, *t_cols; 855 856 for ( i=0; i<nrqr; i++ ) { 857 rbuf1_i = rbuf1[i]; 858 sbuf_aj_i = sbuf_aj[i]; 859 ct1 = 2*rbuf1_i[0] + 1; 860 ct2 = 0; 861 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 862 kmax = rbuf1[i][2*j]; 863 for ( k=0; k<kmax; k++,ct1++ ) { 864 row = rbuf1_i[ct1] - cstart; 865 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 866 ncols = nzA + nzB; 867 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 868 869 /* load the column indices for this row into cols*/ 870 cols = sbuf_aj_i + ct2; 871 if (!shift) { 872 for ( l=0; l<nzB; l++ ) { 873 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 874 else break; 875 } 876 imark = l; 877 for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 878 for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 879 } 880 else { 881 ierr = MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0); CHKERRQ(ierr); 882 PetscMemcpy(cols, t_cols, ncols*sizeof(int)); 883 ierr = MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0); CHKERRQ(ierr); 884 885 } 886 887 ct2 += ncols; 888 } 889 } 890 MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag+2,comm,s_waits3+i); 891 } 892 } 893 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 894 CHKPTRQ(r_status3); 895 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 896 CHKPTRQ(s_status3); 897 898 /* Allocate buffers for a->a, and send them off */ 899 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 900 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 901 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 902 for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 903 904 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 905 CHKPTRQ(s_waits4); 906 { 907 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 908 int *cworkA, *cworkB, cstart = c->cstart, *bmap = c->garray; 909 int *a_j = a->j, *b_j = b->j,shift = a->indexshift; 910 Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a,*t_vals; 911 912 for ( i=0; i<nrqr; i++ ) { 913 rbuf1_i = rbuf1[i]; 914 sbuf_aa_i = sbuf_aa[i]; 915 ct1 = 2*rbuf1_i[0]+1; 916 ct2 = 0; 917 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 918 kmax = rbuf1_i[2*j]; 919 for ( k=0; k<kmax; k++,ct1++ ) { 920 row = rbuf1_i[ct1] - cstart; 921 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 922 ncols = nzA + nzB; 923 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 924 vworkA = a_a + a_i[row]; vworkB = b_a + b_i[row]; 925 926 /* load the column values for this row into vals*/ 927 vals = sbuf_aa_i+ct2; 928 if (!shift) { 929 for ( l=0; l<nzB; l++ ) { 930 if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l]; 931 else break; 932 } 933 imark = l; 934 for ( l=0; l<nzA; l++ ) vals[imark+l] = vworkA[l]; 935 for ( l=imark; l<nzB; l++ ) vals[nzA+l] = vworkB[l]; 936 } 937 else { 938 ierr = MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals); CHKERRQ(ierr); 939 PetscMemcpy(vals, t_vals, ncols*sizeof(Scalar)); 940 ierr = MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals); CHKERRQ(ierr); 941 } 942 ct2 += ncols; 943 } 944 } 945 MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag+3,comm,s_waits4+i); 946 } 947 } 948 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status)); 949 CHKPTRQ(r_status4); 950 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status)); 951 CHKPTRQ(s_status4); 952 PetscFree(rbuf1); 953 954 /* Form the matrix */ 955 /* create col map */ 956 { 957 int *icol_i; 958 959 len = (1+ismax)*sizeof(int *) + ismax*c->N*sizeof(int); 960 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 961 cmap[0] = (int *)(cmap + ismax); 962 PetscMemzero(cmap[0],(1+ismax*c->N)*sizeof(int)); 963 for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->N; } 964 for ( i=0; i<ismax; i++ ) { 965 jmax = ncol[i]; 966 icol_i = icol[i]; 967 cmap_i = cmap[i]; 968 for ( j=0; j<jmax; j++ ) { 969 cmap_i[icol_i[j]] = j+1; 970 } 971 } 972 } 973 974 975 /* Create lens which is required for MatCreate... */ 976 for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 977 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 978 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 979 lens[0] = (int *)(lens + ismax); 980 PetscMemzero(lens[0], j*sizeof(int)); 981 for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 982 983 /* Update lens from local data */ 984 for ( i=0; i<ismax; i++ ) { 985 jmax = nrow[i]; 986 cmap_i = cmap[i]; 987 irow_i = irow[i]; 988 lens_i = lens[i]; 989 for ( j=0; j<jmax; j++ ) { 990 row = irow_i[j]; 991 proc = rtable[row]; 992 if (proc == rank) { 993 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 994 for ( k=0; k<ncols; k++ ) { 995 if (cmap_i[cols[k]]) { lens_i[j]++;} 996 } 997 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0); CHKERRQ(ierr); 998 } 999 } 1000 } 1001 1002 /* Create row map*/ 1003 len = (1+ismax)*sizeof(int *) + ismax*c->M*sizeof(int); 1004 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 1005 rmap[0] = (int *)(rmap + ismax); 1006 PetscMemzero(rmap[0],ismax*c->M*sizeof(int)); 1007 for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->M;} 1008 for ( i=0; i<ismax; i++ ) { 1009 rmap_i = rmap[i]; 1010 irow_i = irow[i]; 1011 jmax = nrow[i]; 1012 for ( j=0; j<jmax; j++ ) { 1013 rmap_i[irow_i[j]] = j; 1014 } 1015 } 1016 1017 /* Update lens from offproc data */ 1018 { 1019 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 1020 1021 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1022 MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2); 1023 index = pa[i]; 1024 sbuf1_i = sbuf1[index]; 1025 jmax = sbuf1_i[0]; 1026 ct1 = 2*jmax+1; 1027 ct2 = 0; 1028 rbuf2_i = rbuf2[i]; 1029 rbuf3_i = rbuf3[i]; 1030 for ( j=1; j<=jmax; j++ ) { 1031 is_no = sbuf1_i[2*j-1]; 1032 max1 = sbuf1_i[2*j]; 1033 lens_i = lens[is_no]; 1034 cmap_i = cmap[is_no]; 1035 rmap_i = rmap[is_no]; 1036 for ( k=0; k<max1; k++,ct1++ ) { 1037 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1038 max2 = rbuf2_i[ct1]; 1039 for ( l=0; l<max2; l++,ct2++ ) { 1040 if (cmap_i[rbuf3_i[ct2]]) { 1041 lens_i[row]++; 1042 } 1043 } 1044 } 1045 } 1046 } 1047 } 1048 PetscFree(r_status3); PetscFree(r_waits3); 1049 MPI_Waitall(nrqr,s_waits3,s_status3); 1050 PetscFree(s_status3); PetscFree(s_waits3); 1051 1052 /* Create the submatrices */ 1053 if (scall == MAT_REUSE_MATRIX) { 1054 /* 1055 Assumes new rows are same length as the old rows, hence bug! 1056 */ 1057 for ( i=0; i<ismax; i++ ) { 1058 mat = (Mat_SeqAIJ *)(submats[i]->data); 1059 if ((mat->m != nrow[i]) || (mat->n != ncol[i])) { 1060 SETERRQ(1,"MatGetSubmatrices_MPIAIJ:Cannot reuse matrix. wrong size"); 1061 } 1062 if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) { 1063 SETERRQ(1,"MatGetSubmatrices_MPIAIJ:Cannot reuse matrix. wrong no of nonzeros"); 1064 } 1065 /* Initial matrix as if empty */ 1066 PetscMemzero(mat->ilen,mat->m*sizeof(int)); 1067 } 1068 } 1069 else { 1070 *submat = submats = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(submats); 1071 for ( i=0; i<ismax; i++ ) { 1072 ierr = MatCreateSeqAIJ(comm,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr); 1073 } 1074 } 1075 1076 /* Assemble the matrices */ 1077 /* First assemble the local rows */ 1078 { 1079 int ilen_row,*imat_ilen, *imat_j, *imat_i,old_row; 1080 Scalar *imat_a; 1081 1082 for ( i=0; i<ismax; i++ ) { 1083 mat = (Mat_SeqAIJ *) submats[i]->data; 1084 imat_ilen = mat->ilen; 1085 imat_j = mat->j; 1086 imat_i = mat->i; 1087 imat_a = mat->a; 1088 cmap_i = cmap[i]; 1089 rmap_i = rmap[i]; 1090 irow_i = irow[i]; 1091 jmax = nrow[i]; 1092 for ( j=0; j<jmax; j++ ) { 1093 row = irow_i[j]; 1094 proc = rtable[row]; 1095 if (proc == rank) { 1096 old_row = row; 1097 row = rmap_i[row]; 1098 ilen_row = imat_ilen[row]; 1099 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1100 mat_i = imat_i[row] + ashift; 1101 mat_a = imat_a + mat_i; 1102 mat_j = imat_j + mat_i; 1103 for ( k=0; k<ncols; k++ ) { 1104 if ((tcol = cmap_i[cols[k]])) { 1105 *mat_j++ = tcol - (!ashift); 1106 *mat_a++ = vals[k]; 1107 ilen_row++; 1108 } 1109 } 1110 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals); CHKERRQ(ierr); 1111 imat_ilen[row] = ilen_row; 1112 } 1113 1114 } 1115 } 1116 } 1117 1118 /* Now assemble the off proc rows*/ 1119 { 1120 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1121 int *imat_j,*imat_i; 1122 Scalar *imat_a,*rbuf4_i; 1123 1124 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 1125 MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2); 1126 index = pa[i]; 1127 sbuf1_i = sbuf1[index]; 1128 jmax = sbuf1_i[0]; 1129 ct1 = 2*jmax + 1; 1130 ct2 = 0; 1131 rbuf2_i = rbuf2[i]; 1132 rbuf3_i = rbuf3[i]; 1133 rbuf4_i = rbuf4[i]; 1134 for ( j=1; j<=jmax; j++ ) { 1135 is_no = sbuf1_i[2*j-1]; 1136 rmap_i = rmap[is_no]; 1137 cmap_i = cmap[is_no]; 1138 mat = (Mat_SeqAIJ *) submats[is_no]->data; 1139 imat_ilen = mat->ilen; 1140 imat_j = mat->j; 1141 imat_i = mat->i; 1142 imat_a = mat->a; 1143 max1 = sbuf1_i[2*j]; 1144 for ( k=0; k<max1; k++, ct1++ ) { 1145 row = sbuf1_i[ct1]; 1146 row = rmap_i[row]; 1147 ilen = imat_ilen[row]; 1148 mat_i = imat_i[row] + ashift; 1149 mat_a = imat_a + mat_i; 1150 mat_j = imat_j + mat_i; 1151 max2 = rbuf2_i[ct1]; 1152 for ( l=0; l<max2; l++,ct2++ ) { 1153 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1154 *mat_j++ = tcol - (!ashift); 1155 *mat_a++ = rbuf4_i[ct2]; 1156 ilen++; 1157 } 1158 } 1159 imat_ilen[row] = ilen; 1160 } 1161 } 1162 } 1163 } 1164 PetscFree(r_status4); PetscFree(r_waits4); 1165 MPI_Waitall(nrqr,s_waits4,s_status4); 1166 PetscFree(s_waits4); PetscFree(s_status4); 1167 1168 /* Restore the indices */ 1169 for ( i=0; i<ismax; i++ ) { 1170 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1171 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1172 } 1173 1174 /* Destroy allocated memory */ 1175 PetscFree(irow); 1176 PetscFree(w1); 1177 PetscFree(pa); 1178 1179 PetscFree(sbuf1); 1180 PetscFree(rbuf2); 1181 for ( i=0; i<nrqr; ++i ) { 1182 PetscFree(sbuf2[i]); 1183 } 1184 for ( i=0; i<nrqs; ++i ) { 1185 PetscFree(rbuf3[i]); 1186 PetscFree(rbuf4[i]); 1187 } 1188 1189 PetscFree(sbuf2); 1190 PetscFree(rbuf3); 1191 PetscFree(rbuf4 ); 1192 PetscFree(sbuf_aj[0]); 1193 PetscFree(sbuf_aj); 1194 PetscFree(sbuf_aa[0]); 1195 PetscFree(sbuf_aa); 1196 1197 PetscFree(cmap); 1198 PetscFree(rmap); 1199 PetscFree(lens); 1200 1201 for ( i=0; i<ismax; i++ ) { 1202 ierr = MatAssemblyBegin(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1203 ierr = MatAssemblyEnd(submats[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1204 } 1205 1206 return 0; 1207 } 1208 1209 1210 1211 1212