1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include "src/mat/impls/baij/mpi/mpibaij.h" 6 #include "petscbt.h" 7 8 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *); 9 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**); 10 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*); 11 EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**); 12 EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**); 13 14 #undef __FUNCT__ 15 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ" 16 int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS is[],int ov) 17 { 18 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 19 int i,ierr,N=C->N, bs=c->bs; 20 IS *is_new; 21 22 PetscFunctionBegin; 23 ierr = PetscMalloc(imax*sizeof(IS),&is_new);CHKERRQ(ierr); 24 /* Convert the indices into block format */ 25 ierr = ISCompressIndicesGeneral(N,bs,imax,is,is_new);CHKERRQ(ierr); 26 if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");} 27 for (i=0; i<ov; ++i) { 28 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 29 } 30 for (i=0; i<imax; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 31 ierr = ISExpandIndicesGeneral(N,bs,imax,is_new,is);CHKERRQ(ierr); 32 for (i=0; i<imax; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 33 ierr = PetscFree(is_new);CHKERRQ(ierr); 34 PetscFunctionReturn(0); 35 } 36 37 /* 38 Sample message format: 39 If a processor A wants processor B to process some elements corresponding 40 to index sets is[1], is[5] 41 mesg [0] = 2 (no of index sets in the mesg) 42 ----------- 43 mesg [1] = 1 => is[1] 44 mesg [2] = sizeof(is[1]); 45 ----------- 46 mesg [5] = 5 => is[5] 47 mesg [6] = sizeof(is[5]); 48 ----------- 49 mesg [7] 50 mesg [n] data(is[1]) 51 ----------- 52 mesg[n+1] 53 mesg[m] data(is[5]) 54 ----------- 55 56 Notes: 57 nrqs - no of requests sent (or to be sent out) 58 nrqr - no of requests recieved (which have to be or which have been processed 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Once" 62 static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS is[]) 63 { 64 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 65 int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i; 66 int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 67 int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2; 68 int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2; 69 PetscBT *table; 70 MPI_Comm comm; 71 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 72 MPI_Status *s_status,*recv_status; 73 74 PetscFunctionBegin; 75 comm = C->comm; 76 size = c->size; 77 rank = c->rank; 78 Mbs = c->Mbs; 79 80 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 81 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 82 83 len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int); 84 ierr = PetscMalloc(len,&idx);CHKERRQ(ierr); 85 n = (int*)(idx + imax); 86 rtable = n + imax; 87 88 for (i=0; i<imax; i++) { 89 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 90 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 91 } 92 93 /* Create hash table for the mapping :row -> proc*/ 94 for (i=0,j=0; i<size; i++) { 95 len = c->rowners[i+1]; 96 for (; j<len; j++) { 97 rtable[j] = i; 98 } 99 } 100 101 /* evaluate communication - mesg to who,length of mesg, and buffer space 102 required. Based on this, buffers are allocated, and data copied into them*/ 103 ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/* mesg size */ 104 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 105 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 106 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 107 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 108 for (i=0; i<imax; i++) { 109 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 110 idx_i = idx[i]; 111 len = n[i]; 112 for (j=0; j<len; j++) { 113 row = idx_i[j]; 114 if (row < 0) { 115 SETERRQ(1,"Index set cannot have negative entries"); 116 } 117 proc = rtable[row]; 118 w4[proc]++; 119 } 120 for (j=0; j<size; j++){ 121 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 122 } 123 } 124 125 nrqs = 0; /* no of outgoing messages */ 126 msz = 0; /* total mesg length (for all proc */ 127 w1[rank] = 0; /* no mesg sent to itself */ 128 w3[rank] = 0; 129 for (i=0; i<size; i++) { 130 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 131 } 132 /* pa - is list of processors to communicate with */ 133 ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); 134 for (i=0,j=0; i<size; i++) { 135 if (w1[i]) {pa[j] = i; j++;} 136 } 137 138 /* Each message would have a header = 1 + 2*(no of IS) + data */ 139 for (i=0; i<nrqs; i++) { 140 j = pa[i]; 141 w1[j] += w2[j] + 2*w3[j]; 142 msz += w1[j]; 143 } 144 145 /* Determine the number of messages to expect, their lengths, from from-ids */ 146 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 147 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 148 149 /* Now post the Irecvs corresponding to these messages */ 150 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 151 152 /* Allocate Memory for outgoing messages */ 153 len = 2*size*sizeof(int*) + (size+msz)*sizeof(int); 154 ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr); 155 ptr = outdat + size; /* Pointers to the data in outgoing buffers */ 156 ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr); 157 tmp = (int*)(outdat + 2*size); 158 ctr = tmp + msz; 159 160 { 161 int *iptr = tmp,ict = 0; 162 for (i=0; i<nrqs; i++) { 163 j = pa[i]; 164 iptr += ict; 165 outdat[j] = iptr; 166 ict = w1[j]; 167 } 168 } 169 170 /* Form the outgoing messages */ 171 /*plug in the headers*/ 172 for (i=0; i<nrqs; i++) { 173 j = pa[i]; 174 outdat[j][0] = 0; 175 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 176 ptr[j] = outdat[j] + 2*w3[j] + 1; 177 } 178 179 /* Memory for doing local proc's work*/ 180 { 181 int *d_p; 182 char *t_p; 183 184 len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) + 185 (Mbs)*imax*sizeof(int) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1; 186 ierr = PetscMalloc(len,&table);CHKERRQ(ierr); 187 ierr = PetscMemzero(table,len);CHKERRQ(ierr); 188 data = (int **)(table + imax); 189 isz = (int *)(data + imax); 190 d_p = (int *)(isz + imax); 191 t_p = (char *)(d_p + Mbs*imax); 192 for (i=0; i<imax; i++) { 193 table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 194 data[i] = d_p + (Mbs)*i; 195 } 196 } 197 198 /* Parse the IS and update local tables and the outgoing buf with the data*/ 199 { 200 int n_i,*data_i,isz_i,*outdat_j,ctr_j; 201 PetscBT table_i; 202 203 for (i=0; i<imax; i++) { 204 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 205 n_i = n[i]; 206 table_i = table[i]; 207 idx_i = idx[i]; 208 data_i = data[i]; 209 isz_i = isz[i]; 210 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 211 row = idx_i[j]; 212 proc = rtable[row]; 213 if (proc != rank) { /* copy to the outgoing buffer */ 214 ctr[proc]++; 215 *ptr[proc] = row; 216 ptr[proc]++; 217 } 218 else { /* Update the local table */ 219 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 220 } 221 } 222 /* Update the headers for the current IS */ 223 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 224 if ((ctr_j = ctr[j])) { 225 outdat_j = outdat[j]; 226 k = ++outdat_j[0]; 227 outdat_j[2*k] = ctr_j; 228 outdat_j[2*k-1] = i; 229 } 230 } 231 isz[i] = isz_i; 232 } 233 } 234 235 /* Now post the sends */ 236 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 237 for (i=0; i<nrqs; ++i) { 238 j = pa[i]; 239 ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 240 } 241 242 /* No longer need the original indices*/ 243 for (i=0; i<imax; ++i) { 244 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 245 } 246 ierr = PetscFree(idx);CHKERRQ(ierr); 247 248 for (i=0; i<imax; ++i) { 249 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 250 } 251 252 /* Do Local work*/ 253 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 254 255 /* Receive messages*/ 256 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 257 ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr); 258 259 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 260 ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr); 261 262 /* Phase 1 sends are complete - deallocate buffers */ 263 ierr = PetscFree(outdat);CHKERRQ(ierr); 264 ierr = PetscFree(w1);CHKERRQ(ierr); 265 266 ierr = PetscMalloc((nrqr+1)*sizeof(int*),&xdata);CHKERRQ(ierr); 267 ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr); 268 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 269 ierr = PetscFree(rbuf);CHKERRQ(ierr); 270 271 /* Send the data back*/ 272 /* Do a global reduction to know the buffer space req for incoming messages*/ 273 { 274 int *rw1; 275 276 ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr); 277 ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr); 278 279 for (i=0; i<nrqr; ++i) { 280 proc = recv_status[i].MPI_SOURCE; 281 if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch"); 282 rw1[proc] = isz1[i]; 283 } 284 285 ierr = PetscFree(onodes1);CHKERRQ(ierr); 286 ierr = PetscFree(olengths1);CHKERRQ(ierr); 287 288 /* Determine the number of messages to expect, their lengths, from from-ids */ 289 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 290 PetscFree(rw1); 291 } 292 /* Now post the Irecvs corresponding to these messages */ 293 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 294 295 /* Now post the sends */ 296 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 297 for (i=0; i<nrqr; ++i) { 298 j = recv_status[i].MPI_SOURCE; 299 ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 300 } 301 302 /* receive work done on other processors*/ 303 { 304 int idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 305 PetscBT table_i; 306 MPI_Status *status2; 307 308 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 309 310 for (i=0; i<nrqs; ++i) { 311 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 312 /* Process the message*/ 313 rbuf2_i = rbuf2[idex]; 314 ct1 = 2*rbuf2_i[0]+1; 315 jmax = rbuf2[idex][0]; 316 for (j=1; j<=jmax; j++) { 317 max = rbuf2_i[2*j]; 318 is_no = rbuf2_i[2*j-1]; 319 isz_i = isz[is_no]; 320 data_i = data[is_no]; 321 table_i = table[is_no]; 322 for (k=0; k<max; k++,ct1++) { 323 row = rbuf2_i[ct1]; 324 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 325 } 326 isz[is_no] = isz_i; 327 } 328 } 329 ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr); 330 ierr = PetscFree(status2);CHKERRQ(ierr); 331 } 332 333 for (i=0; i<imax; ++i) { 334 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr); 335 } 336 337 338 ierr = PetscFree(onodes2);CHKERRQ(ierr); 339 ierr = PetscFree(olengths2);CHKERRQ(ierr); 340 341 ierr = PetscFree(pa);CHKERRQ(ierr); 342 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 343 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 344 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 345 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 346 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 347 ierr = PetscFree(table);CHKERRQ(ierr); 348 ierr = PetscFree(s_status);CHKERRQ(ierr); 349 ierr = PetscFree(recv_status);CHKERRQ(ierr); 350 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 351 ierr = PetscFree(xdata);CHKERRQ(ierr); 352 ierr = PetscFree(isz1);CHKERRQ(ierr); 353 PetscFunctionReturn(0); 354 } 355 356 #undef __FUNCT__ 357 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Local" 358 /* 359 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 360 the work on the local processor. 361 362 Inputs: 363 C - MAT_MPIBAIJ; 364 imax - total no of index sets processed at a time; 365 table - an array of char - size = Mbs bits. 366 367 Output: 368 isz - array containing the count of the solution elements corresponding 369 to each index set; 370 data - pointer to the solutions 371 */ 372 static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data) 373 { 374 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 375 Mat A = c->A,B = c->B; 376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 377 int start,end,val,max,rstart,cstart,*ai,*aj; 378 int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 379 PetscBT table_i; 380 381 PetscFunctionBegin; 382 rstart = c->rstart; 383 cstart = c->cstart; 384 ai = a->i; 385 aj = a->j; 386 bi = b->i; 387 bj = b->j; 388 garray = c->garray; 389 390 391 for (i=0; i<imax; i++) { 392 data_i = data[i]; 393 table_i = table[i]; 394 isz_i = isz[i]; 395 for (j=0,max=isz[i]; j<max; j++) { 396 row = data_i[j] - rstart; 397 start = ai[row]; 398 end = ai[row+1]; 399 for (k=start; k<end; k++) { /* Amat */ 400 val = aj[k] + cstart; 401 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 402 } 403 start = bi[row]; 404 end = bi[row+1]; 405 for (k=start; k<end; k++) { /* Bmat */ 406 val = garray[bj[k]]; 407 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 408 } 409 } 410 isz[i] = isz_i; 411 } 412 PetscFunctionReturn(0); 413 } 414 #undef __FUNCT__ 415 #define __FUNCT__ "MatIncreaseOverlap_MPIBAIJ_Receive" 416 /* 417 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 418 and return the output 419 420 Input: 421 C - the matrix 422 nrqr - no of messages being processed. 423 rbuf - an array of pointers to the recieved requests 424 425 Output: 426 xdata - array of messages to be sent back 427 isz1 - size of each message 428 429 For better efficiency perhaps we should malloc seperately each xdata[i], 430 then if a remalloc is required we need only copy the data for that one row 431 rather than all previous rows as it is now where a single large chunck of 432 memory is used. 433 434 */ 435 static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1) 436 { 437 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 438 Mat A = c->A,B = c->B; 439 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 440 int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 441 int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 442 int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 443 int *rbuf_i,kmax,rbuf_0,ierr; 444 PetscBT xtable; 445 446 PetscFunctionBegin; 447 rank = c->rank; 448 Mbs = c->Mbs; 449 rstart = c->rstart; 450 cstart = c->cstart; 451 ai = a->i; 452 aj = a->j; 453 bi = b->i; 454 bj = b->j; 455 garray = c->garray; 456 457 458 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 459 rbuf_i = rbuf[i]; 460 rbuf_0 = rbuf_i[0]; 461 ct += rbuf_0; 462 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 463 } 464 465 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 466 else max1 = 1; 467 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 468 ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr); 469 ++no_malloc; 470 ierr = PetscBTCreate(Mbs,xtable);CHKERRQ(ierr); 471 ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); 472 473 ct3 = 0; 474 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 475 rbuf_i = rbuf[i]; 476 rbuf_0 = rbuf_i[0]; 477 ct1 = 2*rbuf_0+1; 478 ct2 = ct1; 479 ct3 += ct1; 480 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 481 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 482 oct2 = ct2; 483 kmax = rbuf_i[2*j]; 484 for (k=0; k<kmax; k++,ct1++) { 485 row = rbuf_i[ct1]; 486 if (!PetscBTLookupSet(xtable,row)) { 487 if (!(ct3 < mem_estimate)) { 488 new_estimate = (int)(1.5*mem_estimate)+1; 489 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 490 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 491 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 492 xdata[0] = tmp; 493 mem_estimate = new_estimate; ++no_malloc; 494 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 495 } 496 xdata[i][ct2++] = row; 497 ct3++; 498 } 499 } 500 for (k=oct2,max2=ct2; k<max2; k++) { 501 row = xdata[i][k] - rstart; 502 start = ai[row]; 503 end = ai[row+1]; 504 for (l=start; l<end; l++) { 505 val = aj[l] + cstart; 506 if (!PetscBTLookupSet(xtable,val)) { 507 if (!(ct3 < mem_estimate)) { 508 new_estimate = (int)(1.5*mem_estimate)+1; 509 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 510 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 511 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 512 xdata[0] = tmp; 513 mem_estimate = new_estimate; ++no_malloc; 514 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 515 } 516 xdata[i][ct2++] = val; 517 ct3++; 518 } 519 } 520 start = bi[row]; 521 end = bi[row+1]; 522 for (l=start; l<end; l++) { 523 val = garray[bj[l]]; 524 if (!PetscBTLookupSet(xtable,val)) { 525 if (!(ct3 < mem_estimate)) { 526 new_estimate = (int)(1.5*mem_estimate)+1; 527 ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);CHKERRQ(ierr); 528 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr); 529 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 530 xdata[0] = tmp; 531 mem_estimate = new_estimate; ++no_malloc; 532 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 533 } 534 xdata[i][ct2++] = val; 535 ct3++; 536 } 537 } 538 } 539 /* Update the header*/ 540 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 541 xdata[i][2*j-1] = rbuf_i[2*j-1]; 542 } 543 xdata[i][0] = rbuf_0; 544 xdata[i+1] = xdata[i] + ct2; 545 isz1[i] = ct2; /* size of each message */ 546 } 547 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 548 PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc); 549 PetscFunctionReturn(0); 550 } 551 552 static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,const IS[],const IS[],MatReuse,Mat *); 553 554 #undef __FUNCT__ 555 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ" 556 int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 557 { 558 IS *isrow_new,*iscol_new; 559 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 560 int nmax,nstages_local,nstages,i,pos,max_no,ierr,N=C->N,bs=c->bs; 561 562 PetscFunctionBegin; 563 /* The compression and expansion should be avoided. Does'nt point 564 out errors might change the indices hence buggey */ 565 566 ierr = PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);CHKERRQ(ierr); 567 iscol_new = isrow_new + ismax; 568 ierr = ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);CHKERRQ(ierr); 569 ierr = ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);CHKERRQ(ierr); 570 571 /* Allocate memory to hold all the submatrices */ 572 if (scall != MAT_REUSE_MATRIX) { 573 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 574 } 575 /* Determine the number of stages through which submatrices are done */ 576 nmax = 20*1000000 / (c->Nbs * sizeof(int)); 577 if (!nmax) nmax = 1; 578 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 579 580 /* Make sure every processor loops through the nstages */ 581 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 582 for (i=0,pos=0; i<nstages; i++) { 583 if (pos+nmax <= ismax) max_no = nmax; 584 else if (pos == ismax) max_no = 0; 585 else max_no = ismax-pos; 586 ierr = MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);CHKERRQ(ierr); 587 pos += max_no; 588 } 589 590 for (i=0; i<ismax; i++) { 591 ierr = ISDestroy(isrow_new[i]);CHKERRQ(ierr); 592 ierr = ISDestroy(iscol_new[i]);CHKERRQ(ierr); 593 } 594 ierr = PetscFree(isrow_new);CHKERRQ(ierr); 595 PetscFunctionReturn(0); 596 } 597 598 #if defined (PETSC_USE_CTABLE) 599 #undef __FUNCT__ 600 #define __FUNCT__ "PetscGetProc" 601 int PetscGetProc(const int gid, const int size, const int proc_gnode[], int *proc) 602 { 603 int nGlobalNd = proc_gnode[size]; 604 int fproc = (int) ((float)gid * (float)size / (float)nGlobalNd + 0.5); 605 606 PetscFunctionBegin; 607 /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/ 608 if (fproc > size) fproc = size; 609 while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) { 610 if (gid < proc_gnode[fproc]) fproc--; 611 else fproc++; 612 } 613 /* if(fproc<0 || fproc>=size) { SETERRQ(1,"fproc < 0 || fproc >= size"); }*/ 614 *proc = fproc; 615 PetscFunctionReturn(0); 616 } 617 #endif 618 619 /* -------------------------------------------------------------------------*/ 620 #undef __FUNCT__ 621 #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local" 622 static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 623 { 624 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 625 Mat A = c->A; 626 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat; 627 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size; 628 int **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc; 629 int nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr; 630 int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 631 int **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 632 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 633 int bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB; 634 int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 635 int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3; 636 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 637 MPI_Request *r_waits4,*s_waits3,*s_waits4; 638 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 639 MPI_Status *r_status3,*r_status4,*s_status4; 640 MPI_Comm comm; 641 MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 642 MatScalar *a_a=a->a,*b_a=b->a; 643 PetscTruth flag; 644 int *onodes1,*olengths1; 645 646 #if defined (PETSC_USE_CTABLE) 647 int tt; 648 PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1; 649 #else 650 int **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs; 651 #endif 652 653 PetscFunctionBegin; 654 comm = C->comm; 655 tag0 = C->tag; 656 size = c->size; 657 rank = c->rank; 658 659 /* Get some new tags to keep the communication clean */ 660 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 661 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 662 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 663 664 /* Check if the col indices are sorted */ 665 for (i=0; i<ismax; i++) { 666 ierr = ISSorted(iscol[i],(PetscTruth*)&j);CHKERRQ(ierr); 667 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted"); 668 } 669 670 len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)); 671 #if !defined (PETSC_USE_CTABLE) 672 len += (Mbs+1)*sizeof(int); 673 #endif 674 ierr = PetscMalloc(len,&irow);CHKERRQ(ierr); 675 icol = irow + ismax; 676 nrow = (int*)(icol + ismax); 677 ncol = nrow + ismax; 678 #if !defined (PETSC_USE_CTABLE) 679 rtable = ncol + ismax; 680 /* Create hash table for the mapping :row -> proc*/ 681 for (i=0,j=0; i<size; i++) { 682 jmax = c->rowners[i+1]; 683 for (; j<jmax; j++) { 684 rtable[j] = i; 685 } 686 } 687 #endif 688 689 for (i=0; i<ismax; i++) { 690 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 691 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 692 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 693 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 694 } 695 696 /* evaluate communication - mesg to who,length of mesg,and buffer space 697 required. Based on this, buffers are allocated, and data copied into them*/ 698 ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */ 699 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 700 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 701 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 702 ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 703 for (i=0; i<ismax; i++) { 704 ierr = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/ 705 jmax = nrow[i]; 706 irow_i = irow[i]; 707 for (j=0; j<jmax; j++) { 708 row = irow_i[j]; 709 #if defined (PETSC_USE_CTABLE) 710 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 711 #else 712 proc = rtable[row]; 713 #endif 714 w4[proc]++; 715 } 716 for (j=0; j<size; j++) { 717 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 718 } 719 } 720 721 nrqs = 0; /* no of outgoing messages */ 722 msz = 0; /* total mesg length for all proc */ 723 w1[rank] = 0; /* no mesg sent to intself */ 724 w3[rank] = 0; 725 for (i=0; i<size; i++) { 726 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 727 } 728 ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/ 729 for (i=0,j=0; i<size; i++) { 730 if (w1[i]) { pa[j] = i; j++; } 731 } 732 733 /* Each message would have a header = 1 + 2*(no of IS) + data */ 734 for (i=0; i<nrqs; i++) { 735 j = pa[i]; 736 w1[j] += w2[j] + 2* w3[j]; 737 msz += w1[j]; 738 } 739 740 /* Determine the number of messages to expect, their lengths, from from-ids */ 741 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 742 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 743 744 /* Now post the Irecvs corresponding to these messages */ 745 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 746 747 ierr = PetscFree(onodes1);CHKERRQ(ierr); 748 ierr = PetscFree(olengths1);CHKERRQ(ierr); 749 750 /* Allocate Memory for outgoing messages */ 751 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 752 ierr = PetscMalloc(len,&sbuf1);CHKERRQ(ierr); 753 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 754 ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr); 755 /* allocate memory for outgoing data + buf to receive the first reply */ 756 tmp = (int*)(ptr + size); 757 ctr = tmp + 2*msz; 758 759 { 760 int *iptr = tmp,ict = 0; 761 for (i=0; i<nrqs; i++) { 762 j = pa[i]; 763 iptr += ict; 764 sbuf1[j] = iptr; 765 ict = w1[j]; 766 } 767 } 768 769 /* Form the outgoing messages */ 770 /* Initialise the header space */ 771 for (i=0; i<nrqs; i++) { 772 j = pa[i]; 773 sbuf1[j][0] = 0; 774 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr); 775 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 776 } 777 778 /* Parse the isrow and copy data into outbuf */ 779 for (i=0; i<ismax; i++) { 780 ierr = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr); 781 irow_i = irow[i]; 782 jmax = nrow[i]; 783 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 784 row = irow_i[j]; 785 #if defined (PETSC_USE_CTABLE) 786 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 787 #else 788 proc = rtable[row]; 789 #endif 790 if (proc != rank) { /* copy to the outgoing buf*/ 791 ctr[proc]++; 792 *ptr[proc] = row; 793 ptr[proc]++; 794 } 795 } 796 /* Update the headers for the current IS */ 797 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 798 if ((ctr_j = ctr[j])) { 799 sbuf1_j = sbuf1[j]; 800 k = ++sbuf1_j[0]; 801 sbuf1_j[2*k] = ctr_j; 802 sbuf1_j[2*k-1] = i; 803 } 804 } 805 } 806 807 /* Now post the sends */ 808 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 809 for (i=0; i<nrqs; ++i) { 810 j = pa[i]; 811 ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 812 } 813 814 /* Post Recieves to capture the buffer size */ 815 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 816 ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf2);CHKERRQ(ierr); 817 rbuf2[0] = tmp + msz; 818 for (i=1; i<nrqs; ++i) { 819 j = pa[i]; 820 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 821 } 822 for (i=0; i<nrqs; ++i) { 823 j = pa[i]; 824 ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 825 } 826 827 /* Send to other procs the buf size they should allocate */ 828 829 /* Receive messages*/ 830 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 831 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 832 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 833 ierr = PetscMalloc(len,&sbuf2);CHKERRQ(ierr); 834 req_size = (int*)(sbuf2 + nrqr); 835 req_source = req_size + nrqr; 836 837 { 838 Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data; 839 int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i; 840 841 for (i=0; i<nrqr; ++i) { 842 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 843 req_size[idex] = 0; 844 rbuf1_i = rbuf1[idex]; 845 start = 2*rbuf1_i[0] + 1; 846 ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr); 847 ierr = PetscMalloc(end*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr); 848 sbuf2_i = sbuf2[idex]; 849 for (j=start; j<end; j++) { 850 id = rbuf1_i[j] - rstart; 851 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 852 sbuf2_i[j] = ncols; 853 req_size[idex] += ncols; 854 } 855 req_source[idex] = r_status1[i].MPI_SOURCE; 856 /* form the header */ 857 sbuf2_i[0] = req_size[idex]; 858 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 859 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 860 } 861 } 862 ierr = PetscFree(r_status1);CHKERRQ(ierr); 863 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 864 865 /* recv buffer sizes */ 866 /* Receive messages*/ 867 868 ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr); 869 ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr); 870 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 871 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 872 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 873 874 for (i=0; i<nrqs; ++i) { 875 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 876 ierr = PetscMalloc(rbuf2[idex][0]*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr); 877 ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr); 878 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT, 879 r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 880 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR, 881 r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 882 } 883 ierr = PetscFree(r_status2);CHKERRQ(ierr); 884 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 885 886 /* Wait on sends1 and sends2 */ 887 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 888 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 889 890 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 891 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 892 ierr = PetscFree(s_status1);CHKERRQ(ierr); 893 ierr = PetscFree(s_status2);CHKERRQ(ierr); 894 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 895 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 896 897 /* Now allocate buffers for a->j, and send them off */ 898 ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr); 899 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 900 ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr); 901 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 902 903 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 904 { 905 for (i=0; i<nrqr; i++) { 906 rbuf1_i = rbuf1[i]; 907 sbuf_aj_i = sbuf_aj[i]; 908 ct1 = 2*rbuf1_i[0] + 1; 909 ct2 = 0; 910 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 911 kmax = rbuf1[i][2*j]; 912 for (k=0; k<kmax; k++,ct1++) { 913 row = rbuf1_i[ct1] - rstart; 914 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 915 ncols = nzA + nzB; 916 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 917 918 /* load the column indices for this row into cols*/ 919 cols = sbuf_aj_i + ct2; 920 for (l=0; l<nzB; l++) { 921 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 922 else break; 923 } 924 imark = l; 925 for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 926 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 927 ct2 += ncols; 928 } 929 } 930 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 931 } 932 } 933 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 934 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 935 936 /* Allocate buffers for a->a, and send them off */ 937 ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr); 938 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 939 ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr); 940 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 941 942 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 943 { 944 for (i=0; i<nrqr; i++) { 945 rbuf1_i = rbuf1[i]; 946 sbuf_aa_i = sbuf_aa[i]; 947 ct1 = 2*rbuf1_i[0]+1; 948 ct2 = 0; 949 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 950 kmax = rbuf1_i[2*j]; 951 for (k=0; k<kmax; k++,ct1++) { 952 row = rbuf1_i[ct1] - rstart; 953 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 954 ncols = nzA + nzB; 955 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 956 vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 957 958 /* load the column values for this row into vals*/ 959 vals = sbuf_aa_i+ct2*bs2; 960 for (l=0; l<nzB; l++) { 961 if ((bmap[cworkB[l]]) < cstart) { 962 ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 963 } 964 else break; 965 } 966 imark = l; 967 for (l=0; l<nzA; l++) { 968 ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 969 } 970 for (l=imark; l<nzB; l++) { 971 ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 972 } 973 ct2 += ncols; 974 } 975 } 976 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 977 } 978 } 979 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 980 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 981 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 982 983 /* Form the matrix */ 984 /* create col map */ 985 { 986 int *icol_i; 987 #if defined (PETSC_USE_CTABLE) 988 /* Create row map*/ 989 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr); 990 for (i=0; i<ismax; i++) { 991 ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr); 992 } 993 #else 994 len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int); 995 ierr = PetscMalloc(len,&cmap);CHKERRQ(ierr); 996 cmap[0] = (int*)(cmap + ismax); 997 ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));CHKERRQ(ierr); 998 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 999 #endif 1000 for (i=0; i<ismax; i++) { 1001 jmax = ncol[i]; 1002 icol_i = icol[i]; 1003 #if defined (PETSC_USE_CTABLE) 1004 lcol1_gcol1 = colmaps[i]; 1005 for (j=0; j<jmax; j++) { 1006 ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr); 1007 } 1008 #else 1009 cmap_i = cmap[i]; 1010 for (j=0; j<jmax; j++) { 1011 cmap_i[icol_i[j]] = j+1; 1012 } 1013 #endif 1014 } 1015 } 1016 1017 /* Create lens which is required for MatCreate... */ 1018 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1019 len = (1+ismax)*sizeof(int*)+ j*sizeof(int); 1020 ierr = PetscMalloc(len,&lens);CHKERRQ(ierr); 1021 lens[0] = (int*)(lens + ismax); 1022 ierr = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr); 1023 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1024 1025 /* Update lens from local data */ 1026 for (i=0; i<ismax; i++) { 1027 jmax = nrow[i]; 1028 #if defined (PETSC_USE_CTABLE) 1029 lcol1_gcol1 = colmaps[i]; 1030 #else 1031 cmap_i = cmap[i]; 1032 #endif 1033 irow_i = irow[i]; 1034 lens_i = lens[i]; 1035 for (j=0; j<jmax; j++) { 1036 row = irow_i[j]; 1037 #if defined (PETSC_USE_CTABLE) 1038 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 1039 #else 1040 proc = rtable[row]; 1041 #endif 1042 if (proc == rank) { 1043 /* Get indices from matA and then from matB */ 1044 row = row - rstart; 1045 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1046 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1047 #if defined (PETSC_USE_CTABLE) 1048 for (k=0; k<nzA; k++) { 1049 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr); 1050 if (tt) { lens_i[j]++; } 1051 } 1052 for (k=0; k<nzB; k++) { 1053 ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr); 1054 if (tt) { lens_i[j]++; } 1055 } 1056 #else 1057 for (k=0; k<nzA; k++) { 1058 if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; } 1059 } 1060 for (k=0; k<nzB; k++) { 1061 if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; } 1062 } 1063 #endif 1064 } 1065 } 1066 } 1067 #if defined (PETSC_USE_CTABLE) 1068 /* Create row map*/ 1069 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr); 1070 for (i=0; i<ismax; i++){ 1071 ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr); 1072 } 1073 #else 1074 /* Create row map*/ 1075 len = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int); 1076 ierr = PetscMalloc(len,&rmap);CHKERRQ(ierr); 1077 rmap[0] = (int*)(rmap + ismax); 1078 ierr = PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));CHKERRQ(ierr); 1079 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;} 1080 #endif 1081 for (i=0; i<ismax; i++) { 1082 irow_i = irow[i]; 1083 jmax = nrow[i]; 1084 #if defined (PETSC_USE_CTABLE) 1085 lrow1_grow1 = rowmaps[i]; 1086 for (j=0; j<jmax; j++) { 1087 ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr); 1088 } 1089 #else 1090 rmap_i = rmap[i]; 1091 for (j=0; j<jmax; j++) { 1092 rmap_i[irow_i[j]] = j; 1093 } 1094 #endif 1095 } 1096 1097 /* Update lens from offproc data */ 1098 { 1099 int *rbuf2_i,*rbuf3_i,*sbuf1_i; 1100 1101 for (tmp2=0; tmp2<nrqs; tmp2++) { 1102 ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr); 1103 idex = pa[i]; 1104 sbuf1_i = sbuf1[idex]; 1105 jmax = sbuf1_i[0]; 1106 ct1 = 2*jmax+1; 1107 ct2 = 0; 1108 rbuf2_i = rbuf2[i]; 1109 rbuf3_i = rbuf3[i]; 1110 for (j=1; j<=jmax; j++) { 1111 is_no = sbuf1_i[2*j-1]; 1112 max1 = sbuf1_i[2*j]; 1113 lens_i = lens[is_no]; 1114 #if defined (PETSC_USE_CTABLE) 1115 lcol1_gcol1 = colmaps[is_no]; 1116 lrow1_grow1 = rowmaps[is_no]; 1117 #else 1118 cmap_i = cmap[is_no]; 1119 rmap_i = rmap[is_no]; 1120 #endif 1121 for (k=0; k<max1; k++,ct1++) { 1122 #if defined (PETSC_USE_CTABLE) 1123 ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1124 row--; 1125 if(row < 0) { SETERRQ(1,"row not found in table"); } 1126 #else 1127 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1128 #endif 1129 max2 = rbuf2_i[ct1]; 1130 for (l=0; l<max2; l++,ct2++) { 1131 #if defined (PETSC_USE_CTABLE) 1132 ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr); 1133 if (tt) { 1134 lens_i[row]++; 1135 } 1136 #else 1137 if (cmap_i[rbuf3_i[ct2]]) { 1138 lens_i[row]++; 1139 } 1140 #endif 1141 } 1142 } 1143 } 1144 } 1145 } 1146 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1147 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1148 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1149 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1150 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1151 1152 /* Create the submatrices */ 1153 if (scall == MAT_REUSE_MATRIX) { 1154 /* 1155 Assumes new rows are same length as the old rows, hence bug! 1156 */ 1157 for (i=0; i<ismax; i++) { 1158 mat = (Mat_SeqBAIJ *)(submats[i]->data); 1159 if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) { 1160 SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1161 } 1162 ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);CHKERRQ(ierr); 1163 if (flag == PETSC_FALSE) { 1164 SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros"); 1165 } 1166 /* Initial matrix as if empty */ 1167 ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(int));CHKERRQ(ierr); 1168 submats[i]->factor = C->factor; 1169 } 1170 } else { 1171 for (i=0; i<ismax; i++) { 1172 ierr = MatCreate(PETSC_COMM_SELF,nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs,submats+i);CHKERRQ(ierr); 1173 ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr); 1174 ierr = MatSeqBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);CHKERRQ(ierr); 1175 ierr = MatSeqSBAIJSetPreallocation(submats[i],a->bs,0,lens[i]);CHKERRQ(ierr); 1176 } 1177 } 1178 1179 /* Assemble the matrices */ 1180 /* First assemble the local rows */ 1181 { 1182 int ilen_row,*imat_ilen,*imat_j,*imat_i; 1183 MatScalar *imat_a; 1184 1185 for (i=0; i<ismax; i++) { 1186 mat = (Mat_SeqBAIJ*)submats[i]->data; 1187 imat_ilen = mat->ilen; 1188 imat_j = mat->j; 1189 imat_i = mat->i; 1190 imat_a = mat->a; 1191 1192 #if defined (PETSC_USE_CTABLE) 1193 lcol1_gcol1 = colmaps[i]; 1194 lrow1_grow1 = rowmaps[i]; 1195 #else 1196 cmap_i = cmap[i]; 1197 rmap_i = rmap[i]; 1198 #endif 1199 irow_i = irow[i]; 1200 jmax = nrow[i]; 1201 for (j=0; j<jmax; j++) { 1202 row = irow_i[j]; 1203 #if defined (PETSC_USE_CTABLE) 1204 ierr = PetscGetProc(row,size,c->rowners,&proc);CHKERRQ(ierr); 1205 #else 1206 proc = rtable[row]; 1207 #endif 1208 if (proc == rank) { 1209 row = row - rstart; 1210 nzA = a_i[row+1] - a_i[row]; 1211 nzB = b_i[row+1] - b_i[row]; 1212 cworkA = a_j + a_i[row]; 1213 cworkB = b_j + b_i[row]; 1214 vworkA = a_a + a_i[row]*bs2; 1215 vworkB = b_a + b_i[row]*bs2; 1216 #if defined (PETSC_USE_CTABLE) 1217 ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr); 1218 row--; 1219 if (row < 0) { SETERRQ(1,"row not found in table"); } 1220 #else 1221 row = rmap_i[row + rstart]; 1222 #endif 1223 mat_i = imat_i[row]; 1224 mat_a = imat_a + mat_i*bs2; 1225 mat_j = imat_j + mat_i; 1226 ilen_row = imat_ilen[row]; 1227 1228 /* load the column indices for this row into cols*/ 1229 for (l=0; l<nzB; l++) { 1230 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1231 #if defined (PETSC_USE_CTABLE) 1232 ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr); 1233 if (tcol) { 1234 #else 1235 if ((tcol = cmap_i[ctmp])) { 1236 #endif 1237 *mat_j++ = tcol - 1; 1238 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1239 mat_a += bs2; 1240 ilen_row++; 1241 } 1242 } else break; 1243 } 1244 imark = l; 1245 for (l=0; l<nzA; l++) { 1246 #if defined (PETSC_USE_CTABLE) 1247 ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr); 1248 if (tcol) { 1249 #else 1250 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1251 #endif 1252 *mat_j++ = tcol - 1; 1253 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1254 mat_a += bs2; 1255 ilen_row++; 1256 } 1257 } 1258 for (l=imark; l<nzB; l++) { 1259 #if defined (PETSC_USE_CTABLE) 1260 ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr); 1261 if (tcol) { 1262 #else 1263 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1264 #endif 1265 *mat_j++ = tcol - 1; 1266 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1267 mat_a += bs2; 1268 ilen_row++; 1269 } 1270 } 1271 imat_ilen[row] = ilen_row; 1272 } 1273 } 1274 1275 } 1276 } 1277 1278 /* Now assemble the off proc rows*/ 1279 { 1280 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1281 int *imat_j,*imat_i; 1282 MatScalar *imat_a,*rbuf4_i; 1283 1284 for (tmp2=0; tmp2<nrqs; tmp2++) { 1285 ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr); 1286 idex = pa[i]; 1287 sbuf1_i = sbuf1[idex]; 1288 jmax = sbuf1_i[0]; 1289 ct1 = 2*jmax + 1; 1290 ct2 = 0; 1291 rbuf2_i = rbuf2[i]; 1292 rbuf3_i = rbuf3[i]; 1293 rbuf4_i = rbuf4[i]; 1294 for (j=1; j<=jmax; j++) { 1295 is_no = sbuf1_i[2*j-1]; 1296 #if defined (PETSC_USE_CTABLE) 1297 lrow1_grow1 = rowmaps[is_no]; 1298 lcol1_gcol1 = colmaps[is_no]; 1299 #else 1300 rmap_i = rmap[is_no]; 1301 cmap_i = cmap[is_no]; 1302 #endif 1303 mat = (Mat_SeqBAIJ*)submats[is_no]->data; 1304 imat_ilen = mat->ilen; 1305 imat_j = mat->j; 1306 imat_i = mat->i; 1307 imat_a = mat->a; 1308 max1 = sbuf1_i[2*j]; 1309 for (k=0; k<max1; k++,ct1++) { 1310 row = sbuf1_i[ct1]; 1311 #if defined (PETSC_USE_CTABLE) 1312 ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr); 1313 row--; 1314 if(row < 0) { SETERRQ(1,"row not found in table"); } 1315 #else 1316 row = rmap_i[row]; 1317 #endif 1318 ilen = imat_ilen[row]; 1319 mat_i = imat_i[row]; 1320 mat_a = imat_a + mat_i*bs2; 1321 mat_j = imat_j + mat_i; 1322 max2 = rbuf2_i[ct1]; 1323 for (l=0; l<max2; l++,ct2++) { 1324 #if defined (PETSC_USE_CTABLE) 1325 ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1326 if (tcol) { 1327 #else 1328 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1329 #endif 1330 *mat_j++ = tcol - 1; 1331 /* *mat_a++= rbuf4_i[ct2]; */ 1332 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1333 mat_a += bs2; 1334 ilen++; 1335 } 1336 } 1337 imat_ilen[row] = ilen; 1338 } 1339 } 1340 } 1341 } 1342 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1343 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1344 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1345 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1346 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1347 1348 /* Restore the indices */ 1349 for (i=0; i<ismax; i++) { 1350 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1351 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1352 } 1353 1354 /* Destroy allocated memory */ 1355 ierr = PetscFree(irow);CHKERRQ(ierr); 1356 ierr = PetscFree(w1);CHKERRQ(ierr); 1357 ierr = PetscFree(pa);CHKERRQ(ierr); 1358 1359 ierr = PetscFree(sbuf1);CHKERRQ(ierr); 1360 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1361 for (i=0; i<nrqr; ++i) { 1362 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1363 } 1364 for (i=0; i<nrqs; ++i) { 1365 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1366 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1367 } 1368 1369 ierr = PetscFree(sbuf2);CHKERRQ(ierr); 1370 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1371 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1372 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1373 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1374 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1375 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1376 1377 #if defined (PETSC_USE_CTABLE) 1378 for (i=0; i<ismax; i++){ 1379 ierr = PetscTableDelete(rowmaps[i]);CHKERRQ(ierr); 1380 ierr = PetscTableDelete(colmaps[i]);CHKERRQ(ierr); 1381 } 1382 ierr = PetscFree(colmaps);CHKERRQ(ierr); 1383 ierr = PetscFree(rowmaps);CHKERRQ(ierr); 1384 #else 1385 ierr = PetscFree(rmap);CHKERRQ(ierr); 1386 ierr = PetscFree(cmap);CHKERRQ(ierr); 1387 #endif 1388 ierr = PetscFree(lens);CHKERRQ(ierr); 1389 1390 for (i=0; i<ismax; i++) { 1391 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1392 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1393 } 1394 1395 PetscFunctionReturn(0); 1396 } 1397 1398