1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/baij/mpi/mpibaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 10 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 11 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 12 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 14 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 15 { 16 PetscErrorCode ierr; 17 PetscInt i,N=C->cmap->N, bs=C->rmap->bs; 18 IS *is_new; 19 20 PetscFunctionBegin; 21 ierr = PetscMalloc1(imax,&is_new);CHKERRQ(ierr); 22 /* Convert the indices into block format */ 23 ierr = ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);CHKERRQ(ierr); 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n"); 25 for (i=0; i<ov; ++i) { 26 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 27 } 28 for (i=0; i<imax; i++) {ierr = ISDestroy(&is[i]);CHKERRQ(ierr);} 29 ierr = ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);CHKERRQ(ierr); 30 for (i=0; i<imax; i++) {ierr = ISDestroy(&is_new[i]);CHKERRQ(ierr);} 31 ierr = PetscFree(is_new);CHKERRQ(ierr); 32 PetscFunctionReturn(0); 33 } 34 35 /* 36 Sample message format: 37 If a processor A wants processor B to process some elements corresponding 38 to index sets is[1], is[5] 39 mesg [0] = 2 (no of index sets in the mesg) 40 ----------- 41 mesg [1] = 1 => is[1] 42 mesg [2] = sizeof(is[1]); 43 ----------- 44 mesg [5] = 5 => is[5] 45 mesg [6] = sizeof(is[5]); 46 ----------- 47 mesg [7] 48 mesg [n] data(is[1]) 49 ----------- 50 mesg[n+1] 51 mesg[m] data(is[5]) 52 ----------- 53 54 Notes: 55 nrqs - no of requests sent (or to be sent out) 56 nrqr - no of requests recieved (which have to be or which have been processed 57 */ 58 PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[]) 59 { 60 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 61 const PetscInt **idx,*idx_i; 62 PetscInt *n,*w3,*w4,**data,len; 63 PetscErrorCode ierr; 64 PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr; 65 PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr; 66 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 67 PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2; 68 PetscBT *table; 69 MPI_Comm comm,*iscomms; 70 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 71 MPI_Status *s_status,*recv_status; 72 char *t_p; 73 74 PetscFunctionBegin; 75 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 76 size = c->size; 77 rank = c->rank; 78 Mbs = c->Mbs; 79 80 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 81 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 82 83 ierr = PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 84 85 for (i=0; i<imax; i++) { 86 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 87 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 88 } 89 90 /* evaluate communication - mesg to who,length of mesg, and buffer space 91 required. Based on this, buffers are allocated, and data copied into them*/ 92 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 93 for (i=0; i<imax; i++) { 94 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to itself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 137 /* TODO: use PetscCalloc2() */ 138 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 139 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 162 163 for (i=0; i<imax; i++) { 164 table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 165 data[i] = d_p + (Mbs)*i; 166 } 167 } 168 169 /* Parse the IS and update local tables and the outgoing buf with the data*/ 170 { 171 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 172 PetscBT table_i; 173 174 for (i=0; i<imax; i++) { 175 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 176 n_i = n[i]; 177 table_i = table[i]; 178 idx_i = idx[i]; 179 data_i = data[i]; 180 isz_i = isz[i]; 181 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 182 row = idx_i[j]; 183 ierr = PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);CHKERRQ(ierr); 184 if (proc != rank) { /* copy to the outgoing buffer */ 185 ctr[proc]++; 186 *ptr[proc] = row; 187 ptr[proc]++; 188 } else { /* Update the local table */ 189 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 190 } 191 } 192 /* Update the headers for the current IS */ 193 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 194 if ((ctr_j = ctr[j])) { 195 outdat_j = outdat[j]; 196 k = ++outdat_j[0]; 197 outdat_j[2*k] = ctr_j; 198 outdat_j[2*k-1] = i; 199 } 200 } 201 isz[i] = isz_i; 202 } 203 } 204 205 /* Now post the sends */ 206 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 207 for (i=0; i<nrqs; ++i) { 208 j = pa[i]; 209 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 210 } 211 212 /* No longer need the original indices*/ 213 for (i=0; i<imax; ++i) { 214 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 215 } 216 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 217 218 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 219 for (i=0; i<imax; ++i) { 220 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 221 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 222 } 223 224 /* Do Local work*/ 225 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 226 227 /* Receive messages*/ 228 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 229 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 230 231 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 232 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 233 234 /* Phase 1 sends are complete - deallocate buffers */ 235 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 236 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 237 238 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 239 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 240 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 241 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 242 ierr = PetscFree(rbuf);CHKERRQ(ierr); 243 244 /* Send the data back*/ 245 /* Do a global reduction to know the buffer space req for incoming messages*/ 246 { 247 PetscMPIInt *rw1; 248 249 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 250 251 for (i=0; i<nrqr; ++i) { 252 proc = recv_status[i].MPI_SOURCE; 253 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 254 rw1[proc] = isz1[i]; 255 } 256 257 ierr = PetscFree(onodes1);CHKERRQ(ierr); 258 ierr = PetscFree(olengths1);CHKERRQ(ierr); 259 260 /* Determine the number of messages to expect, their lengths, from from-ids */ 261 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 262 ierr = PetscFree(rw1);CHKERRQ(ierr); 263 } 264 /* Now post the Irecvs corresponding to these messages */ 265 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 266 267 /* Now post the sends */ 268 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 269 for (i=0; i<nrqr; ++i) { 270 j = recv_status[i].MPI_SOURCE; 271 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 272 } 273 274 /* receive work done on other processors*/ 275 { 276 PetscMPIInt idex; 277 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 278 PetscBT table_i; 279 MPI_Status *status2; 280 281 ierr = PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);CHKERRQ(ierr); 282 for (i=0; i<nrqs; ++i) { 283 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 284 /* Process the message*/ 285 rbuf2_i = rbuf2[idex]; 286 ct1 = 2*rbuf2_i[0]+1; 287 jmax = rbuf2[idex][0]; 288 for (j=1; j<=jmax; j++) { 289 max = rbuf2_i[2*j]; 290 is_no = rbuf2_i[2*j-1]; 291 isz_i = isz[is_no]; 292 data_i = data[is_no]; 293 table_i = table[is_no]; 294 for (k=0; k<max; k++,ct1++) { 295 row = rbuf2_i[ct1]; 296 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 297 } 298 isz[is_no] = isz_i; 299 } 300 } 301 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 302 ierr = PetscFree(status2);CHKERRQ(ierr); 303 } 304 305 for (i=0; i<imax; ++i) { 306 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 307 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 308 } 309 310 ierr = PetscFree(iscomms);CHKERRQ(ierr); 311 ierr = PetscFree(onodes2);CHKERRQ(ierr); 312 ierr = PetscFree(olengths2);CHKERRQ(ierr); 313 314 ierr = PetscFree(pa);CHKERRQ(ierr); 315 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 316 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 317 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 318 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 319 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 320 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 321 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 322 ierr = PetscFree(s_status);CHKERRQ(ierr); 323 ierr = PetscFree(recv_status);CHKERRQ(ierr); 324 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 325 ierr = PetscFree(xdata);CHKERRQ(ierr); 326 ierr = PetscFree(isz1);CHKERRQ(ierr); 327 PetscFunctionReturn(0); 328 } 329 330 /* 331 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 332 the work on the local processor. 333 334 Inputs: 335 C - MAT_MPIBAIJ; 336 imax - total no of index sets processed at a time; 337 table - an array of char - size = Mbs bits. 338 339 Output: 340 isz - array containing the count of the solution elements corresponding 341 to each index set; 342 data - pointer to the solutions 343 */ 344 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 345 { 346 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 347 Mat A = c->A,B = c->B; 348 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 349 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 350 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 351 PetscBT table_i; 352 353 PetscFunctionBegin; 354 rstart = c->rstartbs; 355 cstart = c->cstartbs; 356 ai = a->i; 357 aj = a->j; 358 bi = b->i; 359 bj = b->j; 360 garray = c->garray; 361 362 363 for (i=0; i<imax; i++) { 364 data_i = data[i]; 365 table_i = table[i]; 366 isz_i = isz[i]; 367 for (j=0,max=isz[i]; j<max; j++) { 368 row = data_i[j] - rstart; 369 start = ai[row]; 370 end = ai[row+1]; 371 for (k=start; k<end; k++) { /* Amat */ 372 val = aj[k] + cstart; 373 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 374 } 375 start = bi[row]; 376 end = bi[row+1]; 377 for (k=start; k<end; k++) { /* Bmat */ 378 val = garray[bj[k]]; 379 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 380 } 381 } 382 isz[i] = isz_i; 383 } 384 PetscFunctionReturn(0); 385 } 386 /* 387 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 388 and return the output 389 390 Input: 391 C - the matrix 392 nrqr - no of messages being processed. 393 rbuf - an array of pointers to the recieved requests 394 395 Output: 396 xdata - array of messages to be sent back 397 isz1 - size of each message 398 399 For better efficiency perhaps we should malloc separately each xdata[i], 400 then if a remalloc is required we need only copy the data for that one row 401 rather than all previous rows as it is now where a single large chunck of 402 memory is used. 403 404 */ 405 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 406 { 407 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 408 Mat A = c->A,B = c->B; 409 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 410 PetscErrorCode ierr; 411 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 412 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 413 PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 414 PetscInt *rbuf_i,kmax,rbuf_0; 415 PetscBT xtable; 416 417 PetscFunctionBegin; 418 Mbs = c->Mbs; 419 rstart = c->rstartbs; 420 cstart = c->cstartbs; 421 ai = a->i; 422 aj = a->j; 423 bi = b->i; 424 bj = b->j; 425 garray = c->garray; 426 427 428 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 429 rbuf_i = rbuf[i]; 430 rbuf_0 = rbuf_i[0]; 431 ct += rbuf_0; 432 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 433 } 434 435 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 436 else max1 = 1; 437 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 438 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 439 ++no_malloc; 440 ierr = PetscBTCreate(Mbs,&xtable);CHKERRQ(ierr); 441 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 442 443 ct3 = 0; 444 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 445 rbuf_i = rbuf[i]; 446 rbuf_0 = rbuf_i[0]; 447 ct1 = 2*rbuf_0+1; 448 ct2 = ct1; 449 ct3 += ct1; 450 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 451 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 452 oct2 = ct2; 453 kmax = rbuf_i[2*j]; 454 for (k=0; k<kmax; k++,ct1++) { 455 row = rbuf_i[ct1]; 456 if (!PetscBTLookupSet(xtable,row)) { 457 if (!(ct3 < mem_estimate)) { 458 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 459 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 460 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 461 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 462 xdata[0] = tmp; 463 mem_estimate = new_estimate; ++no_malloc; 464 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 465 } 466 xdata[i][ct2++] = row; 467 ct3++; 468 } 469 } 470 for (k=oct2,max2=ct2; k<max2; k++) { 471 row = xdata[i][k] - rstart; 472 start = ai[row]; 473 end = ai[row+1]; 474 for (l=start; l<end; l++) { 475 val = aj[l] + cstart; 476 if (!PetscBTLookupSet(xtable,val)) { 477 if (!(ct3 < mem_estimate)) { 478 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 479 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 480 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 481 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 482 xdata[0] = tmp; 483 mem_estimate = new_estimate; ++no_malloc; 484 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 485 } 486 xdata[i][ct2++] = val; 487 ct3++; 488 } 489 } 490 start = bi[row]; 491 end = bi[row+1]; 492 for (l=start; l<end; l++) { 493 val = garray[bj[l]]; 494 if (!PetscBTLookupSet(xtable,val)) { 495 if (!(ct3 < mem_estimate)) { 496 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 497 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 498 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 499 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 500 xdata[0] = tmp; 501 mem_estimate = new_estimate; ++no_malloc; 502 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 503 } 504 xdata[i][ct2++] = val; 505 ct3++; 506 } 507 } 508 } 509 /* Update the header*/ 510 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 511 xdata[i][2*j-1] = rbuf_i[2*j-1]; 512 } 513 xdata[i][0] = rbuf_0; 514 xdata[i+1] = xdata[i] + ct2; 515 isz1[i] = ct2; /* size of each message */ 516 } 517 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 518 ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 523 { 524 IS *isrow_block,*iscol_block; 525 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 526 PetscErrorCode ierr; 527 PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs; 528 Mat_SeqBAIJ *subc; 529 Mat_SubSppt *smat; 530 531 PetscFunctionBegin; 532 /* The compression and expansion should be avoided. Doesn't point 533 out errors, might change the indices, hence buggey */ 534 ierr = PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);CHKERRQ(ierr); 535 ierr = ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);CHKERRQ(ierr); 536 ierr = ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);CHKERRQ(ierr); 537 538 /* Determine the number of stages through which submatrices are done */ 539 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 540 else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt)); 541 if (!nmax) nmax = 1; 542 543 if (scall == MAT_INITIAL_MATRIX) { 544 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 545 546 /* Make sure every processor loops through the nstages */ 547 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 548 549 /* Allocate memory to hold all the submatrices and dummy submatrices */ 550 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 551 } else { /* MAT_REUSE_MATRIX */ 552 if (ismax) { 553 subc = (Mat_SeqBAIJ*)((*submat)[0]->data); 554 smat = subc->submatis1; 555 } else { /* (*submat)[0] is a dummy matrix */ 556 smat = (Mat_SubSppt*)(*submat)[0]->data; 557 } 558 if (!smat) { 559 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat"); 560 } 561 nstages = smat->nstages; 562 } 563 564 for (i=0,pos=0; i<nstages; i++) { 565 if (pos+nmax <= ismax) max_no = nmax; 566 else if (pos == ismax) max_no = 0; 567 else max_no = ismax-pos; 568 569 ierr = MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);CHKERRQ(ierr); 570 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 571 smat = (Mat_SubSppt*)(*submat)[pos]->data; 572 smat->nstages = nstages; 573 } 574 pos += max_no; 575 } 576 577 if (scall == MAT_INITIAL_MATRIX && ismax) { 578 /* save nstages for reuse */ 579 subc = (Mat_SeqBAIJ*)((*submat)[0]->data); 580 smat = subc->submatis1; 581 smat->nstages = nstages; 582 } 583 584 for (i=0; i<ismax; i++) { 585 ierr = ISDestroy(&isrow_block[i]);CHKERRQ(ierr); 586 ierr = ISDestroy(&iscol_block[i]);CHKERRQ(ierr); 587 } 588 ierr = PetscFree2(isrow_block,iscol_block);CHKERRQ(ierr); 589 PetscFunctionReturn(0); 590 } 591 592 #if defined(PETSC_USE_CTABLE) 593 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank) 594 { 595 PetscInt nGlobalNd = proc_gnode[size]; 596 PetscMPIInt fproc; 597 PetscErrorCode ierr; 598 599 PetscFunctionBegin; 600 ierr = PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);CHKERRQ(ierr); 601 if (fproc > size) fproc = size; 602 while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) { 603 if (row < proc_gnode[fproc]) fproc--; 604 else fproc++; 605 } 606 *rank = fproc; 607 PetscFunctionReturn(0); 608 } 609 #endif 610 611 /* -------------------------------------------------------------------------*/ 612 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */ 613 PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 614 { 615 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 616 Mat A = c->A; 617 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc; 618 const PetscInt **icol,**irow; 619 PetscInt *nrow,*ncol,start; 620 PetscErrorCode ierr; 621 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 622 PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 623 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 624 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 625 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 626 #if defined(PETSC_USE_CTABLE) 627 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 628 #else 629 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 630 #endif 631 const PetscInt *irow_i,*icol_i; 632 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 633 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 634 MPI_Request *r_waits4,*s_waits3,*s_waits4; 635 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 636 MPI_Status *r_status3,*r_status4,*s_status4; 637 MPI_Comm comm; 638 PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i; 639 PetscMPIInt *onodes1,*olengths1,end; 640 PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i; 641 Mat_SubSppt *smat_i; 642 PetscBool *issorted,colflag,iscsorted=PETSC_TRUE; 643 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 644 PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs; 645 PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */ 646 PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB; 647 PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a; 648 PetscInt cstart = c->cstartbs,*bmap = c->garray; 649 PetscBool *allrows,*allcolumns; 650 651 PetscFunctionBegin; 652 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 653 size = c->size; 654 rank = c->rank; 655 656 ierr = PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);CHKERRQ(ierr); 657 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 658 659 for (i=0; i<ismax; i++) { 660 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 661 if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */ 662 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 663 664 /* Check for special case: allcolumns */ 665 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 666 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 667 668 if (colflag && ncol[i] == c->Nbs) { 669 allcolumns[i] = PETSC_TRUE; 670 icol[i] = NULL; 671 } else { 672 allcolumns[i] = PETSC_FALSE; 673 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 674 } 675 676 /* Check for special case: allrows */ 677 ierr = ISIdentity(isrow[i],&colflag);CHKERRQ(ierr); 678 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 679 if (colflag && nrow[i] == c->Mbs) { 680 allrows[i] = PETSC_TRUE; 681 irow[i] = NULL; 682 } else { 683 allrows[i] = PETSC_FALSE; 684 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 685 } 686 } 687 688 if (scall == MAT_REUSE_MATRIX) { 689 /* Assumes new rows are same length as the old rows */ 690 for (i=0; i<ismax; i++) { 691 subc = (Mat_SeqBAIJ*)(submats[i]->data); 692 if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 693 694 /* Initial matrix as if empty */ 695 ierr = PetscArrayzero(subc->ilen,subc->mbs);CHKERRQ(ierr); 696 697 /* Initial matrix as if empty */ 698 submats[i]->factortype = C->factortype; 699 700 smat_i = subc->submatis1; 701 702 nrqs = smat_i->nrqs; 703 nrqr = smat_i->nrqr; 704 rbuf1 = smat_i->rbuf1; 705 rbuf2 = smat_i->rbuf2; 706 rbuf3 = smat_i->rbuf3; 707 req_source2 = smat_i->req_source2; 708 709 sbuf1 = smat_i->sbuf1; 710 sbuf2 = smat_i->sbuf2; 711 ptr = smat_i->ptr; 712 tmp = smat_i->tmp; 713 ctr = smat_i->ctr; 714 715 pa = smat_i->pa; 716 req_size = smat_i->req_size; 717 req_source1 = smat_i->req_source1; 718 719 allcolumns[i] = smat_i->allcolumns; 720 allrows[i] = smat_i->allrows; 721 row2proc[i] = smat_i->row2proc; 722 rmap[i] = smat_i->rmap; 723 cmap[i] = smat_i->cmap; 724 } 725 726 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 727 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 728 smat_i = (Mat_SubSppt*)submats[0]->data; 729 730 nrqs = smat_i->nrqs; 731 nrqr = smat_i->nrqr; 732 rbuf1 = smat_i->rbuf1; 733 rbuf2 = smat_i->rbuf2; 734 rbuf3 = smat_i->rbuf3; 735 req_source2 = smat_i->req_source2; 736 737 sbuf1 = smat_i->sbuf1; 738 sbuf2 = smat_i->sbuf2; 739 ptr = smat_i->ptr; 740 tmp = smat_i->tmp; 741 ctr = smat_i->ctr; 742 743 pa = smat_i->pa; 744 req_size = smat_i->req_size; 745 req_source1 = smat_i->req_source1; 746 747 allcolumns[0] = PETSC_FALSE; 748 } 749 } else { /* scall == MAT_INITIAL_MATRIX */ 750 /* Get some new tags to keep the communication clean */ 751 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 752 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 753 754 /* evaluate communication - mesg to who, length of mesg, and buffer space 755 required. Based on this, buffers are allocated, and data copied into them*/ 756 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 757 758 for (i=0; i<ismax; i++) { 759 jmax = nrow[i]; 760 irow_i = irow[i]; 761 762 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 763 row2proc[i] = row2proc_i; 764 765 if (issorted[i]) proc = 0; 766 for (j=0; j<jmax; j++) { 767 if (!issorted[i]) proc = 0; 768 if (allrows[i]) row = j; 769 else row = irow_i[j]; 770 771 while (row >= c->rangebs[proc+1]) proc++; 772 w4[proc]++; 773 row2proc_i[j] = proc; /* map row index to proc */ 774 } 775 for (j=0; j<size; j++) { 776 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 777 } 778 } 779 780 nrqs = 0; /* no of outgoing messages */ 781 msz = 0; /* total mesg length (for all procs) */ 782 w1[rank] = 0; /* no mesg sent to self */ 783 w3[rank] = 0; 784 for (i=0; i<size; i++) { 785 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 786 } 787 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 788 for (i=0,j=0; i<size; i++) { 789 if (w1[i]) { pa[j] = i; j++; } 790 } 791 792 /* Each message would have a header = 1 + 2*(no of IS) + data */ 793 for (i=0; i<nrqs; i++) { 794 j = pa[i]; 795 w1[j] += w2[j] + 2* w3[j]; 796 msz += w1[j]; 797 } 798 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 799 800 /* Determine the number of messages to expect, their lengths, from from-ids */ 801 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 802 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 803 804 /* Now post the Irecvs corresponding to these messages */ 805 tag0 = ((PetscObject)C)->tag; 806 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 807 808 ierr = PetscFree(onodes1);CHKERRQ(ierr); 809 ierr = PetscFree(olengths1);CHKERRQ(ierr); 810 811 /* Allocate Memory for outgoing messages */ 812 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 813 /* TODO: use PetscCalloc2() */ 814 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 815 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 816 817 { 818 PetscInt *iptr = tmp; 819 k = 0; 820 for (i=0; i<nrqs; i++) { 821 j = pa[i]; 822 iptr += k; 823 sbuf1[j] = iptr; 824 k = w1[j]; 825 } 826 } 827 828 /* Form the outgoing messages. Initialize the header space */ 829 for (i=0; i<nrqs; i++) { 830 j = pa[i]; 831 sbuf1[j][0] = 0; 832 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 833 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 834 } 835 836 /* Parse the isrow and copy data into outbuf */ 837 for (i=0; i<ismax; i++) { 838 row2proc_i = row2proc[i]; 839 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 840 irow_i = irow[i]; 841 jmax = nrow[i]; 842 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 843 proc = row2proc_i[j]; 844 if (allrows[i]) row = j; 845 else row = irow_i[j]; 846 847 if (proc != rank) { /* copy to the outgoing buf*/ 848 ctr[proc]++; 849 *ptr[proc] = row; 850 ptr[proc]++; 851 } 852 } 853 /* Update the headers for the current IS */ 854 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 855 if ((ctr_j = ctr[j])) { 856 sbuf1_j = sbuf1[j]; 857 k = ++sbuf1_j[0]; 858 sbuf1_j[2*k] = ctr_j; 859 sbuf1_j[2*k-1] = i; 860 } 861 } 862 } 863 864 /* Now post the sends */ 865 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 866 for (i=0; i<nrqs; ++i) { 867 j = pa[i]; 868 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 869 } 870 871 /* Post Receives to capture the buffer size */ 872 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 873 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 874 rbuf2[0] = tmp + msz; 875 for (i=1; i<nrqs; ++i) { 876 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 877 } 878 for (i=0; i<nrqs; ++i) { 879 j = pa[i]; 880 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 881 } 882 883 /* Send to other procs the buf size they should allocate */ 884 /* Receive messages*/ 885 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 886 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 887 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 888 889 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 890 for (i=0; i<nrqr; ++i) { 891 req_size[i] = 0; 892 rbuf1_i = rbuf1[i]; 893 start = 2*rbuf1_i[0] + 1; 894 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 895 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 896 sbuf2_i = sbuf2[i]; 897 for (j=start; j<end; j++) { 898 row = rbuf1_i[j] - rstart; 899 ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row]; 900 sbuf2_i[j] = ncols; 901 req_size[i] += ncols; 902 } 903 req_source1[i] = r_status1[i].MPI_SOURCE; 904 /* form the header */ 905 sbuf2_i[0] = req_size[i]; 906 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 907 908 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 909 } 910 911 ierr = PetscFree(r_status1);CHKERRQ(ierr); 912 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 913 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 914 915 /* Receive messages*/ 916 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 917 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 918 919 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 920 for (i=0; i<nrqs; ++i) { 921 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 922 req_source2[i] = r_status2[i].MPI_SOURCE; 923 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 924 } 925 ierr = PetscFree(r_status2);CHKERRQ(ierr); 926 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 927 928 /* Wait on sends1 and sends2 */ 929 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 930 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 931 932 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 933 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 934 ierr = PetscFree(s_status1);CHKERRQ(ierr); 935 ierr = PetscFree(s_status2);CHKERRQ(ierr); 936 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 937 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 938 939 /* Now allocate sending buffers for a->j, and send them off */ 940 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 941 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 942 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 943 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 944 945 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 946 { 947 948 for (i=0; i<nrqr; i++) { 949 rbuf1_i = rbuf1[i]; 950 sbuf_aj_i = sbuf_aj[i]; 951 ct1 = 2*rbuf1_i[0] + 1; 952 ct2 = 0; 953 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 954 kmax = rbuf1[i][2*j]; 955 for (k=0; k<kmax; k++,ct1++) { 956 row = rbuf1_i[ct1] - rstart; 957 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 958 ncols = nzA + nzB; 959 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 960 961 /* load the column indices for this row into cols */ 962 cols = sbuf_aj_i + ct2; 963 for (l=0; l<nzB; l++) { 964 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 965 else break; 966 } 967 imark = l; 968 for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];} 969 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 970 ct2 += ncols; 971 } 972 } 973 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 974 } 975 } 976 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 977 978 /* create col map: global col of C -> local col of submatrices */ 979 #if defined(PETSC_USE_CTABLE) 980 for (i=0; i<ismax; i++) { 981 if (!allcolumns[i]) { 982 ierr = PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);CHKERRQ(ierr); 983 984 jmax = ncol[i]; 985 icol_i = icol[i]; 986 cmap_i = cmap[i]; 987 for (j=0; j<jmax; j++) { 988 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 989 } 990 } else cmap[i] = NULL; 991 } 992 #else 993 for (i=0; i<ismax; i++) { 994 if (!allcolumns[i]) { 995 ierr = PetscCalloc1(c->Nbs,&cmap[i]);CHKERRQ(ierr); 996 jmax = ncol[i]; 997 icol_i = icol[i]; 998 cmap_i = cmap[i]; 999 for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1; 1000 } else cmap[i] = NULL; 1001 } 1002 #endif 1003 1004 /* Create lens which is required for MatCreate... */ 1005 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1006 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1007 1008 if (ismax) { 1009 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 1010 } 1011 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1012 1013 /* Update lens from local data */ 1014 for (i=0; i<ismax; i++) { 1015 row2proc_i = row2proc[i]; 1016 jmax = nrow[i]; 1017 if (!allcolumns[i]) cmap_i = cmap[i]; 1018 irow_i = irow[i]; 1019 lens_i = lens[i]; 1020 for (j=0; j<jmax; j++) { 1021 if (allrows[i]) row = j; 1022 else row = irow_i[j]; /* global blocked row of C */ 1023 1024 proc = row2proc_i[j]; 1025 if (proc == rank) { 1026 /* Get indices from matA and then from matB */ 1027 #if defined(PETSC_USE_CTABLE) 1028 PetscInt tt; 1029 #endif 1030 row = row - rstart; 1031 nzA = a_i[row+1] - a_i[row]; 1032 nzB = b_i[row+1] - b_i[row]; 1033 cworkA = a_j + a_i[row]; 1034 cworkB = b_j + b_i[row]; 1035 1036 if (!allcolumns[i]) { 1037 #if defined(PETSC_USE_CTABLE) 1038 for (k=0; k<nzA; k++) { 1039 ierr = PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr); 1040 if (tt) lens_i[j]++; 1041 } 1042 for (k=0; k<nzB; k++) { 1043 ierr = PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr); 1044 if (tt) lens_i[j]++; 1045 } 1046 1047 #else 1048 for (k=0; k<nzA; k++) { 1049 if (cmap_i[cstart + cworkA[k]]) lens_i[j]++; 1050 } 1051 for (k=0; k<nzB; k++) { 1052 if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++; 1053 } 1054 #endif 1055 } else { /* allcolumns */ 1056 lens_i[j] = nzA + nzB; 1057 } 1058 } 1059 } 1060 } 1061 1062 /* Create row map: global row of C -> local row of submatrices */ 1063 for (i=0; i<ismax; i++) { 1064 if (!allrows[i]) { 1065 #if defined(PETSC_USE_CTABLE) 1066 ierr = PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);CHKERRQ(ierr); 1067 irow_i = irow[i]; 1068 jmax = nrow[i]; 1069 for (j=0; j<jmax; j++) { 1070 if (allrows[i]) { 1071 ierr = PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1072 } else { 1073 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1074 } 1075 } 1076 #else 1077 ierr = PetscCalloc1(c->Mbs,&rmap[i]);CHKERRQ(ierr); 1078 rmap_i = rmap[i]; 1079 irow_i = irow[i]; 1080 jmax = nrow[i]; 1081 for (j=0; j<jmax; j++) { 1082 if (allrows[i]) rmap_i[j] = j; 1083 else rmap_i[irow_i[j]] = j; 1084 } 1085 #endif 1086 } else rmap[i] = NULL; 1087 } 1088 1089 /* Update lens from offproc data */ 1090 { 1091 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1092 1093 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1094 for (tmp2=0; tmp2<nrqs; tmp2++) { 1095 sbuf1_i = sbuf1[pa[tmp2]]; 1096 jmax = sbuf1_i[0]; 1097 ct1 = 2*jmax+1; 1098 ct2 = 0; 1099 rbuf2_i = rbuf2[tmp2]; 1100 rbuf3_i = rbuf3[tmp2]; 1101 for (j=1; j<=jmax; j++) { 1102 is_no = sbuf1_i[2*j-1]; 1103 max1 = sbuf1_i[2*j]; 1104 lens_i = lens[is_no]; 1105 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1106 rmap_i = rmap[is_no]; 1107 for (k=0; k<max1; k++,ct1++) { 1108 if (allrows[is_no]) { 1109 row = sbuf1_i[ct1]; 1110 } else { 1111 #if defined(PETSC_USE_CTABLE) 1112 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1113 row--; 1114 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1115 #else 1116 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1117 #endif 1118 } 1119 max2 = rbuf2_i[ct1]; 1120 for (l=0; l<max2; l++,ct2++) { 1121 if (!allcolumns[is_no]) { 1122 #if defined(PETSC_USE_CTABLE) 1123 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1124 #else 1125 tcol = cmap_i[rbuf3_i[ct2]]; 1126 #endif 1127 if (tcol) lens_i[row]++; 1128 } else { /* allcolumns */ 1129 lens_i[row]++; /* lens_i[row] += max2 ? */ 1130 } 1131 } 1132 } 1133 } 1134 } 1135 } 1136 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1137 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1138 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 1139 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1140 1141 /* Create the submatrices */ 1142 for (i=0; i<ismax; i++) { 1143 PetscInt bs_tmp; 1144 if (ijonly) bs_tmp = 1; 1145 else bs_tmp = bs; 1146 1147 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1148 ierr = MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1149 1150 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1151 ierr = MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);CHKERRQ(ierr); 1152 ierr = MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by SBAIJ routines */ 1153 1154 /* create struct Mat_SubSppt and attached it to submat */ 1155 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 1156 subc = (Mat_SeqBAIJ*)submats[i]->data; 1157 subc->submatis1 = smat_i; 1158 1159 smat_i->destroy = submats[i]->ops->destroy; 1160 submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ; 1161 submats[i]->factortype = C->factortype; 1162 1163 smat_i->id = i; 1164 smat_i->nrqs = nrqs; 1165 smat_i->nrqr = nrqr; 1166 smat_i->rbuf1 = rbuf1; 1167 smat_i->rbuf2 = rbuf2; 1168 smat_i->rbuf3 = rbuf3; 1169 smat_i->sbuf2 = sbuf2; 1170 smat_i->req_source2 = req_source2; 1171 1172 smat_i->sbuf1 = sbuf1; 1173 smat_i->ptr = ptr; 1174 smat_i->tmp = tmp; 1175 smat_i->ctr = ctr; 1176 1177 smat_i->pa = pa; 1178 smat_i->req_size = req_size; 1179 smat_i->req_source1 = req_source1; 1180 1181 smat_i->allcolumns = allcolumns[i]; 1182 smat_i->allrows = allrows[i]; 1183 smat_i->singleis = PETSC_FALSE; 1184 smat_i->row2proc = row2proc[i]; 1185 smat_i->rmap = rmap[i]; 1186 smat_i->cmap = cmap[i]; 1187 } 1188 1189 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 1190 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 1191 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1192 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 1193 1194 /* create struct Mat_SubSppt and attached it to submat */ 1195 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 1196 submats[0]->data = (void*)smat_i; 1197 1198 smat_i->destroy = submats[0]->ops->destroy; 1199 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 1200 submats[0]->factortype = C->factortype; 1201 1202 smat_i->id = 0; 1203 smat_i->nrqs = nrqs; 1204 smat_i->nrqr = nrqr; 1205 smat_i->rbuf1 = rbuf1; 1206 smat_i->rbuf2 = rbuf2; 1207 smat_i->rbuf3 = rbuf3; 1208 smat_i->sbuf2 = sbuf2; 1209 smat_i->req_source2 = req_source2; 1210 1211 smat_i->sbuf1 = sbuf1; 1212 smat_i->ptr = ptr; 1213 smat_i->tmp = tmp; 1214 smat_i->ctr = ctr; 1215 1216 smat_i->pa = pa; 1217 smat_i->req_size = req_size; 1218 smat_i->req_source1 = req_source1; 1219 1220 smat_i->allcolumns = PETSC_FALSE; 1221 smat_i->singleis = PETSC_FALSE; 1222 smat_i->row2proc = NULL; 1223 smat_i->rmap = NULL; 1224 smat_i->cmap = NULL; 1225 } 1226 1227 1228 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1229 ierr = PetscFree(lens);CHKERRQ(ierr); 1230 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1231 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1232 1233 } /* endof scall == MAT_INITIAL_MATRIX */ 1234 1235 /* Post recv matrix values */ 1236 if (!ijonly) { 1237 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1238 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1239 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1240 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1241 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1242 for (i=0; i<nrqs; ++i) { 1243 ierr = PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);CHKERRQ(ierr); 1244 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1245 } 1246 1247 /* Allocate sending buffers for a->a, and send them off */ 1248 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1249 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1250 1251 ierr = PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);CHKERRQ(ierr); 1252 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1253 1254 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1255 1256 for (i=0; i<nrqr; i++) { 1257 rbuf1_i = rbuf1[i]; 1258 sbuf_aa_i = sbuf_aa[i]; 1259 ct1 = 2*rbuf1_i[0]+1; 1260 ct2 = 0; 1261 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1262 kmax = rbuf1_i[2*j]; 1263 for (k=0; k<kmax; k++,ct1++) { 1264 row = rbuf1_i[ct1] - rstart; 1265 nzA = a_i[row+1] - a_i[row]; 1266 nzB = b_i[row+1] - b_i[row]; 1267 ncols = nzA + nzB; 1268 cworkB = b_j + b_i[row]; 1269 vworkA = a_a + a_i[row]*bs2; 1270 vworkB = b_a + b_i[row]*bs2; 1271 1272 /* load the column values for this row into vals*/ 1273 vals = sbuf_aa_i+ct2*bs2; 1274 for (l=0; l<nzB; l++) { 1275 if ((bmap[cworkB[l]]) < cstart) { 1276 ierr = PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2);CHKERRQ(ierr); 1277 } else break; 1278 } 1279 imark = l; 1280 for (l=0; l<nzA; l++) { 1281 ierr = PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2);CHKERRQ(ierr); 1282 } 1283 for (l=imark; l<nzB; l++) { 1284 ierr = PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2);CHKERRQ(ierr); 1285 } 1286 1287 ct2 += ncols; 1288 } 1289 } 1290 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1291 } 1292 } 1293 1294 /* Assemble the matrices */ 1295 /* First assemble the local rows */ 1296 for (i=0; i<ismax; i++) { 1297 row2proc_i = row2proc[i]; 1298 subc = (Mat_SeqBAIJ*)submats[i]->data; 1299 imat_ilen = subc->ilen; 1300 imat_j = subc->j; 1301 imat_i = subc->i; 1302 imat_a = subc->a; 1303 1304 if (!allcolumns[i]) cmap_i = cmap[i]; 1305 rmap_i = rmap[i]; 1306 irow_i = irow[i]; 1307 jmax = nrow[i]; 1308 for (j=0; j<jmax; j++) { 1309 if (allrows[i]) row = j; 1310 else row = irow_i[j]; 1311 proc = row2proc_i[j]; 1312 1313 if (proc == rank) { 1314 1315 row = row - rstart; 1316 nzA = a_i[row+1] - a_i[row]; 1317 nzB = b_i[row+1] - b_i[row]; 1318 cworkA = a_j + a_i[row]; 1319 cworkB = b_j + b_i[row]; 1320 if (!ijonly) { 1321 vworkA = a_a + a_i[row]*bs2; 1322 vworkB = b_a + b_i[row]*bs2; 1323 } 1324 1325 if (allrows[i]) { 1326 row = row+rstart; 1327 } else { 1328 #if defined(PETSC_USE_CTABLE) 1329 ierr = PetscTableFind(rmap_i,row+rstart+1,&row);CHKERRQ(ierr); 1330 row--; 1331 1332 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1333 #else 1334 row = rmap_i[row + rstart]; 1335 #endif 1336 } 1337 mat_i = imat_i[row]; 1338 if (!ijonly) mat_a = imat_a + mat_i*bs2; 1339 mat_j = imat_j + mat_i; 1340 ilen = imat_ilen[row]; 1341 1342 /* load the column indices for this row into cols*/ 1343 if (!allcolumns[i]) { 1344 for (l=0; l<nzB; l++) { 1345 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1346 #if defined(PETSC_USE_CTABLE) 1347 ierr = PetscTableFind(cmap_i,ctmp+1,&tcol);CHKERRQ(ierr); 1348 if (tcol) { 1349 #else 1350 if ((tcol = cmap_i[ctmp])) { 1351 #endif 1352 *mat_j++ = tcol - 1; 1353 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr); 1354 mat_a += bs2; 1355 ilen++; 1356 } 1357 } else break; 1358 } 1359 imark = l; 1360 for (l=0; l<nzA; l++) { 1361 #if defined(PETSC_USE_CTABLE) 1362 ierr = PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr); 1363 if (tcol) { 1364 #else 1365 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1366 #endif 1367 *mat_j++ = tcol - 1; 1368 if (!ijonly) { 1369 ierr = PetscArraycpy(mat_a,vworkA+l*bs2,bs2);CHKERRQ(ierr); 1370 mat_a += bs2; 1371 } 1372 ilen++; 1373 } 1374 } 1375 for (l=imark; l<nzB; l++) { 1376 #if defined(PETSC_USE_CTABLE) 1377 ierr = PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr); 1378 if (tcol) { 1379 #else 1380 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1381 #endif 1382 *mat_j++ = tcol - 1; 1383 if (!ijonly) { 1384 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr); 1385 mat_a += bs2; 1386 } 1387 ilen++; 1388 } 1389 } 1390 } else { /* allcolumns */ 1391 for (l=0; l<nzB; l++) { 1392 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1393 *mat_j++ = ctmp; 1394 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr); 1395 mat_a += bs2; 1396 ilen++; 1397 } else break; 1398 } 1399 imark = l; 1400 for (l=0; l<nzA; l++) { 1401 *mat_j++ = cstart+cworkA[l]; 1402 if (!ijonly) { 1403 ierr = PetscArraycpy(mat_a,vworkA+l*bs2,bs2);CHKERRQ(ierr); 1404 mat_a += bs2; 1405 } 1406 ilen++; 1407 } 1408 for (l=imark; l<nzB; l++) { 1409 *mat_j++ = bmap[cworkB[l]]; 1410 if (!ijonly) { 1411 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr); 1412 mat_a += bs2; 1413 } 1414 ilen++; 1415 } 1416 } 1417 imat_ilen[row] = ilen; 1418 } 1419 } 1420 } 1421 1422 /* Now assemble the off proc rows */ 1423 if (!ijonly) { 1424 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 1425 } 1426 for (tmp2=0; tmp2<nrqs; tmp2++) { 1427 sbuf1_i = sbuf1[pa[tmp2]]; 1428 jmax = sbuf1_i[0]; 1429 ct1 = 2*jmax + 1; 1430 ct2 = 0; 1431 rbuf2_i = rbuf2[tmp2]; 1432 rbuf3_i = rbuf3[tmp2]; 1433 if (!ijonly) rbuf4_i = rbuf4[tmp2]; 1434 for (j=1; j<=jmax; j++) { 1435 is_no = sbuf1_i[2*j-1]; 1436 rmap_i = rmap[is_no]; 1437 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1438 subc = (Mat_SeqBAIJ*)submats[is_no]->data; 1439 imat_ilen = subc->ilen; 1440 imat_j = subc->j; 1441 imat_i = subc->i; 1442 if (!ijonly) imat_a = subc->a; 1443 max1 = sbuf1_i[2*j]; 1444 for (k=0; k<max1; k++,ct1++) { /* for each recved block row */ 1445 row = sbuf1_i[ct1]; 1446 1447 if (allrows[is_no]) { 1448 row = sbuf1_i[ct1]; 1449 } else { 1450 #if defined(PETSC_USE_CTABLE) 1451 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1452 row--; 1453 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1454 #else 1455 row = rmap_i[row]; 1456 #endif 1457 } 1458 ilen = imat_ilen[row]; 1459 mat_i = imat_i[row]; 1460 if (!ijonly) mat_a = imat_a + mat_i*bs2; 1461 mat_j = imat_j + mat_i; 1462 max2 = rbuf2_i[ct1]; 1463 if (!allcolumns[is_no]) { 1464 for (l=0; l<max2; l++,ct2++) { 1465 #if defined(PETSC_USE_CTABLE) 1466 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1467 #else 1468 tcol = cmap_i[rbuf3_i[ct2]]; 1469 #endif 1470 if (tcol) { 1471 *mat_j++ = tcol - 1; 1472 if (!ijonly) { 1473 ierr = PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);CHKERRQ(ierr); 1474 mat_a += bs2; 1475 } 1476 ilen++; 1477 } 1478 } 1479 } else { /* allcolumns */ 1480 for (l=0; l<max2; l++,ct2++) { 1481 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1482 if (!ijonly) { 1483 ierr = PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);CHKERRQ(ierr); 1484 mat_a += bs2; 1485 } 1486 ilen++; 1487 } 1488 } 1489 imat_ilen[row] = ilen; 1490 } 1491 } 1492 } 1493 1494 if (!iscsorted) { /* sort column indices of the rows */ 1495 MatScalar *work; 1496 1497 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 1498 for (i=0; i<ismax; i++) { 1499 subc = (Mat_SeqBAIJ*)submats[i]->data; 1500 imat_ilen = subc->ilen; 1501 imat_j = subc->j; 1502 imat_i = subc->i; 1503 if (!ijonly) imat_a = subc->a; 1504 if (allcolumns[i]) continue; 1505 1506 jmax = nrow[i]; 1507 for (j=0; j<jmax; j++) { 1508 mat_i = imat_i[j]; 1509 mat_j = imat_j + mat_i; 1510 ilen = imat_ilen[j]; 1511 if (ijonly) { 1512 ierr = PetscSortInt(ilen,mat_j);CHKERRQ(ierr); 1513 } else { 1514 mat_a = imat_a + mat_i*bs2; 1515 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 1516 } 1517 } 1518 } 1519 ierr = PetscFree(work);CHKERRQ(ierr); 1520 } 1521 1522 if (!ijonly) { 1523 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1524 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1525 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1526 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1527 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1528 } 1529 1530 /* Restore the indices */ 1531 for (i=0; i<ismax; i++) { 1532 if (!allrows[i]) { 1533 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1534 } 1535 if (!allcolumns[i]) { 1536 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1537 } 1538 } 1539 1540 for (i=0; i<ismax; i++) { 1541 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1542 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1543 } 1544 1545 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 1546 ierr = PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);CHKERRQ(ierr); 1547 1548 if (!ijonly) { 1549 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1550 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1551 1552 for (i=0; i<nrqs; ++i) { 1553 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1554 } 1555 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1556 } 1557 c->ijonly = PETSC_FALSE; /* set back to the default */ 1558 PetscFunctionReturn(0); 1559 } 1560