1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/baij/mpi/mpibaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 10 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 11 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 12 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 14 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 15 { 16 PetscErrorCode ierr; 17 PetscInt i,N=C->cmap->N, bs=C->rmap->bs; 18 IS *is_new; 19 20 PetscFunctionBegin; 21 ierr = PetscMalloc1(imax,&is_new);CHKERRQ(ierr); 22 /* Convert the indices into block format */ 23 ierr = ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);CHKERRQ(ierr); 24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n"); 25 for (i=0; i<ov; ++i) { 26 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr); 27 } 28 for (i=0; i<imax; i++) {ierr = ISDestroy(&is[i]);CHKERRQ(ierr);} 29 ierr = ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);CHKERRQ(ierr); 30 for (i=0; i<imax; i++) {ierr = ISDestroy(&is_new[i]);CHKERRQ(ierr);} 31 ierr = PetscFree(is_new);CHKERRQ(ierr); 32 PetscFunctionReturn(0); 33 } 34 35 /* 36 Sample message format: 37 If a processor A wants processor B to process some elements corresponding 38 to index sets is[1], is[5] 39 mesg [0] = 2 (no of index sets in the mesg) 40 ----------- 41 mesg [1] = 1 => is[1] 42 mesg [2] = sizeof(is[1]); 43 ----------- 44 mesg [5] = 5 => is[5] 45 mesg [6] = sizeof(is[5]); 46 ----------- 47 mesg [7] 48 mesg [n] data(is[1]) 49 ----------- 50 mesg[n+1] 51 mesg[m] data(is[5]) 52 ----------- 53 54 Notes: 55 nrqs - no of requests sent (or to be sent out) 56 nrqr - no of requests recieved (which have to be or which have been processed 57 */ 58 PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[]) 59 { 60 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 61 const PetscInt **idx,*idx_i; 62 PetscInt *n,*w3,*w4,**data,len; 63 PetscErrorCode ierr; 64 PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr; 65 PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr; 66 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 67 PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2; 68 PetscBT *table; 69 MPI_Comm comm,*iscomms; 70 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 71 MPI_Status *s_status,*recv_status; 72 char *t_p; 73 74 PetscFunctionBegin; 75 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 76 size = c->size; 77 rank = c->rank; 78 Mbs = c->Mbs; 79 80 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 81 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 82 83 ierr = PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 84 85 for (i=0; i<imax; i++) { 86 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 87 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 88 } 89 90 /* evaluate communication - mesg to who,length of mesg, and buffer space 91 required. Based on this, buffers are allocated, and data copied into them*/ 92 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to itself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 { 140 PetscInt *iptr = tmp,ict = 0; 141 for (i=0; i<nrqs; i++) { 142 j = pa[i]; 143 iptr += ict; 144 outdat[j] = iptr; 145 ict = w1[j]; 146 } 147 } 148 149 /* Form the outgoing messages */ 150 /*plug in the headers*/ 151 for (i=0; i<nrqs; i++) { 152 j = pa[i]; 153 outdat[j][0] = 0; 154 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 155 ptr[j] = outdat[j] + 2*w3[j] + 1; 156 } 157 158 /* Memory for doing local proc's work*/ 159 { 160 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 161 162 for (i=0; i<imax; i++) { 163 table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 164 data[i] = d_p + (Mbs)*i; 165 } 166 } 167 168 /* Parse the IS and update local tables and the outgoing buf with the data*/ 169 { 170 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 171 PetscBT table_i; 172 173 for (i=0; i<imax; i++) { 174 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 175 n_i = n[i]; 176 table_i = table[i]; 177 idx_i = idx[i]; 178 data_i = data[i]; 179 isz_i = isz[i]; 180 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 181 row = idx_i[j]; 182 ierr = PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);CHKERRQ(ierr); 183 if (proc != rank) { /* copy to the outgoing buffer */ 184 ctr[proc]++; 185 *ptr[proc] = row; 186 ptr[proc]++; 187 } else { /* Update the local table */ 188 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 189 } 190 } 191 /* Update the headers for the current IS */ 192 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 193 if ((ctr_j = ctr[j])) { 194 outdat_j = outdat[j]; 195 k = ++outdat_j[0]; 196 outdat_j[2*k] = ctr_j; 197 outdat_j[2*k-1] = i; 198 } 199 } 200 isz[i] = isz_i; 201 } 202 } 203 204 /* Now post the sends */ 205 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 206 for (i=0; i<nrqs; ++i) { 207 j = pa[i]; 208 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 209 } 210 211 /* No longer need the original indices*/ 212 for (i=0; i<imax; ++i) { 213 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 214 } 215 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 216 217 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 218 for (i=0; i<imax; ++i) { 219 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 223 /* Do Local work*/ 224 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 225 226 /* Receive messages*/ 227 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 228 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 229 230 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 231 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 232 233 /* Phase 1 sends are complete - deallocate buffers */ 234 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 235 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 236 237 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 238 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 239 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 240 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 241 ierr = PetscFree(rbuf);CHKERRQ(ierr); 242 243 /* Send the data back*/ 244 /* Do a global reduction to know the buffer space req for incoming messages*/ 245 { 246 PetscMPIInt *rw1; 247 248 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 249 250 for (i=0; i<nrqr; ++i) { 251 proc = recv_status[i].MPI_SOURCE; 252 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 253 rw1[proc] = isz1[i]; 254 } 255 256 ierr = PetscFree(onodes1);CHKERRQ(ierr); 257 ierr = PetscFree(olengths1);CHKERRQ(ierr); 258 259 /* Determine the number of messages to expect, their lengths, from from-ids */ 260 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 261 ierr = PetscFree(rw1);CHKERRQ(ierr); 262 } 263 /* Now post the Irecvs corresponding to these messages */ 264 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 265 266 /* Now post the sends */ 267 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 268 for (i=0; i<nrqr; ++i) { 269 j = recv_status[i].MPI_SOURCE; 270 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 271 } 272 273 /* receive work done on other processors*/ 274 { 275 PetscMPIInt idex; 276 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 277 PetscBT table_i; 278 MPI_Status *status2; 279 280 ierr = PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);CHKERRQ(ierr); 281 for (i=0; i<nrqs; ++i) { 282 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 283 /* Process the message*/ 284 rbuf2_i = rbuf2[idex]; 285 ct1 = 2*rbuf2_i[0]+1; 286 jmax = rbuf2[idex][0]; 287 for (j=1; j<=jmax; j++) { 288 max = rbuf2_i[2*j]; 289 is_no = rbuf2_i[2*j-1]; 290 isz_i = isz[is_no]; 291 data_i = data[is_no]; 292 table_i = table[is_no]; 293 for (k=0; k<max; k++,ct1++) { 294 row = rbuf2_i[ct1]; 295 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 296 } 297 isz[is_no] = isz_i; 298 } 299 } 300 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 301 ierr = PetscFree(status2);CHKERRQ(ierr); 302 } 303 304 for (i=0; i<imax; ++i) { 305 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 306 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 307 } 308 309 ierr = PetscFree(iscomms);CHKERRQ(ierr); 310 ierr = PetscFree(onodes2);CHKERRQ(ierr); 311 ierr = PetscFree(olengths2);CHKERRQ(ierr); 312 313 ierr = PetscFree(pa);CHKERRQ(ierr); 314 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 315 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 316 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 317 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 318 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 319 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 320 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 321 ierr = PetscFree(s_status);CHKERRQ(ierr); 322 ierr = PetscFree(recv_status);CHKERRQ(ierr); 323 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 324 ierr = PetscFree(xdata);CHKERRQ(ierr); 325 ierr = PetscFree(isz1);CHKERRQ(ierr); 326 PetscFunctionReturn(0); 327 } 328 329 /* 330 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do 331 the work on the local processor. 332 333 Inputs: 334 C - MAT_MPIBAIJ; 335 imax - total no of index sets processed at a time; 336 table - an array of char - size = Mbs bits. 337 338 Output: 339 isz - array containing the count of the solution elements corresponding 340 to each index set; 341 data - pointer to the solutions 342 */ 343 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 344 { 345 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 346 Mat A = c->A,B = c->B; 347 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 348 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 349 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 350 PetscBT table_i; 351 352 PetscFunctionBegin; 353 rstart = c->rstartbs; 354 cstart = c->cstartbs; 355 ai = a->i; 356 aj = a->j; 357 bi = b->i; 358 bj = b->j; 359 garray = c->garray; 360 361 362 for (i=0; i<imax; i++) { 363 data_i = data[i]; 364 table_i = table[i]; 365 isz_i = isz[i]; 366 for (j=0,max=isz[i]; j<max; j++) { 367 row = data_i[j] - rstart; 368 start = ai[row]; 369 end = ai[row+1]; 370 for (k=start; k<end; k++) { /* Amat */ 371 val = aj[k] + cstart; 372 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 373 } 374 start = bi[row]; 375 end = bi[row+1]; 376 for (k=start; k<end; k++) { /* Bmat */ 377 val = garray[bj[k]]; 378 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 379 } 380 } 381 isz[i] = isz_i; 382 } 383 PetscFunctionReturn(0); 384 } 385 /* 386 MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages, 387 and return the output 388 389 Input: 390 C - the matrix 391 nrqr - no of messages being processed. 392 rbuf - an array of pointers to the recieved requests 393 394 Output: 395 xdata - array of messages to be sent back 396 isz1 - size of each message 397 398 For better efficiency perhaps we should malloc separately each xdata[i], 399 then if a remalloc is required we need only copy the data for that one row 400 rather than all previous rows as it is now where a single large chunck of 401 memory is used. 402 403 */ 404 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 405 { 406 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 407 Mat A = c->A,B = c->B; 408 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data; 409 PetscErrorCode ierr; 410 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 411 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 412 PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr; 413 PetscInt *rbuf_i,kmax,rbuf_0; 414 PetscBT xtable; 415 416 PetscFunctionBegin; 417 Mbs = c->Mbs; 418 rstart = c->rstartbs; 419 cstart = c->cstartbs; 420 ai = a->i; 421 aj = a->j; 422 bi = b->i; 423 bj = b->j; 424 garray = c->garray; 425 426 427 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 428 rbuf_i = rbuf[i]; 429 rbuf_0 = rbuf_i[0]; 430 ct += rbuf_0; 431 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 432 } 433 434 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs; 435 else max1 = 1; 436 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 437 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 438 ++no_malloc; 439 ierr = PetscBTCreate(Mbs,&xtable);CHKERRQ(ierr); 440 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 441 442 ct3 = 0; 443 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 444 rbuf_i = rbuf[i]; 445 rbuf_0 = rbuf_i[0]; 446 ct1 = 2*rbuf_0+1; 447 ct2 = ct1; 448 ct3 += ct1; 449 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 450 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr); 451 oct2 = ct2; 452 kmax = rbuf_i[2*j]; 453 for (k=0; k<kmax; k++,ct1++) { 454 row = rbuf_i[ct1]; 455 if (!PetscBTLookupSet(xtable,row)) { 456 if (!(ct3 < mem_estimate)) { 457 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 458 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 459 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 460 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 461 xdata[0] = tmp; 462 mem_estimate = new_estimate; ++no_malloc; 463 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 464 } 465 xdata[i][ct2++] = row; 466 ct3++; 467 } 468 } 469 for (k=oct2,max2=ct2; k<max2; k++) { 470 row = xdata[i][k] - rstart; 471 start = ai[row]; 472 end = ai[row+1]; 473 for (l=start; l<end; l++) { 474 val = aj[l] + cstart; 475 if (!PetscBTLookupSet(xtable,val)) { 476 if (!(ct3 < mem_estimate)) { 477 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 478 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 479 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 480 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 481 xdata[0] = tmp; 482 mem_estimate = new_estimate; ++no_malloc; 483 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 484 } 485 xdata[i][ct2++] = val; 486 ct3++; 487 } 488 } 489 start = bi[row]; 490 end = bi[row+1]; 491 for (l=start; l<end; l++) { 492 val = garray[bj[l]]; 493 if (!PetscBTLookupSet(xtable,val)) { 494 if (!(ct3 < mem_estimate)) { 495 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 496 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 497 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 498 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 499 xdata[0] = tmp; 500 mem_estimate = new_estimate; ++no_malloc; 501 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 502 } 503 xdata[i][ct2++] = val; 504 ct3++; 505 } 506 } 507 } 508 /* Update the header*/ 509 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 510 xdata[i][2*j-1] = rbuf_i[2*j-1]; 511 } 512 xdata[i][0] = rbuf_0; 513 xdata[i+1] = xdata[i] + ct2; 514 isz1[i] = ct2; /* size of each message */ 515 } 516 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 517 ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 518 PetscFunctionReturn(0); 519 } 520 521 PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 522 { 523 IS *isrow_block,*iscol_block; 524 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 525 PetscErrorCode ierr; 526 PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs; 527 Mat_SeqBAIJ *subc; 528 Mat_SubSppt *smat; 529 530 PetscFunctionBegin; 531 /* The compression and expansion should be avoided. Doesn't point 532 out errors, might change the indices, hence buggey */ 533 ierr = PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);CHKERRQ(ierr); 534 ierr = ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);CHKERRQ(ierr); 535 ierr = ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);CHKERRQ(ierr); 536 537 /* Determine the number of stages through which submatrices are done */ 538 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 539 else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt)); 540 if (!nmax) nmax = 1; 541 542 if (scall == MAT_INITIAL_MATRIX) { 543 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 544 545 /* Make sure every processor loops through the nstages */ 546 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 547 548 /* Allocate memory to hold all the submatrices and dummy submatrices */ 549 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 550 } else { /* MAT_REUSE_MATRIX */ 551 if (ismax) { 552 subc = (Mat_SeqBAIJ*)((*submat)[0]->data); 553 smat = subc->submatis1; 554 } else { /* (*submat)[0] is a dummy matrix */ 555 smat = (Mat_SubSppt*)(*submat)[0]->data; 556 } 557 if (!smat) { 558 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat"); 559 } 560 nstages = smat->nstages; 561 } 562 563 for (i=0,pos=0; i<nstages; i++) { 564 if (pos+nmax <= ismax) max_no = nmax; 565 else if (pos == ismax) max_no = 0; 566 else max_no = ismax-pos; 567 568 ierr = MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);CHKERRQ(ierr); 569 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 570 smat = (Mat_SubSppt*)(*submat)[pos]->data; 571 smat->nstages = nstages; 572 } 573 pos += max_no; 574 } 575 576 if (scall == MAT_INITIAL_MATRIX && ismax) { 577 /* save nstages for reuse */ 578 subc = (Mat_SeqBAIJ*)((*submat)[0]->data); 579 smat = subc->submatis1; 580 smat->nstages = nstages; 581 } 582 583 for (i=0; i<ismax; i++) { 584 ierr = ISDestroy(&isrow_block[i]);CHKERRQ(ierr); 585 ierr = ISDestroy(&iscol_block[i]);CHKERRQ(ierr); 586 } 587 ierr = PetscFree2(isrow_block,iscol_block);CHKERRQ(ierr); 588 PetscFunctionReturn(0); 589 } 590 591 #if defined(PETSC_USE_CTABLE) 592 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank) 593 { 594 PetscInt nGlobalNd = proc_gnode[size]; 595 PetscMPIInt fproc; 596 PetscErrorCode ierr; 597 598 PetscFunctionBegin; 599 ierr = PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);CHKERRQ(ierr); 600 if (fproc > size) fproc = size; 601 while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) { 602 if (row < proc_gnode[fproc]) fproc--; 603 else fproc++; 604 } 605 *rank = fproc; 606 PetscFunctionReturn(0); 607 } 608 #endif 609 610 /* -------------------------------------------------------------------------*/ 611 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */ 612 PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 613 { 614 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 615 Mat A = c->A; 616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc; 617 const PetscInt **icol,**irow; 618 PetscInt *nrow,*ncol,start; 619 PetscErrorCode ierr; 620 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 621 PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 622 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 623 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 624 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 625 #if defined(PETSC_USE_CTABLE) 626 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 627 #else 628 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 629 #endif 630 const PetscInt *irow_i,*icol_i; 631 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 632 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 633 MPI_Request *r_waits4,*s_waits3,*s_waits4; 634 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 635 MPI_Status *r_status3,*r_status4,*s_status4; 636 MPI_Comm comm; 637 PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a,*sbuf_aa_i; 638 PetscMPIInt *onodes1,*olengths1,end; 639 PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i; 640 Mat_SubSppt *smat_i; 641 PetscBool *issorted,colflag,iscsorted=PETSC_TRUE; 642 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 643 PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs; 644 PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */ 645 PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB; 646 PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a; 647 PetscInt cstart = c->cstartbs,*bmap = c->garray; 648 PetscBool *allrows,*allcolumns; 649 650 PetscFunctionBegin; 651 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 652 size = c->size; 653 rank = c->rank; 654 655 ierr = PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);CHKERRQ(ierr); 656 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 657 658 for (i=0; i<ismax; i++) { 659 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 660 if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */ 661 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 662 663 /* Check for special case: allcolumns */ 664 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 665 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 666 667 if (colflag && ncol[i] == c->Nbs) { 668 allcolumns[i] = PETSC_TRUE; 669 icol[i] = NULL; 670 } else { 671 allcolumns[i] = PETSC_FALSE; 672 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 673 } 674 675 /* Check for special case: allrows */ 676 ierr = ISIdentity(isrow[i],&colflag);CHKERRQ(ierr); 677 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 678 if (colflag && nrow[i] == c->Mbs) { 679 allrows[i] = PETSC_TRUE; 680 irow[i] = NULL; 681 } else { 682 allrows[i] = PETSC_FALSE; 683 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 684 } 685 } 686 687 if (scall == MAT_REUSE_MATRIX) { 688 /* Assumes new rows are same length as the old rows */ 689 for (i=0; i<ismax; i++) { 690 subc = (Mat_SeqBAIJ*)(submats[i]->data); 691 if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 692 693 /* Initial matrix as if empty */ 694 ierr = PetscMemzero(subc->ilen,subc->mbs*sizeof(PetscInt));CHKERRQ(ierr); 695 696 /* Initial matrix as if empty */ 697 submats[i]->factortype = C->factortype; 698 699 smat_i = subc->submatis1; 700 701 nrqs = smat_i->nrqs; 702 nrqr = smat_i->nrqr; 703 rbuf1 = smat_i->rbuf1; 704 rbuf2 = smat_i->rbuf2; 705 rbuf3 = smat_i->rbuf3; 706 req_source2 = smat_i->req_source2; 707 708 sbuf1 = smat_i->sbuf1; 709 sbuf2 = smat_i->sbuf2; 710 ptr = smat_i->ptr; 711 tmp = smat_i->tmp; 712 ctr = smat_i->ctr; 713 714 pa = smat_i->pa; 715 req_size = smat_i->req_size; 716 req_source1 = smat_i->req_source1; 717 718 allcolumns[i] = smat_i->allcolumns; 719 allrows[i] = smat_i->allrows; 720 row2proc[i] = smat_i->row2proc; 721 rmap[i] = smat_i->rmap; 722 cmap[i] = smat_i->cmap; 723 } 724 725 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 726 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 727 smat_i = (Mat_SubSppt*)submats[0]->data; 728 729 nrqs = smat_i->nrqs; 730 nrqr = smat_i->nrqr; 731 rbuf1 = smat_i->rbuf1; 732 rbuf2 = smat_i->rbuf2; 733 rbuf3 = smat_i->rbuf3; 734 req_source2 = smat_i->req_source2; 735 736 sbuf1 = smat_i->sbuf1; 737 sbuf2 = smat_i->sbuf2; 738 ptr = smat_i->ptr; 739 tmp = smat_i->tmp; 740 ctr = smat_i->ctr; 741 742 pa = smat_i->pa; 743 req_size = smat_i->req_size; 744 req_source1 = smat_i->req_source1; 745 746 allcolumns[0] = PETSC_FALSE; 747 } 748 } else { /* scall == MAT_INITIAL_MATRIX */ 749 /* Get some new tags to keep the communication clean */ 750 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 751 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 752 753 /* evaluate communication - mesg to who, length of mesg, and buffer space 754 required. Based on this, buffers are allocated, and data copied into them*/ 755 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 756 757 for (i=0; i<ismax; i++) { 758 jmax = nrow[i]; 759 irow_i = irow[i]; 760 761 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 762 row2proc[i] = row2proc_i; 763 764 if (issorted[i]) proc = 0; 765 for (j=0; j<jmax; j++) { 766 if (!issorted[i]) proc = 0; 767 if (allrows[i]) row = j; 768 else row = irow_i[j]; 769 770 while (row >= c->rangebs[proc+1]) proc++; 771 w4[proc]++; 772 row2proc_i[j] = proc; /* map row index to proc */ 773 } 774 for (j=0; j<size; j++) { 775 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 776 } 777 } 778 779 nrqs = 0; /* no of outgoing messages */ 780 msz = 0; /* total mesg length (for all procs) */ 781 w1[rank] = 0; /* no mesg sent to self */ 782 w3[rank] = 0; 783 for (i=0; i<size; i++) { 784 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 785 } 786 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 787 for (i=0,j=0; i<size; i++) { 788 if (w1[i]) { pa[j] = i; j++; } 789 } 790 791 /* Each message would have a header = 1 + 2*(no of IS) + data */ 792 for (i=0; i<nrqs; i++) { 793 j = pa[i]; 794 w1[j] += w2[j] + 2* w3[j]; 795 msz += w1[j]; 796 } 797 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 798 799 /* Determine the number of messages to expect, their lengths, from from-ids */ 800 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 801 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 802 803 /* Now post the Irecvs corresponding to these messages */ 804 tag0 = ((PetscObject)C)->tag; 805 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 806 807 ierr = PetscFree(onodes1);CHKERRQ(ierr); 808 ierr = PetscFree(olengths1);CHKERRQ(ierr); 809 810 /* Allocate Memory for outgoing messages */ 811 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 812 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 813 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 814 815 { 816 PetscInt *iptr = tmp; 817 k = 0; 818 for (i=0; i<nrqs; i++) { 819 j = pa[i]; 820 iptr += k; 821 sbuf1[j] = iptr; 822 k = w1[j]; 823 } 824 } 825 826 /* Form the outgoing messages. Initialize the header space */ 827 for (i=0; i<nrqs; i++) { 828 j = pa[i]; 829 sbuf1[j][0] = 0; 830 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 831 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 832 } 833 834 /* Parse the isrow and copy data into outbuf */ 835 for (i=0; i<ismax; i++) { 836 row2proc_i = row2proc[i]; 837 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 838 irow_i = irow[i]; 839 jmax = nrow[i]; 840 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 841 proc = row2proc_i[j]; 842 if (allrows[i]) row = j; 843 else row = irow_i[j]; 844 845 if (proc != rank) { /* copy to the outgoing buf*/ 846 ctr[proc]++; 847 *ptr[proc] = row; 848 ptr[proc]++; 849 } 850 } 851 /* Update the headers for the current IS */ 852 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 853 if ((ctr_j = ctr[j])) { 854 sbuf1_j = sbuf1[j]; 855 k = ++sbuf1_j[0]; 856 sbuf1_j[2*k] = ctr_j; 857 sbuf1_j[2*k-1] = i; 858 } 859 } 860 } 861 862 /* Now post the sends */ 863 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 864 for (i=0; i<nrqs; ++i) { 865 j = pa[i]; 866 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 867 } 868 869 /* Post Receives to capture the buffer size */ 870 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 871 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 872 rbuf2[0] = tmp + msz; 873 for (i=1; i<nrqs; ++i) { 874 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 875 } 876 for (i=0; i<nrqs; ++i) { 877 j = pa[i]; 878 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 879 } 880 881 /* Send to other procs the buf size they should allocate */ 882 /* Receive messages*/ 883 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 884 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 885 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 886 887 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 888 for (i=0; i<nrqr; ++i) { 889 req_size[i] = 0; 890 rbuf1_i = rbuf1[i]; 891 start = 2*rbuf1_i[0] + 1; 892 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 893 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 894 sbuf2_i = sbuf2[i]; 895 for (j=start; j<end; j++) { 896 row = rbuf1_i[j] - rstart; 897 ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row]; 898 sbuf2_i[j] = ncols; 899 req_size[i] += ncols; 900 } 901 req_source1[i] = r_status1[i].MPI_SOURCE; 902 /* form the header */ 903 sbuf2_i[0] = req_size[i]; 904 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 905 906 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 907 } 908 909 ierr = PetscFree(r_status1);CHKERRQ(ierr); 910 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 911 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 912 913 /* Receive messages*/ 914 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 915 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 916 917 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 918 for (i=0; i<nrqs; ++i) { 919 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 920 req_source2[i] = r_status2[i].MPI_SOURCE; 921 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 922 } 923 ierr = PetscFree(r_status2);CHKERRQ(ierr); 924 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 925 926 /* Wait on sends1 and sends2 */ 927 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 928 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 929 930 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 931 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 932 ierr = PetscFree(s_status1);CHKERRQ(ierr); 933 ierr = PetscFree(s_status2);CHKERRQ(ierr); 934 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 935 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 936 937 /* Now allocate sending buffers for a->j, and send them off */ 938 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 939 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 940 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 941 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 942 943 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 944 { 945 946 for (i=0; i<nrqr; i++) { 947 rbuf1_i = rbuf1[i]; 948 sbuf_aj_i = sbuf_aj[i]; 949 ct1 = 2*rbuf1_i[0] + 1; 950 ct2 = 0; 951 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 952 kmax = rbuf1[i][2*j]; 953 for (k=0; k<kmax; k++,ct1++) { 954 row = rbuf1_i[ct1] - rstart; 955 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 956 ncols = nzA + nzB; 957 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 958 959 /* load the column indices for this row into cols */ 960 cols = sbuf_aj_i + ct2; 961 for (l=0; l<nzB; l++) { 962 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 963 else break; 964 } 965 imark = l; 966 for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];} 967 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 968 ct2 += ncols; 969 } 970 } 971 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 972 } 973 } 974 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 975 976 /* create col map: global col of C -> local col of submatrices */ 977 #if defined(PETSC_USE_CTABLE) 978 for (i=0; i<ismax; i++) { 979 if (!allcolumns[i]) { 980 ierr = PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);CHKERRQ(ierr); 981 982 jmax = ncol[i]; 983 icol_i = icol[i]; 984 cmap_i = cmap[i]; 985 for (j=0; j<jmax; j++) { 986 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 987 } 988 } else cmap[i] = NULL; 989 } 990 #else 991 for (i=0; i<ismax; i++) { 992 if (!allcolumns[i]) { 993 ierr = PetscCalloc1(c->Nbs,&cmap[i]);CHKERRQ(ierr); 994 jmax = ncol[i]; 995 icol_i = icol[i]; 996 cmap_i = cmap[i]; 997 for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1; 998 } else cmap[i] = NULL; 999 } 1000 #endif 1001 1002 /* Create lens which is required for MatCreate... */ 1003 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1004 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1005 1006 if (ismax) { 1007 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 1008 } 1009 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1010 1011 /* Update lens from local data */ 1012 for (i=0; i<ismax; i++) { 1013 row2proc_i = row2proc[i]; 1014 jmax = nrow[i]; 1015 if (!allcolumns[i]) cmap_i = cmap[i]; 1016 irow_i = irow[i]; 1017 lens_i = lens[i]; 1018 for (j=0; j<jmax; j++) { 1019 if (allrows[i]) row = j; 1020 else row = irow_i[j]; /* global blocked row of C */ 1021 1022 proc = row2proc_i[j]; 1023 if (proc == rank) { 1024 /* Get indices from matA and then from matB */ 1025 #if defined(PETSC_USE_CTABLE) 1026 PetscInt tt; 1027 #endif 1028 row = row - rstart; 1029 nzA = a_i[row+1] - a_i[row]; 1030 nzB = b_i[row+1] - b_i[row]; 1031 cworkA = a_j + a_i[row]; 1032 cworkB = b_j + b_i[row]; 1033 1034 if (!allcolumns[i]) { 1035 #if defined(PETSC_USE_CTABLE) 1036 for (k=0; k<nzA; k++) { 1037 ierr = PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr); 1038 if (tt) lens_i[j]++; 1039 } 1040 for (k=0; k<nzB; k++) { 1041 ierr = PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr); 1042 if (tt) lens_i[j]++; 1043 } 1044 1045 #else 1046 for (k=0; k<nzA; k++) { 1047 if (cmap_i[cstart + cworkA[k]]) lens_i[j]++; 1048 } 1049 for (k=0; k<nzB; k++) { 1050 if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++; 1051 } 1052 #endif 1053 } else { /* allcolumns */ 1054 lens_i[j] = nzA + nzB; 1055 } 1056 } 1057 } 1058 } 1059 1060 /* Create row map: global row of C -> local row of submatrices */ 1061 for (i=0; i<ismax; i++) { 1062 if (!allrows[i]) { 1063 #if defined(PETSC_USE_CTABLE) 1064 ierr = PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);CHKERRQ(ierr); 1065 irow_i = irow[i]; 1066 jmax = nrow[i]; 1067 for (j=0; j<jmax; j++) { 1068 if (allrows[i]) { 1069 ierr = PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1070 } else { 1071 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1072 } 1073 } 1074 #else 1075 ierr = PetscCalloc1(c->Mbs,&rmap[i]);CHKERRQ(ierr); 1076 rmap_i = rmap[i]; 1077 irow_i = irow[i]; 1078 jmax = nrow[i]; 1079 for (j=0; j<jmax; j++) { 1080 if (allrows[i]) rmap_i[j] = j; 1081 else rmap_i[irow_i[j]] = j; 1082 } 1083 #endif 1084 } else rmap[i] = NULL; 1085 } 1086 1087 /* Update lens from offproc data */ 1088 { 1089 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1090 1091 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1092 for (tmp2=0; tmp2<nrqs; tmp2++) { 1093 sbuf1_i = sbuf1[pa[tmp2]]; 1094 jmax = sbuf1_i[0]; 1095 ct1 = 2*jmax+1; 1096 ct2 = 0; 1097 rbuf2_i = rbuf2[tmp2]; 1098 rbuf3_i = rbuf3[tmp2]; 1099 for (j=1; j<=jmax; j++) { 1100 is_no = sbuf1_i[2*j-1]; 1101 max1 = sbuf1_i[2*j]; 1102 lens_i = lens[is_no]; 1103 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1104 rmap_i = rmap[is_no]; 1105 for (k=0; k<max1; k++,ct1++) { 1106 if (allrows[is_no]) { 1107 row = sbuf1_i[ct1]; 1108 } else { 1109 #if defined(PETSC_USE_CTABLE) 1110 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1111 row--; 1112 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1113 #else 1114 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1115 #endif 1116 } 1117 max2 = rbuf2_i[ct1]; 1118 for (l=0; l<max2; l++,ct2++) { 1119 if (!allcolumns[is_no]) { 1120 #if defined(PETSC_USE_CTABLE) 1121 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1122 #else 1123 tcol = cmap_i[rbuf3_i[ct2]]; 1124 #endif 1125 if (tcol) lens_i[row]++; 1126 } else { /* allcolumns */ 1127 lens_i[row]++; /* lens_i[row] += max2 ? */ 1128 } 1129 } 1130 } 1131 } 1132 } 1133 } 1134 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1135 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1136 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 1137 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1138 1139 /* Create the submatrices */ 1140 for (i=0; i<ismax; i++) { 1141 PetscInt bs_tmp; 1142 if (ijonly) bs_tmp = 1; 1143 else bs_tmp = bs; 1144 1145 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1146 ierr = MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1147 1148 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1149 ierr = MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);CHKERRQ(ierr); 1150 ierr = MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by SBAIJ routines */ 1151 1152 /* create struct Mat_SubSppt and attached it to submat */ 1153 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 1154 subc = (Mat_SeqBAIJ*)submats[i]->data; 1155 subc->submatis1 = smat_i; 1156 1157 smat_i->destroy = submats[i]->ops->destroy; 1158 submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ; 1159 submats[i]->factortype = C->factortype; 1160 1161 smat_i->id = i; 1162 smat_i->nrqs = nrqs; 1163 smat_i->nrqr = nrqr; 1164 smat_i->rbuf1 = rbuf1; 1165 smat_i->rbuf2 = rbuf2; 1166 smat_i->rbuf3 = rbuf3; 1167 smat_i->sbuf2 = sbuf2; 1168 smat_i->req_source2 = req_source2; 1169 1170 smat_i->sbuf1 = sbuf1; 1171 smat_i->ptr = ptr; 1172 smat_i->tmp = tmp; 1173 smat_i->ctr = ctr; 1174 1175 smat_i->pa = pa; 1176 smat_i->req_size = req_size; 1177 smat_i->req_source1 = req_source1; 1178 1179 smat_i->allcolumns = allcolumns[i]; 1180 smat_i->allrows = allrows[i]; 1181 smat_i->singleis = PETSC_FALSE; 1182 smat_i->row2proc = row2proc[i]; 1183 smat_i->rmap = rmap[i]; 1184 smat_i->cmap = cmap[i]; 1185 } 1186 1187 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 1188 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 1189 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1190 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 1191 1192 /* create struct Mat_SubSppt and attached it to submat */ 1193 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 1194 submats[0]->data = (void*)smat_i; 1195 1196 smat_i->destroy = submats[0]->ops->destroy; 1197 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 1198 submats[0]->factortype = C->factortype; 1199 1200 smat_i->id = 0; 1201 smat_i->nrqs = nrqs; 1202 smat_i->nrqr = nrqr; 1203 smat_i->rbuf1 = rbuf1; 1204 smat_i->rbuf2 = rbuf2; 1205 smat_i->rbuf3 = rbuf3; 1206 smat_i->sbuf2 = sbuf2; 1207 smat_i->req_source2 = req_source2; 1208 1209 smat_i->sbuf1 = sbuf1; 1210 smat_i->ptr = ptr; 1211 smat_i->tmp = tmp; 1212 smat_i->ctr = ctr; 1213 1214 smat_i->pa = pa; 1215 smat_i->req_size = req_size; 1216 smat_i->req_source1 = req_source1; 1217 1218 smat_i->allcolumns = PETSC_FALSE; 1219 smat_i->singleis = PETSC_FALSE; 1220 smat_i->row2proc = NULL; 1221 smat_i->rmap = NULL; 1222 smat_i->cmap = NULL; 1223 } 1224 1225 1226 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1227 ierr = PetscFree(lens);CHKERRQ(ierr); 1228 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1229 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1230 1231 } /* endof scall == MAT_INITIAL_MATRIX */ 1232 1233 /* Post recv matrix values */ 1234 if (!ijonly) { 1235 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1236 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1237 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1238 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1239 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1240 for (i=0; i<nrqs; ++i) { 1241 ierr = PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);CHKERRQ(ierr); 1242 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1243 } 1244 1245 /* Allocate sending buffers for a->a, and send them off */ 1246 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1247 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1248 1249 ierr = PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);CHKERRQ(ierr); 1250 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 1251 1252 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1253 1254 for (i=0; i<nrqr; i++) { 1255 rbuf1_i = rbuf1[i]; 1256 sbuf_aa_i = sbuf_aa[i]; 1257 ct1 = 2*rbuf1_i[0]+1; 1258 ct2 = 0; 1259 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1260 kmax = rbuf1_i[2*j]; 1261 for (k=0; k<kmax; k++,ct1++) { 1262 row = rbuf1_i[ct1] - rstart; 1263 nzA = a_i[row+1] - a_i[row]; 1264 nzB = b_i[row+1] - b_i[row]; 1265 ncols = nzA + nzB; 1266 cworkB = b_j + b_i[row]; 1267 vworkA = a_a + a_i[row]*bs2; 1268 vworkB = b_a + b_i[row]*bs2; 1269 1270 /* load the column values for this row into vals*/ 1271 vals = sbuf_aa_i+ct2*bs2; 1272 for (l=0; l<nzB; l++) { 1273 if ((bmap[cworkB[l]]) < cstart) { 1274 ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1275 } else break; 1276 } 1277 imark = l; 1278 for (l=0; l<nzA; l++) { 1279 ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1280 } 1281 for (l=imark; l<nzB; l++) { 1282 ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1283 } 1284 1285 ct2 += ncols; 1286 } 1287 } 1288 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1289 } 1290 } 1291 1292 /* Assemble the matrices */ 1293 /* First assemble the local rows */ 1294 for (i=0; i<ismax; i++) { 1295 row2proc_i = row2proc[i]; 1296 subc = (Mat_SeqBAIJ*)submats[i]->data; 1297 imat_ilen = subc->ilen; 1298 imat_j = subc->j; 1299 imat_i = subc->i; 1300 imat_a = subc->a; 1301 1302 if (!allcolumns[i]) cmap_i = cmap[i]; 1303 rmap_i = rmap[i]; 1304 irow_i = irow[i]; 1305 jmax = nrow[i]; 1306 for (j=0; j<jmax; j++) { 1307 if (allrows[i]) row = j; 1308 else row = irow_i[j]; 1309 proc = row2proc_i[j]; 1310 1311 if (proc == rank) { 1312 1313 row = row - rstart; 1314 nzA = a_i[row+1] - a_i[row]; 1315 nzB = b_i[row+1] - b_i[row]; 1316 cworkA = a_j + a_i[row]; 1317 cworkB = b_j + b_i[row]; 1318 if (!ijonly) { 1319 vworkA = a_a + a_i[row]*bs2; 1320 vworkB = b_a + b_i[row]*bs2; 1321 } 1322 1323 if (allrows[i]) { 1324 row = row+rstart; 1325 } else { 1326 #if defined(PETSC_USE_CTABLE) 1327 ierr = PetscTableFind(rmap_i,row+rstart+1,&row);CHKERRQ(ierr); 1328 row--; 1329 1330 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1331 #else 1332 row = rmap_i[row + rstart]; 1333 #endif 1334 } 1335 mat_i = imat_i[row]; 1336 if (!ijonly) mat_a = imat_a + mat_i*bs2; 1337 mat_j = imat_j + mat_i; 1338 ilen = imat_ilen[row]; 1339 1340 /* load the column indices for this row into cols*/ 1341 if (!allcolumns[i]) { 1342 for (l=0; l<nzB; l++) { 1343 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1344 #if defined(PETSC_USE_CTABLE) 1345 ierr = PetscTableFind(cmap_i,ctmp+1,&tcol);CHKERRQ(ierr); 1346 if (tcol) { 1347 #else 1348 if ((tcol = cmap_i[ctmp])) { 1349 #endif 1350 *mat_j++ = tcol - 1; 1351 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1352 mat_a += bs2; 1353 ilen++; 1354 } 1355 } else break; 1356 } 1357 imark = l; 1358 for (l=0; l<nzA; l++) { 1359 #if defined(PETSC_USE_CTABLE) 1360 ierr = PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr); 1361 if (tcol) { 1362 #else 1363 if ((tcol = cmap_i[cstart + cworkA[l]])) { 1364 #endif 1365 *mat_j++ = tcol - 1; 1366 if (!ijonly) { 1367 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1368 mat_a += bs2; 1369 } 1370 ilen++; 1371 } 1372 } 1373 for (l=imark; l<nzB; l++) { 1374 #if defined(PETSC_USE_CTABLE) 1375 ierr = PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr); 1376 if (tcol) { 1377 #else 1378 if ((tcol = cmap_i[bmap[cworkB[l]]])) { 1379 #endif 1380 *mat_j++ = tcol - 1; 1381 if (!ijonly) { 1382 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1383 mat_a += bs2; 1384 } 1385 ilen++; 1386 } 1387 } 1388 } else { /* allcolumns */ 1389 for (l=0; l<nzB; l++) { 1390 if ((ctmp = bmap[cworkB[l]]) < cstart) { 1391 *mat_j++ = ctmp; 1392 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1393 mat_a += bs2; 1394 ilen++; 1395 } else break; 1396 } 1397 imark = l; 1398 for (l=0; l<nzA; l++) { 1399 *mat_j++ = cstart+cworkA[l]; 1400 if (!ijonly) { 1401 ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1402 mat_a += bs2; 1403 } 1404 ilen++; 1405 } 1406 for (l=imark; l<nzB; l++) { 1407 *mat_j++ = bmap[cworkB[l]]; 1408 if (!ijonly) { 1409 ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1410 mat_a += bs2; 1411 } 1412 ilen++; 1413 } 1414 } 1415 imat_ilen[row] = ilen; 1416 } 1417 } 1418 } 1419 1420 /* Now assemble the off proc rows */ 1421 if (!ijonly) { 1422 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 1423 } 1424 for (tmp2=0; tmp2<nrqs; tmp2++) { 1425 sbuf1_i = sbuf1[pa[tmp2]]; 1426 jmax = sbuf1_i[0]; 1427 ct1 = 2*jmax + 1; 1428 ct2 = 0; 1429 rbuf2_i = rbuf2[tmp2]; 1430 rbuf3_i = rbuf3[tmp2]; 1431 if (!ijonly) rbuf4_i = rbuf4[tmp2]; 1432 for (j=1; j<=jmax; j++) { 1433 is_no = sbuf1_i[2*j-1]; 1434 rmap_i = rmap[is_no]; 1435 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1436 subc = (Mat_SeqBAIJ*)submats[is_no]->data; 1437 imat_ilen = subc->ilen; 1438 imat_j = subc->j; 1439 imat_i = subc->i; 1440 if (!ijonly) imat_a = subc->a; 1441 max1 = sbuf1_i[2*j]; 1442 for (k=0; k<max1; k++,ct1++) { /* for each recved block row */ 1443 row = sbuf1_i[ct1]; 1444 1445 if (allrows[is_no]) { 1446 row = sbuf1_i[ct1]; 1447 } else { 1448 #if defined(PETSC_USE_CTABLE) 1449 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1450 row--; 1451 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1452 #else 1453 row = rmap_i[row]; 1454 #endif 1455 } 1456 ilen = imat_ilen[row]; 1457 mat_i = imat_i[row]; 1458 if (!ijonly) mat_a = imat_a + mat_i*bs2; 1459 mat_j = imat_j + mat_i; 1460 max2 = rbuf2_i[ct1]; 1461 if (!allcolumns[is_no]) { 1462 for (l=0; l<max2; l++,ct2++) { 1463 #if defined(PETSC_USE_CTABLE) 1464 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1465 #else 1466 tcol = cmap_i[rbuf3_i[ct2]]; 1467 #endif 1468 if (tcol) { 1469 *mat_j++ = tcol - 1; 1470 if (!ijonly) { 1471 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1472 mat_a += bs2; 1473 } 1474 ilen++; 1475 } 1476 } 1477 } else { /* allcolumns */ 1478 for (l=0; l<max2; l++,ct2++) { 1479 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1480 if (!ijonly) { 1481 ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1482 mat_a += bs2; 1483 } 1484 ilen++; 1485 } 1486 } 1487 imat_ilen[row] = ilen; 1488 } 1489 } 1490 } 1491 1492 if (!iscsorted) { /* sort column indices of the rows */ 1493 MatScalar *work; 1494 1495 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr); 1496 for (i=0; i<ismax; i++) { 1497 subc = (Mat_SeqBAIJ*)submats[i]->data; 1498 imat_ilen = subc->ilen; 1499 imat_j = subc->j; 1500 imat_i = subc->i; 1501 if (!ijonly) imat_a = subc->a; 1502 if (allcolumns[i]) continue; 1503 1504 jmax = nrow[i]; 1505 for (j=0; j<jmax; j++) { 1506 mat_i = imat_i[j]; 1507 mat_j = imat_j + mat_i; 1508 ilen = imat_ilen[j]; 1509 if (ijonly) { 1510 ierr = PetscSortInt(ilen,mat_j);CHKERRQ(ierr); 1511 } else { 1512 mat_a = imat_a + mat_i*bs2; 1513 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr); 1514 } 1515 } 1516 } 1517 ierr = PetscFree(work);CHKERRQ(ierr); 1518 } 1519 1520 if (!ijonly) { 1521 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1522 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1523 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1524 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1525 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1526 } 1527 1528 /* Restore the indices */ 1529 for (i=0; i<ismax; i++) { 1530 if (!allrows[i]) { 1531 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1532 } 1533 if (!allcolumns[i]) { 1534 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1535 } 1536 } 1537 1538 for (i=0; i<ismax; i++) { 1539 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1540 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1541 } 1542 1543 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 1544 ierr = PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);CHKERRQ(ierr); 1545 1546 if (!ijonly) { 1547 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1548 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1549 1550 for (i=0; i<nrqs; ++i) { 1551 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1552 } 1553 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1554 } 1555 c->ijonly = PETSC_FALSE; /* set back to the default */ 1556 PetscFunctionReturn(0); 1557 } 1558