1632d0f97SHong Zhang 2632d0f97SHong Zhang /* 3632d0f97SHong Zhang Routines to compute overlapping regions of a parallel MPI matrix. 4632d0f97SHong Zhang Used for finding submatrices that were shared across processors. 5632d0f97SHong Zhang */ 6c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h> 7c6db04a5SJed Brown #include <petscbt.h> 8632d0f97SHong Zhang 913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat,PetscInt,IS*); 1013f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat,PetscInt*,PetscInt,PetscInt*,PetscBT*); 11632d0f97SHong Zhang 12db41cccfSHong Zhang /* -------------------------------------------------------------------------*/ 13db41cccfSHong Zhang /* This code is modified from MatGetSubMatrices_MPIBAIJ_local() */ 14db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 15db41cccfSHong Zhang #undef __FUNCT__ 16db41cccfSHong Zhang #define __FUNCT__ "PetscGetProc_ijonly" 17db41cccfSHong Zhang PetscErrorCode PetscGetProc_ijonly(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank) 18db41cccfSHong Zhang { 19db41cccfSHong Zhang PetscInt nGlobalNd = proc_gnode[size]; 20db41cccfSHong Zhang PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5))); 21db41cccfSHong Zhang 22db41cccfSHong Zhang PetscFunctionBegin; 23db41cccfSHong Zhang if (fproc > size) fproc = size; 24db41cccfSHong Zhang while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) { 25db41cccfSHong Zhang if (row < proc_gnode[fproc]) fproc--; 26db41cccfSHong Zhang else fproc++; 27db41cccfSHong Zhang } 28db41cccfSHong Zhang *rank = fproc; 29db41cccfSHong Zhang PetscFunctionReturn(0); 30db41cccfSHong Zhang } 31db41cccfSHong Zhang #endif 32db41cccfSHong Zhang 33db41cccfSHong Zhang #undef __FUNCT__ 34db41cccfSHong Zhang #define __FUNCT__ "MatGetSubMatrices_MPIBAIJ_local_ijonly" 35db41cccfSHong Zhang PetscErrorCode MatGetSubMatrices_MPIBAIJ_local_ijonly(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 36db41cccfSHong Zhang { 37db41cccfSHong Zhang Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data; 38db41cccfSHong Zhang Mat A = c->A; 39db41cccfSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat; 40db41cccfSHong Zhang const PetscInt **irow,**icol,*irow_i; 41db41cccfSHong Zhang PetscInt *nrow,*ncol,*w3,*w4,start; 42db41cccfSHong Zhang PetscErrorCode ierr; 43db41cccfSHong Zhang PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc; 44db41cccfSHong Zhang PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row; 45db41cccfSHong Zhang PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 46db41cccfSHong Zhang PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 47db41cccfSHong Zhang PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 48db41cccfSHong Zhang PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 49db41cccfSHong Zhang PetscInt *a_j=a->j,*b_j=b->j,*cworkA,*cworkB; //bs=C->rmap->bs,bs2=c->bs2 50db41cccfSHong Zhang PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark; 51db41cccfSHong Zhang PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs; 52db41cccfSHong Zhang MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3; 53db41cccfSHong Zhang //MPI_Request *r_waits4,*s_waits4; 54db41cccfSHong Zhang MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3; 55db41cccfSHong Zhang //MPI_Status *r_status4,*s_status4; 56db41cccfSHong Zhang MPI_Comm comm; 57db41cccfSHong Zhang //MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB; 58db41cccfSHong Zhang //MatScalar *a_a=a->a,*b_a=b->a; 59db41cccfSHong Zhang PetscBool flag; 60db41cccfSHong Zhang PetscMPIInt *onodes1,*olengths1; 61db41cccfSHong Zhang 62db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 63db41cccfSHong Zhang PetscInt tt; 64db41cccfSHong Zhang PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1; 65db41cccfSHong Zhang #else 66db41cccfSHong Zhang PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs; 67db41cccfSHong Zhang #endif 68db41cccfSHong Zhang 69db41cccfSHong Zhang PetscFunctionBegin; 70db41cccfSHong Zhang comm = ((PetscObject)C)->comm; 71db41cccfSHong Zhang tag0 = ((PetscObject)C)->tag; 72db41cccfSHong Zhang size = c->size; 73db41cccfSHong Zhang rank = c->rank; 74db41cccfSHong Zhang 75db41cccfSHong Zhang /* Get some new tags to keep the communication clean */ 76db41cccfSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 77db41cccfSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 78db41cccfSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 79db41cccfSHong Zhang 80db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE) 81db41cccfSHong Zhang ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 82db41cccfSHong Zhang #else 83db41cccfSHong Zhang ierr = PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);CHKERRQ(ierr); 84db41cccfSHong Zhang /* Create hash table for the mapping :row -> proc*/ 85db41cccfSHong Zhang for (i=0,j=0; i<size; i++) { 86db41cccfSHong Zhang jmax = c->rowners[i+1]; 87db41cccfSHong Zhang for (; j<jmax; j++) { 88db41cccfSHong Zhang rtable[j] = i; 89db41cccfSHong Zhang } 90db41cccfSHong Zhang } 91db41cccfSHong Zhang #endif 92db41cccfSHong Zhang 93db41cccfSHong Zhang for (i=0; i<ismax; i++) { 94db41cccfSHong Zhang ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 95db41cccfSHong Zhang ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 96db41cccfSHong Zhang ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 97db41cccfSHong Zhang ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 98db41cccfSHong Zhang } 99db41cccfSHong Zhang 100db41cccfSHong Zhang /* evaluate communication - mesg to who,length of mesg,and buffer space 101db41cccfSHong Zhang required. Based on this, buffers are allocated, and data copied into them*/ 102db41cccfSHong Zhang ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);CHKERRQ(ierr); 103db41cccfSHong Zhang ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 104db41cccfSHong Zhang ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 105db41cccfSHong Zhang ierr = PetscMemzero(w3,size*sizeof(PetscInt));CHKERRQ(ierr); 106db41cccfSHong Zhang for (i=0; i<ismax; i++) { 107db41cccfSHong Zhang ierr = PetscMemzero(w4,size*sizeof(PetscInt));CHKERRQ(ierr); /* initialise work vector*/ 108db41cccfSHong Zhang jmax = nrow[i]; 109db41cccfSHong Zhang irow_i = irow[i]; 110db41cccfSHong Zhang for (j=0; j<jmax; j++) { 111db41cccfSHong Zhang row = irow_i[j]; 112db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 113db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 114db41cccfSHong Zhang #else 115db41cccfSHong Zhang proc = rtable[row]; 116db41cccfSHong Zhang #endif 117db41cccfSHong Zhang w4[proc]++; 118db41cccfSHong Zhang } 119db41cccfSHong Zhang for (j=0; j<size; j++) { 120db41cccfSHong Zhang if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 121db41cccfSHong Zhang } 122db41cccfSHong Zhang } 123db41cccfSHong Zhang 124db41cccfSHong Zhang nrqs = 0; /* no of outgoing messages */ 125db41cccfSHong Zhang msz = 0; /* total mesg length for all proc */ 126db41cccfSHong Zhang w1[rank] = 0; /* no mesg sent to intself */ 127db41cccfSHong Zhang w3[rank] = 0; 128db41cccfSHong Zhang for (i=0; i<size; i++) { 129db41cccfSHong Zhang if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 130db41cccfSHong Zhang } 131db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 132db41cccfSHong Zhang for (i=0,j=0; i<size; i++) { 133db41cccfSHong Zhang if (w1[i]) { pa[j] = i; j++; } 134db41cccfSHong Zhang } 135db41cccfSHong Zhang 136db41cccfSHong Zhang /* Each message would have a header = 1 + 2*(no of IS) + data */ 137db41cccfSHong Zhang for (i=0; i<nrqs; i++) { 138db41cccfSHong Zhang j = pa[i]; 139db41cccfSHong Zhang w1[j] += w2[j] + 2* w3[j]; 140db41cccfSHong Zhang msz += w1[j]; 141db41cccfSHong Zhang } 142db41cccfSHong Zhang 143db41cccfSHong Zhang /* Determine the number of messages to expect, their lengths, from from-ids */ 144db41cccfSHong Zhang ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 145db41cccfSHong Zhang ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 146db41cccfSHong Zhang 147db41cccfSHong Zhang /* Now post the Irecvs corresponding to these messages */ 148db41cccfSHong Zhang ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 149db41cccfSHong Zhang 150db41cccfSHong Zhang ierr = PetscFree(onodes1);CHKERRQ(ierr); 151db41cccfSHong Zhang ierr = PetscFree(olengths1);CHKERRQ(ierr); 152db41cccfSHong Zhang 153db41cccfSHong Zhang /* Allocate Memory for outgoing messages */ 154db41cccfSHong Zhang ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 155db41cccfSHong Zhang ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 156db41cccfSHong Zhang ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 157db41cccfSHong Zhang { 158db41cccfSHong Zhang PetscInt *iptr = tmp,ict = 0; 159db41cccfSHong Zhang for (i=0; i<nrqs; i++) { 160db41cccfSHong Zhang j = pa[i]; 161db41cccfSHong Zhang iptr += ict; 162db41cccfSHong Zhang sbuf1[j] = iptr; 163db41cccfSHong Zhang ict = w1[j]; 164db41cccfSHong Zhang } 165db41cccfSHong Zhang } 166db41cccfSHong Zhang 167db41cccfSHong Zhang /* Form the outgoing messages */ 168db41cccfSHong Zhang /* Initialise the header space */ 169db41cccfSHong Zhang for (i=0; i<nrqs; i++) { 170db41cccfSHong Zhang j = pa[i]; 171db41cccfSHong Zhang sbuf1[j][0] = 0; 172db41cccfSHong Zhang ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 173db41cccfSHong Zhang ptr[j] = sbuf1[j] + 2*w3[j] + 1; 174db41cccfSHong Zhang } 175db41cccfSHong Zhang 176db41cccfSHong Zhang /* Parse the isrow and copy data into outbuf */ 177db41cccfSHong Zhang for (i=0; i<ismax; i++) { 178db41cccfSHong Zhang ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 179db41cccfSHong Zhang irow_i = irow[i]; 180db41cccfSHong Zhang jmax = nrow[i]; 181db41cccfSHong Zhang for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 182db41cccfSHong Zhang row = irow_i[j]; 183db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 184db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 185db41cccfSHong Zhang #else 186db41cccfSHong Zhang proc = rtable[row]; 187db41cccfSHong Zhang #endif 188db41cccfSHong Zhang if (proc != rank) { /* copy to the outgoing buf*/ 189db41cccfSHong Zhang ctr[proc]++; 190db41cccfSHong Zhang *ptr[proc] = row; 191db41cccfSHong Zhang ptr[proc]++; 192db41cccfSHong Zhang } 193db41cccfSHong Zhang } 194db41cccfSHong Zhang /* Update the headers for the current IS */ 195db41cccfSHong Zhang for (j=0; j<size; j++) { /* Can Optimise this loop too */ 196db41cccfSHong Zhang if ((ctr_j = ctr[j])) { 197db41cccfSHong Zhang sbuf1_j = sbuf1[j]; 198db41cccfSHong Zhang k = ++sbuf1_j[0]; 199db41cccfSHong Zhang sbuf1_j[2*k] = ctr_j; 200db41cccfSHong Zhang sbuf1_j[2*k-1] = i; 201db41cccfSHong Zhang } 202db41cccfSHong Zhang } 203db41cccfSHong Zhang } 204db41cccfSHong Zhang 205db41cccfSHong Zhang /* Now post the sends */ 206db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 207db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 208db41cccfSHong Zhang j = pa[i]; 209db41cccfSHong Zhang ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 210db41cccfSHong Zhang } 211db41cccfSHong Zhang 212db41cccfSHong Zhang /* Post Recieves to capture the buffer size */ 213db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 214db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 215db41cccfSHong Zhang rbuf2[0] = tmp + msz; 216db41cccfSHong Zhang for (i=1; i<nrqs; ++i) { 217db41cccfSHong Zhang j = pa[i]; 218db41cccfSHong Zhang rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 219db41cccfSHong Zhang } 220db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 221db41cccfSHong Zhang j = pa[i]; 222db41cccfSHong Zhang ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 223db41cccfSHong Zhang } 224db41cccfSHong Zhang 225db41cccfSHong Zhang /* Send to other procs the buf size they should allocate */ 226db41cccfSHong Zhang 227db41cccfSHong Zhang /* Receive messages*/ 228db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 229db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 230db41cccfSHong Zhang ierr = PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 231db41cccfSHong Zhang { 232db41cccfSHong Zhang Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data; 233db41cccfSHong Zhang PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i; 234db41cccfSHong Zhang 235db41cccfSHong Zhang for (i=0; i<nrqr; ++i) { 236db41cccfSHong Zhang ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 237db41cccfSHong Zhang req_size[idex] = 0; 238db41cccfSHong Zhang rbuf1_i = rbuf1[idex]; 239db41cccfSHong Zhang start = 2*rbuf1_i[0] + 1; 240db41cccfSHong Zhang ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 241db41cccfSHong Zhang ierr = PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 242db41cccfSHong Zhang sbuf2_i = sbuf2[idex]; 243db41cccfSHong Zhang for (j=start; j<end; j++) { 244db41cccfSHong Zhang id = rbuf1_i[j] - rstart; 245db41cccfSHong Zhang ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 246db41cccfSHong Zhang sbuf2_i[j] = ncols; 247db41cccfSHong Zhang req_size[idex] += ncols; 248db41cccfSHong Zhang } 249db41cccfSHong Zhang req_source[idex] = r_status1[i].MPI_SOURCE; 250db41cccfSHong Zhang /* form the header */ 251db41cccfSHong Zhang sbuf2_i[0] = req_size[idex]; 252db41cccfSHong Zhang for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 253db41cccfSHong Zhang ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 254db41cccfSHong Zhang } 255db41cccfSHong Zhang } 256db41cccfSHong Zhang ierr = PetscFree(r_status1);CHKERRQ(ierr); 257db41cccfSHong Zhang ierr = PetscFree(r_waits1);CHKERRQ(ierr); 258db41cccfSHong Zhang 259db41cccfSHong Zhang /* recv buffer sizes */ 260db41cccfSHong Zhang /* Receive messages*/ 261db41cccfSHong Zhang 262db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 263db41cccfSHong Zhang //ierr = PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);CHKERRQ(ierr); 264db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 265db41cccfSHong Zhang //ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 266db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 267db41cccfSHong Zhang 268db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 269db41cccfSHong Zhang ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 270db41cccfSHong Zhang ierr = PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 271db41cccfSHong Zhang //ierr = PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);CHKERRQ(ierr); 272db41cccfSHong Zhang ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 273db41cccfSHong Zhang //ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 274db41cccfSHong Zhang } 275db41cccfSHong Zhang ierr = PetscFree(r_status2);CHKERRQ(ierr); 276db41cccfSHong Zhang ierr = PetscFree(r_waits2);CHKERRQ(ierr); 277db41cccfSHong Zhang 278db41cccfSHong Zhang /* Wait on sends1 and sends2 */ 279db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 280db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 281db41cccfSHong Zhang 282db41cccfSHong Zhang if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 283db41cccfSHong Zhang if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 284db41cccfSHong Zhang ierr = PetscFree(s_status1);CHKERRQ(ierr); 285db41cccfSHong Zhang ierr = PetscFree(s_status2);CHKERRQ(ierr); 286db41cccfSHong Zhang ierr = PetscFree(s_waits1);CHKERRQ(ierr); 287db41cccfSHong Zhang ierr = PetscFree(s_waits2);CHKERRQ(ierr); 288db41cccfSHong Zhang 289db41cccfSHong Zhang /* Now allocate buffers for a->j, and send them off */ 290db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 291db41cccfSHong Zhang for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 292db41cccfSHong Zhang ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 293db41cccfSHong Zhang for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 294db41cccfSHong Zhang 295db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 296db41cccfSHong Zhang { 297db41cccfSHong Zhang for (i=0; i<nrqr; i++) { 298db41cccfSHong Zhang rbuf1_i = rbuf1[i]; 299db41cccfSHong Zhang sbuf_aj_i = sbuf_aj[i]; 300db41cccfSHong Zhang ct1 = 2*rbuf1_i[0] + 1; 301db41cccfSHong Zhang ct2 = 0; 302db41cccfSHong Zhang for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 303db41cccfSHong Zhang kmax = rbuf1[i][2*j]; 304db41cccfSHong Zhang for (k=0; k<kmax; k++,ct1++) { 305db41cccfSHong Zhang row = rbuf1_i[ct1] - rstart; 306db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 307db41cccfSHong Zhang ncols = nzA + nzB; 308db41cccfSHong Zhang cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 309db41cccfSHong Zhang 310db41cccfSHong Zhang /* load the column indices for this row into cols*/ 311db41cccfSHong Zhang cols = sbuf_aj_i + ct2; 312db41cccfSHong Zhang for (l=0; l<nzB; l++) { 313db41cccfSHong Zhang if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 314db41cccfSHong Zhang else break; 315db41cccfSHong Zhang } 316db41cccfSHong Zhang imark = l; 317db41cccfSHong Zhang for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l]; 318db41cccfSHong Zhang for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]]; 319db41cccfSHong Zhang ct2 += ncols; 320db41cccfSHong Zhang } 321db41cccfSHong Zhang } 322db41cccfSHong Zhang ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 323db41cccfSHong Zhang } 324db41cccfSHong Zhang } 325db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 326db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 327db41cccfSHong Zhang 328db41cccfSHong Zhang #if defined(MV) 329db41cccfSHong Zhang /* Allocate buffers for a->a, and send them off */ 330db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);CHKERRQ(ierr); 331db41cccfSHong Zhang for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 332db41cccfSHong Zhang ierr = PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);CHKERRQ(ierr); 333db41cccfSHong Zhang for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2; 334db41cccfSHong Zhang 335db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 336db41cccfSHong Zhang { 337db41cccfSHong Zhang for (i=0; i<nrqr; i++) { 338db41cccfSHong Zhang rbuf1_i = rbuf1[i]; 339db41cccfSHong Zhang sbuf_aa_i = sbuf_aa[i]; 340db41cccfSHong Zhang ct1 = 2*rbuf1_i[0]+1; 341db41cccfSHong Zhang ct2 = 0; 342db41cccfSHong Zhang for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 343db41cccfSHong Zhang kmax = rbuf1_i[2*j]; 344db41cccfSHong Zhang for (k=0; k<kmax; k++,ct1++) { 345db41cccfSHong Zhang row = rbuf1_i[ct1] - rstart; 346db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 347db41cccfSHong Zhang ncols = nzA + nzB; 348db41cccfSHong Zhang cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 349db41cccfSHong Zhang vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2; 350db41cccfSHong Zhang 351db41cccfSHong Zhang /* load the column values for this row into vals*/ 352db41cccfSHong Zhang vals = sbuf_aa_i+ct2*bs2; 353db41cccfSHong Zhang for (l=0; l<nzB; l++) { 354db41cccfSHong Zhang if ((bmap[cworkB[l]]) < cstart) { 355db41cccfSHong Zhang ierr = PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 356db41cccfSHong Zhang } 357db41cccfSHong Zhang else break; 358db41cccfSHong Zhang } 359db41cccfSHong Zhang imark = l; 360db41cccfSHong Zhang for (l=0; l<nzA; l++) { 361db41cccfSHong Zhang ierr = PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 362db41cccfSHong Zhang } 363db41cccfSHong Zhang for (l=imark; l<nzB; l++) { 364db41cccfSHong Zhang ierr = PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 365db41cccfSHong Zhang } 366db41cccfSHong Zhang ct2 += ncols; 367db41cccfSHong Zhang } 368db41cccfSHong Zhang } 369db41cccfSHong Zhang ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 370db41cccfSHong Zhang } 371db41cccfSHong Zhang } 372db41cccfSHong Zhang ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 373db41cccfSHong Zhang ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 374db41cccfSHong Zhang #endif 375db41cccfSHong Zhang ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 376db41cccfSHong Zhang ierr = PetscFree(rbuf1);CHKERRQ(ierr); 377db41cccfSHong Zhang 378db41cccfSHong Zhang /* Form the matrix */ 379db41cccfSHong Zhang /* create col map */ 380db41cccfSHong Zhang { 381db41cccfSHong Zhang const PetscInt *icol_i; 382db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 383db41cccfSHong Zhang /* Create row map*/ 384db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);CHKERRQ(ierr); 385db41cccfSHong Zhang for (i=0; i<ismax; i++) { 386db41cccfSHong Zhang ierr = PetscTableCreate(ncol[i]+1,&colmaps[i]);CHKERRQ(ierr); 387db41cccfSHong Zhang } 388db41cccfSHong Zhang #else 389db41cccfSHong Zhang ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 390db41cccfSHong Zhang ierr = PetscMalloc(ismax*c->Nbs*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 391db41cccfSHong Zhang for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; } 392db41cccfSHong Zhang #endif 393db41cccfSHong Zhang for (i=0; i<ismax; i++) { 394db41cccfSHong Zhang jmax = ncol[i]; 395db41cccfSHong Zhang icol_i = icol[i]; 396db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 397db41cccfSHong Zhang lcol1_gcol1 = colmaps[i]; 398db41cccfSHong Zhang for (j=0; j<jmax; j++) { 399db41cccfSHong Zhang ierr = PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);CHKERRQ(ierr); 400db41cccfSHong Zhang } 401db41cccfSHong Zhang #else 402db41cccfSHong Zhang cmap_i = cmap[i]; 403db41cccfSHong Zhang for (j=0; j<jmax; j++) { 404db41cccfSHong Zhang cmap_i[icol_i[j]] = j+1; 405db41cccfSHong Zhang } 406db41cccfSHong Zhang #endif 407db41cccfSHong Zhang } 408db41cccfSHong Zhang } 409db41cccfSHong Zhang 410db41cccfSHong Zhang /* Create lens which is required for MatCreate... */ 411db41cccfSHong Zhang for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 412db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);CHKERRQ(ierr); 413db41cccfSHong Zhang lens[0] = (PetscInt*)(lens + ismax); 414db41cccfSHong Zhang ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 415db41cccfSHong Zhang for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 416db41cccfSHong Zhang 417db41cccfSHong Zhang /* Update lens from local data */ 418db41cccfSHong Zhang for (i=0; i<ismax; i++) { 419db41cccfSHong Zhang jmax = nrow[i]; 420db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 421db41cccfSHong Zhang lcol1_gcol1 = colmaps[i]; 422db41cccfSHong Zhang #else 423db41cccfSHong Zhang cmap_i = cmap[i]; 424db41cccfSHong Zhang #endif 425db41cccfSHong Zhang irow_i = irow[i]; 426db41cccfSHong Zhang lens_i = lens[i]; 427db41cccfSHong Zhang for (j=0; j<jmax; j++) { 428db41cccfSHong Zhang row = irow_i[j]; 429db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 430db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 431db41cccfSHong Zhang #else 432db41cccfSHong Zhang proc = rtable[row]; 433db41cccfSHong Zhang #endif 434db41cccfSHong Zhang if (proc == rank) { 435db41cccfSHong Zhang /* Get indices from matA and then from matB */ 436db41cccfSHong Zhang row = row - rstart; 437db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 438db41cccfSHong Zhang cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 439db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 440db41cccfSHong Zhang for (k=0; k<nzA; k++) { 441db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr); 442db41cccfSHong Zhang if (tt) { lens_i[j]++; } 443db41cccfSHong Zhang } 444db41cccfSHong Zhang for (k=0; k<nzB; k++) { 445db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr); 446db41cccfSHong Zhang if (tt) { lens_i[j]++; } 447db41cccfSHong Zhang } 448db41cccfSHong Zhang #else 449db41cccfSHong Zhang for (k=0; k<nzA; k++) { 450db41cccfSHong Zhang if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; } 451db41cccfSHong Zhang } 452db41cccfSHong Zhang for (k=0; k<nzB; k++) { 453db41cccfSHong Zhang if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; } 454db41cccfSHong Zhang } 455db41cccfSHong Zhang #endif 456db41cccfSHong Zhang } 457db41cccfSHong Zhang } 458db41cccfSHong Zhang } 459db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 460db41cccfSHong Zhang /* Create row map*/ 461db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);CHKERRQ(ierr); 462db41cccfSHong Zhang for (i=0; i<ismax; i++){ 463db41cccfSHong Zhang ierr = PetscTableCreate(nrow[i]+1,&rowmaps[i]);CHKERRQ(ierr); 464db41cccfSHong Zhang } 465db41cccfSHong Zhang #else 466db41cccfSHong Zhang /* Create row map*/ 467db41cccfSHong Zhang ierr = PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);CHKERRQ(ierr); 468db41cccfSHong Zhang rmap[0] = (PetscInt*)(rmap + ismax); 469db41cccfSHong Zhang ierr = PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));CHKERRQ(ierr); 470db41cccfSHong Zhang for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;} 471db41cccfSHong Zhang #endif 472db41cccfSHong Zhang for (i=0; i<ismax; i++) { 473db41cccfSHong Zhang irow_i = irow[i]; 474db41cccfSHong Zhang jmax = nrow[i]; 475db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 476db41cccfSHong Zhang lrow1_grow1 = rowmaps[i]; 477db41cccfSHong Zhang for (j=0; j<jmax; j++) { 478db41cccfSHong Zhang ierr = PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);CHKERRQ(ierr); 479db41cccfSHong Zhang } 480db41cccfSHong Zhang #else 481db41cccfSHong Zhang rmap_i = rmap[i]; 482db41cccfSHong Zhang for (j=0; j<jmax; j++) { 483db41cccfSHong Zhang rmap_i[irow_i[j]] = j; 484db41cccfSHong Zhang } 485db41cccfSHong Zhang #endif 486db41cccfSHong Zhang } 487db41cccfSHong Zhang 488db41cccfSHong Zhang /* Update lens from offproc data */ 489db41cccfSHong Zhang { 490db41cccfSHong Zhang PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 491db41cccfSHong Zhang PetscMPIInt ii; 492db41cccfSHong Zhang 493db41cccfSHong Zhang for (tmp2=0; tmp2<nrqs; tmp2++) { 494db41cccfSHong Zhang ierr = MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);CHKERRQ(ierr); 495db41cccfSHong Zhang idex = pa[ii]; 496db41cccfSHong Zhang sbuf1_i = sbuf1[idex]; 497db41cccfSHong Zhang jmax = sbuf1_i[0]; 498db41cccfSHong Zhang ct1 = 2*jmax+1; 499db41cccfSHong Zhang ct2 = 0; 500db41cccfSHong Zhang rbuf2_i = rbuf2[ii]; 501db41cccfSHong Zhang rbuf3_i = rbuf3[ii]; 502db41cccfSHong Zhang for (j=1; j<=jmax; j++) { 503db41cccfSHong Zhang is_no = sbuf1_i[2*j-1]; 504db41cccfSHong Zhang max1 = sbuf1_i[2*j]; 505db41cccfSHong Zhang lens_i = lens[is_no]; 506db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 507db41cccfSHong Zhang lcol1_gcol1 = colmaps[is_no]; 508db41cccfSHong Zhang lrow1_grow1 = rowmaps[is_no]; 509db41cccfSHong Zhang #else 510db41cccfSHong Zhang cmap_i = cmap[is_no]; 511db41cccfSHong Zhang rmap_i = rmap[is_no]; 512db41cccfSHong Zhang #endif 513db41cccfSHong Zhang for (k=0; k<max1; k++,ct1++) { 514db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 515db41cccfSHong Zhang ierr = PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 516db41cccfSHong Zhang row--; 517db41cccfSHong Zhang if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 518db41cccfSHong Zhang #else 519db41cccfSHong Zhang row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 520db41cccfSHong Zhang #endif 521db41cccfSHong Zhang max2 = rbuf2_i[ct1]; 522db41cccfSHong Zhang for (l=0; l<max2; l++,ct2++) { 523db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 524db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);CHKERRQ(ierr); 525db41cccfSHong Zhang if (tt) { 526db41cccfSHong Zhang lens_i[row]++; 527db41cccfSHong Zhang } 528db41cccfSHong Zhang #else 529db41cccfSHong Zhang if (cmap_i[rbuf3_i[ct2]]) { 530db41cccfSHong Zhang lens_i[row]++; 531db41cccfSHong Zhang } 532db41cccfSHong Zhang #endif 533db41cccfSHong Zhang } 534db41cccfSHong Zhang } 535db41cccfSHong Zhang } 536db41cccfSHong Zhang } 537db41cccfSHong Zhang } 538db41cccfSHong Zhang ierr = PetscFree(r_status3);CHKERRQ(ierr); 539db41cccfSHong Zhang ierr = PetscFree(r_waits3);CHKERRQ(ierr); 540db41cccfSHong Zhang if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 541db41cccfSHong Zhang ierr = PetscFree(s_status3);CHKERRQ(ierr); 542db41cccfSHong Zhang ierr = PetscFree(s_waits3);CHKERRQ(ierr); 543db41cccfSHong Zhang 544db41cccfSHong Zhang /* Create the submatrices */ 545db41cccfSHong Zhang if (scall == MAT_REUSE_MATRIX) { 546db41cccfSHong Zhang /* 547db41cccfSHong Zhang Assumes new rows are same length as the old rows, hence bug! 548db41cccfSHong Zhang */ 549db41cccfSHong Zhang for (i=0; i<ismax; i++) { 550db41cccfSHong Zhang mat = (Mat_SeqBAIJ *)(submats[i]->data); 551db41cccfSHong Zhang //if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 552db41cccfSHong Zhang if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 553db41cccfSHong Zhang ierr = PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr); 554db41cccfSHong Zhang if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros"); 555db41cccfSHong Zhang /* Initial matrix as if empty */ 556db41cccfSHong Zhang ierr = PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));CHKERRQ(ierr); 557db41cccfSHong Zhang submats[i]->factortype = C->factortype; 558db41cccfSHong Zhang } 559db41cccfSHong Zhang } else { 560db41cccfSHong Zhang for (i=0; i<ismax; i++) { 561db41cccfSHong Zhang ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 562db41cccfSHong Zhang //ierr = MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);CHKERRQ(ierr); 563db41cccfSHong Zhang ierr = MatSetSizes(submats[i],nrow[i],ncol[i],nrow[i],ncol[i]);CHKERRQ(ierr); 564db41cccfSHong Zhang ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 565db41cccfSHong Zhang ierr = MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); 566db41cccfSHong Zhang //ierr = MatSeqBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr); 567db41cccfSHong Zhang ierr = MatSeqSBAIJSetPreallocation(submats[i],1,0,lens[i]);CHKERRQ(ierr); 568db41cccfSHong Zhang //ierr = MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by MatGetSubMatrices_MPISBAIJ()*/ 569db41cccfSHong Zhang } 570db41cccfSHong Zhang } 571db41cccfSHong Zhang 572db41cccfSHong Zhang /* Assemble the matrices */ 573db41cccfSHong Zhang /* First assemble the local rows */ 574db41cccfSHong Zhang { 575db41cccfSHong Zhang PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i; 576db41cccfSHong Zhang //MatScalar *imat_a; 577db41cccfSHong Zhang 578db41cccfSHong Zhang for (i=0; i<ismax; i++) { 579db41cccfSHong Zhang mat = (Mat_SeqBAIJ*)submats[i]->data; 580db41cccfSHong Zhang imat_ilen = mat->ilen; 581db41cccfSHong Zhang imat_j = mat->j; 582db41cccfSHong Zhang imat_i = mat->i; 583db41cccfSHong Zhang //imat_a = mat->a; 584db41cccfSHong Zhang 585db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 586db41cccfSHong Zhang lcol1_gcol1 = colmaps[i]; 587db41cccfSHong Zhang lrow1_grow1 = rowmaps[i]; 588db41cccfSHong Zhang #else 589db41cccfSHong Zhang cmap_i = cmap[i]; 590db41cccfSHong Zhang rmap_i = rmap[i]; 591db41cccfSHong Zhang #endif 592db41cccfSHong Zhang irow_i = irow[i]; 593db41cccfSHong Zhang jmax = nrow[i]; 594db41cccfSHong Zhang for (j=0; j<jmax; j++) { 595db41cccfSHong Zhang row = irow_i[j]; 596db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 597db41cccfSHong Zhang ierr = PetscGetProc_ijonly(row,size,c->rangebs,&proc);CHKERRQ(ierr); 598db41cccfSHong Zhang #else 599db41cccfSHong Zhang proc = rtable[row]; 600db41cccfSHong Zhang #endif 601db41cccfSHong Zhang if (proc == rank) { 602db41cccfSHong Zhang row = row - rstart; 603db41cccfSHong Zhang nzA = a_i[row+1] - a_i[row]; 604db41cccfSHong Zhang nzB = b_i[row+1] - b_i[row]; 605db41cccfSHong Zhang cworkA = a_j + a_i[row]; 606db41cccfSHong Zhang cworkB = b_j + b_i[row]; 607db41cccfSHong Zhang //vworkA = a_a + a_i[row]*bs2; 608db41cccfSHong Zhang //vworkB = b_a + b_i[row]*bs2; 609db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 610db41cccfSHong Zhang ierr = PetscTableFind(lrow1_grow1,row+rstart+1,&row);CHKERRQ(ierr); 611db41cccfSHong Zhang row--; 612db41cccfSHong Zhang if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 613db41cccfSHong Zhang #else 614db41cccfSHong Zhang row = rmap_i[row + rstart]; 615db41cccfSHong Zhang #endif 616db41cccfSHong Zhang mat_i = imat_i[row]; 617db41cccfSHong Zhang //mat_a = imat_a + mat_i*bs2; 618db41cccfSHong Zhang mat_j = imat_j + mat_i; 619db41cccfSHong Zhang ilen_row = imat_ilen[row]; 620db41cccfSHong Zhang 621db41cccfSHong Zhang /* load the column indices for this row into cols*/ 622db41cccfSHong Zhang for (l=0; l<nzB; l++) { 623db41cccfSHong Zhang if ((ctmp = bmap[cworkB[l]]) < cstart) { 624db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 625db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);CHKERRQ(ierr); 626db41cccfSHong Zhang if (tcol) { 627db41cccfSHong Zhang #else 628db41cccfSHong Zhang if ((tcol = cmap_i[ctmp])) { 629db41cccfSHong Zhang #endif 630db41cccfSHong Zhang *mat_j++ = tcol - 1; 631db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 632db41cccfSHong Zhang //mat_a += bs2; 633db41cccfSHong Zhang ilen_row++; 634db41cccfSHong Zhang } 635db41cccfSHong Zhang } else break; 636db41cccfSHong Zhang } 637db41cccfSHong Zhang imark = l; 638db41cccfSHong Zhang for (l=0; l<nzA; l++) { 639db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 640db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr); 641db41cccfSHong Zhang if (tcol) { 642db41cccfSHong Zhang #else 643db41cccfSHong Zhang if ((tcol = cmap_i[cstart + cworkA[l]])) { 644db41cccfSHong Zhang #endif 645db41cccfSHong Zhang *mat_j++ = tcol - 1; 646db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 647db41cccfSHong Zhang //mat_a += bs2; 648db41cccfSHong Zhang ilen_row++; 649db41cccfSHong Zhang } 650db41cccfSHong Zhang } 651db41cccfSHong Zhang for (l=imark; l<nzB; l++) { 652db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 653db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr); 654db41cccfSHong Zhang if (tcol) { 655db41cccfSHong Zhang #else 656db41cccfSHong Zhang if ((tcol = cmap_i[bmap[cworkB[l]]])) { 657db41cccfSHong Zhang #endif 658db41cccfSHong Zhang *mat_j++ = tcol - 1; 659db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 660db41cccfSHong Zhang //mat_a += bs2; 661db41cccfSHong Zhang ilen_row++; 662db41cccfSHong Zhang } 663db41cccfSHong Zhang } 664db41cccfSHong Zhang imat_ilen[row] = ilen_row; 665db41cccfSHong Zhang } 666db41cccfSHong Zhang } 667db41cccfSHong Zhang } 668db41cccfSHong Zhang } 669db41cccfSHong Zhang 670db41cccfSHong Zhang /* Now assemble the off proc rows*/ 671db41cccfSHong Zhang { 672db41cccfSHong Zhang PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 673db41cccfSHong Zhang PetscInt *imat_j,*imat_i; 674db41cccfSHong Zhang //MatScalar *imat_a,*rbuf4_i; 675db41cccfSHong Zhang PetscMPIInt ii; 676db41cccfSHong Zhang 677db41cccfSHong Zhang for (tmp2=0; tmp2<nrqs; tmp2++) { 678db41cccfSHong Zhang //ierr = MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);CHKERRQ(ierr); 679db41cccfSHong Zhang ii = tmp2; //new 680db41cccfSHong Zhang idex = pa[ii]; 681db41cccfSHong Zhang sbuf1_i = sbuf1[idex]; 682db41cccfSHong Zhang jmax = sbuf1_i[0]; 683db41cccfSHong Zhang ct1 = 2*jmax + 1; 684db41cccfSHong Zhang ct2 = 0; 685db41cccfSHong Zhang rbuf2_i = rbuf2[ii]; 686db41cccfSHong Zhang rbuf3_i = rbuf3[ii]; 687db41cccfSHong Zhang //rbuf4_i = rbuf4[ii]; 688db41cccfSHong Zhang for (j=1; j<=jmax; j++) { 689db41cccfSHong Zhang is_no = sbuf1_i[2*j-1]; 690db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 691db41cccfSHong Zhang lrow1_grow1 = rowmaps[is_no]; 692db41cccfSHong Zhang lcol1_gcol1 = colmaps[is_no]; 693db41cccfSHong Zhang #else 694db41cccfSHong Zhang rmap_i = rmap[is_no]; 695db41cccfSHong Zhang cmap_i = cmap[is_no]; 696db41cccfSHong Zhang #endif 697db41cccfSHong Zhang mat = (Mat_SeqBAIJ*)submats[is_no]->data; 698db41cccfSHong Zhang imat_ilen = mat->ilen; 699db41cccfSHong Zhang imat_j = mat->j; 700db41cccfSHong Zhang imat_i = mat->i; 701db41cccfSHong Zhang //imat_a = mat->a; 702db41cccfSHong Zhang max1 = sbuf1_i[2*j]; 703db41cccfSHong Zhang for (k=0; k<max1; k++,ct1++) { 704db41cccfSHong Zhang row = sbuf1_i[ct1]; 705db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 706db41cccfSHong Zhang ierr = PetscTableFind(lrow1_grow1,row+1,&row);CHKERRQ(ierr); 707db41cccfSHong Zhang row--; 708db41cccfSHong Zhang if(row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 709db41cccfSHong Zhang #else 710db41cccfSHong Zhang row = rmap_i[row]; 711db41cccfSHong Zhang #endif 712db41cccfSHong Zhang ilen = imat_ilen[row]; 713db41cccfSHong Zhang mat_i = imat_i[row]; 714db41cccfSHong Zhang //mat_a = imat_a + mat_i*bs2; 715db41cccfSHong Zhang mat_j = imat_j + mat_i; 716db41cccfSHong Zhang max2 = rbuf2_i[ct1]; 717db41cccfSHong Zhang for (l=0; l<max2; l++,ct2++) { 718db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 719db41cccfSHong Zhang ierr = PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 720db41cccfSHong Zhang if (tcol) { 721db41cccfSHong Zhang #else 722db41cccfSHong Zhang if ((tcol = cmap_i[rbuf3_i[ct2]])) { 723db41cccfSHong Zhang #endif 724db41cccfSHong Zhang *mat_j++ = tcol - 1; 725db41cccfSHong Zhang /* *mat_a++= rbuf4_i[ct2]; */ 726db41cccfSHong Zhang //ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 727db41cccfSHong Zhang //mat_a += bs2; 728db41cccfSHong Zhang ilen++; 729db41cccfSHong Zhang } 730db41cccfSHong Zhang } 731db41cccfSHong Zhang imat_ilen[row] = ilen; 732db41cccfSHong Zhang } 733db41cccfSHong Zhang } 734db41cccfSHong Zhang } 735db41cccfSHong Zhang } 736db41cccfSHong Zhang //ierr = PetscFree(r_status4);CHKERRQ(ierr); 737db41cccfSHong Zhang //ierr = PetscFree(r_waits4);CHKERRQ(ierr); 738db41cccfSHong Zhang //if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 739db41cccfSHong Zhang //ierr = PetscFree(s_waits4);CHKERRQ(ierr); 740db41cccfSHong Zhang //ierr = PetscFree(s_status4);CHKERRQ(ierr); 741db41cccfSHong Zhang 742db41cccfSHong Zhang /* Restore the indices */ 743db41cccfSHong Zhang for (i=0; i<ismax; i++) { 744db41cccfSHong Zhang ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 745db41cccfSHong Zhang ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 746db41cccfSHong Zhang } 747db41cccfSHong Zhang 748db41cccfSHong Zhang /* Destroy allocated memory */ 749db41cccfSHong Zhang #if defined(PETSC_USE_CTABLE) 750db41cccfSHong Zhang ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 751db41cccfSHong Zhang #else 752db41cccfSHong Zhang ierr = PetscFree5(irow,icol,nrow,ncol,rtable);CHKERRQ(ierr); 753db41cccfSHong Zhang #endif 754db41cccfSHong Zhang ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 755db41cccfSHong Zhang ierr = PetscFree(pa);CHKERRQ(ierr); 756db41cccfSHong Zhang 757db41cccfSHong Zhang ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 758db41cccfSHong Zhang ierr = PetscFree(sbuf1);CHKERRQ(ierr); 759db41cccfSHong Zhang ierr = PetscFree(rbuf2);CHKERRQ(ierr); 760db41cccfSHong Zhang for (i=0; i<nrqr; ++i) { 761db41cccfSHong Zhang ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 762db41cccfSHong Zhang } 763db41cccfSHong Zhang for (i=0; i<nrqs; ++i) { 764db41cccfSHong Zhang ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 765db41cccfSHong Zhang //ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 766db41cccfSHong Zhang } 767db41cccfSHong Zhang ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 768db41cccfSHong Zhang ierr = PetscFree(rbuf3);CHKERRQ(ierr); 769db41cccfSHong Zhang //ierr = PetscFree(rbuf4);CHKERRQ(ierr); 770db41cccfSHong Zhang ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 771db41cccfSHong Zhang ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 772db41cccfSHong Zhang //ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 773db41cccfSHong Zhang //ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 774db41cccfSHong Zhang 775db41cccfSHong Zhang #if defined (PETSC_USE_CTABLE) 776db41cccfSHong Zhang for (i=0; i<ismax; i++){ 777db41cccfSHong Zhang ierr = PetscTableDestroy(rowmaps[i]);CHKERRQ(ierr); 778db41cccfSHong Zhang ierr = PetscTableDestroy(colmaps[i]);CHKERRQ(ierr); 779db41cccfSHong Zhang } 780db41cccfSHong Zhang ierr = PetscFree(colmaps);CHKERRQ(ierr); 781db41cccfSHong Zhang ierr = PetscFree(rowmaps);CHKERRQ(ierr); 782db41cccfSHong Zhang #else 783db41cccfSHong Zhang ierr = PetscFree(rmap);CHKERRQ(ierr); 784db41cccfSHong Zhang ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 785db41cccfSHong Zhang ierr = PetscFree(cmap);CHKERRQ(ierr); 786db41cccfSHong Zhang #endif 787db41cccfSHong Zhang ierr = PetscFree(lens);CHKERRQ(ierr); 788db41cccfSHong Zhang 789db41cccfSHong Zhang for (i=0; i<ismax; i++) { 790db41cccfSHong Zhang ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 791db41cccfSHong Zhang ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 792db41cccfSHong Zhang } 793db41cccfSHong Zhang PetscFunctionReturn(0); 794db41cccfSHong Zhang } 795db41cccfSHong Zhang 796db41cccfSHong Zhang /* ------------------------------------------------------- */ 797db41cccfSHong Zhang 798632d0f97SHong Zhang #undef __FUNCT__ 799632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ" 80013f74950SBarry Smith PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C,PetscInt is_max,IS is[],PetscInt ov) 801632d0f97SHong Zhang { 8026849ba73SBarry Smith PetscErrorCode ierr; 803d0f46423SBarry Smith PetscInt i,N=C->cmap->N, bs=C->rmap->bs; 804632d0f97SHong Zhang IS *is_new; 805632d0f97SHong Zhang 806632d0f97SHong Zhang PetscFunctionBegin; 807c910923dSHong Zhang ierr = PetscMalloc(is_max*sizeof(IS),&is_new);CHKERRQ(ierr); 808632d0f97SHong Zhang /* Convert the indices into block format */ 80927f478b1SHong Zhang ierr = ISCompressIndicesGeneral(N,bs,is_max,is,is_new);CHKERRQ(ierr); 810e32f2f54SBarry Smith if (ov < 0){ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");} 811db41cccfSHong Zhang 812db41cccfSHong Zhang //------- new ---------- 813db41cccfSHong Zhang PetscBool flg=PETSC_FALSE; 814db41cccfSHong Zhang ierr = PetscOptionsHasName(PETSC_NULL, "-IncreaseOverlap_new", &flg);CHKERRQ(ierr); 815db41cccfSHong Zhang if (!flg){ /* previous non-scalable implementation */ 816632d0f97SHong Zhang for (i=0; i<ov; ++i) { 817c910923dSHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr); 818632d0f97SHong Zhang } 819db41cccfSHong Zhang } else { /* scalable implementation using modified BAIJ routines */ 820db41cccfSHong Zhang 821db41cccfSHong Zhang Mat *submats; 822db41cccfSHong Zhang IS *is_row; 823db41cccfSHong Zhang PetscInt M=C->rmap->N,Mbs=M/bs,*nidx,isz,iov; 824db41cccfSHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 825db41cccfSHong Zhang PetscMPIInt rank=c->rank; 826db41cccfSHong Zhang 827db41cccfSHong Zhang ierr = PetscMalloc((Mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr); 828db41cccfSHong Zhang 829db41cccfSHong Zhang /* Create is_row */ 830db41cccfSHong Zhang ierr = PetscMalloc(is_max*sizeof(IS **),&is_row);CHKERRQ(ierr); 831*487daaacSHong Zhang ierr = ISCreateStride(PETSC_COMM_SELF,Mbs,0,1,&is_row[0]);CHKERRQ(ierr); 832*487daaacSHong Zhang for (i=1; i<is_max; i++) is_row[i] = is_row[0]; /* reuse is_row[0] */ 833db41cccfSHong Zhang 834db41cccfSHong Zhang for (iov=0; iov<ov; ++iov) { 835*487daaacSHong Zhang /* Get submats for column search - Modified from MatGetSubMatrices_MPIBAIJ() */ 836*487daaacSHong Zhang 837db41cccfSHong Zhang /* Allocate memory to hold all the submatrices */ 838db41cccfSHong Zhang ierr = PetscMalloc((is_max+1)*sizeof(Mat),&submats);CHKERRQ(ierr); 839db41cccfSHong Zhang /* Determine the number of stages through which submatrices are done */ 840db41cccfSHong Zhang PetscInt nmax,nstages_local,nstages,max_no,pos; 841db41cccfSHong Zhang nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt)); 842db41cccfSHong Zhang if (!nmax) nmax = 1; 843db41cccfSHong Zhang nstages_local = is_max/nmax + ((is_max % nmax)?1:0); 844db41cccfSHong Zhang 845db41cccfSHong Zhang /* Make sure every processor loops through the nstages */ 846db41cccfSHong Zhang ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 847db41cccfSHong Zhang for (i=0,pos=0; i<nstages; i++) { 848db41cccfSHong Zhang if (pos+nmax <= is_max) max_no = nmax; 849db41cccfSHong Zhang else if (pos == is_max) max_no = 0; 850db41cccfSHong Zhang else max_no = is_max-pos; 851db41cccfSHong Zhang 852db41cccfSHong Zhang ierr = MatGetSubMatrices_MPIBAIJ_local_ijonly(C,max_no,is_row+pos,is_new+pos,MAT_INITIAL_MATRIX,submats+pos);CHKERRQ(ierr); 853db41cccfSHong Zhang if (rank == 10 && !i){ 854db41cccfSHong Zhang printf("submats[0]:\n"); 855db41cccfSHong Zhang ierr = MatView(submats[0],PETSC_VIEWER_STDOUT_SELF);CHKERRQ(ierr); 856db41cccfSHong Zhang } 857db41cccfSHong Zhang pos += max_no; 858db41cccfSHong Zhang } 859db41cccfSHong Zhang 860db41cccfSHong Zhang /* Row search */ 861db41cccfSHong Zhang ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,is_max,is_new);CHKERRQ(ierr); 862db41cccfSHong Zhang 863db41cccfSHong Zhang /* Column search */ 864db41cccfSHong Zhang PetscBT table; 865db41cccfSHong Zhang Mat_SeqSBAIJ *asub_i; 866db41cccfSHong Zhang PetscInt *ai,brow,nz,nis,l; 867db41cccfSHong Zhang const PetscInt *idx; 868db41cccfSHong Zhang 869db41cccfSHong Zhang ierr = PetscBTCreate(Mbs,table);CHKERRQ(ierr); 870db41cccfSHong Zhang for (i=0; i<is_max; i++){ 871db41cccfSHong Zhang asub_i = (Mat_SeqSBAIJ*)submats[i]->data; 872db41cccfSHong Zhang ai=asub_i->i;; 873db41cccfSHong Zhang 874db41cccfSHong Zhang /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */ 875db41cccfSHong Zhang ierr = PetscBTMemzero(Mbs,table);CHKERRQ(ierr); 876db41cccfSHong Zhang 877db41cccfSHong Zhang ierr = ISGetIndices(is_new[i],&idx);CHKERRQ(ierr); 878db41cccfSHong Zhang ierr = ISGetLocalSize(is_new[i],&nis);CHKERRQ(ierr); 879db41cccfSHong Zhang for (l=0; l<nis; l++) { 880db41cccfSHong Zhang ierr = PetscBTSet(table,idx[l]);CHKERRQ(ierr); 881db41cccfSHong Zhang nidx[l] = idx[l]; 882db41cccfSHong Zhang } 883db41cccfSHong Zhang isz = nis; 884db41cccfSHong Zhang 885db41cccfSHong Zhang for (brow=0; brow<Mbs; brow++){ 886db41cccfSHong Zhang nz = ai[brow+1] - ai[brow]; 887db41cccfSHong Zhang if (nz) { 888db41cccfSHong Zhang if (!PetscBTLookupSet(table,brow)) nidx[isz++] = brow; 889db41cccfSHong Zhang } 890db41cccfSHong Zhang } 891db41cccfSHong Zhang ierr = ISRestoreIndices(is_new[i],&idx);CHKERRQ(ierr); 892db41cccfSHong Zhang ierr = ISDestroy(is_new[i]);CHKERRQ(ierr); 893db41cccfSHong Zhang 894db41cccfSHong Zhang /* create updated is_new */ 895db41cccfSHong Zhang ierr = ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,PETSC_COPY_VALUES,is_new+i);CHKERRQ(ierr); 896db41cccfSHong Zhang } 897db41cccfSHong Zhang 898db41cccfSHong Zhang /* Free tmp spaces */ 899db41cccfSHong Zhang ierr = PetscBTDestroy(table);CHKERRQ(ierr); 900db41cccfSHong Zhang for (i=0; i<is_max; i++){ 901db41cccfSHong Zhang ierr = MatDestroy(submats[i]);CHKERRQ(ierr); 902db41cccfSHong Zhang } 903db41cccfSHong Zhang ierr = PetscFree(submats);CHKERRQ(ierr); 904db41cccfSHong Zhang } 905*487daaacSHong Zhang ierr = ISDestroy(is_row[0]);CHKERRQ(ierr); 906db41cccfSHong Zhang ierr = PetscFree(is_row);CHKERRQ(ierr); 907db41cccfSHong Zhang ierr = PetscFree(nidx);CHKERRQ(ierr); 908db41cccfSHong Zhang } 909db41cccfSHong Zhang //--------end of new---------- 910db41cccfSHong Zhang 911c910923dSHong Zhang for (i=0; i<is_max; i++) {ierr = ISDestroy(is[i]);CHKERRQ(ierr);} 91227f478b1SHong Zhang ierr = ISExpandIndicesGeneral(N,bs,is_max,is_new,is);CHKERRQ(ierr); 913db41cccfSHong Zhang 914c910923dSHong Zhang for (i=0; i<is_max; i++) {ierr = ISDestroy(is_new[i]);CHKERRQ(ierr);} 915632d0f97SHong Zhang ierr = PetscFree(is_new);CHKERRQ(ierr); 916632d0f97SHong Zhang PetscFunctionReturn(0); 917632d0f97SHong Zhang } 918632d0f97SHong Zhang 9194a69c536SBarry Smith typedef enum {MINE,OTHER} WhoseOwner; 9200472cc68SHong Zhang /* data1, odata1 and odata2 are packed in the format (for communication): 921a2a9f22aSHong Zhang data[0] = is_max, no of is 922a2a9f22aSHong Zhang data[1] = size of is[0] 923a2a9f22aSHong Zhang ... 924a2a9f22aSHong Zhang data[is_max] = size of is[is_max-1] 925a2a9f22aSHong Zhang data[is_max + 1] = data(is[0]) 926a2a9f22aSHong Zhang ... 927a2a9f22aSHong Zhang data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i]) 928a2a9f22aSHong Zhang ... 9290472cc68SHong Zhang data2 is packed in the format (for creating output is[]): 9300472cc68SHong Zhang data[0] = is_max, no of is 9310472cc68SHong Zhang data[1] = size of is[0] 9320472cc68SHong Zhang ... 9330472cc68SHong Zhang data[is_max] = size of is[is_max-1] 9340472cc68SHong Zhang data[is_max + 1] = data(is[0]) 9350472cc68SHong Zhang ... 9360472cc68SHong Zhang data[is_max + 1 + Mbs*i) = data(is[i]) 9370472cc68SHong Zhang ... 938a2a9f22aSHong Zhang */ 939632d0f97SHong Zhang #undef __FUNCT__ 940632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Once" 94113f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[]) 942632d0f97SHong Zhang { 943632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 9446849ba73SBarry Smith PetscErrorCode ierr; 94513f74950SBarry Smith PetscMPIInt size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len; 9465d0c19d7SBarry Smith const PetscInt *idx_i; 9475d0c19d7SBarry Smith PetscInt idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i, 94813f74950SBarry Smith Mbs,i,j,k,*odata1,*odata2, 94913f74950SBarry Smith proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est; 95013f74950SBarry Smith PetscInt proc_end=0,*iwork,len_unused,nodata2; 95113f74950SBarry Smith PetscInt ois_max; /* max no of is[] in each of processor */ 952bfc6803cSHong Zhang char *t_p; 953632d0f97SHong Zhang MPI_Comm comm; 954e8527bf2SHong Zhang MPI_Request *s_waits1,*s_waits2,r_req; 955632d0f97SHong Zhang MPI_Status *s_status,r_status; 95645f43ab7SHong Zhang PetscBT *table; /* mark indices of this processor's is[] */ 957fc70d14eSHong Zhang PetscBT table_i; 958bfc6803cSHong Zhang PetscBT otable; /* mark indices of other processors' is[] */ 959d0f46423SBarry Smith PetscInt bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners; 96010eea884SHong Zhang IS garray_local,garray_gl; 9615483b11dSHong Zhang 962632d0f97SHong Zhang PetscFunctionBegin; 9637adad957SLisandro Dalcin comm = ((PetscObject)C)->comm; 964632d0f97SHong Zhang size = c->size; 965632d0f97SHong Zhang rank = c->rank; 966632d0f97SHong Zhang Mbs = c->Mbs; 967632d0f97SHong Zhang 968c910923dSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 969c910923dSHong Zhang ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 970c910923dSHong Zhang 971430a0127SHong Zhang /* create tables used in 972430a0127SHong Zhang step 1: table[i] - mark c->garray of proc [i] 97345f43ab7SHong Zhang step 3: table[i] - mark indices of is[i] when whose=MINE 974430a0127SHong Zhang table[0] - mark incideces of is[] when whose=OTHER */ 975430a0127SHong Zhang len = PetscMax(is_max, size);CHKERRQ(ierr); 97674ed9c26SBarry Smith ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr); 977430a0127SHong Zhang for (i=0; i<len; i++) { 978430a0127SHong Zhang table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; 979430a0127SHong Zhang } 980430a0127SHong Zhang 98113f74950SBarry Smith ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); 98210eea884SHong Zhang 9835483b11dSHong Zhang /* 1. Send this processor's is[] to other processors */ 9845483b11dSHong Zhang /*---------------------------------------------------*/ 985e8527bf2SHong Zhang /* allocate spaces */ 98613f74950SBarry Smith ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr); 9875483b11dSHong Zhang len = 0; 988c910923dSHong Zhang for (i=0; i<is_max; i++) { 989632d0f97SHong Zhang ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 990632d0f97SHong Zhang len += n[i]; 991632d0f97SHong Zhang } 992958c9bccSBarry Smith if (!len) { 9935483b11dSHong Zhang is_max = 0; 9945483b11dSHong Zhang } else { 9955483b11dSHong Zhang len += 1 + is_max; /* max length of data1 for one processor */ 9965483b11dSHong Zhang } 9975483b11dSHong Zhang 998430a0127SHong Zhang 99913f74950SBarry Smith ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr); 100013f74950SBarry Smith ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr); 10015483b11dSHong Zhang for (i=0; i<size; i++) data1_start[i] = data1 + i*len; 10025483b11dSHong Zhang 100340cb64c9SJed Brown ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr); 1004e8527bf2SHong Zhang 100576f244e2SHong Zhang /* gather c->garray from all processors */ 100670b3c8c7SBarry Smith ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local);CHKERRQ(ierr); 100776f244e2SHong Zhang ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr); 100876f244e2SHong Zhang ierr = ISDestroy(garray_local);CHKERRQ(ierr); 1009a7cc72afSBarry Smith ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr); 101076f244e2SHong Zhang Bowners[0] = 0; 101176f244e2SHong Zhang for (i=0; i<size; i++) Bowners[i+1] += Bowners[i]; 101276f244e2SHong Zhang 10135483b11dSHong Zhang if (is_max){ 1014430a0127SHong Zhang /* hash table ctable which maps c->row to proc_id) */ 101513f74950SBarry Smith ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr); 10165483b11dSHong Zhang for (proc_id=0,j=0; proc_id<size; proc_id++) { 1017288a2dc7SJed Brown for (; j<C->rmap->range[proc_id+1]/bs; j++) { 10185483b11dSHong Zhang ctable[j] = proc_id; 10195483b11dSHong Zhang } 10205483b11dSHong Zhang } 10215483b11dSHong Zhang 1022430a0127SHong Zhang /* hash tables marking c->garray */ 102310eea884SHong Zhang ierr = ISGetIndices(garray_gl,&idx_i); 10245483b11dSHong Zhang for (i=0; i<size; i++){ 1025430a0127SHong Zhang table_i = table[i]; 1026430a0127SHong Zhang ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr); 1027430a0127SHong Zhang for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/ 1028430a0127SHong Zhang ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr); 10295483b11dSHong Zhang } 10305483b11dSHong Zhang } 103110eea884SHong Zhang ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr); 10325483b11dSHong Zhang } /* if (is_max) */ 103376f244e2SHong Zhang ierr = ISDestroy(garray_gl);CHKERRQ(ierr); 10345483b11dSHong Zhang 10355483b11dSHong Zhang /* evaluate communication - mesg to who, length, and buffer space */ 1036e8527bf2SHong Zhang for (i=0; i<size; i++) len_s[i] = 0; 10375483b11dSHong Zhang 10385483b11dSHong Zhang /* header of data1 */ 10395483b11dSHong Zhang for (proc_id=0; proc_id<size; proc_id++){ 10405483b11dSHong Zhang iwork[proc_id] = 0; 10415483b11dSHong Zhang *data1_start[proc_id] = is_max; 10425483b11dSHong Zhang data1_start[proc_id]++; 10435483b11dSHong Zhang for (j=0; j<is_max; j++) { 10445483b11dSHong Zhang if (proc_id == rank){ 10455483b11dSHong Zhang *data1_start[proc_id] = n[j]; 10465483b11dSHong Zhang } else { 10475483b11dSHong Zhang *data1_start[proc_id] = 0; 10485483b11dSHong Zhang } 10495483b11dSHong Zhang data1_start[proc_id]++; 10505483b11dSHong Zhang } 10515483b11dSHong Zhang } 10525483b11dSHong Zhang 10535483b11dSHong Zhang for (i=0; i<is_max; i++) { 10545483b11dSHong Zhang ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr); 10555483b11dSHong Zhang for (j=0; j<n[i]; j++){ 10565483b11dSHong Zhang idx = idx_i[j]; 10575483b11dSHong Zhang *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */ 10585483b11dSHong Zhang proc_end = ctable[idx]; 10595483b11dSHong Zhang for (proc_id=0; proc_id<=proc_end; proc_id++){ /* for others to process */ 1060e8527bf2SHong Zhang if (proc_id == rank ) continue; /* done before this loop */ 1061430a0127SHong Zhang if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx)) 1062430a0127SHong Zhang continue; /* no need for sending idx to [proc_id] */ 10635483b11dSHong Zhang *data1_start[proc_id] = idx; data1_start[proc_id]++; 10645483b11dSHong Zhang len_s[proc_id]++; 10655483b11dSHong Zhang } 10665483b11dSHong Zhang } 10675483b11dSHong Zhang /* update header data */ 10682cfbe0a4SHong Zhang for (proc_id=0; proc_id<size; proc_id++){ 10695483b11dSHong Zhang if (proc_id== rank) continue; 10705483b11dSHong Zhang *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id]; 10715483b11dSHong Zhang iwork[proc_id] = len_s[proc_id] ; 10725483b11dSHong Zhang } 10735483b11dSHong Zhang ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr); 1074e8527bf2SHong Zhang } 10755483b11dSHong Zhang 10765483b11dSHong Zhang nrqs = 0; nrqr = 0; 10775483b11dSHong Zhang for (i=0; i<size; i++){ 10785483b11dSHong Zhang data1_start[i] = data1 + i*len; 10795483b11dSHong Zhang if (len_s[i]){ 10805483b11dSHong Zhang nrqs++; 10815483b11dSHong Zhang len_s[i] += 1 + is_max; /* add no. of header msg */ 10825483b11dSHong Zhang } 10835483b11dSHong Zhang } 10845483b11dSHong Zhang 10855483b11dSHong Zhang for (i=0; i<is_max; i++) { 1086a2a9f22aSHong Zhang ierr = ISDestroy(is[i]);CHKERRQ(ierr); 1087632d0f97SHong Zhang } 1088bfc6803cSHong Zhang ierr = PetscFree(n);CHKERRQ(ierr); 108905b42c5fSBarry Smith ierr = PetscFree(ctable);CHKERRQ(ierr); 10905483b11dSHong Zhang 10915483b11dSHong Zhang /* Determine the number of messages to expect, their lengths, from from-ids */ 10925483b11dSHong Zhang ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr); 10935483b11dSHong Zhang ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr); 1094632d0f97SHong Zhang 1095632d0f97SHong Zhang /* Now post the sends */ 109674ed9c26SBarry Smith ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr); 1097632d0f97SHong Zhang k = 0; 10985483b11dSHong Zhang for (proc_id=0; proc_id<size; proc_id++){ /* send data1 to processor [proc_id] */ 10995483b11dSHong Zhang if (len_s[proc_id]){ 110013f74950SBarry Smith ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr); 1101632d0f97SHong Zhang k++; 1102632d0f97SHong Zhang } 1103632d0f97SHong Zhang } 1104632d0f97SHong Zhang 110545f43ab7SHong Zhang /* 2. Receive other's is[] and process. Then send back */ 1106bfc6803cSHong Zhang /*-----------------------------------------------------*/ 11075483b11dSHong Zhang len = 0; 11085483b11dSHong Zhang for (i=0; i<nrqr; i++){ 11095483b11dSHong Zhang if (len_r1[i] > len)len = len_r1[i]; 11105483b11dSHong Zhang } 111145f43ab7SHong Zhang ierr = PetscFree(len_r1);CHKERRQ(ierr); 111245f43ab7SHong Zhang ierr = PetscFree(id_r1);CHKERRQ(ierr); 111345f43ab7SHong Zhang 111445f43ab7SHong Zhang for (proc_id=0; proc_id<size; proc_id++) 111545f43ab7SHong Zhang len_s[proc_id] = iwork[proc_id] = 0; 111645f43ab7SHong Zhang 111713f74950SBarry Smith ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr); 111813f74950SBarry Smith ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr); 111976f244e2SHong Zhang ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr); 112010eea884SHong Zhang 112110eea884SHong Zhang len_max = ois_max*(Mbs+1); /* max space storing all is[] for each receive */ 1122240e5896SHong Zhang len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */ 112313f74950SBarry Smith ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); 112410eea884SHong Zhang nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */ 1125240e5896SHong Zhang odata2_ptr[nodata2] = odata2; 112610eea884SHong Zhang len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */ 112710eea884SHong Zhang 1128632d0f97SHong Zhang k = 0; 11295483b11dSHong Zhang while (k < nrqr){ 1130632d0f97SHong Zhang /* Receive messages */ 1131bfc6803cSHong Zhang ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr); 1132632d0f97SHong Zhang if (flag){ 113313f74950SBarry Smith ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); 1134632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 113513f74950SBarry Smith ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); 11365483b11dSHong Zhang ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); 1137632d0f97SHong Zhang 1138fc70d14eSHong Zhang /* Process messages */ 1139240e5896SHong Zhang /* make sure there is enough unused space in odata2 array */ 114010eea884SHong Zhang if (len_unused < len_max){ /* allocate more space for odata2 */ 114113f74950SBarry Smith ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); 1142240e5896SHong Zhang odata2_ptr[++nodata2] = odata2; 114310eea884SHong Zhang len_unused = len_est; 114410eea884SHong Zhang } 114510eea884SHong Zhang 114610eea884SHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr); 1147a2a9f22aSHong Zhang len = 1 + odata2[0]; 1148a2a9f22aSHong Zhang for (i=0; i<odata2[0]; i++){ 1149a2a9f22aSHong Zhang len += odata2[1 + i]; 1150fc70d14eSHong Zhang } 1151632d0f97SHong Zhang 1152632d0f97SHong Zhang /* Send messages back */ 115313f74950SBarry Smith ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr); 1154fc70d14eSHong Zhang k++; 115510eea884SHong Zhang odata2 += len; 115610eea884SHong Zhang len_unused -= len; 115745f43ab7SHong Zhang len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */ 1158632d0f97SHong Zhang } 11595483b11dSHong Zhang } 11605483b11dSHong Zhang ierr = PetscFree(odata1);CHKERRQ(ierr); 116145f43ab7SHong Zhang ierr = PetscBTDestroy(otable);CHKERRQ(ierr); 1162632d0f97SHong Zhang 116345f43ab7SHong Zhang /* 3. Do local work on this processor's is[] */ 116445f43ab7SHong Zhang /*-------------------------------------------*/ 116545f43ab7SHong Zhang /* make sure there is enough unused space in odata2(=data) array */ 116645f43ab7SHong Zhang len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */ 116710eea884SHong Zhang if (len_unused < len_max){ /* allocate more space for odata2 */ 116813f74950SBarry Smith ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); 1169240e5896SHong Zhang odata2_ptr[++nodata2] = odata2; 117010eea884SHong Zhang len_unused = len_est; 117110eea884SHong Zhang } 1172bfc6803cSHong Zhang 1173240e5896SHong Zhang data = odata2; 117445f43ab7SHong Zhang ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr); 117545f43ab7SHong Zhang ierr = PetscFree(data1_start);CHKERRQ(ierr); 117645f43ab7SHong Zhang 117745f43ab7SHong Zhang /* 4. Receive work done on other processors, then merge */ 117845f43ab7SHong Zhang /*------------------------------------------------------*/ 117945f43ab7SHong Zhang /* get max number of messages that this processor expects to recv */ 118013f74950SBarry Smith ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); 118113f74950SBarry Smith ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr); 118274ed9c26SBarry Smith ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr); 118345f43ab7SHong Zhang 1184632d0f97SHong Zhang k = 0; 11855483b11dSHong Zhang while (k < nrqs){ 1186632d0f97SHong Zhang /* Receive messages */ 1187c910923dSHong Zhang ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status); 1188632d0f97SHong Zhang if (flag){ 118913f74950SBarry Smith ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); 1190632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 119113f74950SBarry Smith ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); 11925483b11dSHong Zhang ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); 11935483b11dSHong Zhang if (len > 1+is_max){ /* Add data2 into data */ 11940472cc68SHong Zhang data2_i = data2 + 1 + is_max; 1195fc70d14eSHong Zhang for (i=0; i<is_max; i++){ 1196fc70d14eSHong Zhang table_i = table[i]; 1197bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs*i; 1198bfc6803cSHong Zhang isz = data[1+i]; 11990472cc68SHong Zhang for (j=0; j<data2[1+i]; j++){ 12000472cc68SHong Zhang col = data2_i[j]; 1201bfc6803cSHong Zhang if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;} 1202fc70d14eSHong Zhang } 1203bfc6803cSHong Zhang data[1+i] = isz; 12040472cc68SHong Zhang if (i < is_max - 1) data2_i += data2[1+i]; 1205fc70d14eSHong Zhang } 12065483b11dSHong Zhang } 1207632d0f97SHong Zhang k++; 1208632d0f97SHong Zhang } 12095483b11dSHong Zhang } 121045f43ab7SHong Zhang ierr = PetscFree(data2);CHKERRQ(ierr); 121174ed9c26SBarry Smith ierr = PetscFree2(table,t_p);CHKERRQ(ierr); 12125483b11dSHong Zhang 12135483b11dSHong Zhang /* phase 1 sends are complete */ 12145483b11dSHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 12150c468ba9SBarry Smith if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 12165483b11dSHong Zhang ierr = PetscFree(data1);CHKERRQ(ierr); 12175483b11dSHong Zhang 1218240e5896SHong Zhang /* phase 2 sends are complete */ 12190c468ba9SBarry Smith if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);} 122074ed9c26SBarry Smith ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr); 122145f43ab7SHong Zhang ierr = PetscFree(s_status);CHKERRQ(ierr); 1222632d0f97SHong Zhang 1223c910923dSHong Zhang /* 5. Create new is[] */ 1224c910923dSHong Zhang /*--------------------*/ 1225c910923dSHong Zhang for (i=0; i<is_max; i++) { 1226bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs*i; 122770b3c8c7SBarry Smith ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 1228632d0f97SHong Zhang } 122945f43ab7SHong Zhang for (k=0; k<=nodata2; k++){ 123045f43ab7SHong Zhang ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr); 123145f43ab7SHong Zhang } 123245f43ab7SHong Zhang ierr = PetscFree(odata2_ptr);CHKERRQ(ierr); 12335483b11dSHong Zhang 1234632d0f97SHong Zhang PetscFunctionReturn(0); 1235632d0f97SHong Zhang } 1236632d0f97SHong Zhang 1237632d0f97SHong Zhang #undef __FUNCT__ 1238632d0f97SHong Zhang #define __FUNCT__ "MatIncreaseOverlap_MPISBAIJ_Local" 1239632d0f97SHong Zhang /* 1240dc008846SHong Zhang MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do 1241632d0f97SHong Zhang the work on the local processor. 1242632d0f97SHong Zhang 1243632d0f97SHong Zhang Inputs: 1244632d0f97SHong Zhang C - MAT_MPISBAIJ; 1245bfc6803cSHong Zhang data - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format. 1246bfc6803cSHong Zhang whose - whose is[] to be processed, 1247bfc6803cSHong Zhang MINE: this processor's is[] 1248bfc6803cSHong Zhang OTHER: other processor's is[] 1249632d0f97SHong Zhang Output: 125010eea884SHong Zhang nidx - whose = MINE: 12510472cc68SHong Zhang holds input and newly found indices in the same format as data 12520472cc68SHong Zhang whose = OTHER: 12530472cc68SHong Zhang only holds the newly found indices 12540472cc68SHong Zhang table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'. 1255632d0f97SHong Zhang */ 125676f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */ 125713f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C,PetscInt *data,PetscInt whose,PetscInt *nidx,PetscBT *table) 1258632d0f97SHong Zhang { 1259632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; 1260dc008846SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)(c->A)->data; 1261dc008846SHong Zhang Mat_SeqBAIJ *b = (Mat_SeqBAIJ*)(c->B)->data; 1262dfbe8321SBarry Smith PetscErrorCode ierr; 126313f74950SBarry Smith PetscInt row,mbs,Mbs,*nidx_i,col,col_max,isz,isz0,*ai,*aj,*bi,*bj,*garray,rstart,l; 126413f74950SBarry Smith PetscInt a_start,a_end,b_start,b_end,i,j,k,is_max,*idx_i,n; 1265bfc6803cSHong Zhang PetscBT table0; /* mark the indices of input is[] for look up */ 1266bfc6803cSHong Zhang PetscBT table_i; /* poits to i-th table. When whose=OTHER, a single table is used for all is[] */ 1267632d0f97SHong Zhang 1268632d0f97SHong Zhang PetscFunctionBegin; 126931f99336SHong Zhang Mbs = c->Mbs; mbs = a->mbs; 1270dc008846SHong Zhang ai = a->i; aj = a->j; 1271dc008846SHong Zhang bi = b->i; bj = b->j; 1272632d0f97SHong Zhang garray = c->garray; 1273899cda47SBarry Smith rstart = c->rstartbs; 1274dc008846SHong Zhang is_max = data[0]; 1275632d0f97SHong Zhang 1276fc70d14eSHong Zhang ierr = PetscBTCreate(Mbs,table0);CHKERRQ(ierr); 1277fc70d14eSHong Zhang 1278fc70d14eSHong Zhang nidx[0] = is_max; 1279fc70d14eSHong Zhang idx_i = data + is_max + 1; /* ptr to input is[0] array */ 1280bfc6803cSHong Zhang nidx_i = nidx + is_max + 1; /* ptr to output is[0] array */ 1281dc008846SHong Zhang for (i=0; i<is_max; i++) { /* for each is */ 1282dc008846SHong Zhang isz = 0; 1283fc70d14eSHong Zhang n = data[1+i]; /* size of input is[i] */ 1284dc008846SHong Zhang 128576f244e2SHong Zhang /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */ 1286bfc6803cSHong Zhang if (whose == MINE){ /* process this processor's is[] */ 1287bfc6803cSHong Zhang table_i = table[i]; 12880472cc68SHong Zhang nidx_i = nidx + 1+ is_max + Mbs*i; 1289bfc6803cSHong Zhang } else { /* process other processor's is[] - only use one temp table */ 1290430a0127SHong Zhang table_i = table[0]; 1291bfc6803cSHong Zhang } 1292bfc6803cSHong Zhang ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr); 1293bfc6803cSHong Zhang ierr = PetscBTMemzero(Mbs,table0);CHKERRQ(ierr); 129476f244e2SHong Zhang if (n==0) { 129576f244e2SHong Zhang nidx[1+i] = 0; /* size of new is[i] */ 129676f244e2SHong Zhang continue; 129776f244e2SHong Zhang } 129876f244e2SHong Zhang 129976f244e2SHong Zhang isz0 = 0; col_max = 0; 1300dc008846SHong Zhang for (j=0; j<n; j++){ 1301dc008846SHong Zhang col = idx_i[j]; 1302e32f2f54SBarry Smith if (col >= Mbs) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index col %D >= Mbs %D",col,Mbs); 1303bfc6803cSHong Zhang if(!PetscBTLookupSet(table_i,col)) { 1304bfc6803cSHong Zhang ierr = PetscBTSet(table0,col);CHKERRQ(ierr); 13050472cc68SHong Zhang if (whose == MINE) {nidx_i[isz0] = col;} 1306dbe03f88SHong Zhang if (col_max < col) col_max = col; 1307bfc6803cSHong Zhang isz0++; 1308bfc6803cSHong Zhang } 1309632d0f97SHong Zhang } 1310dc008846SHong Zhang 13110472cc68SHong Zhang if (whose == MINE) {isz = isz0;} 1312fc70d14eSHong Zhang k = 0; /* no. of indices from input is[i] that have been examined */ 1313dc008846SHong Zhang for (row=0; row<mbs; row++){ 1314dc008846SHong Zhang a_start = ai[row]; a_end = ai[row+1]; 1315dc008846SHong Zhang b_start = bi[row]; b_end = bi[row+1]; 13160472cc68SHong Zhang if (PetscBTLookup(table0,row+rstart)){ /* row is on input is[i]: 13170472cc68SHong Zhang do row search: collect all col in this row */ 1318dc008846SHong Zhang for (l = a_start; l<a_end ; l++){ /* Amat */ 1319dc008846SHong Zhang col = aj[l] + rstart; 1320fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;} 1321dc008846SHong Zhang } 1322dc008846SHong Zhang for (l = b_start; l<b_end ; l++){ /* Bmat */ 1323dc008846SHong Zhang col = garray[bj[l]]; 1324fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,col)) {nidx_i[isz++] = col;} 1325dc008846SHong Zhang } 1326dc008846SHong Zhang k++; 1327dc008846SHong Zhang if (k >= isz0) break; /* for (row=0; row<mbs; row++) */ 13280472cc68SHong Zhang } else { /* row is not on input is[i]: 13290472cc68SHong Zhang do col serach: add row onto nidx_i if there is a col in nidx_i */ 1330dc008846SHong Zhang for (l = a_start; l<a_end ; l++){ /* Amat */ 1331dc008846SHong Zhang col = aj[l] + rstart; 133276f244e2SHong Zhang if (col > col_max) break; 1333dc008846SHong Zhang if (PetscBTLookup(table0,col)){ 1334fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;} 1335dc008846SHong Zhang break; /* for l = start; l<end ; l++) */ 1336632d0f97SHong Zhang } 1337632d0f97SHong Zhang } 1338dc008846SHong Zhang for (l = b_start; l<b_end ; l++){ /* Bmat */ 1339dc008846SHong Zhang col = garray[bj[l]]; 134076f244e2SHong Zhang if (col > col_max) break; 1341dc008846SHong Zhang if (PetscBTLookup(table0,col)){ 1342fc70d14eSHong Zhang if (!PetscBTLookupSet(table_i,row+rstart)) {nidx_i[isz++] = row+rstart;} 1343dc008846SHong Zhang break; /* for l = start; l<end ; l++) */ 1344632d0f97SHong Zhang } 1345dc008846SHong Zhang } 1346dc008846SHong Zhang } 1347bfc6803cSHong Zhang } 1348dc008846SHong Zhang 1349dc008846SHong Zhang if (i < is_max - 1){ 1350fc70d14eSHong Zhang idx_i += n; /* ptr to input is[i+1] array */ 1351bfc6803cSHong Zhang nidx_i += isz; /* ptr to output is[i+1] array */ 1352dc008846SHong Zhang } 1353fc70d14eSHong Zhang nidx[1+i] = isz; /* size of new is[i] */ 13541ab97a2aSSatish Balay } /* for each is */ 1355dc008846SHong Zhang ierr = PetscBTDestroy(table0);CHKERRQ(ierr); 1356632d0f97SHong Zhang 1357632d0f97SHong Zhang PetscFunctionReturn(0); 1358632d0f97SHong Zhang } 1359632d0f97SHong Zhang 1360632d0f97SHong Zhang 1361