18a729477SBarry Smith 2*1eb62cbbSBarry Smith #include "mpiaij.h" 38a729477SBarry Smith #include "vec/vecimpl.h" 48a729477SBarry Smith 58a729477SBarry Smith 6*1eb62cbbSBarry Smith #define CHUNCKSIZE 100 7*1eb62cbbSBarry Smith /* 8*1eb62cbbSBarry Smith This is a simple minded stash. Do a linear search to determine if 9*1eb62cbbSBarry Smith in stash, if not add to end. 10*1eb62cbbSBarry Smith */ 11*1eb62cbbSBarry Smith static int StashValues(Stash *stash,int row,int n, int *idxn, 12*1eb62cbbSBarry Smith Scalar *values,InsertMode addv) 138a729477SBarry Smith { 14*1eb62cbbSBarry Smith int i,j,N = stash->n,found,*n_idx, *n_idy; 15*1eb62cbbSBarry Smith Scalar val,*n_array; 168a729477SBarry Smith 17*1eb62cbbSBarry Smith for ( i=0; i<n; i++ ) { 18*1eb62cbbSBarry Smith found = 0; 19*1eb62cbbSBarry Smith val = *values++; 208a729477SBarry Smith for ( j=0; j<N; j++ ) { 21*1eb62cbbSBarry Smith if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) { 22*1eb62cbbSBarry Smith /* found a match */ 23*1eb62cbbSBarry Smith if (addv == AddValues) stash->array[j] += val; 24*1eb62cbbSBarry Smith else stash->array[j] = val; 25*1eb62cbbSBarry Smith found = 1; 268a729477SBarry Smith break; 278a729477SBarry Smith } 288a729477SBarry Smith } 29*1eb62cbbSBarry Smith if (!found) { /* not found so add to end */ 30*1eb62cbbSBarry Smith if ( stash->n == stash->nmax ) { 31*1eb62cbbSBarry Smith /* allocate a larger stash */ 32*1eb62cbbSBarry Smith n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*( 33*1eb62cbbSBarry Smith 2*sizeof(int) + sizeof(Scalar))); 34*1eb62cbbSBarry Smith CHKPTR(n_array); 35*1eb62cbbSBarry Smith n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE); 36*1eb62cbbSBarry Smith n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE); 37*1eb62cbbSBarry Smith MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar)); 38*1eb62cbbSBarry Smith MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int)); 39*1eb62cbbSBarry Smith MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int)); 40*1eb62cbbSBarry Smith if (stash->array) FREE(stash->array); 41*1eb62cbbSBarry Smith stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy; 42*1eb62cbbSBarry Smith stash->nmax += CHUNCKSIZE; 43*1eb62cbbSBarry Smith } 44*1eb62cbbSBarry Smith stash->array[stash->n] = val; 45*1eb62cbbSBarry Smith stash->idx[stash->n] = row; 46*1eb62cbbSBarry Smith stash->idy[stash->n++] = idxn[i]; 47*1eb62cbbSBarry Smith } 488a729477SBarry Smith } 498a729477SBarry Smith return 0; 508a729477SBarry Smith } 518a729477SBarry Smith 52*1eb62cbbSBarry Smith static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n, 53*1eb62cbbSBarry Smith int *idxn,Scalar *v,InsertMode addv) 548a729477SBarry Smith { 55*1eb62cbbSBarry Smith Matimpiaij *aij = (Matimpiaij *) mat->data; 56*1eb62cbbSBarry Smith int ierr,i,j, rstart = aij->rstart, rend = aij->rend; 57*1eb62cbbSBarry Smith int cstart = aij->cstart, cend = aij->cend,row,col; 588a729477SBarry Smith 59*1eb62cbbSBarry Smith if (aij->insertmode != NotSetValues && aij->insertmode != addv) { 60*1eb62cbbSBarry Smith SETERR(1,"You cannot mix inserts and adds"); 618a729477SBarry Smith } 62*1eb62cbbSBarry Smith aij->insertmode = addv; 638a729477SBarry Smith for ( i=0; i<m; i++ ) { 64*1eb62cbbSBarry Smith if (idxm[i] >= rstart && idxm[i] < rend) { 65*1eb62cbbSBarry Smith row = idxm[i] - rstart; 66*1eb62cbbSBarry Smith for ( j=0; j<n; j++ ) { 67*1eb62cbbSBarry Smith if (idxn[j] >= cstart && idxn[j] < cend){ 68*1eb62cbbSBarry Smith col = idxn[j] - cstart; 69*1eb62cbbSBarry Smith ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr); 70*1eb62cbbSBarry Smith } 71*1eb62cbbSBarry Smith else { 72*1eb62cbbSBarry Smith col = idxn[j]; 73*1eb62cbbSBarry Smith ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr); 74*1eb62cbbSBarry Smith } 75*1eb62cbbSBarry Smith } 76*1eb62cbbSBarry Smith } 77*1eb62cbbSBarry Smith else { 78*1eb62cbbSBarry Smith ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr); 79*1eb62cbbSBarry Smith } 808a729477SBarry Smith } 818a729477SBarry Smith return 0; 828a729477SBarry Smith } 838a729477SBarry Smith 848a729477SBarry Smith /* 85*1eb62cbbSBarry Smith the assembly code is alot like the code for vectors, we should 86*1eb62cbbSBarry Smith sometime derive a single assembly code that can be used for 87*1eb62cbbSBarry Smith either case. 888a729477SBarry Smith */ 898a729477SBarry Smith 90*1eb62cbbSBarry Smith static int MatiAIJBeginAssemble(Mat mat) 918a729477SBarry Smith { 92*1eb62cbbSBarry Smith Matimpiaij *aij = (Matimpiaij *) mat->data; 93*1eb62cbbSBarry Smith MPI_Comm comm = aij->comm; 94*1eb62cbbSBarry Smith int ierr, numtids = aij->numtids, *owners = aij->rowners; 95*1eb62cbbSBarry Smith int mytid = aij->mytid; 96*1eb62cbbSBarry Smith MPI_Request *send_waits,*recv_waits; 97*1eb62cbbSBarry Smith int *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work; 98*1eb62cbbSBarry Smith int tag = 50, *owner,*starts,count; 99*1eb62cbbSBarry Smith InsertMode addv; 100*1eb62cbbSBarry Smith Scalar *rvalues,*svalues; 101*1eb62cbbSBarry Smith 102*1eb62cbbSBarry Smith /* make sure all processors are either in INSERTMODE or ADDMODE */ 103*1eb62cbbSBarry Smith MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT, 104*1eb62cbbSBarry Smith MPI_BOR,comm); 105*1eb62cbbSBarry Smith if (addv == (AddValues|InsertValues)) { 106*1eb62cbbSBarry Smith SETERR(1,"Some processors have inserted while others have added"); 107*1eb62cbbSBarry Smith } 108*1eb62cbbSBarry Smith aij->insertmode = addv; /* in case this processor had no cache */ 109*1eb62cbbSBarry Smith 110*1eb62cbbSBarry Smith /* first count number of contributors to each processor */ 111*1eb62cbbSBarry Smith nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs); 112*1eb62cbbSBarry Smith MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids; 113*1eb62cbbSBarry Smith owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner); 114*1eb62cbbSBarry Smith for ( i=0; i<aij->stash.n; i++ ) { 115*1eb62cbbSBarry Smith idx = aij->stash.idx[i]; 116*1eb62cbbSBarry Smith for ( j=0; j<numtids; j++ ) { 117*1eb62cbbSBarry Smith if (idx >= owners[j] && idx < owners[j+1]) { 118*1eb62cbbSBarry Smith nprocs[j]++; procs[j] = 1; owner[i] = j; break; 1198a729477SBarry Smith } 1208a729477SBarry Smith } 1218a729477SBarry Smith } 122*1eb62cbbSBarry Smith nsends = 0; for ( i=0; i<numtids; i++ ) { nsends += procs[i];} 123*1eb62cbbSBarry Smith 124*1eb62cbbSBarry Smith /* inform other processors of number of messages and max length*/ 125*1eb62cbbSBarry Smith work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work); 126*1eb62cbbSBarry Smith MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm); 127*1eb62cbbSBarry Smith nreceives = work[mytid]; 128*1eb62cbbSBarry Smith MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm); 129*1eb62cbbSBarry Smith nmax = work[mytid]; 130*1eb62cbbSBarry Smith FREE(work); 131*1eb62cbbSBarry Smith 132*1eb62cbbSBarry Smith /* post receives: 133*1eb62cbbSBarry Smith 1) each message will consist of ordered pairs 134*1eb62cbbSBarry Smith (global index,value) we store the global index as a double 135*1eb62cbbSBarry Smith to simply the message passing. 136*1eb62cbbSBarry Smith 2) since we don't know how long each individual message is we 137*1eb62cbbSBarry Smith allocate the largest needed buffer for each receive. Potentially 138*1eb62cbbSBarry Smith this is a lot of wasted space. 139*1eb62cbbSBarry Smith 140*1eb62cbbSBarry Smith 141*1eb62cbbSBarry Smith This could be done better. 142*1eb62cbbSBarry Smith */ 143*1eb62cbbSBarry Smith rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar)); 144*1eb62cbbSBarry Smith CHKPTR(rvalues); 145*1eb62cbbSBarry Smith recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request)); 146*1eb62cbbSBarry Smith CHKPTR(recv_waits); 147*1eb62cbbSBarry Smith for ( i=0; i<nreceives; i++ ) { 148*1eb62cbbSBarry Smith MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag, 149*1eb62cbbSBarry Smith comm,recv_waits+i); 150*1eb62cbbSBarry Smith } 151*1eb62cbbSBarry Smith 152*1eb62cbbSBarry Smith /* do sends: 153*1eb62cbbSBarry Smith 1) starts[i] gives the starting index in svalues for stuff going to 154*1eb62cbbSBarry Smith the ith processor 155*1eb62cbbSBarry Smith */ 156*1eb62cbbSBarry Smith svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) ); 157*1eb62cbbSBarry Smith CHKPTR(svalues); 158*1eb62cbbSBarry Smith send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request)); 159*1eb62cbbSBarry Smith CHKPTR(send_waits); 160*1eb62cbbSBarry Smith starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts); 161*1eb62cbbSBarry Smith starts[0] = 0; 162*1eb62cbbSBarry Smith for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 163*1eb62cbbSBarry Smith for ( i=0; i<aij->stash.n; i++ ) { 164*1eb62cbbSBarry Smith svalues[3*starts[owner[i]]] = (Scalar) aij->stash.idx[i]; 165*1eb62cbbSBarry Smith svalues[3*starts[owner[i]]+1] = (Scalar) aij->stash.idy[i]; 166*1eb62cbbSBarry Smith svalues[3*(starts[owner[i]]++)+2] = aij->stash.array[i]; 167*1eb62cbbSBarry Smith } 168*1eb62cbbSBarry Smith FREE(owner); 169*1eb62cbbSBarry Smith starts[0] = 0; 170*1eb62cbbSBarry Smith for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 171*1eb62cbbSBarry Smith count = 0; 172*1eb62cbbSBarry Smith for ( i=0; i<numtids; i++ ) { 173*1eb62cbbSBarry Smith if (procs[i]) { 174*1eb62cbbSBarry Smith MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag, 175*1eb62cbbSBarry Smith comm,send_waits+count++); 176*1eb62cbbSBarry Smith } 177*1eb62cbbSBarry Smith } 178*1eb62cbbSBarry Smith FREE(starts); FREE(nprocs); 179*1eb62cbbSBarry Smith 180*1eb62cbbSBarry Smith /* Free cache space */ 181*1eb62cbbSBarry Smith aij->stash.nmax = aij->stash.n = 0; 182*1eb62cbbSBarry Smith if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;} 183*1eb62cbbSBarry Smith 184*1eb62cbbSBarry Smith aij->svalues = svalues; aij->rvalues = rvalues; 185*1eb62cbbSBarry Smith aij->nsends = nsends; aij->nrecvs = nreceives; 186*1eb62cbbSBarry Smith aij->send_waits = send_waits; aij->recv_waits = recv_waits; 187*1eb62cbbSBarry Smith aij->rmax = nmax; 188*1eb62cbbSBarry Smith 189*1eb62cbbSBarry Smith return 0; 190*1eb62cbbSBarry Smith } 191*1eb62cbbSBarry Smith extern int MPIAIJSetUpMultiply(Mat); 192*1eb62cbbSBarry Smith 193*1eb62cbbSBarry Smith static int MatiAIJEndAssemble(Mat mat) 194*1eb62cbbSBarry Smith { 195*1eb62cbbSBarry Smith int ierr; 196*1eb62cbbSBarry Smith Matimpiaij *aij = (Matimpiaij *) mat->data; 197*1eb62cbbSBarry Smith 198*1eb62cbbSBarry Smith MPI_Status *send_status,recv_status; 199*1eb62cbbSBarry Smith int index,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n; 200*1eb62cbbSBarry Smith int row,col; 201*1eb62cbbSBarry Smith Scalar *values,val; 202*1eb62cbbSBarry Smith InsertMode addv = aij->insertmode; 203*1eb62cbbSBarry Smith 204*1eb62cbbSBarry Smith /* wait on receives */ 205*1eb62cbbSBarry Smith while (count) { 206*1eb62cbbSBarry Smith MPI_Waitany(nrecvs,aij->recv_waits,&index,&recv_status); 207*1eb62cbbSBarry Smith /* unpack receives into our local space */ 208*1eb62cbbSBarry Smith values = aij->rvalues + 3*index*aij->rmax; 209*1eb62cbbSBarry Smith MPI_Get_count(&recv_status,MPI_SCALAR,&n); 210*1eb62cbbSBarry Smith n = n/3; 211*1eb62cbbSBarry Smith for ( i=0; i<n; i++ ) { 212*1eb62cbbSBarry Smith row = (int) PETSCREAL(values[3*i]) - aij->rstart; 213*1eb62cbbSBarry Smith col = (int) PETSCREAL(values[3*i+1]); 214*1eb62cbbSBarry Smith val = values[3*i+2]; 215*1eb62cbbSBarry Smith if (col >= aij->cstart && col < aij->cend) { 216*1eb62cbbSBarry Smith col -= aij->cstart; 217*1eb62cbbSBarry Smith MatSetValues(aij->A,1,&row,1,&col,&val,addv); 218*1eb62cbbSBarry Smith } 219*1eb62cbbSBarry Smith else { 220*1eb62cbbSBarry Smith MatSetValues(aij->B,1,&row,1,&col,&val,addv); 221*1eb62cbbSBarry Smith } 222*1eb62cbbSBarry Smith } 223*1eb62cbbSBarry Smith count--; 224*1eb62cbbSBarry Smith } 225*1eb62cbbSBarry Smith FREE(aij->recv_waits); FREE(aij->rvalues); 226*1eb62cbbSBarry Smith 227*1eb62cbbSBarry Smith /* wait on sends */ 228*1eb62cbbSBarry Smith if (aij->nsends) { 229*1eb62cbbSBarry Smith send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) ); 230*1eb62cbbSBarry Smith CHKPTR(send_status); 231*1eb62cbbSBarry Smith MPI_Waitall(aij->nsends,aij->send_waits,send_status); 232*1eb62cbbSBarry Smith FREE(send_status); 233*1eb62cbbSBarry Smith } 234*1eb62cbbSBarry Smith FREE(aij->send_waits); FREE(aij->svalues); 235*1eb62cbbSBarry Smith 236*1eb62cbbSBarry Smith aij->insertmode = NotSetValues; 237*1eb62cbbSBarry Smith ierr = MatBeginAssembly(aij->A); CHKERR(ierr); 238*1eb62cbbSBarry Smith ierr = MatEndAssembly(aij->A); CHKERR(ierr); 239*1eb62cbbSBarry Smith 240*1eb62cbbSBarry Smith ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr); 241*1eb62cbbSBarry Smith ierr = MatBeginAssembly(aij->B); CHKERR(ierr); 242*1eb62cbbSBarry Smith ierr = MatEndAssembly(aij->B); CHKERR(ierr); 2438a729477SBarry Smith return 0; 2448a729477SBarry Smith } 2458a729477SBarry Smith 246*1eb62cbbSBarry Smith static int MatiZero(Mat A) 247*1eb62cbbSBarry Smith { 248*1eb62cbbSBarry Smith Matimpiaij *l = (Matimpiaij *) A->data; 249*1eb62cbbSBarry Smith 250*1eb62cbbSBarry Smith MatZeroEntries(l->A); MatZeroEntries(l->B); 251*1eb62cbbSBarry Smith return 0; 252*1eb62cbbSBarry Smith } 253*1eb62cbbSBarry Smith 254*1eb62cbbSBarry Smith /* again this uses the same basic stratagy as in the assembly and 255*1eb62cbbSBarry Smith scatter create routines, we should try to do it systemamatically 256*1eb62cbbSBarry Smith if we can figure out the proper level of generality. */ 257*1eb62cbbSBarry Smith 258*1eb62cbbSBarry Smith /* the code does not do the diagonal entries correctly unless the 259*1eb62cbbSBarry Smith matrix is square and the column and row owerships are identical. 260*1eb62cbbSBarry Smith This is a BUG. The only way to fix it seems to be to access 261*1eb62cbbSBarry Smith aij->A and aij->B directly and not through the MatZeroRows() 262*1eb62cbbSBarry Smith routine. 263*1eb62cbbSBarry Smith */ 264*1eb62cbbSBarry Smith static int MatiZerorows(Mat A,IS is,Scalar *diag) 265*1eb62cbbSBarry Smith { 266*1eb62cbbSBarry Smith Matimpiaij *l = (Matimpiaij *) A->data; 267*1eb62cbbSBarry Smith int i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids; 268*1eb62cbbSBarry Smith int *localrows,*procs,*nprocs,j,found,idx,nsends,*work; 269*1eb62cbbSBarry Smith int nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid; 270*1eb62cbbSBarry Smith int *rvalues,tag = 67,count,base,slen,n,len,*source; 271*1eb62cbbSBarry Smith int *lens,index,*lrows,*values; 272*1eb62cbbSBarry Smith MPI_Comm comm = l->comm; 273*1eb62cbbSBarry Smith MPI_Request *send_waits,*recv_waits; 274*1eb62cbbSBarry Smith MPI_Status recv_status,*send_status; 275*1eb62cbbSBarry Smith IS istmp; 276*1eb62cbbSBarry Smith 277*1eb62cbbSBarry Smith ierr = ISGetLocalSize(is,&N); CHKERR(ierr); 278*1eb62cbbSBarry Smith ierr = ISGetIndices(is,&rows); CHKERR(ierr); 279*1eb62cbbSBarry Smith 280*1eb62cbbSBarry Smith /* first count number of contributors to each processor */ 281*1eb62cbbSBarry Smith nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs); 282*1eb62cbbSBarry Smith MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids; 283*1eb62cbbSBarry Smith owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/ 284*1eb62cbbSBarry Smith for ( i=0; i<N; i++ ) { 285*1eb62cbbSBarry Smith idx = rows[i]; 286*1eb62cbbSBarry Smith found = 0; 287*1eb62cbbSBarry Smith for ( j=0; j<numtids; j++ ) { 288*1eb62cbbSBarry Smith if (idx >= owners[j] && idx < owners[j+1]) { 289*1eb62cbbSBarry Smith nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break; 290*1eb62cbbSBarry Smith } 291*1eb62cbbSBarry Smith } 292*1eb62cbbSBarry Smith if (!found) SETERR(1,"Index out of range"); 293*1eb62cbbSBarry Smith } 294*1eb62cbbSBarry Smith nsends = 0; for ( i=0; i<numtids; i++ ) { nsends += procs[i];} 295*1eb62cbbSBarry Smith 296*1eb62cbbSBarry Smith /* inform other processors of number of messages and max length*/ 297*1eb62cbbSBarry Smith work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work); 298*1eb62cbbSBarry Smith MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm); 299*1eb62cbbSBarry Smith nrecvs = work[mytid]; 300*1eb62cbbSBarry Smith MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm); 301*1eb62cbbSBarry Smith nmax = work[mytid]; 302*1eb62cbbSBarry Smith FREE(work); 303*1eb62cbbSBarry Smith 304*1eb62cbbSBarry Smith /* post receives: */ 305*1eb62cbbSBarry Smith rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */ 306*1eb62cbbSBarry Smith CHKPTR(rvalues); 307*1eb62cbbSBarry Smith recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request)); 308*1eb62cbbSBarry Smith CHKPTR(recv_waits); 309*1eb62cbbSBarry Smith for ( i=0; i<nrecvs; i++ ) { 310*1eb62cbbSBarry Smith MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag, 311*1eb62cbbSBarry Smith comm,recv_waits+i); 312*1eb62cbbSBarry Smith } 313*1eb62cbbSBarry Smith 314*1eb62cbbSBarry Smith /* do sends: 315*1eb62cbbSBarry Smith 1) starts[i] gives the starting index in svalues for stuff going to 316*1eb62cbbSBarry Smith the ith processor 317*1eb62cbbSBarry Smith */ 318*1eb62cbbSBarry Smith svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues); 319*1eb62cbbSBarry Smith send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request)); 320*1eb62cbbSBarry Smith CHKPTR(send_waits); 321*1eb62cbbSBarry Smith starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts); 322*1eb62cbbSBarry Smith starts[0] = 0; 323*1eb62cbbSBarry Smith for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 324*1eb62cbbSBarry Smith for ( i=0; i<N; i++ ) { 325*1eb62cbbSBarry Smith svalues[starts[owner[i]]++] = rows[i]; 326*1eb62cbbSBarry Smith } 327*1eb62cbbSBarry Smith ISRestoreIndices(is,&rows); 328*1eb62cbbSBarry Smith 329*1eb62cbbSBarry Smith starts[0] = 0; 330*1eb62cbbSBarry Smith for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 331*1eb62cbbSBarry Smith count = 0; 332*1eb62cbbSBarry Smith for ( i=0; i<numtids; i++ ) { 333*1eb62cbbSBarry Smith if (procs[i]) { 334*1eb62cbbSBarry Smith MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag, 335*1eb62cbbSBarry Smith comm,send_waits+count++); 336*1eb62cbbSBarry Smith } 337*1eb62cbbSBarry Smith } 338*1eb62cbbSBarry Smith FREE(starts); 339*1eb62cbbSBarry Smith 340*1eb62cbbSBarry Smith base = owners[mytid]; 341*1eb62cbbSBarry Smith 342*1eb62cbbSBarry Smith /* wait on receives */ 343*1eb62cbbSBarry Smith lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens); 344*1eb62cbbSBarry Smith source = lens + nrecvs; 345*1eb62cbbSBarry Smith count = nrecvs; slen = 0; 346*1eb62cbbSBarry Smith while (count) { 347*1eb62cbbSBarry Smith MPI_Waitany(nrecvs,recv_waits,&index,&recv_status); 348*1eb62cbbSBarry Smith /* unpack receives into our local space */ 349*1eb62cbbSBarry Smith MPI_Get_count(&recv_status,MPI_INT,&n); 350*1eb62cbbSBarry Smith source[index] = recv_status.MPI_SOURCE; 351*1eb62cbbSBarry Smith lens[index] = n; 352*1eb62cbbSBarry Smith slen += n; 353*1eb62cbbSBarry Smith count--; 354*1eb62cbbSBarry Smith } 355*1eb62cbbSBarry Smith FREE(recv_waits); 356*1eb62cbbSBarry Smith 357*1eb62cbbSBarry Smith /* move the data into the send scatter */ 358*1eb62cbbSBarry Smith lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows); 359*1eb62cbbSBarry Smith count = 0; 360*1eb62cbbSBarry Smith for ( i=0; i<nrecvs; i++ ) { 361*1eb62cbbSBarry Smith values = rvalues + i*nmax; 362*1eb62cbbSBarry Smith for ( j=0; j<lens[i]; j++ ) { 363*1eb62cbbSBarry Smith lrows[count++] = values[j] - base; 364*1eb62cbbSBarry Smith } 365*1eb62cbbSBarry Smith } 366*1eb62cbbSBarry Smith FREE(rvalues); FREE(lens); 367*1eb62cbbSBarry Smith FREE(owner); FREE(nprocs); 368*1eb62cbbSBarry Smith 369*1eb62cbbSBarry Smith /* actually zap the local rows */ 370*1eb62cbbSBarry Smith ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr); FREE(lrows); 371*1eb62cbbSBarry Smith ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr); 372*1eb62cbbSBarry Smith ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr); 373*1eb62cbbSBarry Smith ierr = ISDestroy(istmp); CHKERR(ierr); 374*1eb62cbbSBarry Smith 375*1eb62cbbSBarry Smith /* wait on sends */ 376*1eb62cbbSBarry Smith if (nsends) { 377*1eb62cbbSBarry Smith send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) ); 378*1eb62cbbSBarry Smith CHKPTR(send_status); 379*1eb62cbbSBarry Smith MPI_Waitall(nsends,send_waits,send_status); 380*1eb62cbbSBarry Smith FREE(send_status); 381*1eb62cbbSBarry Smith } 382*1eb62cbbSBarry Smith FREE(send_waits); FREE(svalues); 383*1eb62cbbSBarry Smith 384*1eb62cbbSBarry Smith 385*1eb62cbbSBarry Smith return 0; 386*1eb62cbbSBarry Smith } 387*1eb62cbbSBarry Smith 388*1eb62cbbSBarry Smith static int MatiAIJMult(Mat aijin,Vec xx,Vec yy) 389*1eb62cbbSBarry Smith { 390*1eb62cbbSBarry Smith Matimpiaij *aij = (Matimpiaij *) aijin->data; 391*1eb62cbbSBarry Smith int ierr; 392*1eb62cbbSBarry Smith 393*1eb62cbbSBarry Smith ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); 394*1eb62cbbSBarry Smith CHKERR(ierr); 395*1eb62cbbSBarry Smith ierr = MatMult(aij->A,xx,yy); CHKERR(ierr); 396*1eb62cbbSBarry Smith ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); CHKERR(ierr); 397*1eb62cbbSBarry Smith ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr); 398*1eb62cbbSBarry Smith return 0; 399*1eb62cbbSBarry Smith } 400*1eb62cbbSBarry Smith 401*1eb62cbbSBarry Smith /* 402*1eb62cbbSBarry Smith This only works correctly for square matrices where the subblock A->A is the 403*1eb62cbbSBarry Smith diagonal block 404*1eb62cbbSBarry Smith */ 405*1eb62cbbSBarry Smith static int MatiAIJgetdiag(Mat Ain,Vec v) 406*1eb62cbbSBarry Smith { 407*1eb62cbbSBarry Smith Matimpiaij *A = (Matimpiaij *) Ain->data; 408*1eb62cbbSBarry Smith return MatGetDiagonal(A->A,v); 409*1eb62cbbSBarry Smith } 410*1eb62cbbSBarry Smith 411*1eb62cbbSBarry Smith static int MatiAIJdestroy(PetscObject obj) 412*1eb62cbbSBarry Smith { 413*1eb62cbbSBarry Smith Mat mat = (Mat) obj; 414*1eb62cbbSBarry Smith Matimpiaij *aij = (Matimpiaij *) mat->data; 415*1eb62cbbSBarry Smith int ierr; 416*1eb62cbbSBarry Smith FREE(aij->rowners); 417*1eb62cbbSBarry Smith ierr = MatDestroy(aij->A); CHKERR(ierr); 418*1eb62cbbSBarry Smith ierr = MatDestroy(aij->B); CHKERR(ierr); 419*1eb62cbbSBarry Smith FREE(aij); FREE(mat); 420*1eb62cbbSBarry Smith if (aij->lvec) VecDestroy(aij->lvec); 421*1eb62cbbSBarry Smith if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx); 422*1eb62cbbSBarry Smith return 0; 423*1eb62cbbSBarry Smith } 424*1eb62cbbSBarry Smith 425*1eb62cbbSBarry Smith static int MatiView(PetscObject obj,Viewer viewer) 426*1eb62cbbSBarry Smith { 427*1eb62cbbSBarry Smith Mat mat = (Mat) obj; 428*1eb62cbbSBarry Smith Matimpiaij *aij = (Matimpiaij *) mat->data; 429*1eb62cbbSBarry Smith int ierr; 430*1eb62cbbSBarry Smith 431*1eb62cbbSBarry Smith MPE_Seq_begin(aij->comm,1); 432*1eb62cbbSBarry Smith printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n", 433*1eb62cbbSBarry Smith aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart, 434*1eb62cbbSBarry Smith aij->cend); 435*1eb62cbbSBarry Smith ierr = MatView(aij->A,viewer); CHKERR(ierr); 436*1eb62cbbSBarry Smith ierr = MatView(aij->B,viewer); CHKERR(ierr); 437*1eb62cbbSBarry Smith MPE_Seq_end(aij->comm,1); 438*1eb62cbbSBarry Smith return 0; 439*1eb62cbbSBarry Smith } 440*1eb62cbbSBarry Smith 441*1eb62cbbSBarry Smith /* 442*1eb62cbbSBarry Smith This has to provide several versions. 443*1eb62cbbSBarry Smith 444*1eb62cbbSBarry Smith 1) per sequential 445*1eb62cbbSBarry Smith 2) a) use only local smoothing updating outer values only once. 446*1eb62cbbSBarry Smith b) local smoothing updating outer values each inner iteration 447*1eb62cbbSBarry Smith 3) color updating out values betwen colors. (this imples an 448*1eb62cbbSBarry Smith ordering that is sort of related to the IS argument, it 449*1eb62cbbSBarry Smith is not clear a IS argument makes the most sense perhaps it 450*1eb62cbbSBarry Smith should be dropped. 451*1eb62cbbSBarry Smith */ 4528a729477SBarry Smith static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,IS is, 4538a729477SBarry Smith int its,Vec xx) 4548a729477SBarry Smith { 455*1eb62cbbSBarry Smith Matimpiaij *mat = (Matimpiaij *) matin->data; 456*1eb62cbbSBarry Smith Scalar zero = 0.0; 4578a729477SBarry Smith int ierr,one = 1, tmp, *idx, *diag; 4588a729477SBarry Smith int n = mat->n, m = mat->m, i, j; 4598a729477SBarry Smith 4608a729477SBarry Smith if (is) SETERR(1,"No support for ordering in relaxation"); 4618a729477SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) { 4628a729477SBarry Smith if (ierr = VecSet(&zero,xx)) return ierr; 4638a729477SBarry Smith } 464*1eb62cbbSBarry Smith 465*1eb62cbbSBarry Smith /* update outer values from other processors*/ 466*1eb62cbbSBarry Smith 467*1eb62cbbSBarry Smith /* smooth locally */ 4688a729477SBarry Smith return 0; 4698a729477SBarry Smith } 4708a729477SBarry Smith /* -------------------------------------------------------------------*/ 471*1eb62cbbSBarry Smith static struct _MatOps MatOps = {MatiAIJInsertValues, 4728a729477SBarry Smith 0, 0, 473*1eb62cbbSBarry Smith MatiAIJMult,0,0,0, 474*1eb62cbbSBarry Smith 0,0,0,0, 4758a729477SBarry Smith 0,0, 4768a729477SBarry Smith MatiAIJrelax, 4778a729477SBarry Smith 0, 478*1eb62cbbSBarry Smith 0,0,0, 4798a729477SBarry Smith 0, 4808a729477SBarry Smith MatiAIJgetdiag,0,0, 481*1eb62cbbSBarry Smith MatiAIJBeginAssemble,MatiAIJEndAssemble, 482*1eb62cbbSBarry Smith 0, 483*1eb62cbbSBarry Smith 0,MatiZero,MatiZerorows,0, 484*1eb62cbbSBarry Smith 0,0,0,0 }; 4858a729477SBarry Smith 4868a729477SBarry Smith 4878a729477SBarry Smith 4888a729477SBarry Smith /*@ 4898a729477SBarry Smith 490*1eb62cbbSBarry Smith MatCreateMPIAIJ - Creates a sparse parallel matrix 491*1eb62cbbSBarry Smith in AIJ format. 4928a729477SBarry Smith 4938a729477SBarry Smith Input Parameters: 494*1eb62cbbSBarry Smith . comm - MPI communicator 495*1eb62cbbSBarry Smith . m,n - number of local rows and columns (or -1 to have calculated) 496*1eb62cbbSBarry Smith . M,N - global rows and columns (or -1 to have calculated) 497*1eb62cbbSBarry Smith . d_nz - total number nonzeros in diagonal portion of matrix 498*1eb62cbbSBarry Smith . d_nzz - number of nonzeros per row in diagonal portion of matrix or null 4998a729477SBarry Smith . You must leave room for the diagonal entry even if it is zero. 500*1eb62cbbSBarry Smith . o_nz - total number nonzeros in off-diagonal portion of matrix 501*1eb62cbbSBarry Smith . o_nzz - number of nonzeros per row in off-diagonal portion of matrix 502*1eb62cbbSBarry Smith . or null. You must have at least one nonzero per row. 5038a729477SBarry Smith 5048a729477SBarry Smith Output parameters: 5058a729477SBarry Smith . newmat - the matrix 5068a729477SBarry Smith 507*1eb62cbbSBarry Smith Keywords: matrix, aij, compressed row, sparse, parallel 5088a729477SBarry Smith @*/ 509*1eb62cbbSBarry Smith int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N, 510*1eb62cbbSBarry Smith int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat) 5118a729477SBarry Smith { 5128a729477SBarry Smith Mat mat; 513*1eb62cbbSBarry Smith Matimpiaij *aij; 514*1eb62cbbSBarry Smith int ierr, i,rl,len,sum[2],work[2]; 5158a729477SBarry Smith *newmat = 0; 5168a729477SBarry Smith CREATEHEADER(mat,_Mat); 517*1eb62cbbSBarry Smith mat->data = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij); 5188a729477SBarry Smith mat->cookie = MAT_COOKIE; 5198a729477SBarry Smith mat->ops = &MatOps; 5208a729477SBarry Smith mat->destroy = MatiAIJdestroy; 521*1eb62cbbSBarry Smith mat->view = MatiView; 522*1eb62cbbSBarry Smith mat->type = MATAIJMPI; 5238a729477SBarry Smith mat->factor = 0; 5248a729477SBarry Smith mat->row = 0; 5258a729477SBarry Smith mat->col = 0; 526*1eb62cbbSBarry Smith aij->comm = comm; 527*1eb62cbbSBarry Smith aij->insertmode = NotSetValues; 528*1eb62cbbSBarry Smith MPI_Comm_rank(comm,&aij->mytid); 529*1eb62cbbSBarry Smith MPI_Comm_size(comm,&aij->numtids); 530*1eb62cbbSBarry Smith 531*1eb62cbbSBarry Smith if (M == -1 || N == -1) { 532*1eb62cbbSBarry Smith work[0] = m; work[1] = n; 533*1eb62cbbSBarry Smith MPI_Allreduce((void *) work,(void *) sum,1,MPI_INT,MPI_SUM,comm ); 534*1eb62cbbSBarry Smith if (M == -1) M = sum[0]; 535*1eb62cbbSBarry Smith if (N == -1) N = sum[1]; 536*1eb62cbbSBarry Smith } 537*1eb62cbbSBarry Smith if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);} 538*1eb62cbbSBarry Smith if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);} 5398a729477SBarry Smith aij->m = m; 5408a729477SBarry Smith aij->n = n; 541*1eb62cbbSBarry Smith aij->N = N; 542*1eb62cbbSBarry Smith aij->M = M; 543*1eb62cbbSBarry Smith 544*1eb62cbbSBarry Smith /* build local table of row and column ownerships */ 545*1eb62cbbSBarry Smith aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int)); 546*1eb62cbbSBarry Smith CHKPTR(aij->rowners); 547*1eb62cbbSBarry Smith aij->cowners = aij->rowners + aij->numtids + 1; 548*1eb62cbbSBarry Smith MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm); 549*1eb62cbbSBarry Smith aij->rowners[0] = 0; 550*1eb62cbbSBarry Smith for ( i=2; i<=aij->numtids; i++ ) { 551*1eb62cbbSBarry Smith aij->rowners[i] += aij->rowners[i-1]; 5528a729477SBarry Smith } 553*1eb62cbbSBarry Smith aij->rstart = aij->rowners[aij->mytid]; 554*1eb62cbbSBarry Smith aij->rend = aij->rowners[aij->mytid+1]; 555*1eb62cbbSBarry Smith MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm); 556*1eb62cbbSBarry Smith aij->cowners[0] = 0; 557*1eb62cbbSBarry Smith for ( i=2; i<=aij->numtids; i++ ) { 558*1eb62cbbSBarry Smith aij->cowners[i] += aij->cowners[i-1]; 5598a729477SBarry Smith } 560*1eb62cbbSBarry Smith aij->cstart = aij->cowners[aij->mytid]; 561*1eb62cbbSBarry Smith aij->cend = aij->cowners[aij->mytid+1]; 5628a729477SBarry Smith 5638a729477SBarry Smith 564*1eb62cbbSBarry Smith ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr); 565*1eb62cbbSBarry Smith ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr); 5668a729477SBarry Smith 567*1eb62cbbSBarry Smith /* build cache for off array entries formed */ 568*1eb62cbbSBarry Smith aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */ 569*1eb62cbbSBarry Smith aij->stash.n = 0; 570*1eb62cbbSBarry Smith aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) + 571*1eb62cbbSBarry Smith sizeof(Scalar))); CHKPTR(aij->stash.array); 572*1eb62cbbSBarry Smith aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax); 573*1eb62cbbSBarry Smith aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax); 5748a729477SBarry Smith 575*1eb62cbbSBarry Smith /* stuff used for matrix vector multiply */ 576*1eb62cbbSBarry Smith aij->lvec = 0; 577*1eb62cbbSBarry Smith aij->Mvctx = 0; 5788a729477SBarry Smith 5798a729477SBarry Smith *newmat = mat; 5808a729477SBarry Smith return 0; 5818a729477SBarry Smith } 582