1 2 #include "mpiaij.h" 3 #include "vec/vecimpl.h" 4 #include "inline/spops.h" 5 6 #define CHUNCKSIZE 100 7 /* 8 This is a simple minded stash. Do a linear search to determine if 9 in stash, if not add to end. 10 */ 11 static int StashValues(Stash *stash,int row,int n, int *idxn, 12 Scalar *values,InsertMode addv) 13 { 14 int i,j,N = stash->n,found,*n_idx, *n_idy; 15 Scalar val,*n_array; 16 17 for ( i=0; i<n; i++ ) { 18 found = 0; 19 val = *values++; 20 for ( j=0; j<N; j++ ) { 21 if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) { 22 /* found a match */ 23 if (addv == AddValues) stash->array[j] += val; 24 else stash->array[j] = val; 25 found = 1; 26 break; 27 } 28 } 29 if (!found) { /* not found so add to end */ 30 if ( stash->n == stash->nmax ) { 31 /* allocate a larger stash */ 32 n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*( 33 2*sizeof(int) + sizeof(Scalar))); 34 CHKPTR(n_array); 35 n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE); 36 n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE); 37 MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar)); 38 MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int)); 39 MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int)); 40 if (stash->array) FREE(stash->array); 41 stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy; 42 stash->nmax += CHUNCKSIZE; 43 } 44 stash->array[stash->n] = val; 45 stash->idx[stash->n] = row; 46 stash->idy[stash->n++] = idxn[i]; 47 } 48 } 49 return 0; 50 } 51 52 static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n, 53 int *idxn,Scalar *v,InsertMode addv) 54 { 55 Matimpiaij *aij = (Matimpiaij *) mat->data; 56 int ierr,i,j, rstart = aij->rstart, rend = aij->rend; 57 int cstart = aij->cstart, cend = aij->cend,row,col; 58 59 if (aij->insertmode != NotSetValues && aij->insertmode != addv) { 60 SETERR(1,"You cannot mix inserts and adds"); 61 } 62 aij->insertmode = addv; 63 for ( i=0; i<m; i++ ) { 64 if (idxm[i] < 0) SETERR(1,"Negative row index"); 65 if (idxm[i] >= aij->M) SETERR(1,"Row index too large"); 66 if (idxm[i] >= rstart && idxm[i] < rend) { 67 row = idxm[i] - rstart; 68 for ( j=0; j<n; j++ ) { 69 if (idxn[j] < 0) SETERR(1,"Negative column index"); 70 if (idxn[j] >= aij->N) SETERR(1,"Column index too large"); 71 if (idxn[j] >= cstart && idxn[j] < cend){ 72 col = idxn[j] - cstart; 73 ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr); 74 } 75 else { 76 if (aij->assembled) { 77 SETERR(1,"Cannot insert off diagonal block in already\ 78 assembled matrix. Contact petsc-maint@mcs.anl.gov\ 79 if your need this feature"); 80 } 81 col = idxn[j]; 82 ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr); 83 } 84 } 85 } 86 else { 87 ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr); 88 } 89 } 90 return 0; 91 } 92 93 /* 94 the assembly code is alot like the code for vectors, we should 95 sometime derive a single assembly code that can be used for 96 either case. 97 */ 98 99 static int MatiAIJBeginAssemble(Mat mat) 100 { 101 Matimpiaij *aij = (Matimpiaij *) mat->data; 102 MPI_Comm comm = mat->comm; 103 int ierr, numtids = aij->numtids, *owners = aij->rowners; 104 int mytid = aij->mytid; 105 MPI_Request *send_waits,*recv_waits; 106 int *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work; 107 int tag = 50, *owner,*starts,count; 108 InsertMode addv; 109 Scalar *rvalues,*svalues; 110 111 /* make sure all processors are either in INSERTMODE or ADDMODE */ 112 MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT, 113 MPI_BOR,comm); 114 if (addv == (AddValues|InsertValues)) { 115 SETERR(1,"Some processors have inserted while others have added"); 116 } 117 aij->insertmode = addv; /* in case this processor had no cache */ 118 119 /* first count number of contributors to each processor */ 120 nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs); 121 MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids; 122 owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner); 123 for ( i=0; i<aij->stash.n; i++ ) { 124 idx = aij->stash.idx[i]; 125 for ( j=0; j<numtids; j++ ) { 126 if (idx >= owners[j] && idx < owners[j+1]) { 127 nprocs[j]++; procs[j] = 1; owner[i] = j; break; 128 } 129 } 130 } 131 nsends = 0; for ( i=0; i<numtids; i++ ) { nsends += procs[i];} 132 133 /* inform other processors of number of messages and max length*/ 134 work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work); 135 MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm); 136 nreceives = work[mytid]; 137 MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm); 138 nmax = work[mytid]; 139 FREE(work); 140 141 /* post receives: 142 1) each message will consist of ordered pairs 143 (global index,value) we store the global index as a double 144 to simplify the message passing. 145 2) since we don't know how long each individual message is we 146 allocate the largest needed buffer for each receive. Potentially 147 this is a lot of wasted space. 148 149 150 This could be done better. 151 */ 152 rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar)); 153 CHKPTR(rvalues); 154 recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request)); 155 CHKPTR(recv_waits); 156 for ( i=0; i<nreceives; i++ ) { 157 MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag, 158 comm,recv_waits+i); 159 } 160 161 /* do sends: 162 1) starts[i] gives the starting index in svalues for stuff going to 163 the ith processor 164 */ 165 svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) ); 166 CHKPTR(svalues); 167 send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request)); 168 CHKPTR(send_waits); 169 starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts); 170 starts[0] = 0; 171 for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 172 for ( i=0; i<aij->stash.n; i++ ) { 173 svalues[3*starts[owner[i]]] = (Scalar) aij->stash.idx[i]; 174 svalues[3*starts[owner[i]]+1] = (Scalar) aij->stash.idy[i]; 175 svalues[3*(starts[owner[i]]++)+2] = aij->stash.array[i]; 176 } 177 FREE(owner); 178 starts[0] = 0; 179 for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 180 count = 0; 181 for ( i=0; i<numtids; i++ ) { 182 if (procs[i]) { 183 MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag, 184 comm,send_waits+count++); 185 } 186 } 187 FREE(starts); FREE(nprocs); 188 189 /* Free cache space */ 190 aij->stash.nmax = aij->stash.n = 0; 191 if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;} 192 193 aij->svalues = svalues; aij->rvalues = rvalues; 194 aij->nsends = nsends; aij->nrecvs = nreceives; 195 aij->send_waits = send_waits; aij->recv_waits = recv_waits; 196 aij->rmax = nmax; 197 198 return 0; 199 } 200 extern int MPIAIJSetUpMultiply(Mat); 201 202 static int MatiAIJEndAssemble(Mat mat) 203 { 204 int ierr; 205 Matimpiaij *aij = (Matimpiaij *) mat->data; 206 207 MPI_Status *send_status,recv_status; 208 int imdex,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n; 209 int row,col; 210 Scalar *values,val; 211 InsertMode addv = aij->insertmode; 212 213 /* wait on receives */ 214 while (count) { 215 MPI_Waitany(nrecvs,aij->recv_waits,&imdex,&recv_status); 216 /* unpack receives into our local space */ 217 values = aij->rvalues + 3*imdex*aij->rmax; 218 MPI_Get_count(&recv_status,MPI_SCALAR,&n); 219 n = n/3; 220 for ( i=0; i<n; i++ ) { 221 row = (int) PETSCREAL(values[3*i]) - aij->rstart; 222 col = (int) PETSCREAL(values[3*i+1]); 223 val = values[3*i+2]; 224 if (col >= aij->cstart && col < aij->cend) { 225 col -= aij->cstart; 226 MatSetValues(aij->A,1,&row,1,&col,&val,addv); 227 } 228 else { 229 if (aij->assembled) { 230 SETERR(1,"Cannot insert off diagonal block in already\ 231 assembled matrix. Contact petsc-maint@mcs.anl.gov\ 232 if your need this feature"); 233 } 234 MatSetValues(aij->B,1,&row,1,&col,&val,addv); 235 } 236 } 237 count--; 238 } 239 FREE(aij->recv_waits); FREE(aij->rvalues); 240 241 /* wait on sends */ 242 if (aij->nsends) { 243 send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) ); 244 CHKPTR(send_status); 245 MPI_Waitall(aij->nsends,aij->send_waits,send_status); 246 FREE(send_status); 247 } 248 FREE(aij->send_waits); FREE(aij->svalues); 249 250 aij->insertmode = NotSetValues; 251 ierr = MatBeginAssembly(aij->A); CHKERR(ierr); 252 ierr = MatEndAssembly(aij->A); CHKERR(ierr); 253 254 255 if (aij->assembled) { 256 /* this is because the present implimentation doesn't support 257 inserting values off the diagonal block.*/ 258 return 0; 259 } 260 261 ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr); 262 ierr = MatBeginAssembly(aij->B); CHKERR(ierr); 263 ierr = MatEndAssembly(aij->B); CHKERR(ierr); 264 265 aij->assembled = 1; 266 return 0; 267 } 268 269 static int MatiZero(Mat A) 270 { 271 Matimpiaij *l = (Matimpiaij *) A->data; 272 273 MatZeroEntries(l->A); MatZeroEntries(l->B); 274 return 0; 275 } 276 277 /* again this uses the same basic stratagy as in the assembly and 278 scatter create routines, we should try to do it systemamatically 279 if we can figure out the proper level of generality. */ 280 281 /* the code does not do the diagonal entries correctly unless the 282 matrix is square and the column and row owerships are identical. 283 This is a BUG. The only way to fix it seems to be to access 284 aij->A and aij->B directly and not through the MatZeroRows() 285 routine. 286 */ 287 static int MatiZerorows(Mat A,IS is,Scalar *diag) 288 { 289 Matimpiaij *l = (Matimpiaij *) A->data; 290 int i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids; 291 int *localrows,*procs,*nprocs,j,found,idx,nsends,*work; 292 int nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid; 293 int *rvalues,tag = 67,count,base,slen,n,len,*source; 294 int *lens,imdex,*lrows,*values; 295 MPI_Comm comm = A->comm; 296 MPI_Request *send_waits,*recv_waits; 297 MPI_Status recv_status,*send_status; 298 IS istmp; 299 300 if (!l->assembled) SETERR(1,"MatiZerorows: must assmble matrix first"); 301 ierr = ISGetLocalSize(is,&N); CHKERR(ierr); 302 ierr = ISGetIndices(is,&rows); CHKERR(ierr); 303 304 /* first count number of contributors to each processor */ 305 nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs); 306 MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids; 307 owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/ 308 for ( i=0; i<N; i++ ) { 309 idx = rows[i]; 310 found = 0; 311 for ( j=0; j<numtids; j++ ) { 312 if (idx >= owners[j] && idx < owners[j+1]) { 313 nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break; 314 } 315 } 316 if (!found) SETERR(1,"Imdex out of range"); 317 } 318 nsends = 0; for ( i=0; i<numtids; i++ ) { nsends += procs[i];} 319 320 /* inform other processors of number of messages and max length*/ 321 work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work); 322 MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm); 323 nrecvs = work[mytid]; 324 MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm); 325 nmax = work[mytid]; 326 FREE(work); 327 328 /* post receives: */ 329 rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */ 330 CHKPTR(rvalues); 331 recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request)); 332 CHKPTR(recv_waits); 333 for ( i=0; i<nrecvs; i++ ) { 334 MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag, 335 comm,recv_waits+i); 336 } 337 338 /* do sends: 339 1) starts[i] gives the starting index in svalues for stuff going to 340 the ith processor 341 */ 342 svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues); 343 send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request)); 344 CHKPTR(send_waits); 345 starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts); 346 starts[0] = 0; 347 for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 348 for ( i=0; i<N; i++ ) { 349 svalues[starts[owner[i]]++] = rows[i]; 350 } 351 ISRestoreIndices(is,&rows); 352 353 starts[0] = 0; 354 for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 355 count = 0; 356 for ( i=0; i<numtids; i++ ) { 357 if (procs[i]) { 358 MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag, 359 comm,send_waits+count++); 360 } 361 } 362 FREE(starts); 363 364 base = owners[mytid]; 365 366 /* wait on receives */ 367 lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens); 368 source = lens + nrecvs; 369 count = nrecvs; slen = 0; 370 while (count) { 371 MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status); 372 /* unpack receives into our local space */ 373 MPI_Get_count(&recv_status,MPI_INT,&n); 374 source[imdex] = recv_status.MPI_SOURCE; 375 lens[imdex] = n; 376 slen += n; 377 count--; 378 } 379 FREE(recv_waits); 380 381 /* move the data into the send scatter */ 382 lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows); 383 count = 0; 384 for ( i=0; i<nrecvs; i++ ) { 385 values = rvalues + i*nmax; 386 for ( j=0; j<lens[i]; j++ ) { 387 lrows[count++] = values[j] - base; 388 } 389 } 390 FREE(rvalues); FREE(lens); 391 FREE(owner); FREE(nprocs); 392 393 /* actually zap the local rows */ 394 ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr); FREE(lrows); 395 ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr); 396 ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr); 397 ierr = ISDestroy(istmp); CHKERR(ierr); 398 399 /* wait on sends */ 400 if (nsends) { 401 send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) ); 402 CHKPTR(send_status); 403 MPI_Waitall(nsends,send_waits,send_status); 404 FREE(send_status); 405 } 406 FREE(send_waits); FREE(svalues); 407 408 409 return 0; 410 } 411 412 static int MatiAIJMult(Mat aijin,Vec xx,Vec yy) 413 { 414 Matimpiaij *aij = (Matimpiaij *) aijin->data; 415 int ierr; 416 if (!aij->assembled) SETERR(1,"MatiAIJMul: must assmble matrix first"); 417 ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,ScatterAll,aij->Mvctx); 418 CHKERR(ierr); 419 ierr = MatMult(aij->A,xx,yy); CHKERR(ierr); 420 ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,ScatterAll,aij->Mvctx); 421 CHKERR(ierr); 422 ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr); 423 return 0; 424 } 425 426 static int MatiAIJMultadd(Mat aijin,Vec xx,Vec yy,Vec zz) 427 { 428 Matimpiaij *aij = (Matimpiaij *) aijin->data; 429 int ierr; 430 if (!aij->assembled) SETERR(1,"MatiAIJMul: must assmble matrix first"); 431 ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,ScatterAll,aij->Mvctx); 432 CHKERR(ierr); 433 ierr = MatMultAdd(aij->A,xx,yy,zz); CHKERR(ierr); 434 ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,ScatterAll,aij->Mvctx); 435 CHKERR(ierr); 436 ierr = MatMultAdd(aij->B,aij->lvec,zz,zz); CHKERR(ierr); 437 return 0; 438 } 439 440 static int MatiAIJMultTrans(Mat aijin,Vec xx,Vec yy) 441 { 442 Matimpiaij *aij = (Matimpiaij *) aijin->data; 443 int ierr; 444 445 if (!aij->assembled) SETERR(1,"MatiAIJMulTrans: must assmble matrix first"); 446 /* do nondiagonal part */ 447 ierr = MatMultTrans(aij->B,xx,aij->lvec); CHKERR(ierr); 448 /* send it on its way */ 449 ierr = VecScatterBegin(aij->lvec,0,yy,0,AddValues, 450 ScatterAll|ScatterReverse,aij->Mvctx); CHKERR(ierr); 451 /* do local part */ 452 ierr = MatMultTrans(aij->A,xx,yy); CHKERR(ierr); 453 /* receive remote parts: note this assumes the values are not actually */ 454 /* inserted in yy until the next line, which is true for my implementation*/ 455 /* but is not perhaps always true. */ 456 ierr = VecScatterEnd(aij->lvec,0,yy,0,AddValues,ScatterAll|ScatterReverse, 457 aij->Mvctx); CHKERR(ierr); 458 return 0; 459 } 460 461 static int MatiAIJMultTransadd(Mat aijin,Vec xx,Vec yy,Vec zz) 462 { 463 Matimpiaij *aij = (Matimpiaij *) aijin->data; 464 int ierr; 465 466 if (!aij->assembled) SETERR(1,"MatiAIJMulTrans: must assmble matrix first"); 467 /* do nondiagonal part */ 468 ierr = MatMultTrans(aij->B,xx,aij->lvec); CHKERR(ierr); 469 /* send it on its way */ 470 ierr = VecScatterBegin(aij->lvec,0,zz,0,AddValues, 471 ScatterAll|ScatterReverse,aij->Mvctx); CHKERR(ierr); 472 /* do local part */ 473 ierr = MatMultTransAdd(aij->A,xx,yy,zz); CHKERR(ierr); 474 /* receive remote parts: note this assumes the values are not actually */ 475 /* inserted in yy until the next line, which is true for my implementation*/ 476 /* but is not perhaps always true. */ 477 ierr = VecScatterEnd(aij->lvec,0,zz,0,AddValues,ScatterAll|ScatterReverse, 478 aij->Mvctx); CHKERR(ierr); 479 return 0; 480 } 481 482 /* 483 This only works correctly for square matrices where the subblock A->A is the 484 diagonal block 485 */ 486 static int MatiAIJgetdiag(Mat Ain,Vec v) 487 { 488 Matimpiaij *A = (Matimpiaij *) Ain->data; 489 if (!A->assembled) SETERR(1,"MatiAIJgetdiag: must assmble matrix first"); 490 return MatGetDiagonal(A->A,v); 491 } 492 493 static int MatiAIJdestroy(PetscObject obj) 494 { 495 Mat mat = (Mat) obj; 496 Matimpiaij *aij = (Matimpiaij *) mat->data; 497 int ierr; 498 FREE(aij->rowners); 499 ierr = MatDestroy(aij->A); CHKERR(ierr); 500 ierr = MatDestroy(aij->B); CHKERR(ierr); 501 FREE(aij); FREE(mat); 502 if (aij->lvec) VecDestroy(aij->lvec); 503 if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx); 504 return 0; 505 } 506 507 static int MatiView(PetscObject obj,Viewer viewer) 508 { 509 Mat mat = (Mat) obj; 510 Matimpiaij *aij = (Matimpiaij *) mat->data; 511 int ierr; 512 513 if (!aij->assembled) SETERR(1,"MatiAIJMulTrans: must assmble matrix first"); 514 MPE_Seq_begin(mat->comm,1); 515 printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n", 516 aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart, 517 aij->cend); 518 ierr = MatView(aij->A,viewer); CHKERR(ierr); 519 ierr = MatView(aij->B,viewer); CHKERR(ierr); 520 MPE_Seq_end(mat->comm,1); 521 return 0; 522 } 523 524 extern int MatiAIJmarkdiag(Matiaij *); 525 /* 526 This has to provide several versions. 527 528 1) per sequential 529 2) a) use only local smoothing updating outer values only once. 530 b) local smoothing updating outer values each inner iteration 531 3) color updating out values betwen colors. 532 */ 533 static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,double shift, 534 int its,Vec xx) 535 { 536 Matimpiaij *mat = (Matimpiaij *) matin->data; 537 Mat AA = mat->A, BB = mat->B; 538 Matiaij *A = (Matiaij *) AA->data, *B = (Matiaij *)BB->data; 539 Scalar zero = 0.0,*b,*x,*w,*xs,*ls,d,*v,sum,scale,*t,*ts; 540 int ierr,one = 1, tmp, *idx, *diag,mytid; 541 int n = mat->n, m = mat->m, i, j; 542 Vec tt; 543 544 if (!mat->assembled) SETERR(1,"MatiAIJRelax: must assmble matrix first"); 545 546 VecGetArray(xx,&x); VecGetArray(bb,&b); VecGetArray(mat->lvec,&ls); 547 xs = x -1; /* shift by one for index start of 1 */ 548 ls--; 549 if (!A->diag) {if (ierr = MatiAIJmarkdiag(A)) return ierr;} 550 diag = A->diag; 551 if (flag & SOR_EISENSTAT) { 552 /* Let A = L + U + D; where L is lower trianglar, 553 U is upper triangular, E is diagonal; This routine applies 554 555 (L + E)^{-1} A (U + E)^{-1} 556 557 to a vector efficiently using Eisenstat's trick. This is for 558 the case of SSOR preconditioner, so E is D/omega where omega 559 is the relaxation factor. 560 */ 561 ierr = VecCreate(xx,&tt); CHKERR(ierr); 562 VecGetArray(tt,&t); 563 scale = (2.0/omega) - 1.0; 564 /* x = (E + U)^{-1} b */ 565 VecSet(&zero,mat->lvec); 566 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineUp, 567 mat->Mvctx); CHKERR(ierr); 568 for ( i=m-1; i>-1; i-- ) { 569 n = A->i[i+1] - diag[i] - 1; 570 idx = A->j + diag[i]; 571 v = A->a + diag[i]; 572 sum = b[i]; 573 SPARSEDENSEMDOT(sum,xs,v,idx,n); 574 d = shift + A->a[diag[i]-1]; 575 n = B->i[i+1] - B->i[i]; 576 idx = B->j + B->i[i] - 1; 577 v = B->a + B->i[i] - 1; 578 SPARSEDENSEMDOT(sum,ls,v,idx,n); 579 x[i] = omega*(sum/d); 580 } 581 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineUp, 582 mat->Mvctx); CHKERR(ierr); 583 584 /* t = b - (2*E - D)x */ 585 v = A->a; 586 for ( i=0; i<m; i++ ) { t[i] = b[i] - scale*(v[*diag++ - 1])*x[i]; } 587 588 /* t = (E + L)^{-1}t */ 589 ts = t - 1; /* shifted by one for index start of a or mat->j*/ 590 diag = A->diag; 591 VecSet(&zero,mat->lvec); 592 ierr = VecPipelineBegin(tt,0,mat->lvec,0,InsertValues,PipelineDown, 593 mat->Mvctx); CHKERR(ierr); 594 for ( i=0; i<m; i++ ) { 595 n = diag[i] - A->i[i]; 596 idx = A->j + A->i[i] - 1; 597 v = A->a + A->i[i] - 1; 598 sum = t[i]; 599 SPARSEDENSEMDOT(sum,ts,v,idx,n); 600 d = shift + A->a[diag[i]-1]; 601 n = B->i[i+1] - B->i[i]; 602 idx = B->j + B->i[i] - 1; 603 v = B->a + B->i[i] - 1; 604 SPARSEDENSEMDOT(sum,ls,v,idx,n); 605 t[i] = omega*(sum/d); 606 } 607 ierr = VecPipelineEnd(tt,0,mat->lvec,0,InsertValues,PipelineDown, 608 mat->Mvctx); CHKERR(ierr); 609 /* x = x + t */ 610 for ( i=0; i<m; i++ ) { x[i] += t[i]; } 611 VecDestroy(tt); 612 return 0; 613 } 614 615 616 if ((flag & SOR_SYMMETRIC_SWEEP) == SOR_SYMMETRIC_SWEEP){ 617 if (flag & SOR_ZERO_INITIAL_GUESS) { 618 VecSet(&zero,mat->lvec); VecSet(&zero,xx); 619 } 620 else { 621 ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx); 622 CHKERR(ierr); 623 ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx); 624 CHKERR(ierr); 625 } 626 while (its--) { 627 /* go down through the rows */ 628 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineDown, 629 mat->Mvctx); CHKERR(ierr); 630 for ( i=0; i<m; i++ ) { 631 n = A->i[i+1] - A->i[i]; 632 idx = A->j + A->i[i] - 1; 633 v = A->a + A->i[i] - 1; 634 sum = b[i]; 635 SPARSEDENSEMDOT(sum,xs,v,idx,n); 636 d = shift + A->a[diag[i]-1]; 637 n = B->i[i+1] - B->i[i]; 638 idx = B->j + B->i[i] - 1; 639 v = B->a + B->i[i] - 1; 640 SPARSEDENSEMDOT(sum,ls,v,idx,n); 641 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 642 } 643 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineDown, 644 mat->Mvctx); CHKERR(ierr); 645 /* come up through the rows */ 646 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineUp, 647 mat->Mvctx); CHKERR(ierr); 648 for ( i=m-1; i>-1; i-- ) { 649 n = A->i[i+1] - A->i[i]; 650 idx = A->j + A->i[i] - 1; 651 v = A->a + A->i[i] - 1; 652 sum = b[i]; 653 SPARSEDENSEMDOT(sum,xs,v,idx,n); 654 d = shift + A->a[diag[i]-1]; 655 n = B->i[i+1] - B->i[i]; 656 idx = B->j + B->i[i] - 1; 657 v = B->a + B->i[i] - 1; 658 SPARSEDENSEMDOT(sum,ls,v,idx,n); 659 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 660 } 661 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineUp, 662 mat->Mvctx); CHKERR(ierr); 663 } 664 } 665 else if (flag & SOR_FORWARD_SWEEP){ 666 if (flag & SOR_ZERO_INITIAL_GUESS) { 667 VecSet(&zero,mat->lvec); 668 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineDown, 669 mat->Mvctx); CHKERR(ierr); 670 for ( i=0; i<m; i++ ) { 671 n = diag[i] - A->i[i]; 672 idx = A->j + A->i[i] - 1; 673 v = A->a + A->i[i] - 1; 674 sum = b[i]; 675 SPARSEDENSEMDOT(sum,xs,v,idx,n); 676 d = shift + A->a[diag[i]-1]; 677 n = B->i[i+1] - B->i[i]; 678 idx = B->j + B->i[i] - 1; 679 v = B->a + B->i[i] - 1; 680 SPARSEDENSEMDOT(sum,ls,v,idx,n); 681 x[i] = omega*(sum/d); 682 } 683 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineDown, 684 mat->Mvctx); CHKERR(ierr); 685 its--; 686 } 687 while (its--) { 688 ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx); 689 CHKERR(ierr); 690 ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterUp,mat->Mvctx); 691 CHKERR(ierr); 692 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineDown, 693 mat->Mvctx); CHKERR(ierr); 694 for ( i=0; i<m; i++ ) { 695 n = A->i[i+1] - A->i[i]; 696 idx = A->j + A->i[i] - 1; 697 v = A->a + A->i[i] - 1; 698 sum = b[i]; 699 SPARSEDENSEMDOT(sum,xs,v,idx,n); 700 d = shift + A->a[diag[i]-1]; 701 n = B->i[i+1] - B->i[i]; 702 idx = B->j + B->i[i] - 1; 703 v = B->a + B->i[i] - 1; 704 SPARSEDENSEMDOT(sum,ls,v,idx,n); 705 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 706 } 707 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineDown, 708 mat->Mvctx); CHKERR(ierr); 709 } 710 } 711 else if (flag & SOR_BACKWARD_SWEEP){ 712 if (flag & SOR_ZERO_INITIAL_GUESS) { 713 VecSet(&zero,mat->lvec); 714 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineUp, 715 mat->Mvctx); CHKERR(ierr); 716 for ( i=m-1; i>-1; i-- ) { 717 n = A->i[i+1] - diag[i] - 1; 718 idx = A->j + diag[i]; 719 v = A->a + diag[i]; 720 sum = b[i]; 721 SPARSEDENSEMDOT(sum,xs,v,idx,n); 722 d = shift + A->a[diag[i]-1]; 723 n = B->i[i+1] - B->i[i]; 724 idx = B->j + B->i[i] - 1; 725 v = B->a + B->i[i] - 1; 726 SPARSEDENSEMDOT(sum,ls,v,idx,n); 727 x[i] = omega*(sum/d); 728 } 729 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineUp, 730 mat->Mvctx); CHKERR(ierr); 731 its--; 732 } 733 while (its--) { 734 ierr = VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterDown, 735 mat->Mvctx); CHKERR(ierr); 736 ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterDown, 737 mat->Mvctx); CHKERR(ierr); 738 ierr = VecPipelineBegin(xx,0,mat->lvec,0,InsertValues,PipelineUp, 739 mat->Mvctx); CHKERR(ierr); 740 for ( i=m-1; i>-1; i-- ) { 741 n = A->i[i+1] - A->i[i]; 742 idx = A->j + A->i[i] - 1; 743 v = A->a + A->i[i] - 1; 744 sum = b[i]; 745 SPARSEDENSEMDOT(sum,xs,v,idx,n); 746 d = shift + A->a[diag[i]-1]; 747 n = B->i[i+1] - B->i[i]; 748 idx = B->j + B->i[i] - 1; 749 v = B->a + B->i[i] - 1; 750 SPARSEDENSEMDOT(sum,ls,v,idx,n); 751 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 752 } 753 ierr = VecPipelineEnd(xx,0,mat->lvec,0,InsertValues,PipelineUp, 754 mat->Mvctx); CHKERR(ierr); 755 } 756 } 757 else if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){ 758 if (flag & SOR_ZERO_INITIAL_GUESS) { 759 return MatRelax(mat->A,bb,omega,flag,shift,its,xx); 760 } 761 ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx); 762 CHKERR(ierr); 763 ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx); 764 CHKERR(ierr); 765 while (its--) { 766 /* go down through the rows */ 767 for ( i=0; i<m; i++ ) { 768 n = A->i[i+1] - A->i[i]; 769 idx = A->j + A->i[i] - 1; 770 v = A->a + A->i[i] - 1; 771 sum = b[i]; 772 SPARSEDENSEMDOT(sum,xs,v,idx,n); 773 d = shift + A->a[diag[i]-1]; 774 n = B->i[i+1] - B->i[i]; 775 idx = B->j + B->i[i] - 1; 776 v = B->a + B->i[i] - 1; 777 SPARSEDENSEMDOT(sum,ls,v,idx,n); 778 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 779 } 780 /* come up through the rows */ 781 for ( i=m-1; i>-1; i-- ) { 782 n = A->i[i+1] - A->i[i]; 783 idx = A->j + A->i[i] - 1; 784 v = A->a + A->i[i] - 1; 785 sum = b[i]; 786 SPARSEDENSEMDOT(sum,xs,v,idx,n); 787 d = shift + A->a[diag[i]-1]; 788 n = B->i[i+1] - B->i[i]; 789 idx = B->j + B->i[i] - 1; 790 v = B->a + B->i[i] - 1; 791 SPARSEDENSEMDOT(sum,ls,v,idx,n); 792 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 793 } 794 } 795 } 796 else if (flag & SOR_LOCAL_FORWARD_SWEEP){ 797 if (flag & SOR_ZERO_INITIAL_GUESS) { 798 return MatRelax(mat->A,bb,omega,flag,shift,its,xx); 799 } 800 ierr=VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx); 801 CHKERR(ierr); 802 ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterAll,mat->Mvctx); 803 CHKERR(ierr); 804 while (its--) { 805 for ( i=0; i<m; i++ ) { 806 n = A->i[i+1] - A->i[i]; 807 idx = A->j + A->i[i] - 1; 808 v = A->a + A->i[i] - 1; 809 sum = b[i]; 810 SPARSEDENSEMDOT(sum,xs,v,idx,n); 811 d = shift + A->a[diag[i]-1]; 812 n = B->i[i+1] - B->i[i]; 813 idx = B->j + B->i[i] - 1; 814 v = B->a + B->i[i] - 1; 815 SPARSEDENSEMDOT(sum,ls,v,idx,n); 816 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 817 } 818 } 819 } 820 else if (flag & SOR_LOCAL_BACKWARD_SWEEP){ 821 if (flag & SOR_ZERO_INITIAL_GUESS) { 822 return MatRelax(mat->A,bb,omega,flag,shift,its,xx); 823 } 824 ierr = VecScatterBegin(xx,0,mat->lvec,0,InsertValues,ScatterAll, 825 mat->Mvctx); CHKERR(ierr); 826 ierr = VecScatterEnd(xx,0,mat->lvec,0,InsertValues,ScatterAll, 827 mat->Mvctx); CHKERR(ierr); 828 while (its--) { 829 for ( i=m-1; i>-1; i-- ) { 830 n = A->i[i+1] - A->i[i]; 831 idx = A->j + A->i[i] - 1; 832 v = A->a + A->i[i] - 1; 833 sum = b[i]; 834 SPARSEDENSEMDOT(sum,xs,v,idx,n); 835 d = shift + A->a[diag[i]-1]; 836 n = B->i[i+1] - B->i[i]; 837 idx = B->j + B->i[i] - 1; 838 v = B->a + B->i[i] - 1; 839 SPARSEDENSEMDOT(sum,ls,v,idx,n); 840 x[i] = (1. - omega)*x[i] + omega*(sum/d + x[i]); 841 } 842 } 843 } 844 return 0; 845 } 846 static int MatiAIJinsopt(Mat aijin,int op) 847 { 848 Matimpiaij *aij = (Matimpiaij *) aijin->data; 849 850 if (op == NO_NEW_NONZERO_LOCATIONS) { 851 MatSetOption(aij->A,op); 852 MatSetOption(aij->B,op); 853 } 854 else if (op == YES_NEW_NONZERO_LOCATIONS) { 855 MatSetOption(aij->A,op); 856 MatSetOption(aij->B,op); 857 } 858 else if (op == COLUMN_ORIENTED) SETERR(1,"Column oriented not supported"); 859 return 0; 860 } 861 862 static int MatiAIJsize(Mat matin,int *m,int *n) 863 { 864 Matimpiaij *mat = (Matimpiaij *) matin->data; 865 *m = mat->M; *n = mat->N; 866 return 0; 867 } 868 869 static int MatiAIJlocalsize(Mat matin,int *m,int *n) 870 { 871 Matimpiaij *mat = (Matimpiaij *) matin->data; 872 *m = mat->m; *n = mat->n; 873 return 0; 874 } 875 876 static int MatiAIJrange(Mat matin,int *m,int *n) 877 { 878 Matimpiaij *mat = (Matimpiaij *) matin->data; 879 *m = mat->rstart; *n = mat->rend; 880 return 0; 881 } 882 883 static int MatiCopy(Mat,Mat *); 884 885 /* -------------------------------------------------------------------*/ 886 static struct _MatOps MatOps = {MatiAIJInsertValues, 887 0, 0, 888 MatiAIJMult,MatiAIJMultadd,MatiAIJMultTrans,MatiAIJMultTransadd, 889 0,0,0,0, 890 0,0, 891 MatiAIJrelax, 892 0, 893 0,0,0, 894 MatiCopy, 895 MatiAIJgetdiag,0,0, 896 MatiAIJBeginAssemble,MatiAIJEndAssemble, 897 0, 898 MatiAIJinsopt,MatiZero,MatiZerorows,0, 899 0,0,0,0, 900 MatiAIJsize,MatiAIJlocalsize,MatiAIJrange }; 901 902 903 904 /*@ 905 906 MatCreateMPIAIJ - Creates a sparse parallel matrix 907 in AIJ format. 908 909 Input Parameters: 910 . comm - MPI communicator 911 . m,n - number of local rows and columns (or -1 to have calculated) 912 . M,N - global rows and columns (or -1 to have calculated) 913 . d_nz - total number nonzeros in diagonal portion of matrix 914 . d_nzz - number of nonzeros per row in diagonal portion of matrix or null 915 . You must leave room for the diagonal entry even if it is zero. 916 . o_nz - total number nonzeros in off-diagonal portion of matrix 917 . o_nzz - number of nonzeros per row in off-diagonal portion of matrix 918 . or null. You must have at least one nonzero per row. 919 920 Output parameters: 921 . newmat - the matrix 922 923 Keywords: matrix, aij, compressed row, sparse, parallel 924 @*/ 925 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N, 926 int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat) 927 { 928 Mat mat; 929 Matimpiaij *aij; 930 int ierr, i,rl,len,sum[2],work[2]; 931 *newmat = 0; 932 CREATEHEADER(mat,_Mat); 933 mat->data = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij); 934 mat->cookie = MAT_COOKIE; 935 mat->ops = &MatOps; 936 mat->destroy = MatiAIJdestroy; 937 mat->view = MatiView; 938 mat->type = MATAIJMPI; 939 mat->factor = 0; 940 mat->row = 0; 941 mat->col = 0; 942 943 mat->comm = comm; 944 aij->insertmode = NotSetValues; 945 MPI_Comm_rank(comm,&aij->mytid); 946 MPI_Comm_size(comm,&aij->numtids); 947 948 if (M == -1 || N == -1) { 949 work[0] = m; work[1] = n; 950 MPI_Allreduce((void *) work,(void *) sum,2,MPI_INT,MPI_SUM,comm ); 951 if (M == -1) M = sum[0]; 952 if (N == -1) N = sum[1]; 953 } 954 if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);} 955 if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);} 956 aij->m = m; 957 aij->n = n; 958 aij->N = N; 959 aij->M = M; 960 961 /* build local table of row and column ownerships */ 962 aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int)); 963 CHKPTR(aij->rowners); 964 aij->cowners = aij->rowners + aij->numtids + 1; 965 MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm); 966 aij->rowners[0] = 0; 967 for ( i=2; i<=aij->numtids; i++ ) { 968 aij->rowners[i] += aij->rowners[i-1]; 969 } 970 aij->rstart = aij->rowners[aij->mytid]; 971 aij->rend = aij->rowners[aij->mytid+1]; 972 MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm); 973 aij->cowners[0] = 0; 974 for ( i=2; i<=aij->numtids; i++ ) { 975 aij->cowners[i] += aij->cowners[i-1]; 976 } 977 aij->cstart = aij->cowners[aij->mytid]; 978 aij->cend = aij->cowners[aij->mytid+1]; 979 980 981 ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr); 982 ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr); 983 984 /* build cache for off array entries formed */ 985 aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */ 986 aij->stash.n = 0; 987 aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) + 988 sizeof(Scalar))); CHKPTR(aij->stash.array); 989 aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax); 990 aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax); 991 992 /* stuff used for matrix vector multiply */ 993 aij->lvec = 0; 994 aij->Mvctx = 0; 995 aij->assembled = 0; 996 997 *newmat = mat; 998 return 0; 999 } 1000 1001 static int MatiCopy(Mat matin,Mat *newmat) 1002 { 1003 Mat mat; 1004 Matimpiaij *aij,*oldmat = (Matimpiaij *) matin->data; 1005 int i,rl,len, m = oldmat->m, n = oldmat->n,ierr; 1006 *newmat = 0; 1007 1008 if (!oldmat->assembled) SETERR(1,"Cannot copy unassembled matrix"); 1009 CREATEHEADER(mat,_Mat); 1010 mat->data = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij); 1011 mat->cookie = MAT_COOKIE; 1012 mat->ops = &MatOps; 1013 mat->destroy = MatiAIJdestroy; 1014 mat->view = MatiView; 1015 mat->type = MATAIJSEQ; 1016 mat->factor = matin->factor; 1017 mat->row = 0; 1018 mat->col = 0; 1019 1020 aij->m = oldmat->m; 1021 aij->n = oldmat->n; 1022 aij->M = oldmat->M; 1023 aij->N = oldmat->N; 1024 1025 aij->assembled = 1; 1026 aij->rstart = oldmat->rstart; 1027 aij->rend = oldmat->rend; 1028 aij->cstart = oldmat->cstart; 1029 aij->cend = oldmat->cend; 1030 aij->numtids = oldmat->numtids; 1031 aij->mytid = oldmat->mytid; 1032 aij->insertmode = NotSetValues; 1033 1034 aij->rowners = (int *) MALLOC( (aij->numtids+1)*sizeof(int) ); 1035 CHKPTR(aij->rowners); 1036 MEMCPY(aij->rowners,oldmat->rowners,(aij->numtids+1)*sizeof(int)); 1037 aij->stash.nmax = 0; 1038 aij->stash.n = 0; 1039 aij->stash.array= 0; 1040 mat->comm = matin->comm; 1041 1042 ierr = VecCreate(oldmat->lvec,&aij->lvec); CHKERR(ierr); 1043 ierr = VecScatterCtxCopy(oldmat->Mvctx,&aij->Mvctx); CHKERR(ierr); 1044 ierr = MatCopy(oldmat->A,&aij->A); CHKERR(ierr); 1045 ierr = MatCopy(oldmat->B,&aij->B); CHKERR(ierr); 1046 *newmat = mat; 1047 return 0; 1048 } 1049