1 2 #include "mpiaij.h" 3 #include "vec/vecimpl.h" 4 5 6 #define CHUNCKSIZE 100 7 /* 8 This is a simple minded stash. Do a linear search to determine if 9 in stash, if not add to end. 10 */ 11 static int StashValues(Stash *stash,int row,int n, int *idxn, 12 Scalar *values,InsertMode addv) 13 { 14 int i,j,N = stash->n,found,*n_idx, *n_idy; 15 Scalar val,*n_array; 16 17 for ( i=0; i<n; i++ ) { 18 found = 0; 19 val = *values++; 20 for ( j=0; j<N; j++ ) { 21 if ( stash->idx[j] == row && stash->idy[j] == idxn[i]) { 22 /* found a match */ 23 if (addv == AddValues) stash->array[j] += val; 24 else stash->array[j] = val; 25 found = 1; 26 break; 27 } 28 } 29 if (!found) { /* not found so add to end */ 30 if ( stash->n == stash->nmax ) { 31 /* allocate a larger stash */ 32 n_array = (Scalar *) MALLOC( (stash->nmax + CHUNCKSIZE)*( 33 2*sizeof(int) + sizeof(Scalar))); 34 CHKPTR(n_array); 35 n_idx = (int *) (n_array + stash->nmax + CHUNCKSIZE); 36 n_idy = (int *) (n_idx + stash->nmax + CHUNCKSIZE); 37 MEMCPY(n_array,stash->array,stash->nmax*sizeof(Scalar)); 38 MEMCPY(n_idx,stash->idx,stash->nmax*sizeof(int)); 39 MEMCPY(n_idy,stash->idy,stash->nmax*sizeof(int)); 40 if (stash->array) FREE(stash->array); 41 stash->array = n_array; stash->idx = n_idx; stash->idy = n_idy; 42 stash->nmax += CHUNCKSIZE; 43 } 44 stash->array[stash->n] = val; 45 stash->idx[stash->n] = row; 46 stash->idy[stash->n++] = idxn[i]; 47 } 48 } 49 return 0; 50 } 51 52 static int MatiAIJInsertValues(Mat mat,int m,int *idxm,int n, 53 int *idxn,Scalar *v,InsertMode addv) 54 { 55 Matimpiaij *aij = (Matimpiaij *) mat->data; 56 int ierr,i,j, rstart = aij->rstart, rend = aij->rend; 57 int cstart = aij->cstart, cend = aij->cend,row,col; 58 59 if (aij->insertmode != NotSetValues && aij->insertmode != addv) { 60 SETERR(1,"You cannot mix inserts and adds"); 61 } 62 aij->insertmode = addv; 63 for ( i=0; i<m; i++ ) { 64 if (idxm[i] < 0 || idxm[i] >= aij->M) { 65 if (aij->outofrange) continue; 66 else SETERR(1,"row index out of range"); 67 } 68 if (idxm[i] >= rstart && idxm[i] < rend) { 69 row = idxm[i] - rstart; 70 for ( j=0; j<n; j++ ) { 71 if (idxn[i] < 0 || idxn[i] >= aij->N) { 72 if (aij->outofrange) continue; 73 else SETERR(1,"column index out of range"); 74 } 75 if (idxn[j] >= cstart && idxn[j] < cend){ 76 col = idxn[j] - cstart; 77 ierr = MatSetValues(aij->A,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr); 78 } 79 else { 80 col = idxn[j]; 81 ierr = MatSetValues(aij->B,1,&row,1,&col,v+i*n+j,addv);CHKERR(ierr); 82 } 83 } 84 } 85 else { 86 ierr = StashValues(&aij->stash,idxm[i],n,idxn,v+i*n,addv);CHKERR(ierr); 87 } 88 } 89 return 0; 90 } 91 92 /* 93 the assembly code is alot like the code for vectors, we should 94 sometime derive a single assembly code that can be used for 95 either case. 96 */ 97 98 static int MatiAIJBeginAssemble(Mat mat) 99 { 100 Matimpiaij *aij = (Matimpiaij *) mat->data; 101 MPI_Comm comm = aij->comm; 102 int ierr, numtids = aij->numtids, *owners = aij->rowners; 103 int mytid = aij->mytid; 104 MPI_Request *send_waits,*recv_waits; 105 int *nprocs,i,j,n,idx,*procs,nsends,nreceives,nmax,*work; 106 int tag = 50, *owner,*starts,count; 107 InsertMode addv; 108 Scalar *rvalues,*svalues; 109 110 /* make sure all processors are either in INSERTMODE or ADDMODE */ 111 MPI_Allreduce((void *) &aij->insertmode,(void *) &addv,numtids,MPI_INT, 112 MPI_BOR,comm); 113 if (addv == (AddValues|InsertValues)) { 114 SETERR(1,"Some processors have inserted while others have added"); 115 } 116 aij->insertmode = addv; /* in case this processor had no cache */ 117 118 /* first count number of contributors to each processor */ 119 nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs); 120 MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids; 121 owner = (int *) MALLOC( (aij->stash.n+1)*sizeof(int) ); CHKPTR(owner); 122 for ( i=0; i<aij->stash.n; i++ ) { 123 idx = aij->stash.idx[i]; 124 for ( j=0; j<numtids; j++ ) { 125 if (idx >= owners[j] && idx < owners[j+1]) { 126 nprocs[j]++; procs[j] = 1; owner[i] = j; break; 127 } 128 } 129 } 130 nsends = 0; for ( i=0; i<numtids; i++ ) { nsends += procs[i];} 131 132 /* inform other processors of number of messages and max length*/ 133 work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work); 134 MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm); 135 nreceives = work[mytid]; 136 MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm); 137 nmax = work[mytid]; 138 FREE(work); 139 140 /* post receives: 141 1) each message will consist of ordered pairs 142 (global index,value) we store the global index as a double 143 to simply the message passing. 144 2) since we don't know how long each individual message is we 145 allocate the largest needed buffer for each receive. Potentially 146 this is a lot of wasted space. 147 148 149 This could be done better. 150 */ 151 rvalues = (Scalar *) MALLOC(3*(nreceives+1)*nmax*sizeof(Scalar)); 152 CHKPTR(rvalues); 153 recv_waits = (MPI_Request *) MALLOC((nreceives+1)*sizeof(MPI_Request)); 154 CHKPTR(recv_waits); 155 for ( i=0; i<nreceives; i++ ) { 156 MPI_Irecv((void *)(rvalues+3*nmax*i),3*nmax,MPI_SCALAR,MPI_ANY_SOURCE,tag, 157 comm,recv_waits+i); 158 } 159 160 /* do sends: 161 1) starts[i] gives the starting index in svalues for stuff going to 162 the ith processor 163 */ 164 svalues = (Scalar *) MALLOC( 3*(aij->stash.n+1)*sizeof(Scalar) ); 165 CHKPTR(svalues); 166 send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request)); 167 CHKPTR(send_waits); 168 starts = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(starts); 169 starts[0] = 0; 170 for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 171 for ( i=0; i<aij->stash.n; i++ ) { 172 svalues[3*starts[owner[i]]] = (Scalar) aij->stash.idx[i]; 173 svalues[3*starts[owner[i]]+1] = (Scalar) aij->stash.idy[i]; 174 svalues[3*(starts[owner[i]]++)+2] = aij->stash.array[i]; 175 } 176 FREE(owner); 177 starts[0] = 0; 178 for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 179 count = 0; 180 for ( i=0; i<numtids; i++ ) { 181 if (procs[i]) { 182 MPI_Isend((void*)(svalues+3*starts[i]),3*nprocs[i],MPI_SCALAR,i,tag, 183 comm,send_waits+count++); 184 } 185 } 186 FREE(starts); FREE(nprocs); 187 188 /* Free cache space */ 189 aij->stash.nmax = aij->stash.n = 0; 190 if (aij->stash.array){ FREE(aij->stash.array); aij->stash.array = 0;} 191 192 aij->svalues = svalues; aij->rvalues = rvalues; 193 aij->nsends = nsends; aij->nrecvs = nreceives; 194 aij->send_waits = send_waits; aij->recv_waits = recv_waits; 195 aij->rmax = nmax; 196 197 return 0; 198 } 199 extern int MPIAIJSetUpMultiply(Mat); 200 201 static int MatiAIJEndAssemble(Mat mat) 202 { 203 int ierr; 204 Matimpiaij *aij = (Matimpiaij *) mat->data; 205 206 MPI_Status *send_status,recv_status; 207 int index,idx,nrecvs = aij->nrecvs, count = nrecvs, i, n; 208 int row,col; 209 Scalar *values,val; 210 InsertMode addv = aij->insertmode; 211 212 /* wait on receives */ 213 while (count) { 214 MPI_Waitany(nrecvs,aij->recv_waits,&index,&recv_status); 215 /* unpack receives into our local space */ 216 values = aij->rvalues + 3*index*aij->rmax; 217 MPI_Get_count(&recv_status,MPI_SCALAR,&n); 218 n = n/3; 219 for ( i=0; i<n; i++ ) { 220 row = (int) PETSCREAL(values[3*i]) - aij->rstart; 221 col = (int) PETSCREAL(values[3*i+1]); 222 val = values[3*i+2]; 223 if (col >= aij->cstart && col < aij->cend) { 224 col -= aij->cstart; 225 MatSetValues(aij->A,1,&row,1,&col,&val,addv); 226 } 227 else { 228 MatSetValues(aij->B,1,&row,1,&col,&val,addv); 229 } 230 } 231 count--; 232 } 233 FREE(aij->recv_waits); FREE(aij->rvalues); 234 235 /* wait on sends */ 236 if (aij->nsends) { 237 send_status = (MPI_Status *) MALLOC( aij->nsends*sizeof(MPI_Status) ); 238 CHKPTR(send_status); 239 MPI_Waitall(aij->nsends,aij->send_waits,send_status); 240 FREE(send_status); 241 } 242 FREE(aij->send_waits); FREE(aij->svalues); 243 244 aij->insertmode = NotSetValues; 245 ierr = MatBeginAssembly(aij->A); CHKERR(ierr); 246 ierr = MatEndAssembly(aij->A); CHKERR(ierr); 247 248 ierr = MPIAIJSetUpMultiply(mat); CHKERR(ierr); 249 ierr = MatBeginAssembly(aij->B); CHKERR(ierr); 250 ierr = MatEndAssembly(aij->B); CHKERR(ierr); 251 return 0; 252 } 253 254 static int MatiZero(Mat A) 255 { 256 Matimpiaij *l = (Matimpiaij *) A->data; 257 258 MatZeroEntries(l->A); MatZeroEntries(l->B); 259 return 0; 260 } 261 262 /* again this uses the same basic stratagy as in the assembly and 263 scatter create routines, we should try to do it systemamatically 264 if we can figure out the proper level of generality. */ 265 266 /* the code does not do the diagonal entries correctly unless the 267 matrix is square and the column and row owerships are identical. 268 This is a BUG. The only way to fix it seems to be to access 269 aij->A and aij->B directly and not through the MatZeroRows() 270 routine. 271 */ 272 static int MatiZerorows(Mat A,IS is,Scalar *diag) 273 { 274 Matimpiaij *l = (Matimpiaij *) A->data; 275 int i,ierr,N, *rows,*owners = l->rowners,numtids = l->numtids; 276 int *localrows,*procs,*nprocs,j,found,idx,nsends,*work; 277 int nmax,*svalues,*starts,*owner,nrecvs,mytid = l->mytid; 278 int *rvalues,tag = 67,count,base,slen,n,len,*source; 279 int *lens,index,*lrows,*values; 280 MPI_Comm comm = l->comm; 281 MPI_Request *send_waits,*recv_waits; 282 MPI_Status recv_status,*send_status; 283 IS istmp; 284 285 ierr = ISGetLocalSize(is,&N); CHKERR(ierr); 286 ierr = ISGetIndices(is,&rows); CHKERR(ierr); 287 288 /* first count number of contributors to each processor */ 289 nprocs = (int *) MALLOC( 2*numtids*sizeof(int) ); CHKPTR(nprocs); 290 MEMSET(nprocs,0,2*numtids*sizeof(int)); procs = nprocs + numtids; 291 owner = (int *) MALLOC((N+1)*sizeof(int)); CHKPTR(owner); /* see note*/ 292 for ( i=0; i<N; i++ ) { 293 idx = rows[i]; 294 found = 0; 295 for ( j=0; j<numtids; j++ ) { 296 if (idx >= owners[j] && idx < owners[j+1]) { 297 nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break; 298 } 299 } 300 if (!found) SETERR(1,"Index out of range"); 301 } 302 nsends = 0; for ( i=0; i<numtids; i++ ) { nsends += procs[i];} 303 304 /* inform other processors of number of messages and max length*/ 305 work = (int *) MALLOC( numtids*sizeof(int) ); CHKPTR(work); 306 MPI_Allreduce((void *) procs,(void *) work,numtids,MPI_INT,MPI_SUM,comm); 307 nrecvs = work[mytid]; 308 MPI_Allreduce((void *) nprocs,(void *) work,numtids,MPI_INT,MPI_MAX,comm); 309 nmax = work[mytid]; 310 FREE(work); 311 312 /* post receives: */ 313 rvalues = (int *) MALLOC((nrecvs+1)*nmax*sizeof(int)); /*see note */ 314 CHKPTR(rvalues); 315 recv_waits = (MPI_Request *) MALLOC((nrecvs+1)*sizeof(MPI_Request)); 316 CHKPTR(recv_waits); 317 for ( i=0; i<nrecvs; i++ ) { 318 MPI_Irecv((void *)(rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag, 319 comm,recv_waits+i); 320 } 321 322 /* do sends: 323 1) starts[i] gives the starting index in svalues for stuff going to 324 the ith processor 325 */ 326 svalues = (int *) MALLOC( (N+1)*sizeof(int) ); CHKPTR(svalues); 327 send_waits = (MPI_Request *) MALLOC( (nsends+1)*sizeof(MPI_Request)); 328 CHKPTR(send_waits); 329 starts = (int *) MALLOC( (numtids+1)*sizeof(int) ); CHKPTR(starts); 330 starts[0] = 0; 331 for ( i=1; i<numtids; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 332 for ( i=0; i<N; i++ ) { 333 svalues[starts[owner[i]]++] = rows[i]; 334 } 335 ISRestoreIndices(is,&rows); 336 337 starts[0] = 0; 338 for ( i=1; i<numtids+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 339 count = 0; 340 for ( i=0; i<numtids; i++ ) { 341 if (procs[i]) { 342 MPI_Isend((void*)(svalues+starts[i]),nprocs[i],MPI_INT,i,tag, 343 comm,send_waits+count++); 344 } 345 } 346 FREE(starts); 347 348 base = owners[mytid]; 349 350 /* wait on receives */ 351 lens = (int *) MALLOC( 2*(nrecvs+1)*sizeof(int) ); CHKPTR(lens); 352 source = lens + nrecvs; 353 count = nrecvs; slen = 0; 354 while (count) { 355 MPI_Waitany(nrecvs,recv_waits,&index,&recv_status); 356 /* unpack receives into our local space */ 357 MPI_Get_count(&recv_status,MPI_INT,&n); 358 source[index] = recv_status.MPI_SOURCE; 359 lens[index] = n; 360 slen += n; 361 count--; 362 } 363 FREE(recv_waits); 364 365 /* move the data into the send scatter */ 366 lrows = (int *) MALLOC( slen*sizeof(int) ); CHKPTR(lrows); 367 count = 0; 368 for ( i=0; i<nrecvs; i++ ) { 369 values = rvalues + i*nmax; 370 for ( j=0; j<lens[i]; j++ ) { 371 lrows[count++] = values[j] - base; 372 } 373 } 374 FREE(rvalues); FREE(lens); 375 FREE(owner); FREE(nprocs); 376 377 /* actually zap the local rows */ 378 ierr = ISCreateSequential(slen,lrows,&istmp); CHKERR(ierr); FREE(lrows); 379 ierr = MatZeroRows(l->A,istmp,diag); CHKERR(ierr); 380 ierr = MatZeroRows(l->B,istmp,0); CHKERR(ierr); 381 ierr = ISDestroy(istmp); CHKERR(ierr); 382 383 /* wait on sends */ 384 if (nsends) { 385 send_status = (MPI_Status *) MALLOC( nsends*sizeof(MPI_Status) ); 386 CHKPTR(send_status); 387 MPI_Waitall(nsends,send_waits,send_status); 388 FREE(send_status); 389 } 390 FREE(send_waits); FREE(svalues); 391 392 393 return 0; 394 } 395 396 static int MatiAIJMult(Mat aijin,Vec xx,Vec yy) 397 { 398 Matimpiaij *aij = (Matimpiaij *) aijin->data; 399 int ierr; 400 401 ierr = VecScatterBegin(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); 402 CHKERR(ierr); 403 ierr = MatMult(aij->A,xx,yy); CHKERR(ierr); 404 ierr = VecScatterEnd(xx,0,aij->lvec,0,InsertValues,&aij->Mvctx); CHKERR(ierr); 405 ierr = MatMultAdd(aij->B,aij->lvec,yy,yy); CHKERR(ierr); 406 return 0; 407 } 408 409 /* 410 This only works correctly for square matrices where the subblock A->A is the 411 diagonal block 412 */ 413 static int MatiAIJgetdiag(Mat Ain,Vec v) 414 { 415 Matimpiaij *A = (Matimpiaij *) Ain->data; 416 return MatGetDiagonal(A->A,v); 417 } 418 419 static int MatiAIJdestroy(PetscObject obj) 420 { 421 Mat mat = (Mat) obj; 422 Matimpiaij *aij = (Matimpiaij *) mat->data; 423 int ierr; 424 FREE(aij->rowners); 425 ierr = MatDestroy(aij->A); CHKERR(ierr); 426 ierr = MatDestroy(aij->B); CHKERR(ierr); 427 FREE(aij); FREE(mat); 428 if (aij->lvec) VecDestroy(aij->lvec); 429 if (aij->Mvctx) VecScatterCtxDestroy(aij->Mvctx); 430 return 0; 431 } 432 433 static int MatiView(PetscObject obj,Viewer viewer) 434 { 435 Mat mat = (Mat) obj; 436 Matimpiaij *aij = (Matimpiaij *) mat->data; 437 int ierr; 438 439 MPE_Seq_begin(aij->comm,1); 440 printf("[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n", 441 aij->mytid,aij->m,aij->rstart,aij->rend,aij->n,aij->cstart, 442 aij->cend); 443 ierr = MatView(aij->A,viewer); CHKERR(ierr); 444 ierr = MatView(aij->B,viewer); CHKERR(ierr); 445 MPE_Seq_end(aij->comm,1); 446 return 0; 447 } 448 449 /* 450 This has to provide several versions. 451 452 1) per sequential 453 2) a) use only local smoothing updating outer values only once. 454 b) local smoothing updating outer values each inner iteration 455 3) color updating out values betwen colors. (this imples an 456 ordering that is sort of related to the IS argument, it 457 is not clear a IS argument makes the most sense perhaps it 458 should be dropped. 459 */ 460 static int MatiAIJrelax(Mat matin,Vec bb,double omega,int flag,IS is, 461 int its,Vec xx) 462 { 463 Matimpiaij *mat = (Matimpiaij *) matin->data; 464 Scalar zero = 0.0; 465 int ierr,one = 1, tmp, *idx, *diag; 466 int n = mat->n, m = mat->m, i, j; 467 468 if (is) SETERR(1,"No support for ordering in relaxation"); 469 if (flag & SOR_ZERO_INITIAL_GUESS) { 470 if (ierr = VecSet(&zero,xx)) return ierr; 471 } 472 473 /* update outer values from other processors*/ 474 475 /* smooth locally */ 476 return 0; 477 } 478 static int MatiAIJinsopt(Mat aijin,int op) 479 { 480 Matimpiaij *aij = (Matimpiaij *) aijin->data; 481 482 if (op == NO_NEW_NONZERO_LOCATIONS) { 483 MatSetOption(aij->A,op); 484 MatSetOption(aij->B,op); 485 } 486 else if (op == YES_NEW_NONZERO_LOCATIONS) { 487 MatSetOption(aij->A,op); 488 MatSetOption(aij->B,op); 489 } 490 else if (op == ALLOW_OUT_OF_RANGE) aij->outofrange = 1; 491 else if (op == COLUMN_ORIENTED) SETERR(1,"Column oriented not supported"); 492 return 0; 493 } 494 495 static int MatiAIJsize(Mat matin,int *m,int *n) 496 { 497 Matimpiaij *mat = (Matimpiaij *) matin->data; 498 *m = mat->M; *n = mat->N; 499 return 0; 500 } 501 502 static int MatiAIJlocalsize(Mat matin,int *m,int *n) 503 { 504 Matimpiaij *mat = (Matimpiaij *) matin->data; 505 *m = mat->m; *n = mat->n; 506 return 0; 507 } 508 509 static int MatiAIJrange(Mat matin,int *m,int *n) 510 { 511 Matimpiaij *mat = (Matimpiaij *) matin->data; 512 *m = mat->rstart; *n = mat->rend; 513 return 0; 514 } 515 516 /* -------------------------------------------------------------------*/ 517 static struct _MatOps MatOps = {MatiAIJInsertValues, 518 0, 0, 519 MatiAIJMult,0,0,0, 520 0,0,0,0, 521 0,0, 522 MatiAIJrelax, 523 0, 524 0,0,0, 525 0, 526 MatiAIJgetdiag,0,0, 527 MatiAIJBeginAssemble,MatiAIJEndAssemble, 528 0, 529 MatiAIJinsopt,MatiZero,MatiZerorows,0, 530 0,0,0,0, 531 MatiAIJsize,MatiAIJlocalsize,MatiAIJrange }; 532 533 534 535 /*@ 536 537 MatCreateMPIAIJ - Creates a sparse parallel matrix 538 in AIJ format. 539 540 Input Parameters: 541 . comm - MPI communicator 542 . m,n - number of local rows and columns (or -1 to have calculated) 543 . M,N - global rows and columns (or -1 to have calculated) 544 . d_nz - total number nonzeros in diagonal portion of matrix 545 . d_nzz - number of nonzeros per row in diagonal portion of matrix or null 546 . You must leave room for the diagonal entry even if it is zero. 547 . o_nz - total number nonzeros in off-diagonal portion of matrix 548 . o_nzz - number of nonzeros per row in off-diagonal portion of matrix 549 . or null. You must have at least one nonzero per row. 550 551 Output parameters: 552 . newmat - the matrix 553 554 Keywords: matrix, aij, compressed row, sparse, parallel 555 @*/ 556 int MatCreateMPIAIJ(MPI_Comm comm,int m,int n,int M,int N, 557 int d_nz,int *d_nnz, int o_nz,int *o_nnz,Mat *newmat) 558 { 559 Mat mat; 560 Matimpiaij *aij; 561 int ierr, i,rl,len,sum[2],work[2]; 562 *newmat = 0; 563 CREATEHEADER(mat,_Mat); 564 mat->data = (void *) (aij = NEW(Matimpiaij)); CHKPTR(aij); 565 mat->cookie = MAT_COOKIE; 566 mat->ops = &MatOps; 567 mat->destroy = MatiAIJdestroy; 568 mat->view = MatiView; 569 mat->type = MATAIJMPI; 570 mat->factor = 0; 571 mat->row = 0; 572 mat->col = 0; 573 aij->comm = comm; 574 aij->insertmode = NotSetValues; 575 MPI_Comm_rank(comm,&aij->mytid); 576 MPI_Comm_size(comm,&aij->numtids); 577 578 if (M == -1 || N == -1) { 579 work[0] = m; work[1] = n; 580 MPI_Allreduce((void *) work,(void *) sum,1,MPI_INT,MPI_SUM,comm ); 581 if (M == -1) M = sum[0]; 582 if (N == -1) N = sum[1]; 583 } 584 if (m == -1) {m = M/aij->numtids + ((M % aij->numtids) > aij->mytid);} 585 if (n == -1) {n = N/aij->numtids + ((N % aij->numtids) > aij->mytid);} 586 aij->m = m; 587 aij->n = n; 588 aij->N = N; 589 aij->M = M; 590 591 /* build local table of row and column ownerships */ 592 aij->rowners = (int *) MALLOC(2*(aij->numtids+2)*sizeof(int)); 593 CHKPTR(aij->rowners); 594 aij->cowners = aij->rowners + aij->numtids + 1; 595 MPI_Allgather(&m,1,MPI_INT,aij->rowners+1,1,MPI_INT,comm); 596 aij->rowners[0] = 0; 597 for ( i=2; i<=aij->numtids; i++ ) { 598 aij->rowners[i] += aij->rowners[i-1]; 599 } 600 aij->rstart = aij->rowners[aij->mytid]; 601 aij->rend = aij->rowners[aij->mytid+1]; 602 MPI_Allgather(&n,1,MPI_INT,aij->cowners+1,1,MPI_INT,comm); 603 aij->cowners[0] = 0; 604 for ( i=2; i<=aij->numtids; i++ ) { 605 aij->cowners[i] += aij->cowners[i-1]; 606 } 607 aij->cstart = aij->cowners[aij->mytid]; 608 aij->cend = aij->cowners[aij->mytid+1]; 609 610 611 ierr = MatCreateSequentialAIJ(m,n,d_nz,d_nnz,&aij->A); CHKERR(ierr); 612 ierr = MatCreateSequentialAIJ(m,N,o_nz,o_nnz,&aij->B); CHKERR(ierr); 613 614 /* build cache for off array entries formed */ 615 aij->stash.nmax = CHUNCKSIZE; /* completely arbratray number */ 616 aij->stash.n = 0; 617 aij->stash.array = (Scalar *) MALLOC( aij->stash.nmax*(2*sizeof(int) + 618 sizeof(Scalar))); CHKPTR(aij->stash.array); 619 aij->stash.idx = (int *) (aij->stash.array + aij->stash.nmax); 620 aij->stash.idy = (int *) (aij->stash.idx + aij->stash.nmax); 621 622 /* stuff used for matrix vector multiply */ 623 aij->lvec = 0; 624 aij->Mvctx = 0; 625 626 aij->outofrange = 0; 627 628 *newmat = mat; 629 return 0; 630 } 631