1 #ifndef lint 2 static char vcid[] = "$Id: mpibaij.c,v 1.2 1996/06/04 21:40:25 balay Exp balay $"; 3 #endif 4 5 #include "mpibaij.h" 6 7 8 #include "draw.h" 9 #include "pinclude/pviewer.h" 10 11 extern int MatSetUpMultiply_MPIBAIJ(Mat); 12 extern int DisAssemble_MPIBAIJ(Mat); 13 14 /* local utility routine that creates a mapping from the global column 15 number to the local number in the off-diagonal part of the local 16 storage of the matrix. This is done in a non scable way since the 17 length of colmap equals the global matrix length. 18 */ 19 static int CreateColmap_MPIBAIJ_Private(Mat mat) 20 { 21 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 22 Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data; 23 int nbs = B->nbs,i; 24 25 baij->colmap = (int *) PetscMalloc(baij->Nbs*sizeof(int));CHKPTRQ(baij->colmap); 26 PLogObjectMemory(mat,baij->Nbs*sizeof(int)); 27 PetscMemzero(baij->colmap,baij->Nbs*sizeof(int)); 28 for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i; 29 return 0; 30 } 31 32 33 static int MatGetReordering_MPIBAIJ(Mat mat,MatOrdering type,IS *rperm,IS *cperm) 34 { 35 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 36 int ierr; 37 if (baij->size == 1) { 38 ierr = MatGetReordering(baij->A,type,rperm,cperm); CHKERRQ(ierr); 39 } else SETERRQ(1,"MatGetReordering_MPIBAIJ:not supported in parallel"); 40 return 0; 41 } 42 43 static int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv) 44 { 45 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 46 Scalar value; 47 int ierr,i,j, rstart = baij->rstart, rend = baij->rend; 48 int cstart = baij->cstart, cend = baij->cend,row,col; 49 int roworiented = baij->roworiented,rstart_orig,rend_orig; 50 int cstart_orig,cend_orig,bs=baij->bs; 51 52 if (baij->insertmode != NOT_SET_VALUES && baij->insertmode != addv) { 53 SETERRQ(1,"MatSetValues_MPIBAIJ:Cannot mix inserts and adds"); 54 } 55 baij->insertmode = addv; 56 rstart_orig = rstart*bs; 57 rend_orig = rend*bs; 58 cstart_orig = cstart*bs; 59 cend_orig = cend*bs; 60 for ( i=0; i<m; i++ ) { 61 if (im[i] < 0) SETERRQ(1,"MatSetValues_MPIBAIJ:Negative row"); 62 if (im[i] >= baij->M) SETERRQ(1,"MatSetValues_MPIBAIJ:Row too large"); 63 if (im[i] >= rstart_orig && im[i] < rend_orig) { 64 row = im[i] - rstart_orig; 65 for ( j=0; j<n; j++ ) { 66 if (in[j] < 0) SETERRQ(1,"MatSetValues_MPIBAIJ:Negative column"); 67 if (in[j] >= baij->N) SETERRQ(1,"MatSetValues_MPIBAIJ:Col too large"); 68 if (in[j] >= cstart_orig && in[j] < cend_orig){ 69 col = in[j] - cstart_orig; 70 if (roworiented) value = v[i*n+j]; else value = v[i+j*m]; 71 ierr = MatSetValues(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); 72 } 73 else { 74 if (mat->was_assembled) { 75 if (!baij->colmap) {ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);} 76 col = baij->colmap[in[j]]; 77 if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) { 78 ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr); 79 col = in[j]; 80 } 81 } 82 else col = in[j]; 83 if (roworiented) value = v[i*n+j]; else value = v[i+j*m]; 84 ierr = MatSetValues(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); 85 } 86 } 87 } 88 else { 89 if (roworiented) { 90 ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr); 91 } 92 else { 93 row = im[i]; 94 for ( j=0; j<n; j++ ) { 95 ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr); 96 } 97 } 98 } 99 } 100 return 0; 101 } 102 103 104 static int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode) 105 { 106 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 107 MPI_Comm comm = mat->comm; 108 int size = baij->size, *owners = baij->rowners,bs=baij->bs; 109 int rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr; 110 MPI_Request *send_waits,*recv_waits; 111 int *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work; 112 InsertMode addv; 113 Scalar *rvalues,*svalues; 114 115 /* make sure all processors are either in INSERTMODE or ADDMODE */ 116 MPI_Allreduce(&baij->insertmode,&addv,1,MPI_INT,MPI_BOR,comm); 117 if (addv == (ADD_VALUES|INSERT_VALUES)) { 118 SETERRQ(1,"MatAssemblyBegin_MPIBAIJ:Some processors inserted others added"); 119 } 120 baij->insertmode = addv; /* in case this processor had no cache */ 121 122 /* first count number of contributors to each processor */ 123 nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs); 124 PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size; 125 owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner); 126 for ( i=0; i<baij->stash.n; i++ ) { 127 idx = baij->stash.idx[i]; 128 for ( j=0; j<size; j++ ) { 129 if (idx >= owners[j]*bs && idx < owners[j+1]*bs) { 130 nprocs[j]++; procs[j] = 1; owner[i] = j; break; 131 } 132 } 133 } 134 nsends = 0; for ( i=0; i<size; i++ ) { nsends += procs[i];} 135 136 /* inform other processors of number of messages and max length*/ 137 work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work); 138 MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm); 139 nreceives = work[rank]; 140 MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm); 141 nmax = work[rank]; 142 PetscFree(work); 143 144 /* post receives: 145 1) each message will consist of ordered pairs 146 (global index,value) we store the global index as a double 147 to simplify the message passing. 148 2) since we don't know how long each individual message is we 149 allocate the largest needed buffer for each receive. Potentially 150 this is a lot of wasted space. 151 152 153 This could be done better. 154 */ 155 rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar)); 156 CHKPTRQ(rvalues); 157 recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request)); 158 CHKPTRQ(recv_waits); 159 for ( i=0; i<nreceives; i++ ) { 160 MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag, 161 comm,recv_waits+i); 162 } 163 164 /* do sends: 165 1) starts[i] gives the starting index in svalues for stuff going to 166 the ith processor 167 */ 168 svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues); 169 send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request)); 170 CHKPTRQ(send_waits); 171 starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts); 172 starts[0] = 0; 173 for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 174 for ( i=0; i<baij->stash.n; i++ ) { 175 svalues[3*starts[owner[i]]] = (Scalar) baij->stash.idx[i]; 176 svalues[3*starts[owner[i]]+1] = (Scalar) baij->stash.idy[i]; 177 svalues[3*(starts[owner[i]]++)+2] = baij->stash.array[i]; 178 } 179 PetscFree(owner); 180 starts[0] = 0; 181 for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 182 count = 0; 183 for ( i=0; i<size; i++ ) { 184 if (procs[i]) { 185 MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag, 186 comm,send_waits+count++); 187 } 188 } 189 PetscFree(starts); PetscFree(nprocs); 190 191 /* Free cache space */ 192 PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off processor values %d\n",rank,baij->stash.n); 193 ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr); 194 195 baij->svalues = svalues; baij->rvalues = rvalues; 196 baij->nsends = nsends; baij->nrecvs = nreceives; 197 baij->send_waits = send_waits; baij->recv_waits = recv_waits; 198 baij->rmax = nmax; 199 200 return 0; 201 } 202 203 204 static int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode) 205 { 206 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 207 MPI_Status *send_status,recv_status; 208 int imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr; 209 int bs=baij->bs,row,col,other_disassembled; 210 Scalar *values,val; 211 InsertMode addv = baij->insertmode; 212 213 /* wait on receives */ 214 while (count) { 215 MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status); 216 /* unpack receives into our local space */ 217 values = baij->rvalues + 3*imdex*baij->rmax; 218 MPI_Get_count(&recv_status,MPIU_SCALAR,&n); 219 n = n/3; 220 for ( i=0; i<n; i++ ) { 221 row = (int) PetscReal(values[3*i]) - baij->rstart*bs; 222 col = (int) PetscReal(values[3*i+1]); 223 val = values[3*i+2]; 224 if (col >= baij->cstart*bs && col < baij->cend*bs) { 225 col -= baij->cstart*bs; 226 MatSetValues(baij->A,1,&row,1,&col,&val,addv); 227 } 228 else { 229 if (mat->was_assembled) { 230 if (!baij->colmap) {ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);} 231 col = baij->colmap[col/bs]*bs + col%bs; 232 if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) { 233 ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr); 234 col = (int) PetscReal(values[3*i+1]); 235 } 236 } 237 MatSetValues(baij->B,1,&row,1,&col,&val,addv); 238 } 239 } 240 count--; 241 } 242 PetscFree(baij->recv_waits); PetscFree(baij->rvalues); 243 244 /* wait on sends */ 245 if (baij->nsends) { 246 send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status)); 247 CHKPTRQ(send_status); 248 MPI_Waitall(baij->nsends,baij->send_waits,send_status); 249 PetscFree(send_status); 250 } 251 PetscFree(baij->send_waits); PetscFree(baij->svalues); 252 253 baij->insertmode = NOT_SET_VALUES; 254 ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr); 255 ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr); 256 257 /* determine if any processor has disassembled, if so we must 258 also disassemble ourselfs, in order that we may reassemble. */ 259 MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm); 260 if (mat->was_assembled && !other_disassembled) { 261 ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr); 262 } 263 264 if (!mat->was_assembled && mode == FINAL_ASSEMBLY) { 265 ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr); 266 } 267 ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr); 268 ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr); 269 270 if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;} 271 return 0; 272 } 273 274 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer) 275 { 276 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 277 int ierr; 278 279 if (baij->size == 1) { 280 ierr = MatView(baij->A,viewer); CHKERRQ(ierr); 281 } 282 else SETERRQ(1,"MatView_MPIBAIJ_Binary:Only uniprocessor output supported"); 283 return 0; 284 } 285 286 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer) 287 { 288 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 289 int ierr, format,rank,bs=baij->bs,bs2=baij->bs2; 290 FILE *fd; 291 ViewerType vtype; 292 293 ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr); 294 if (vtype == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) { 295 ierr = ViewerGetFormat(viewer,&format); 296 if (format == ASCII_FORMAT_INFO_DETAILED) { 297 int nz, nzalloc, mem; 298 MPI_Comm_rank(mat->comm,&rank); 299 ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr); 300 ierr = MatGetInfo(mat,MAT_LOCAL,&nz,&nzalloc,&mem); 301 PetscSequentialPhaseBegin(mat->comm,1); 302 fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n", 303 rank,baij->m,nz*bs,nzalloc*bs,baij->bs,mem); 304 ierr = MatGetInfo(baij->A,MAT_LOCAL,&nz,&nzalloc,&mem); 305 fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,nz*bs); 306 ierr = MatGetInfo(baij->B,MAT_LOCAL,&nz,&nzalloc,&mem); 307 fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,nz*bs); 308 fflush(fd); 309 PetscSequentialPhaseEnd(mat->comm,1); 310 ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr); 311 return 0; 312 } 313 else if (format == ASCII_FORMAT_INFO) { 314 return 0; 315 } 316 } 317 318 if (vtype == DRAW_VIEWER) { 319 Draw draw; 320 PetscTruth isnull; 321 ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr); 322 ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0; 323 } 324 325 if (vtype == ASCII_FILE_VIEWER) { 326 ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr); 327 PetscSequentialPhaseBegin(mat->comm,1); 328 fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n", 329 baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n, 330 baij->cstart*bs,baij->cend*bs); 331 ierr = MatView(baij->A,viewer); CHKERRQ(ierr); 332 ierr = MatView(baij->B,viewer); CHKERRQ(ierr); 333 fflush(fd); 334 PetscSequentialPhaseEnd(mat->comm,1); 335 } 336 else { 337 int size = baij->size; 338 rank = baij->rank; 339 if (size == 1) { 340 ierr = MatView(baij->A,viewer); CHKERRQ(ierr); 341 } 342 else { 343 /* assemble the entire matrix onto first processor. */ 344 Mat A; 345 Mat_SeqBAIJ *Aloc; 346 int M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals; 347 int mbs=baij->mbs; 348 Scalar *a; 349 350 if (!rank) { 351 ierr = MatCreateMPIAIJ(mat->comm,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A); 352 CHKERRQ(ierr); 353 } 354 else { 355 ierr = MatCreateMPIAIJ(mat->comm,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A); 356 CHKERRQ(ierr); 357 } 358 PLogObjectParent(mat,A); 359 360 /* copy over the A part */ 361 Aloc = (Mat_SeqBAIJ*) baij->A->data; 362 ai = Aloc->i; aj = Aloc->j; a = Aloc->a; 363 row = baij->rstart; 364 rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals); 365 366 for ( i=0; i<mbs; i++ ) { 367 rvals[0] = bs*(baij->rstart + i); 368 for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; } 369 for ( j=ai[i]; j<ai[i+1]; j++ ) { 370 col = (baij->cstart+aj[j])*bs; 371 for (k=0; k<bs; k++ ) { 372 ierr = MatSetValues(A,bs,rvals,1,&col,a+j*bs2,INSERT_VALUES);CHKERRQ(ierr); 373 col++; 374 } 375 } 376 } 377 /* copy over the B part */ 378 Aloc = (Mat_SeqBAIJ*) baij->B->data; 379 ai = Aloc->i; aj = Aloc->j; a = Aloc->a; 380 row = baij->rstart*bs; 381 for ( i=0; i<mbs; i++ ) { 382 rvals[0] = bs*(baij->rstart + i); 383 for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; } 384 for ( j=ai[i]; j<ai[i+1]; j++ ) { 385 col = baij->garray[aj[j]]*bs; 386 for (k=0; k<bs; k++ ) { 387 ierr = MatSetValues(A,bs,rvals,1,&col,a+j*bs2,INSERT_VALUES);CHKERRQ(ierr); 388 col++; 389 } 390 } 391 } 392 PetscFree(rvals); 393 ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr); 394 ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr); 395 if (!rank) { 396 ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr); 397 } 398 ierr = MatDestroy(A); CHKERRQ(ierr); 399 } 400 } 401 return 0; 402 } 403 404 405 406 static int MatView_MPIBAIJ(PetscObject obj,Viewer viewer) 407 { 408 Mat mat = (Mat) obj; 409 int ierr; 410 ViewerType vtype; 411 412 if (!viewer) { 413 viewer = STDOUT_VIEWER_SELF; 414 } 415 ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr); 416 if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER || 417 vtype == DRAW_VIEWER || vtype == MATLAB_VIEWER) { 418 ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr); 419 } 420 else if (vtype == BINARY_FILE_VIEWER) { 421 return MatView_MPIBAIJ_Binary(mat,viewer); 422 } 423 return 0; 424 } 425 426 static int MatDestroy_MPIBAIJ(PetscObject obj) 427 { 428 Mat mat = (Mat) obj; 429 Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data; 430 int ierr; 431 432 #if defined(PETSC_LOG) 433 PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N); 434 #endif 435 436 PetscFree(baij->rowners); 437 ierr = MatDestroy(baij->A); CHKERRQ(ierr); 438 ierr = MatDestroy(baij->B); CHKERRQ(ierr); 439 if (baij->colmap) PetscFree(baij->colmap); 440 if (baij->garray) PetscFree(baij->garray); 441 if (baij->lvec) VecDestroy(baij->lvec); 442 if (baij->Mvctx) VecScatterDestroy(baij->Mvctx); 443 if (baij->rowvalues) PetscFree(baij->rowvalues); 444 PetscFree(baij); 445 PLogObjectDestroy(mat); 446 PetscHeaderDestroy(mat); 447 return 0; 448 } 449 450 static int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n) 451 { 452 Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data; 453 *m = mat->M; *n = mat->N; 454 return 0; 455 } 456 457 static int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n) 458 { 459 Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data; 460 *m = mat->m; *n = mat->N; 461 return 0; 462 } 463 464 static int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n) 465 { 466 Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data; 467 *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs; 468 return 0; 469 } 470 471 /* -------------------------------------------------------------------*/ 472 static struct _MatOps MatOps = { 473 MatSetValues_MPIBAIJ,0,0,0, 474 0,0,0,0, 475 0,0,0,0, 476 0,0,0,0, 477 0,0,0,0, 478 MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,0, 479 0,0,MatGetReordering_MPIBAIJ,0, 480 0,0,0,MatGetSize_MPIBAIJ, 481 MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0, 482 0,0,0,0, 483 0,0,0,0, 484 0,0,0,0, 485 0,0,0,0, 486 0}; 487 488 489 /*@C 490 MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format 491 (block compressed row). For good matrix assembly performance 492 the user should preallocate the matrix storage by setting the parameters 493 d_nz (or d_nnz) and o_nz (or o_nnz). By setting these parameters accurately, 494 performance can be increased by more than a factor of 50. 495 496 Input Parameters: 497 . comm - MPI communicator 498 . bs - size of blockk 499 . m - number of local rows (or PETSC_DECIDE to have calculated if M is given) 500 . n - number of local columns (or PETSC_DECIDE to have calculated 501 if N is given) 502 . M - number of global rows (or PETSC_DECIDE to have calculated if m is given) 503 . N - number of global columns (or PETSC_DECIDE to have calculated 504 if n is given) 505 . d_nz - number of block nonzeros per block row in diagonal portion of local 506 submatrix (same for all local rows) 507 . d_nzz - number of block nonzeros per block row in diagonal portion of local 508 submatrix or null (possibly different for each row). You must leave 509 room for the diagonal entry even if it is zero. 510 . o_nz - number of block nonzeros per block row in off-diagonal portion of local 511 submatrix (same for all local rows). 512 . o_nzz - number of block nonzeros per block row in off-diagonal portion of local 513 submatrix or null (possibly different for each row). 514 515 Output Parameter: 516 . A - the matrix 517 518 Notes: 519 The user MUST specify either the local or global matrix dimensions 520 (possibly both). 521 522 Storage Information: 523 For a square global matrix we define each processor's diagonal portion 524 to be its local rows and the corresponding columns (a square submatrix); 525 each processor's off-diagonal portion encompasses the remainder of the 526 local matrix (a rectangular submatrix). 527 528 The user can specify preallocated storage for the diagonal part of 529 the local submatrix with either d_nz or d_nnz (not both). Set 530 d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic 531 memory allocation. Likewise, specify preallocated storage for the 532 off-diagonal part of the local submatrix with o_nz or o_nnz (not both). 533 534 Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In 535 the figure below we depict these three local rows and all columns (0-11). 536 537 $ 0 1 2 3 4 5 6 7 8 9 10 11 538 $ ------------------- 539 $ row 3 | o o o d d d o o o o o o 540 $ row 4 | o o o d d d o o o o o o 541 $ row 5 | o o o d d d o o o o o o 542 $ ------------------- 543 $ 544 545 Thus, any entries in the d locations are stored in the d (diagonal) 546 submatrix, and any entries in the o locations are stored in the 547 o (off-diagonal) submatrix. Note that the d and the o submatrices are 548 stored simply in the MATSEQBAIJ format for compressed row storage. 549 550 Now d_nz should indicate the number of nonzeros per row in the d matrix, 551 and o_nz should indicate the number of nonzeros per row in the o matrix. 552 In general, for PDE problems in which most nonzeros are near the diagonal, 553 one expects d_nz >> o_nz. For additional details, see the users manual 554 chapter on matrices and the file $(PETSC_DIR)/Performance. 555 556 .keywords: matrix, aij, compressed row, sparse, parallel 557 558 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues() 559 @*/ 560 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N, 561 int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A) 562 { 563 Mat B; 564 Mat_MPIBAIJ *b; 565 int ierr, i,sum[2],work[2],mbs,nbs,Mbs,Nbs; 566 567 if (bs < 1) SETERRQ(1,"MatCreateMPIBAIJ: invalid block size specified"); 568 *A = 0; 569 PetscHeaderCreate(B,_Mat,MAT_COOKIE,MATMPIBAIJ,comm); 570 PLogObjectCreate(B); 571 B->data = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b); 572 PetscMemzero(b,sizeof(Mat_MPIBAIJ)); 573 PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps)); 574 B->destroy = MatDestroy_MPIBAIJ; 575 B->view = MatView_MPIBAIJ; 576 577 B->factor = 0; 578 B->assembled = PETSC_FALSE; 579 580 b->insertmode = NOT_SET_VALUES; 581 MPI_Comm_rank(comm,&b->rank); 582 MPI_Comm_size(comm,&b->size); 583 584 if (m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) 585 SETERRQ(1,"MatCreateMPIBAIJ:Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz"); 586 587 if (M == PETSC_DECIDE || N == PETSC_DECIDE) { 588 work[0] = m; work[1] = n; 589 mbs = m/bs; nbs = n/bs; 590 if (mbs*bs != m || nbs*n != nbs) SETERRQ(1,"MatCreateMPIBAIJ: No of local rows, cols must be divisible by blocksize"); 591 MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm ); 592 if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;} 593 if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;} 594 } 595 if (m == PETSC_DECIDE) { 596 Mbs = M/bs; 597 if (Mbs*bs != M) SETERRQ(1,"MatCreateMPIBAIJ: No of global rows must be divisible by blocksize"); 598 mbs = Mbs/b->size + ((Mbs % b->size) > b->rank); 599 m = mbs*bs; 600 } 601 if (n == PETSC_DECIDE) { 602 Nbs = N/bs; 603 if (Nbs*bs != N) SETERRQ(1,"MatCreateMPIBAIJ: No of global cols must be divisible by blocksize"); 604 nbs = Nbs/b->size + ((Nbs % b->size) > b->rank); 605 n = nbs*bs; 606 } 607 608 b->m = m; B->m = m; 609 b->n = n; B->n = n; 610 b->N = N; B->N = N; 611 b->M = M; B->M = M; 612 b->bs = bs; 613 b->bs2 = bs*bs; 614 b->mbs = mbs; 615 b->nbs = nbs; 616 b->Mbs = Mbs; 617 b->Nbs = Nbs; 618 619 /* build local table of row and column ownerships */ 620 b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners); 621 PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _Mat)+sizeof(Mat_MPIBAIJ)); 622 b->cowners = b->rowners + b->size + 1; 623 MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm); 624 b->rowners[0] = 0; 625 for ( i=2; i<=b->size; i++ ) { 626 b->rowners[i] += b->rowners[i-1]; 627 } 628 b->rstart = b->rowners[b->rank]; 629 b->rend = b->rowners[b->rank+1]; 630 MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm); 631 b->cowners[0] = 0; 632 for ( i=2; i<=b->size; i++ ) { 633 b->cowners[i] += b->cowners[i-1]; 634 } 635 b->cstart = b->cowners[b->rank]; 636 b->cend = b->cowners[b->rank+1]; 637 638 if (d_nz == PETSC_DEFAULT) d_nz = 5; 639 ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr); 640 PLogObjectParent(B,b->A); 641 if (o_nz == PETSC_DEFAULT) o_nz = 0; 642 ierr = MatCreateSeqBAIJ(MPI_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr); 643 PLogObjectParent(B,b->B); 644 645 /* build cache for off array entries formed */ 646 ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr); 647 b->colmap = 0; 648 b->garray = 0; 649 b->roworiented = 1; 650 651 /* stuff used for matrix vector multiply */ 652 b->lvec = 0; 653 b->Mvctx = 0; 654 655 /* stuff for MatGetRow() */ 656 b->rowindices = 0; 657 b->rowvalues = 0; 658 b->getrowactive = PETSC_FALSE; 659 660 *A = B; 661 return 0; 662 } 663 664 #include "sys.h" 665 666 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat) 667 { 668 Mat A; 669 int i, nz, ierr, j,rstart, rend, fd; 670 Scalar *vals,*buf; 671 MPI_Comm comm = ((PetscObject)viewer)->comm; 672 MPI_Status status; 673 int header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,maxnz,*cols; 674 int *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf; 675 int flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows; 676 int *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount; 677 int dcount,kmax,k,nzcount,tmp; 678 679 680 ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr); 681 bs2 = bs*bs; 682 683 MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank); 684 if (!rank) { 685 ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr); 686 ierr = PetscBinaryRead(fd,(char *)header,4,BINARY_INT); CHKERRQ(ierr); 687 if (header[0] != MAT_COOKIE) SETERRQ(1,"MatLoad_MPIAIJ:not matrix object"); 688 } 689 690 MPI_Bcast(header+1,3,MPI_INT,0,comm); 691 M = header[1]; N = header[2]; 692 693 694 if (M != N) SETERRQ(1,"MatLoad_SeqBAIJ:Can only do square matrices"); 695 696 /* 697 This code adds extra rows to make sure the number of rows is 698 divisible by the blocksize 699 */ 700 Mbs = M/bs; 701 extra_rows = bs - M + bs*(Mbs); 702 if (extra_rows == bs) extra_rows = 0; 703 else Mbs++; 704 if (extra_rows &&!rank) { 705 PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize"); 706 } 707 /* determine ownership of all rows */ 708 mbs = Mbs/size + ((Mbs % size) > rank); 709 m = mbs * bs; 710 rowners = (int *) PetscMalloc((size+2)*sizeof(int)); CHKPTRQ(rowners); 711 MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm); 712 rowners[0] = 0; 713 for ( i=2; i<=size; i++ ) { 714 rowners[i] += rowners[i-1]; 715 } 716 rstart = rowners[rank]; 717 rend = rowners[rank+1]; 718 719 /* distribute row lengths to all processors */ 720 locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens); 721 if (!rank) { 722 rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths); 723 ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr); 724 for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1; 725 sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts); 726 for ( i=0; i<size; i++ ) sndcounts[i] = (rowners[i+1] - rowners[i])*bs; 727 MPI_Scatterv(rowlengths,sndcounts,rowners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm); 728 PetscFree(sndcounts); 729 } 730 else { 731 MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm); 732 } 733 734 if (!rank) { 735 /* calculate the number of nonzeros on each processor */ 736 procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz); 737 PetscMemzero(procsnz,size*sizeof(int)); 738 for ( i=0; i<size; i++ ) { 739 for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) { 740 procsnz[i] += rowlengths[j]; 741 } 742 } 743 PetscFree(rowlengths); 744 745 /* determine max buffer needed and allocate it */ 746 maxnz = 0; 747 for ( i=0; i<size; i++ ) { 748 maxnz = PetscMax(maxnz,procsnz[i]); 749 } 750 cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols); 751 752 /* read in my part of the matrix column indices */ 753 nz = procsnz[0]; 754 ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf); 755 mycols = ibuf; 756 ierr = PetscBinaryRead(fd,mycols,nz,BINARY_INT); CHKERRQ(ierr); 757 /* read in every ones (except the last) and ship off */ 758 for ( i=1; i<size-1; i++ ) { 759 nz = procsnz[i]; 760 ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr); 761 MPI_Send(cols,nz,MPI_INT,i,tag,comm); 762 } 763 /* read in the stuff for the last proc */ 764 if ( size != 1 ) { 765 nz = procsnz[size-1] - extra_rows; /* the extra rows are not on the disk */ 766 ierr = PetscBinaryRead(fd,cols,nz,BINARY_INT); CHKERRQ(ierr); 767 for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i; 768 MPI_Send(cols,nz+extra_rows,MPI_INT,i,tag,comm); 769 } 770 PetscFree(cols); 771 } 772 else { 773 /* determine buffer space needed for message */ 774 nz = 0; 775 for ( i=0; i<m; i++ ) { 776 nz += locrowlens[i]; 777 } 778 ibuf = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf); 779 mycols = ibuf; 780 /* receive message of column indices*/ 781 MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status); 782 MPI_Get_count(&status,MPI_INT,&maxnz); 783 if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file"); 784 } 785 786 /* loop over local rows, determining number of off diagonal entries */ 787 dlens = (int *) PetscMalloc( 2*(rend-rstart)*sizeof(int) ); CHKPTRQ(dlens); 788 odlens = dlens + (rstart-rend); 789 mask = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask); 790 PetscMemzero(mask,Mbs*sizeof(int)); 791 masked1 = mask + Mbs; 792 masked2 = masked1 + Mbs; 793 rowcount = 0; nzcount = 0; 794 for ( i=0; i<mbs; i++ ) { 795 dcount = 0; 796 odcount = 0; 797 for ( j=0; j<bs; j++ ) { 798 kmax = locrowlens[rowcount]; 799 for ( k=0; k<kmax; k++ ) { 800 tmp = mycols[nzcount++]/bs; 801 if (!mask[tmp]) { 802 mask[tmp] = 1; 803 if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp; 804 else masked1[dcount++] = tmp; 805 } 806 } 807 rowcount++; 808 } 809 dlens[i] = dcount; 810 odlens[i] = odcount; 811 /* zero out the mask elements we set */ 812 for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0; 813 for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0; 814 } 815 /* create our matrix */ 816 ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);CHKERRQ(ierr); 817 A = *newmat; 818 MatSetOption(A,COLUMNS_SORTED); 819 820 if (!rank) { 821 buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf); 822 823 /* read in my part of the matrix numerical values */ 824 nz = procsnz[0]; 825 vals = buf; 826 ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr); 827 828 /* insert into matrix */ 829 jj = rstart*bs; 830 for ( i=0; i<m; i++ ) { 831 ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr); 832 mycols += locrowlens[i]; 833 vals += locrowlens[i]; 834 jj++; 835 } 836 837 /* read in other processors( except the last one) and ship out */ 838 for ( i=1; i<size-1; i++ ) { 839 nz = procsnz[i]; 840 vals = buf; 841 ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr); 842 MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm); 843 } 844 /* the last proc */ 845 if ( size != 1 ){ 846 nz = procsnz[i] - extra_rows; 847 ierr = PetscBinaryRead(fd,vals,nz,BINARY_SCALAR); CHKERRQ(ierr); 848 for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0; 849 MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,i,A->tag,comm); 850 } 851 PetscFree(procsnz); 852 } 853 else { 854 /* receive numeric values */ 855 buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf); 856 857 /* receive message of values*/ 858 vals = buf; 859 MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status); 860 MPI_Get_count(&status,MPIU_SCALAR,&maxnz); 861 if (maxnz != nz) SETERRQ(1,"MatLoad_MPIAIJ:something is wrong with file"); 862 863 /* insert into matrix */ 864 jj = rstart*bs; 865 for ( i=0; i<mbs; i++ ) { 866 ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr); 867 mycols += locrowlens[i]; 868 vals += locrowlens[i]; 869 jj++; 870 } 871 } 872 PetscFree(locrowlens); 873 PetscFree(buf); 874 PetscFree(ibuf); 875 PetscFree(rowners); 876 PetscFree(dlens); 877 ierr = MatAssemblyBegin(A,FINAL_ASSEMBLY); CHKERRQ(ierr); 878 ierr = MatAssemblyEnd(A,FINAL_ASSEMBLY); CHKERRQ(ierr); 879 return 0; 880 } 881 882 883