1 /* 2 Defines projective product routines where A is a MPIAIJ matrix 3 C = P^T * A * P 4 */ 5 6 #include "src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 7 #include "src/mat/utils/freespace.h" 8 #include "src/mat/impls/aij/mpi/mpiaij.h" 9 #include "petscbt.h" 10 11 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat); 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP" 14 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 15 { 16 PetscErrorCode ierr; 17 Mat_MatMatMultMPI *ptap; 18 PetscObjectContainer container; 19 20 PetscFunctionBegin; 21 ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 22 if (container) { 23 ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 24 ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr); 25 ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr); 26 ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr); 27 ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr); 28 if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr); 29 if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr); 30 31 ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr); 32 ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr); 33 } 34 ierr = PetscFree(ptap);CHKERRQ(ierr); 35 36 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 37 PetscFunctionReturn(0); 38 } 39 40 #undef __FUNCT__ 41 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ" 42 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 43 { 44 PetscErrorCode ierr; 45 Mat_MatMatMultMPI *ptap; 46 PetscObjectContainer container; 47 48 PetscFunctionBegin; 49 if (scall == MAT_INITIAL_MATRIX){ 50 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 51 ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr); 52 ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL; 53 ptap->abnz_max = 0; /* symbolic A*P is not done yet */ 54 55 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 56 ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 57 58 /* get P_loc by taking all local rows of P */ 59 ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 60 61 /* attach the supporting struct to P for reuse */ 62 P->ops->destroy = MatDestroy_MPIAIJ_PtAP; 63 ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 64 ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr); 65 ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr); 66 67 /* now, compute symbolic local P^T*A*P */ 68 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */ 69 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 70 } else if (scall == MAT_REUSE_MATRIX){ 71 ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 72 if (container) { 73 ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 74 } else { 75 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 76 } 77 78 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 79 ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 80 81 /* get P_loc by taking all local rows of P */ 82 ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 83 84 } else { 85 SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall); 86 } 87 /* now, compute numeric local P^T*A*P */ 88 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 89 ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr); 90 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 91 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ" 97 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 98 { 99 PetscErrorCode ierr; 100 Mat P_loc,P_oth; 101 Mat_MatMatMultMPI *ptap; 102 PetscObjectContainer container; 103 FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 104 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 105 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 106 Mat_SeqAIJ *p_loc,*p_oth; 107 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj; 108 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col; 109 PetscInt pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj; 110 PetscInt aN=A->N,am=A->m,pN=P->N; 111 PetscInt i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi; 112 PetscBT lnkbt; 113 PetscInt prstart,prend,nprow_loc,nprow_oth; 114 PetscInt *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth; 115 116 MPI_Comm comm=A->comm; 117 Mat B_mpi; 118 PetscMPIInt size,rank,tagi,tagj,*len_s,*len_si,*len_ri; 119 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 120 PetscInt len,proc,*dnz,*onz,*owners; 121 PetscInt anzi,*bi,*bj,bnzi,nspacedouble=0; 122 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 123 MPI_Request *si_waits,*sj_waits,*ri_waits,*rj_waits; 124 MPI_Status *status; 125 Mat_Merge_SeqsToMPI *merge; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 129 if (container) { 130 ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 131 } else { 132 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 133 } 134 /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */ 135 /*--------------------------------------------------*/ 136 /* get data from P_loc and P_oth */ 137 P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart; 138 p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data; 139 pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j; 140 prend = prstart + pm; 141 142 /* get ij structure of P_loc^T */ 143 ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 144 ptJ=ptj; 145 146 /* Allocate ci array, arrays for fill computation and */ 147 /* free space for accumulating nonzero column info */ 148 ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr); 149 ci[0] = 0; 150 151 ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr); 152 ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr); 153 ptasparserow_loc = ptadenserow_loc + aN; 154 ptadenserow_oth = ptasparserow_loc + aN; 155 ptasparserow_oth = ptadenserow_oth + aN; 156 157 /* create and initialize a linked list */ 158 nlnk = pN+1; 159 ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 160 161 /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */ 162 /* This should be reasonable if sparsity of PtAP is similar to that of A. */ 163 nnz = adi[am] + aoi[am]; 164 ierr = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space); 165 current_space = free_space; 166 167 /* determine symbolic info for each row of C: */ 168 for (i=0;i<pN;i++) { 169 ptnzi = pti[i+1] - pti[i]; 170 nprow_loc = 0; nprow_oth = 0; 171 /* i-th row of symbolic P_loc^T*A_loc: */ 172 for (j=0;j<ptnzi;j++) { 173 arow = *ptJ++; 174 /* diagonal portion of A */ 175 anzj = adi[arow+1] - adi[arow]; 176 ajj = adj + adi[arow]; 177 for (k=0;k<anzj;k++) { 178 col = ajj[k]+prstart; 179 if (!ptadenserow_loc[col]) { 180 ptadenserow_loc[col] = -1; 181 ptasparserow_loc[nprow_loc++] = col; 182 } 183 } 184 /* off-diagonal portion of A */ 185 anzj = aoi[arow+1] - aoi[arow]; 186 ajj = aoj + aoi[arow]; 187 for (k=0;k<anzj;k++) { 188 col = a->garray[ajj[k]]; /* global col */ 189 if (col < cstart){ 190 col = ajj[k]; 191 } else if (col >= cend){ 192 col = ajj[k] + pm; 193 } else { 194 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 195 } 196 if (!ptadenserow_oth[col]) { 197 ptadenserow_oth[col] = -1; 198 ptasparserow_oth[nprow_oth++] = col; 199 } 200 } 201 } 202 203 /* using symbolic info of local PtA, determine symbolic info for row of C: */ 204 cnzi = 0; 205 /* rows of P_loc */ 206 ptaj = ptasparserow_loc; 207 for (j=0; j<nprow_loc; j++) { 208 prow = *ptaj++; 209 prow -= prstart; /* rm */ 210 pnzj = pi_loc[prow+1] - pi_loc[prow]; 211 pjj = pj_loc + pi_loc[prow]; 212 /* add non-zero cols of P into the sorted linked list lnk */ 213 ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 214 cnzi += nlnk; 215 } 216 /* rows of P_oth */ 217 ptaj = ptasparserow_oth; 218 for (j=0; j<nprow_oth; j++) { 219 prow = *ptaj++; 220 if (prow >= prend) prow -= pm; /* rm */ 221 pnzj = pi_oth[prow+1] - pi_oth[prow]; 222 pjj = pj_oth + pi_oth[prow]; 223 /* add non-zero cols of P into the sorted linked list lnk */ 224 ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 225 cnzi += nlnk; 226 } 227 228 /* If free space is not available, make more free space */ 229 /* Double the amount of total space in the list */ 230 if (current_space->local_remaining<cnzi) { 231 ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 232 } 233 234 /* Copy data into free space, and zero out denserows */ 235 ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 236 current_space->array += cnzi; 237 current_space->local_used += cnzi; 238 current_space->local_remaining -= cnzi; 239 240 for (j=0;j<nprow_loc; j++) { 241 ptadenserow_loc[ptasparserow_loc[j]] = 0; 242 } 243 for (j=0;j<nprow_oth; j++) { 244 ptadenserow_oth[ptasparserow_oth[j]] = 0; 245 } 246 247 /* Aside: Perhaps we should save the pta info for the numerical factorization. */ 248 /* For now, we will recompute what is needed. */ 249 ci[i+1] = ci[i] + cnzi; 250 } 251 /* Clean up. */ 252 ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 253 254 /* nnz is now stored in ci[ptm], column indices are in the list of free space */ 255 /* Allocate space for cj, initialize cj, and */ 256 /* destroy list of free space and other temporary array(s) */ 257 ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr); 258 ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 259 ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr); 260 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 261 262 /* add C_seq into mpi C */ 263 /*-----------------------------------*/ 264 free_space=PETSC_NULL; current_space=PETSC_NULL; 265 266 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 267 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 268 269 ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 270 271 272 /* determine row ownership */ 273 /*---------------------------------------------------------*/ 274 ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr); 275 ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr); 276 ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr); 277 ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 278 279 /* determine the number of messages to send, their lengths */ 280 /*---------------------------------------------------------*/ 281 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 282 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 283 len_s = merge->len_s; 284 len = 0; /* length of buf_si[] */ 285 merge->nsend = 0; 286 for (proc=0; proc<size; proc++){ 287 len_si[proc] = 0; 288 if (proc == rank){ 289 len_s[proc] = 0; 290 } else { 291 len_si[proc] = owners[proc+1] - owners[proc] + 1; 292 len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */ 293 } 294 if (len_s[proc]) { 295 merge->nsend++; 296 nrows = 0; 297 for (i=owners[proc]; i<owners[proc+1]; i++){ 298 if (ci[i+1] > ci[i]) nrows++; 299 } 300 len_si[proc] = 2*(nrows+1); 301 len += len_si[proc]; 302 } 303 } 304 305 /* determine the number and length of messages to receive for ij-structure */ 306 /*-------------------------------------------------------------------------*/ 307 ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 308 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 309 310 /* post the Irecv of j-structure */ 311 /*-------------------------------*/ 312 ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr); 313 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr); 314 315 /* post the Isend of j-structure */ 316 /*--------------------------------*/ 317 ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr); 318 sj_waits = si_waits + merge->nsend; 319 320 for (proc=0, k=0; proc<size; proc++){ 321 if (!len_s[proc]) continue; 322 i = owners[proc]; 323 ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr); 324 k++; 325 } 326 327 /* receives and sends of j-structure are complete */ 328 /*------------------------------------------------*/ 329 ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 330 ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr); 331 ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr); 332 333 /* send and recv i-structure */ 334 /*---------------------------*/ 335 ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr); 336 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr); 337 338 ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 339 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 340 for (proc=0,k=0; proc<size; proc++){ 341 if (!len_s[proc]) continue; 342 /* form outgoing message for i-structure: 343 buf_si[0]: nrows to be sent 344 [1:nrows]: row index (global) 345 [nrows+1:2*nrows+1]: i-structure index 346 */ 347 /*-------------------------------------------*/ 348 nrows = len_si[proc]/2 - 1; 349 buf_si_i = buf_si + nrows+1; 350 buf_si[0] = nrows; 351 buf_si_i[0] = 0; 352 nrows = 0; 353 for (i=owners[proc]; i<owners[proc+1]; i++){ 354 anzi = ci[i+1] - ci[i]; 355 if (anzi) { 356 buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */ 357 buf_si[nrows+1] = i-owners[proc]; /* local row index */ 358 nrows++; 359 } 360 } 361 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr); 362 k++; 363 buf_si += len_si[proc]; 364 } 365 366 ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr); 367 ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr); 368 369 ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr); 370 for (i=0; i<merge->nrecv; i++){ 371 ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr); 372 } 373 374 ierr = PetscFree(len_si);CHKERRQ(ierr); 375 ierr = PetscFree(len_ri);CHKERRQ(ierr); 376 ierr = PetscFree(rj_waits);CHKERRQ(ierr); 377 ierr = PetscFree(si_waits);CHKERRQ(ierr); 378 ierr = PetscFree(ri_waits);CHKERRQ(ierr); 379 ierr = PetscFree(buf_s);CHKERRQ(ierr); 380 ierr = PetscFree(status);CHKERRQ(ierr); 381 382 /* compute a local seq matrix in each processor */ 383 /*----------------------------------------------*/ 384 /* allocate bi array and free space for accumulating nonzero column info */ 385 ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 386 bi[0] = 0; 387 388 /* create and initialize a linked list */ 389 nlnk = pN+1; 390 ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 391 392 /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */ 393 len = 0; 394 len = ci[owners[rank+1]] - ci[owners[rank]]; 395 ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr); 396 current_space = free_space; 397 398 /* determine symbolic info for each local row */ 399 ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 400 nextrow = buf_ri_k + merge->nrecv; 401 nextci = nextrow + merge->nrecv; 402 for (k=0; k<merge->nrecv; k++){ 403 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 404 nrows = *buf_ri_k[k]; 405 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 406 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 407 } 408 409 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 410 len = 0; 411 for (i=0; i<pn; i++) { 412 bnzi = 0; 413 /* add local non-zero cols of this proc's C_seq into lnk */ 414 arow = owners[rank] + i; 415 anzi = ci[arow+1] - ci[arow]; 416 cji = cj + ci[arow]; 417 ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 418 bnzi += nlnk; 419 /* add received col data into lnk */ 420 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 421 if (i == *nextrow[k]) { /* i-th row */ 422 anzi = *(nextci[k]+1) - *nextci[k]; 423 cji = buf_rj[k] + *nextci[k]; 424 ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 425 bnzi += nlnk; 426 nextrow[k]++; nextci[k]++; 427 } 428 } 429 if (len < bnzi) len = bnzi; /* =max(bnzi) */ 430 431 /* if free space is not available, make more free space */ 432 if (current_space->local_remaining<bnzi) { 433 ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 434 nspacedouble++; 435 } 436 /* copy data into free space, then initialize lnk */ 437 ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 438 ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr); 439 440 current_space->array += bnzi; 441 current_space->local_used += bnzi; 442 current_space->local_remaining -= bnzi; 443 444 bi[i+1] = bi[i] + bnzi; 445 } 446 447 ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 448 449 ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 450 ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 451 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 452 453 /* create symbolic parallel matrix B_mpi */ 454 /*---------------------------------------*/ 455 ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr); 456 ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr); 457 ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr); 458 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 459 /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */ 460 461 /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */ 462 B_mpi->assembled = PETSC_FALSE; 463 B_mpi->ops->destroy = MatDestroy_MPIAIJ_SeqsToMPI; 464 merge->bi = bi; 465 merge->bj = bj; 466 merge->ci = ci; 467 merge->cj = cj; 468 merge->buf_ri = buf_ri; 469 merge->buf_rj = buf_rj; 470 merge->C_seq = PETSC_NULL; 471 472 /* attach the supporting struct to B_mpi for reuse */ 473 ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 474 ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr); 475 ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr); 476 *C = B_mpi; 477 478 PetscFunctionReturn(0); 479 } 480 481 #undef __FUNCT__ 482 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ" 483 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 484 { 485 PetscErrorCode ierr; 486 Mat_Merge_SeqsToMPI *merge; 487 Mat_MatMatMultMPI *ptap; 488 PetscObjectContainer cont_merge,cont_ptap; 489 490 PetscInt flops=0; 491 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 492 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 493 Mat_SeqAIJ *p_loc,*p_oth; 494 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col; 495 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj; 496 PetscInt i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow; 497 PetscInt *cjj; 498 MatScalar *ada=ad->a,*aoa=ao->a,*ada_tmp=ad->a,*aoa_tmp=ao->a,*apa,*paj,*cseqa,*caj; 499 MatScalar *pa_loc,*pA,*pa_oth; 500 PetscInt am=A->m,cN=C->N; 501 PetscInt nextp,*adj_tmp=ad->j,*aoj_tmp=ao->j; 502 503 MPI_Comm comm=C->comm; 504 PetscMPIInt size,rank,taga,*len_s; 505 PetscInt *owners; 506 PetscInt proc; 507 PetscInt **buf_ri,**buf_rj; 508 PetscInt cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */ 509 PetscInt nrows,**buf_ri_k,**nextrow,**nextcseqi; 510 MPI_Request *s_waits,*r_waits; 511 MPI_Status *status; 512 MatScalar **abuf_r,*ba_i; 513 PetscInt *cseqi,*cseqj; 514 PetscInt *cseqj_tmp; 515 MatScalar *cseqa_tmp; 516 PetscInt stages[3]; 517 PetscInt *apsymi,*apsymj; 518 FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 519 520 PetscFunctionBegin; 521 ierr = PetscLogStageRegister(&stages[0],"SymAP");CHKERRQ(ierr); 522 ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr); 523 ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr); 524 525 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 526 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 527 528 ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr); 529 if (cont_merge) { 530 ierr = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr); 531 } else { 532 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container"); 533 } 534 ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr); 535 if (cont_ptap) { 536 ierr = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr); 537 } else { 538 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 539 } 540 541 /* get data from symbolic products */ 542 p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data; 543 p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data; 544 pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc; 545 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 546 547 cseqi = merge->ci; cseqj=merge->cj; 548 ierr = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr); 549 ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr); 550 551 /* ---------- Compute symbolic A*P --------- */ 552 if (!ptap->abnz_max){ 553 ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr); 554 ierr = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr); 555 ptap->abi = apsymi; 556 apsymi[0] = 0; 557 ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr); 558 ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr); 559 apj = (PetscInt *)(apa + cN); 560 apjdense = apj + cN; 561 /* Initial FreeSpace size is 2*nnz(A) */ 562 ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr); 563 current_space = free_space; 564 565 for (i=0;i<am;i++) { 566 apnzj = 0; 567 /* diagonal portion of A */ 568 anzi = adi[i+1] - adi[i]; 569 for (j=0;j<anzi;j++) { 570 prow = *adj; 571 adj++; 572 pnzj = pi_loc[prow+1] - pi_loc[prow]; 573 pjj = pj_loc + pi_loc[prow]; 574 paj = pa_loc + pi_loc[prow]; 575 for (k=0;k<pnzj;k++) { 576 if (!apjdense[pjj[k]]) { 577 apjdense[pjj[k]] = -1; 578 apj[apnzj++] = pjj[k]; 579 } 580 } 581 flops += 2*pnzj; 582 ada++; 583 } 584 /* off-diagonal portion of A */ 585 anzi = aoi[i+1] - aoi[i]; 586 for (j=0;j<anzi;j++) { 587 col = a->garray[*aoj]; 588 if (col < cstart){ 589 prow = *aoj; 590 } else if (col >= cend){ 591 prow = *aoj; 592 } else { 593 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 594 } 595 aoj++; 596 pnzj = pi_oth[prow+1] - pi_oth[prow]; 597 pjj = pj_oth + pi_oth[prow]; 598 paj = pa_oth + pi_oth[prow]; 599 for (k=0;k<pnzj;k++) { 600 if (!apjdense[pjj[k]]) { 601 apjdense[pjj[k]] = -1; 602 apj[apnzj++] = pjj[k]; 603 } 604 } 605 flops += 2*pnzj; 606 aoa++; 607 } 608 /* Sort the j index array for quick sparse axpy. */ 609 ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr); 610 611 apsymi[i+1] = apsymi[i] + apnzj; 612 if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj; 613 614 /* If free space is not available, make more free space */ 615 /* Double the amount of total space in the list */ 616 if (current_space->local_remaining<apnzj) { 617 ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 618 } 619 620 /* Copy data into free space, then initialize lnk */ 621 /* ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); */ 622 for (j=0; j<apnzj; j++) current_space->array[j] = apj[j]; 623 current_space->array += apnzj; 624 current_space->local_used += apnzj; 625 current_space->local_remaining -= apnzj; 626 627 for (j=0;j<apnzj;j++) apjdense[apj[j]] = 0; 628 } 629 /* Allocate space for apsymj, initialize apsymj, and */ 630 /* destroy list of free space and other temporary array(s) */ 631 ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr); 632 ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr); 633 ierr = PetscLogStagePop(); 634 } 635 /* -------endof Compute symbolic A*P ---------------------*/ 636 else { 637 /* get data from symbolic A*P */ 638 ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr); 639 ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr); 640 } 641 642 ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr); 643 apsymi = ptap->abi; apsymj = ptap->abj; 644 for (i=0;i<am;i++) { 645 /* Form i-th sparse row of A*P */ 646 apnzj = apsymi[i+1] - apsymi[i]; 647 apj = apsymj + apsymi[i]; 648 /* diagonal portion of A */ 649 anzi = adi[i+1] - adi[i]; 650 for (j=0;j<anzi;j++) { 651 prow = *adj_tmp; 652 adj_tmp++; 653 pnzj = pi_loc[prow+1] - pi_loc[prow]; 654 pjj = pj_loc + pi_loc[prow]; 655 paj = pa_loc + pi_loc[prow]; 656 nextp = 0; 657 for (k=0; nextp<pnzj; k++) { 658 if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 659 apa[k] += (*ada_tmp)*paj[nextp++]; 660 } 661 } 662 ada_tmp++; 663 } 664 /* off-diagonal portion of A */ 665 anzi = aoi[i+1] - aoi[i]; 666 for (j=0;j<anzi;j++) { 667 col = a->garray[*aoj_tmp]; 668 if (col < cstart){ 669 prow = *aoj_tmp; 670 } else if (col >= cend){ 671 prow = *aoj_tmp; 672 } else { 673 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 674 } 675 aoj_tmp++; 676 pnzj = pi_oth[prow+1] - pi_oth[prow]; 677 pjj = pj_oth + pi_oth[prow]; 678 paj = pa_oth + pi_oth[prow]; 679 nextp = 0; 680 for (k=0; nextp<pnzj; k++) { 681 if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 682 apa[k] += (*aoa_tmp)*paj[nextp++]; 683 } 684 } 685 flops += 2*pnzj; 686 aoa_tmp++; 687 } 688 689 /* Compute P_loc[i,:]^T*AP[i,:] using outer product */ 690 pnzi = pi_loc[i+1] - pi_loc[i]; 691 for (j=0;j<pnzi;j++) { 692 nextap = 0; 693 crow = *pJ++; 694 cjj = cseqj + cseqi[crow]; 695 caj = cseqa + cseqi[crow]; 696 /* Perform sparse axpy operation. Note cjj includes apj. */ 697 for (k=0;nextap<apnzj;k++) { 698 if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++]; 699 } 700 flops += 2*apnzj; 701 pA++; 702 } 703 /* zero the current row info for A*P */ 704 ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr); 705 } 706 707 ierr = PetscFree(apa);CHKERRQ(ierr); 708 ierr = PetscLogStagePop(); 709 710 bi = merge->bi; 711 bj = merge->bj; 712 buf_ri = merge->buf_ri; 713 buf_rj = merge->buf_rj; 714 715 ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 716 ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 717 718 /* send and recv matrix values */ 719 /*-----------------------------*/ 720 ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr); 721 len_s = merge->len_s; 722 ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr); 723 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 724 725 ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr); 726 for (proc=0,k=0; proc<size; proc++){ 727 if (!len_s[proc]) continue; 728 i = owners[proc]; 729 ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 730 k++; 731 } 732 733 ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr); 734 ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr); 735 ierr = PetscFree(status);CHKERRQ(ierr); 736 737 ierr = PetscFree(s_waits);CHKERRQ(ierr); 738 ierr = PetscFree(r_waits);CHKERRQ(ierr); 739 740 /* insert mat values of mpimat */ 741 /*----------------------------*/ 742 ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr); 743 ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 744 nextrow = buf_ri_k + merge->nrecv; 745 nextcseqi = nextrow + merge->nrecv; 746 747 for (k=0; k<merge->nrecv; k++){ 748 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 749 nrows = *(buf_ri_k[k]); 750 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 751 nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 752 } 753 754 /* set values of ba */ 755 for (i=0; i<C->m; i++) { 756 cseqrow = owners[rank] + i; /* global row index of C_seq */ 757 bj_i = bj+bi[i]; /* col indices of the i-th row of C */ 758 bnzi = bi[i+1] - bi[i]; 759 ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr); 760 761 /* add local non-zero vals of this proc's C_seq into ba */ 762 cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow]; 763 cseqj_tmp = cseqj + cseqi[cseqrow]; 764 cseqa_tmp = cseqa + cseqi[cseqrow]; 765 nextcseqj = 0; 766 for (j=0; nextcseqj<cseqnzi; j++){ 767 if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 768 ba_i[j] += cseqa_tmp[nextcseqj++]; 769 } 770 } 771 772 /* add received vals into ba */ 773 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 774 /* i-th row */ 775 if (i == *nextrow[k]) { 776 cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k]; 777 cseqj_tmp = buf_rj[k] + *(nextcseqi[k]); 778 cseqa_tmp = abuf_r[k] + *(nextcseqi[k]); 779 nextcseqj = 0; 780 for (j=0; nextcseqj<cseqnzi; j++){ 781 if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 782 ba_i[j] += cseqa_tmp[nextcseqj++]; 783 } 784 } 785 nextrow[k]++; nextcseqi[k]++; 786 } 787 } 788 ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 789 flops += 2*bnzi; 790 } 791 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 792 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 793 ierr = PetscLogStagePop();CHKERRQ(ierr); 794 795 ierr = PetscFree(cseqa);CHKERRQ(ierr); 796 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 797 ierr = PetscFree(ba_i);CHKERRQ(ierr); 798 ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 799 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 800 801 PetscFunctionReturn(0); 802 } 803