1 /* 2 Defines projective product routines where A is a MPIAIJ matrix 3 C = P^T * A * P 4 */ 5 6 #include "src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 7 #include "src/mat/utils/freespace.h" 8 #include "src/mat/impls/aij/mpi/mpiaij.h" 9 #include "petscbt.h" 10 11 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat); 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP" 14 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 15 { 16 PetscErrorCode ierr; 17 Mat_MatMatMultMPI *ptap; 18 PetscObjectContainer container; 19 20 PetscFunctionBegin; 21 ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 22 if (container) { 23 ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 24 ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr); 25 ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr); 26 ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr); 27 ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr); 28 if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr); 29 if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr); 30 31 ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr); 32 ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr); 33 } 34 ierr = PetscFree(ptap);CHKERRQ(ierr); 35 36 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 37 PetscFunctionReturn(0); 38 } 39 40 #undef __FUNCT__ 41 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ" 42 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 43 { 44 PetscErrorCode ierr; 45 Mat_MatMatMultMPI *ptap; 46 PetscObjectContainer container; 47 48 PetscFunctionBegin; 49 if (scall == MAT_INITIAL_MATRIX){ 50 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 51 ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr); 52 ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL; 53 ptap->abnz_max = 0; /* symbolic A*P is not done yet */ 54 55 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 56 ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 57 58 /* get P_loc by taking all local rows of P */ 59 ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 60 61 /* attach the supporting struct to P for reuse */ 62 P->ops->destroy = MatDestroy_MPIAIJ_PtAP; 63 ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 64 ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr); 65 ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr); 66 67 /* now, compute symbolic local P^T*A*P */ 68 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */ 69 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 70 } else if (scall == MAT_REUSE_MATRIX){ 71 ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 72 if (container) { 73 ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 74 } else { 75 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 76 } 77 78 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 79 ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 80 81 /* get P_loc by taking all local rows of P */ 82 ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 83 84 } else { 85 SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall); 86 } 87 /* now, compute numeric local P^T*A*P */ 88 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 89 ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr); 90 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 91 92 PetscFunctionReturn(0); 93 } 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ" 97 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 98 { 99 PetscErrorCode ierr; 100 Mat P_loc,P_oth; 101 Mat_MatMatMultMPI *ptap; 102 PetscObjectContainer container; 103 FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 104 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 105 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 106 Mat_SeqAIJ *p_loc,*p_oth; 107 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*pjj; 108 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz; 109 PetscInt pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji; 110 PetscInt aN=A->N,am=A->m,pN=P->N; 111 PetscInt i,j,k,ptnzi,arow,prow,pnzj,cnzi; 112 PetscBT lnkbt; 113 PetscInt prstart,prend; 114 MPI_Comm comm=A->comm; 115 Mat B_mpi; 116 PetscMPIInt size,rank,tagi,tagj,*len_s,*len_si,*len_ri; 117 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 118 PetscInt len,proc,*dnz,*onz,*owners; 119 PetscInt anzi,*bi,*bj,bnzi,nspacedouble=0; 120 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 121 MPI_Request *si_waits,*sj_waits,*ri_waits,*rj_waits; 122 MPI_Status *status; 123 Mat_Merge_SeqsToMPI *merge; 124 PetscInt *apsymi,*apsymj,*apj,apnzj,*rmap=p->garray,tnrows; 125 /* PetscInt tnzrows,pcstart=p->cstart,pcend=p->cend; */ 126 127 PetscFunctionBegin; 128 ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 129 if (container) { 130 ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 131 } else { 132 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 133 } 134 135 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 136 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 137 138 /* compute symbolic C_seq = P_loc^T * A_loc * P */ 139 /*----------------------------------------------*/ 140 /* get data from P_loc and P_oth */ 141 P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart; 142 p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data; 143 pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j; 144 prend = prstart + pm; 145 146 /* first, compute symbolic AP = A_loc*P */ 147 ierr = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr); 148 ptap->abi = apsymi; 149 apsymi[0] = 0; 150 151 /* create and initialize a linked list */ 152 nlnk = pN+1; 153 ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 154 155 /* Initial FreeSpace size is 2*nnz(A) */ 156 ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr); 157 current_space = free_space; 158 159 for (i=0;i<am;i++) { 160 apnzj = 0; 161 /* diagonal portion of A */ 162 anzi = adi[i+1] - adi[i]; 163 for (j=0; j<anzi; j++){ 164 prow = *adj++; 165 pnzj = pi_loc[prow+1] - pi_loc[prow]; 166 pjj = pj_loc + pi_loc[prow]; 167 /* add non-zero cols of P into the sorted linked list lnk */ 168 ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 169 apnzj += nlnk; 170 } 171 /* off-diagonal portion of A */ 172 anzi = aoi[i+1] - aoi[i]; 173 for (j=0; j<anzi; j++){ 174 prow = *aoj++; 175 pnzj = pi_oth[prow+1] - pi_oth[prow]; 176 pjj = pj_oth + pi_oth[prow]; 177 ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 178 apnzj += nlnk; 179 } 180 181 apsymi[i+1] = apsymi[i] + apnzj; 182 if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj; 183 184 /* if free space is not available, double the total space in the list */ 185 if (current_space->local_remaining<apnzj) { 186 ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 187 } 188 189 /* Copy data into free space, then initialize lnk */ 190 ierr = PetscLLClean(pN,pN,apnzj,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 191 current_space->array += apnzj; 192 current_space->local_used += apnzj; 193 current_space->local_remaining -= apnzj; 194 } 195 /* Allocate space for apsymj, initialize apsymj, and */ 196 /* destroy list of free space and other temporary array(s) */ 197 ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr); 198 apsymj = ptap->abj; 199 ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr); 200 201 /* get ij structure of P_loc^T */ 202 ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 203 204 /* then, compute symbolic C_seq = P_loc^T*AP */ 205 /* Allocate ci array, arrays for fill computation and */ 206 /* free space for accumulating nonzero column info */ 207 ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr); 208 ci[0] = 0; 209 210 /* tnzrows = 0; */ 211 tnrows = (p->B)->N; /* total num of rows to be sent to other processors 212 >= (num of nonzero rows of C_seq) - pn */ 213 214 215 /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */ 216 /* This should be reasonable if sparsity of PtAP is similar to that of A. */ 217 nnz = adi[am] + aoi[am]; 218 ierr = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space); 219 current_space = free_space; 220 221 /* determine symbolic info for each row of C: */ 222 for (i=0; i<pN; i++) { 223 cnzi = 0; 224 ptnzi = pti[i+1] - pti[i]; 225 if (ptnzi){ 226 j = ptnzi; 227 ptJ = ptj + pti[i+1]; 228 while (j){/* assume cols are almost in increasing order, starting from its end saves computation */ 229 j--; ptJ--; 230 arow = *ptJ; /* row of AP == col of Pt */ 231 apnzj = apsymi[arow+1] - apsymi[arow]; 232 apj = apsymj + apsymi[arow]; 233 /* add non-zero cols of AP into the sorted linked list lnk */ 234 ierr = PetscLLAdd(apnzj,apj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 235 cnzi += nlnk; 236 } 237 238 /* If free space is not available, double the total space in the list */ 239 if (current_space->local_remaining<cnzi) { 240 ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 241 } 242 243 /* Copy data into free space, and zero out denserows */ 244 ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 245 current_space->array += cnzi; 246 current_space->local_used += cnzi; 247 current_space->local_remaining -= cnzi; 248 /* tnzrows++; */ 249 } /* if (ptnzi) */ 250 ci[i+1] = ci[i] + cnzi; 251 252 } 253 /* printf("[%d] tnzrows-pn: %d,tnrows: %d, pN: %d; pcstart/end: %d, %d\n",rank,tnzrows-pn,tnrows,pN,pcstart,pcend);*/ 254 255 /* Clean up. */ 256 ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 257 258 /* Allocate space for cj, initialize cj, and */ 259 /* destroy list of free space and other temporary array(s) */ 260 ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr); 261 ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 262 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 263 264 /* add C_seq into mpi C */ 265 /*-----------------------------------*/ 266 /* determine row ownership */ 267 ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 268 ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr); 269 ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr); 270 ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr); 271 ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 272 273 /* determine the number of messages to send, their lengths */ 274 /*---------------------------------------------------------*/ 275 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 276 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 277 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 278 len_s = merge->len_s; 279 len = 0; /* max length of buf_si[] */ 280 merge->nsend = 0; 281 282 proc = 0; 283 for (i=0; i<tnrows; i++){ 284 while (rmap[i] >= owners[proc+1]) proc++; 285 len_si[proc]++; 286 } 287 for (proc=0; proc<size; proc++){ 288 len_s[proc] = 0; 289 if (len_si[proc]){ 290 merge->nsend++; 291 len_si[proc] = 2*(len_si[proc] + 1); 292 len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */ 293 len += len_si[proc]; 294 } 295 } 296 297 /* determine the number and length of messages to receive for ij-structure */ 298 /*-------------------------------------------------------------------------*/ 299 ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 300 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 301 302 /* post the Irecv of j-structure */ 303 /*-------------------------------*/ 304 ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr); 305 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr); 306 307 /* post the Isend of j-structure */ 308 /*--------------------------------*/ 309 ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr); 310 sj_waits = si_waits + merge->nsend; 311 312 for (proc=0, k=0; proc<size; proc++){ 313 if (!len_s[proc]) continue; 314 i = owners[proc]; 315 ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr); 316 k++; 317 } 318 319 /* receives and sends of j-structure are complete */ 320 /*------------------------------------------------*/ 321 ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 322 ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr); 323 ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr); 324 325 /* send and recv i-structure */ 326 /*---------------------------*/ 327 ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr); 328 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr); 329 330 ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 331 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 332 for (proc=0,k=0; proc<size; proc++){ 333 if (!len_s[proc]) continue; 334 /* form outgoing message for i-structure: 335 buf_si[0]: nrows to be sent 336 [1:nrows]: row index (global) 337 [nrows+1:2*nrows+1]: i-structure index 338 */ 339 /*-------------------------------------------*/ 340 nrows = len_si[proc]/2 - 1; 341 buf_si_i = buf_si + nrows+1; 342 buf_si[0] = nrows; 343 buf_si_i[0] = 0; 344 nrows = 0; 345 for (i=owners[proc]; i<owners[proc+1]; i++){ 346 anzi = ci[i+1] - ci[i]; 347 if (anzi) { 348 buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */ 349 buf_si[nrows+1] = i-owners[proc]; /* local row index */ 350 nrows++; 351 } 352 } 353 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr); 354 k++; 355 buf_si += len_si[proc]; 356 } 357 358 ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr); 359 ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr); 360 361 ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr); 362 for (i=0; i<merge->nrecv; i++){ 363 ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr); 364 } 365 366 ierr = PetscFree(len_si);CHKERRQ(ierr); 367 ierr = PetscFree(len_ri);CHKERRQ(ierr); 368 ierr = PetscFree(rj_waits);CHKERRQ(ierr); 369 ierr = PetscFree(si_waits);CHKERRQ(ierr); 370 ierr = PetscFree(ri_waits);CHKERRQ(ierr); 371 ierr = PetscFree(buf_s);CHKERRQ(ierr); 372 ierr = PetscFree(status);CHKERRQ(ierr); 373 374 /* compute a local seq matrix in each processor */ 375 /*----------------------------------------------*/ 376 /* allocate bi array and free space for accumulating nonzero column info */ 377 ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 378 bi[0] = 0; 379 380 /* create and initialize a linked list */ 381 nlnk = pN+1; 382 ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 383 384 /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */ 385 free_space=PETSC_NULL; current_space=PETSC_NULL; 386 len = 0; 387 len = ci[owners[rank+1]] - ci[owners[rank]]; 388 ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr); 389 current_space = free_space; 390 391 /* determine symbolic info for each local row */ 392 ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 393 nextrow = buf_ri_k + merge->nrecv; 394 nextci = nextrow + merge->nrecv; 395 for (k=0; k<merge->nrecv; k++){ 396 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 397 nrows = *buf_ri_k[k]; 398 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 399 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 400 } 401 402 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 403 len = 0; 404 for (i=0; i<pn; i++) { 405 bnzi = 0; 406 /* add local non-zero cols of this proc's C_seq into lnk */ 407 arow = owners[rank] + i; 408 anzi = ci[arow+1] - ci[arow]; 409 cji = cj + ci[arow]; 410 ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 411 bnzi += nlnk; 412 /* add received col data into lnk */ 413 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 414 if (i == *nextrow[k]) { /* i-th row */ 415 anzi = *(nextci[k]+1) - *nextci[k]; 416 cji = buf_rj[k] + *nextci[k]; 417 ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 418 bnzi += nlnk; 419 nextrow[k]++; nextci[k]++; 420 } 421 } 422 if (len < bnzi) len = bnzi; /* =max(bnzi) */ 423 424 /* if free space is not available, make more free space */ 425 if (current_space->local_remaining<bnzi) { 426 ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 427 nspacedouble++; 428 } 429 /* copy data into free space, then initialize lnk */ 430 ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 431 ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr); 432 433 current_space->array += bnzi; 434 current_space->local_used += bnzi; 435 current_space->local_remaining -= bnzi; 436 437 bi[i+1] = bi[i] + bnzi; 438 } 439 440 ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 441 442 ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 443 ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 444 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 445 446 /* create symbolic parallel matrix B_mpi */ 447 /*---------------------------------------*/ 448 ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr); 449 ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr); 450 ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr); 451 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 452 /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */ 453 454 /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */ 455 B_mpi->assembled = PETSC_FALSE; 456 B_mpi->ops->destroy = MatDestroy_MPIAIJ_SeqsToMPI; 457 merge->bi = bi; 458 merge->bj = bj; 459 merge->ci = ci; 460 merge->cj = cj; 461 merge->buf_ri = buf_ri; 462 merge->buf_rj = buf_rj; 463 merge->C_seq = PETSC_NULL; 464 465 /* attach the supporting struct to B_mpi for reuse */ 466 ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 467 ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr); 468 ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr); 469 *C = B_mpi; 470 471 PetscFunctionReturn(0); 472 } 473 474 #undef __FUNCT__ 475 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ" 476 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 477 { 478 PetscErrorCode ierr; 479 Mat_Merge_SeqsToMPI *merge; 480 Mat_MatMatMultMPI *ptap; 481 PetscObjectContainer cont_merge,cont_ptap; 482 483 PetscInt flops=0; 484 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 485 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 486 Mat_SeqAIJ *p_loc,*p_oth; 487 PetscInt *adi=ad->i,*aoi=ao->i,*apj,cstart=a->cstart,cend=a->cend,col; 488 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj; 489 PetscInt i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow; 490 PetscInt *cjj; 491 MatScalar *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj; 492 MatScalar *pa_loc,*pA,*pa_oth; 493 PetscInt am=A->m,cN=C->N; 494 PetscInt nextp,*adj=ad->j,*aoj=ao->j; 495 MPI_Comm comm=C->comm; 496 PetscMPIInt size,rank,taga,*len_s; 497 PetscInt *owners; 498 PetscInt proc; 499 PetscInt **buf_ri,**buf_rj; 500 PetscInt cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */ 501 PetscInt nrows,**buf_ri_k,**nextrow,**nextcseqi; 502 MPI_Request *s_waits,*r_waits; 503 MPI_Status *status; 504 MatScalar **abuf_r,*ba_i; 505 PetscInt *cseqi,*cseqj; 506 PetscInt *cseqj_tmp; 507 MatScalar *cseqa_tmp; 508 PetscInt stages[2]; 509 PetscInt *apsymi,*apsymj; 510 511 PetscFunctionBegin; 512 ierr = PetscLogStageRegister(&stages[0],"NumPtAP_local");CHKERRQ(ierr); 513 ierr = PetscLogStageRegister(&stages[1],"NumPtAP_Comm");CHKERRQ(ierr); 514 515 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 516 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 517 518 ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr); 519 if (cont_merge) { 520 ierr = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr); 521 } else { 522 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container"); 523 } 524 ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr); 525 if (cont_ptap) { 526 ierr = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr); 527 } else { 528 SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 529 } 530 531 /* get data from symbolic products */ 532 p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data; 533 p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data; 534 pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc; 535 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 536 537 cseqi = merge->ci; cseqj=merge->cj; 538 ierr = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr); 539 ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr); 540 541 /* get data from symbolic A*P */ 542 ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr); 543 ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr); 544 545 /* compute numeric C_seq=P_loc^T*A_loc*P */ 546 ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr); 547 apsymi = ptap->abi; apsymj = ptap->abj; 548 for (i=0;i<am;i++) { 549 /* form i-th sparse row of A*P */ 550 apnzj = apsymi[i+1] - apsymi[i]; 551 apj = apsymj + apsymi[i]; 552 /* diagonal portion of A */ 553 anzi = adi[i+1] - adi[i]; 554 for (j=0;j<anzi;j++) { 555 prow = *adj; 556 adj++; 557 pnzj = pi_loc[prow+1] - pi_loc[prow]; 558 pjj = pj_loc + pi_loc[prow]; 559 paj = pa_loc + pi_loc[prow]; 560 nextp = 0; 561 for (k=0; nextp<pnzj; k++) { 562 if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 563 apa[k] += (*ada)*paj[nextp++]; 564 } 565 } 566 flops += 2*pnzj; 567 ada++; 568 } 569 /* off-diagonal portion of A */ 570 anzi = aoi[i+1] - aoi[i]; 571 for (j=0;j<anzi;j++) { 572 col = a->garray[*aoj]; 573 if (col < cstart){ 574 prow = *aoj; 575 } else if (col >= cend){ 576 prow = *aoj; 577 } else { 578 SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 579 } 580 aoj++; 581 pnzj = pi_oth[prow+1] - pi_oth[prow]; 582 pjj = pj_oth + pi_oth[prow]; 583 paj = pa_oth + pi_oth[prow]; 584 nextp = 0; 585 for (k=0; nextp<pnzj; k++) { 586 if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 587 apa[k] += (*aoa)*paj[nextp++]; 588 } 589 } 590 flops += 2*pnzj; 591 aoa++; 592 } 593 594 /* Compute P_loc[i,:]^T*AP[i,:] using outer product */ 595 pnzi = pi_loc[i+1] - pi_loc[i]; 596 for (j=0;j<pnzi;j++) { 597 nextap = 0; 598 crow = *pJ++; 599 cjj = cseqj + cseqi[crow]; 600 caj = cseqa + cseqi[crow]; 601 /* Perform sparse axpy operation. Note cjj includes apj. */ 602 for (k=0;nextap<apnzj;k++) { 603 if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++]; 604 } 605 flops += 2*apnzj; 606 pA++; 607 } 608 /* zero the current row info for A*P */ 609 ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr); 610 } 611 612 ierr = PetscFree(apa);CHKERRQ(ierr); 613 ierr = PetscLogStagePop(); 614 615 bi = merge->bi; 616 bj = merge->bj; 617 buf_ri = merge->buf_ri; 618 buf_rj = merge->buf_rj; 619 620 ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 621 ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 622 623 /* send and recv matrix values */ 624 /*-----------------------------*/ 625 ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr); 626 len_s = merge->len_s; 627 ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr); 628 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 629 630 ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr); 631 for (proc=0,k=0; proc<size; proc++){ 632 if (!len_s[proc]) continue; 633 i = owners[proc]; 634 ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 635 k++; 636 } 637 638 ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr); 639 ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr); 640 ierr = PetscFree(status);CHKERRQ(ierr); 641 642 ierr = PetscFree(s_waits);CHKERRQ(ierr); 643 ierr = PetscFree(r_waits);CHKERRQ(ierr); 644 645 /* insert mat values of mpimat */ 646 /*----------------------------*/ 647 ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr); 648 ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 649 nextrow = buf_ri_k + merge->nrecv; 650 nextcseqi = nextrow + merge->nrecv; 651 652 for (k=0; k<merge->nrecv; k++){ 653 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 654 nrows = *(buf_ri_k[k]); 655 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 656 nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 657 } 658 659 /* set values of ba */ 660 for (i=0; i<C->m; i++) { 661 cseqrow = owners[rank] + i; /* global row index of C_seq */ 662 bj_i = bj+bi[i]; /* col indices of the i-th row of C */ 663 bnzi = bi[i+1] - bi[i]; 664 ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr); 665 666 /* add local non-zero vals of this proc's C_seq into ba */ 667 cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow]; 668 cseqj_tmp = cseqj + cseqi[cseqrow]; 669 cseqa_tmp = cseqa + cseqi[cseqrow]; 670 nextcseqj = 0; 671 for (j=0; nextcseqj<cseqnzi; j++){ 672 if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 673 ba_i[j] += cseqa_tmp[nextcseqj++]; 674 } 675 } 676 677 /* add received vals into ba */ 678 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 679 /* i-th row */ 680 if (i == *nextrow[k]) { 681 cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k]; 682 cseqj_tmp = buf_rj[k] + *(nextcseqi[k]); 683 cseqa_tmp = abuf_r[k] + *(nextcseqi[k]); 684 nextcseqj = 0; 685 for (j=0; nextcseqj<cseqnzi; j++){ 686 if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 687 ba_i[j] += cseqa_tmp[nextcseqj++]; 688 } 689 } 690 nextrow[k]++; nextcseqi[k]++; 691 } 692 } 693 ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 694 flops += 2*bnzi; 695 } 696 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 697 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 698 ierr = PetscLogStagePop();CHKERRQ(ierr); 699 700 ierr = PetscFree(cseqa);CHKERRQ(ierr); 701 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 702 ierr = PetscFree(ba_i);CHKERRQ(ierr); 703 ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 704 ierr = PetscLogFlops(flops);CHKERRQ(ierr); 705 706 PetscFunctionReturn(0); 707 } 708