1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 13 /* #define PTAP_PROFILE */ 14 15 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 16 { 17 PetscErrorCode ierr; 18 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 19 Mat_APMPI *ptap=a->ap; 20 PetscBool iascii; 21 PetscViewerFormat format; 22 23 PetscFunctionBegin; 24 if (!ptap) { 25 /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */ 26 A->ops->view = MatView_MPIAIJ; 27 ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr); 28 PetscFunctionReturn(0); 29 } 30 31 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 32 if (iascii) { 33 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 34 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 35 if (ptap->algType == 0) { 36 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 37 } else if (ptap->algType == 1) { 38 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 39 } 40 } 41 } 42 ierr = (ptap->view)(A,viewer);CHKERRQ(ierr); 43 PetscFunctionReturn(0); 44 } 45 46 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A) 47 { 48 PetscErrorCode ierr; 49 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 50 Mat_APMPI *ptap=a->ap; 51 Mat_Merge_SeqsToMPI *merge; 52 53 PetscFunctionBegin; 54 if (!ptap) PetscFunctionReturn(0); 55 56 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 57 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 58 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 59 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 60 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 61 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 62 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 63 if (ptap->AP_loc) { /* used by alg_rap */ 64 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 65 ierr = PetscFree(ap->i);CHKERRQ(ierr); 66 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 67 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 68 } else { /* used by alg_ptap */ 69 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 70 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 71 } 72 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 73 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 74 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 75 76 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 77 78 merge=ptap->merge; 79 if (merge) { /* used by alg_ptap */ 80 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 81 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 82 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 83 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 84 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 85 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 86 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 87 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 88 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 89 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 90 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 91 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 92 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 93 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 94 } 95 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 96 97 A->ops->destroy = ptap->destroy; 98 ierr = PetscFree(a->ap);CHKERRQ(ierr); 99 PetscFunctionReturn(0); 100 } 101 102 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 103 { 104 PetscErrorCode ierr; 105 106 PetscFunctionBegin; 107 ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr); 108 ierr = (*A->ops->destroy)(A);CHKERRQ(ierr); 109 PetscFunctionReturn(0); 110 } 111 112 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 113 { 114 PetscErrorCode ierr; 115 PetscBool flg; 116 MPI_Comm comm; 117 #if !defined(PETSC_HAVE_HYPRE) 118 const char *algTypes[2] = {"scalable","nonscalable"}; 119 PetscInt nalg=2; 120 #else 121 const char *algTypes[3] = {"scalable","nonscalable","hypre"}; 122 PetscInt nalg=3; 123 #endif 124 PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */ 125 126 PetscFunctionBegin; 127 /* check if matrix local sizes are compatible */ 128 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 129 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 130 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 131 132 if (scall == MAT_INITIAL_MATRIX) { 133 /* pick an algorithm */ 134 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 135 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 136 ierr = PetscOptionsEnd();CHKERRQ(ierr); 137 138 if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */ 139 MatInfo Ainfo,Pinfo; 140 PetscInt nz_local; 141 PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable; 142 143 ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr); 144 ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr); 145 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 146 147 if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE; 148 ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr); 149 150 if (alg_scalable) { 151 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 152 } 153 } 154 155 switch (alg) { 156 case 1: 157 /* do R=P^T locally, then C=R*A*P -- nonscalable */ 158 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 159 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 160 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 161 break; 162 #if defined(PETSC_HAVE_HYPRE) 163 case 2: 164 /* Use boomerAMGBuildCoarseOperator */ 165 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 166 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 167 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 168 break; 169 #endif 170 default: 171 /* do R=P^T locally, then C=R*A*P */ 172 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 173 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr); 174 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 175 break; 176 } 177 178 if (alg == 0 || alg == 1) { 179 Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data; 180 Mat_APMPI *ap = c->ap; 181 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr); 182 ap->freestruct = PETSC_FALSE; 183 ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr); 184 ierr = PetscOptionsEnd();CHKERRQ(ierr); 185 } 186 } 187 188 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 189 ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr); 190 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 195 { 196 PetscErrorCode ierr; 197 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 198 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 199 Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq; 200 Mat_APMPI *ptap = c->ap; 201 Mat AP_loc,C_loc,C_oth; 202 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 203 PetscScalar *apa; 204 const PetscInt *cols; 205 const PetscScalar *vals; 206 207 PetscFunctionBegin; 208 if (!ptap) { 209 MPI_Comm comm; 210 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 211 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 212 } 213 214 ierr = MatZeroEntries(C);CHKERRQ(ierr); 215 216 /* 1) get R = Pd^T,Ro = Po^T */ 217 if (ptap->reuse == MAT_REUSE_MATRIX) { 218 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 219 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 220 } 221 222 /* 2) get AP_loc */ 223 AP_loc = ptap->AP_loc; 224 ap = (Mat_SeqAIJ*)AP_loc->data; 225 226 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 227 /*-----------------------------------------------------*/ 228 if (ptap->reuse == MAT_REUSE_MATRIX) { 229 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 230 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 231 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 232 } 233 234 /* 2-2) compute numeric A_loc*P - dominating part */ 235 /* ---------------------------------------------- */ 236 /* get data from symbolic products */ 237 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 238 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 239 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!"); 240 241 api = ap->i; 242 apj = ap->j; 243 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 244 for (i=0; i<am; i++) { 245 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 246 apnz = api[i+1] - api[i]; 247 apa = ap->a + api[i]; 248 ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr); 249 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 250 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 251 } 252 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 253 if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout); 254 255 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 256 /* Always use scalable version since we are in the MPI scalable version */ 257 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 258 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 259 260 C_loc = ptap->C_loc; 261 C_oth = ptap->C_oth; 262 263 /* add C_loc and Co to to C */ 264 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 265 266 /* C_loc -> C */ 267 cm = C_loc->rmap->N; 268 c_seq = (Mat_SeqAIJ*)C_loc->data; 269 cols = c_seq->j; 270 vals = c_seq->a; 271 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 272 273 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 274 /* when there are no off-processor parts. */ 275 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 276 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 277 /* a table, and other, more complex stuff has to be done. */ 278 if (C->assembled) { 279 C->was_assembled = PETSC_TRUE; 280 C->assembled = PETSC_FALSE; 281 } 282 if (C->was_assembled) { 283 for (i=0; i<cm; i++) { 284 ncols = c_seq->i[i+1] - c_seq->i[i]; 285 row = rstart + i; 286 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 287 cols += ncols; vals += ncols; 288 } 289 } else { 290 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 291 } 292 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 293 if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 294 295 /* Co -> C, off-processor part */ 296 cm = C_oth->rmap->N; 297 c_seq = (Mat_SeqAIJ*)C_oth->data; 298 cols = c_seq->j; 299 vals = c_seq->a; 300 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 301 for (i=0; i<cm; i++) { 302 ncols = c_seq->i[i+1] - c_seq->i[i]; 303 row = p->garray[i]; 304 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 305 cols += ncols; vals += ncols; 306 } 307 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 308 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 309 310 ptap->reuse = MAT_REUSE_MATRIX; 311 312 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 313 if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 314 315 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 316 if (ptap->freestruct) { 317 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 318 } 319 PetscFunctionReturn(0); 320 } 321 322 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C) 323 { 324 PetscErrorCode ierr; 325 Mat_APMPI *ptap; 326 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 327 MPI_Comm comm; 328 PetscMPIInt size,rank; 329 Mat Cmpi,P_loc,P_oth; 330 PetscFreeSpaceList free_space=NULL,current_space=NULL; 331 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 332 PetscInt *lnk,i,k,pnz,row,nsend; 333 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 334 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 335 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 336 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 337 MPI_Request *swaits,*rwaits; 338 MPI_Status *sstatus,rstatus; 339 PetscLayout rowmap; 340 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 341 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 342 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 343 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 344 PetscScalar *apv; 345 PetscTable ta; 346 MatType mtype; 347 #if defined(PETSC_USE_INFO) 348 PetscReal apfill; 349 #endif 350 351 PetscFunctionBegin; 352 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 353 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 354 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 355 356 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 357 358 /* create symbolic parallel matrix Cmpi */ 359 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 360 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 361 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 362 363 /* create struct Mat_APMPI and attached it to C later */ 364 ierr = PetscNew(&ptap);CHKERRQ(ierr); 365 ptap->reuse = MAT_INITIAL_MATRIX; 366 ptap->algType = 0; 367 368 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 369 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 370 /* get P_loc by taking all local rows of P */ 371 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 372 373 ptap->P_loc = P_loc; 374 ptap->P_oth = P_oth; 375 376 /* (0) compute Rd = Pd^T, Ro = Po^T */ 377 /* --------------------------------- */ 378 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 379 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 380 381 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 382 /* ----------------------------------------------------------------- */ 383 p_loc = (Mat_SeqAIJ*)P_loc->data; 384 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 385 386 /* create and initialize a linked list */ 387 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 388 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 389 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 390 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 391 392 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 393 394 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 395 if (ao) { 396 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 397 } else { 398 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 399 } 400 current_space = free_space; 401 nspacedouble = 0; 402 403 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 404 api[0] = 0; 405 for (i=0; i<am; i++) { 406 /* diagonal portion: Ad[i,:]*P */ 407 ai = ad->i; pi = p_loc->i; 408 nzi = ai[i+1] - ai[i]; 409 aj = ad->j + ai[i]; 410 for (j=0; j<nzi; j++) { 411 row = aj[j]; 412 pnz = pi[row+1] - pi[row]; 413 Jptr = p_loc->j + pi[row]; 414 /* add non-zero cols of P into the sorted linked list lnk */ 415 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 416 } 417 /* off-diagonal portion: Ao[i,:]*P */ 418 if (ao) { 419 ai = ao->i; pi = p_oth->i; 420 nzi = ai[i+1] - ai[i]; 421 aj = ao->j + ai[i]; 422 for (j=0; j<nzi; j++) { 423 row = aj[j]; 424 pnz = pi[row+1] - pi[row]; 425 Jptr = p_oth->j + pi[row]; 426 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 427 } 428 } 429 apnz = lnk[0]; 430 api[i+1] = api[i] + apnz; 431 432 /* if free space is not available, double the total space in the list */ 433 if (current_space->local_remaining<apnz) { 434 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 435 nspacedouble++; 436 } 437 438 /* Copy data into free space, then initialize lnk */ 439 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 440 441 current_space->array += apnz; 442 current_space->local_used += apnz; 443 current_space->local_remaining -= apnz; 444 } 445 /* Allocate space for apj and apv, initialize apj, and */ 446 /* destroy list of free space and other temporary array(s) */ 447 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 448 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 449 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 450 451 /* Create AP_loc for reuse */ 452 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 453 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 454 455 #if defined(PETSC_USE_INFO) 456 if (ao) { 457 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 458 } else { 459 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 460 } 461 ptap->AP_loc->info.mallocs = nspacedouble; 462 ptap->AP_loc->info.fill_ratio_given = fill; 463 ptap->AP_loc->info.fill_ratio_needed = apfill; 464 465 if (api[am]) { 466 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 467 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 468 } else { 469 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 470 } 471 #endif 472 473 /* (2-1) compute symbolic Co = Ro*AP_loc */ 474 /* ------------------------------------ */ 475 ierr = MatSetOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 476 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 477 478 /* (3) send coj of C_oth to other processors */ 479 /* ------------------------------------------ */ 480 /* determine row ownership */ 481 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 482 rowmap->n = pn; 483 rowmap->bs = 1; 484 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 485 owners = rowmap->range; 486 487 /* determine the number of messages to send, their lengths */ 488 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 489 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 490 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 491 492 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 493 coi = c_oth->i; coj = c_oth->j; 494 con = ptap->C_oth->rmap->n; 495 proc = 0; 496 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 497 for (i=0; i<con; i++) { 498 while (prmap[i] >= owners[proc+1]) proc++; 499 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 500 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 501 } 502 503 len = 0; /* max length of buf_si[], see (4) */ 504 owners_co[0] = 0; 505 nsend = 0; 506 for (proc=0; proc<size; proc++) { 507 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 508 if (len_s[proc]) { 509 nsend++; 510 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 511 len += len_si[proc]; 512 } 513 } 514 515 /* determine the number and length of messages to receive for coi and coj */ 516 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 517 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 518 519 /* post the Irecv and Isend of coj */ 520 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 521 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 522 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 523 for (proc=0, k=0; proc<size; proc++) { 524 if (!len_s[proc]) continue; 525 i = owners_co[proc]; 526 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 527 k++; 528 } 529 530 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 531 /* ---------------------------------------- */ 532 ierr = MatSetOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 533 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 534 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 535 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 536 537 /* receives coj are complete */ 538 for (i=0; i<nrecv; i++) { 539 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 540 } 541 ierr = PetscFree(rwaits);CHKERRQ(ierr); 542 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 543 544 /* add received column indices into ta to update Crmax */ 545 for (k=0; k<nrecv; k++) {/* k-th received message */ 546 Jptr = buf_rj[k]; 547 for (j=0; j<len_r[k]; j++) { 548 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 549 } 550 } 551 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 552 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 553 554 /* (4) send and recv coi */ 555 /*-----------------------*/ 556 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 557 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 558 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 559 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 560 for (proc=0,k=0; proc<size; proc++) { 561 if (!len_s[proc]) continue; 562 /* form outgoing message for i-structure: 563 buf_si[0]: nrows to be sent 564 [1:nrows]: row index (global) 565 [nrows+1:2*nrows+1]: i-structure index 566 */ 567 /*-------------------------------------------*/ 568 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 569 buf_si_i = buf_si + nrows+1; 570 buf_si[0] = nrows; 571 buf_si_i[0] = 0; 572 nrows = 0; 573 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 574 nzi = coi[i+1] - coi[i]; 575 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 576 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 577 nrows++; 578 } 579 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 580 k++; 581 buf_si += len_si[proc]; 582 } 583 for (i=0; i<nrecv; i++) { 584 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 585 } 586 ierr = PetscFree(rwaits);CHKERRQ(ierr); 587 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 588 589 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 590 ierr = PetscFree(len_ri);CHKERRQ(ierr); 591 ierr = PetscFree(swaits);CHKERRQ(ierr); 592 ierr = PetscFree(buf_s);CHKERRQ(ierr); 593 594 /* (5) compute the local portion of Cmpi */ 595 /* ------------------------------------------ */ 596 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 597 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 598 current_space = free_space; 599 600 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 601 for (k=0; k<nrecv; k++) { 602 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 603 nrows = *buf_ri_k[k]; 604 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 605 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 606 } 607 608 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 609 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 610 for (i=0; i<pn; i++) { 611 /* add C_loc into Cmpi */ 612 nzi = c_loc->i[i+1] - c_loc->i[i]; 613 Jptr = c_loc->j + c_loc->i[i]; 614 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 615 616 /* add received col data into lnk */ 617 for (k=0; k<nrecv; k++) { /* k-th received message */ 618 if (i == *nextrow[k]) { /* i-th row */ 619 nzi = *(nextci[k]+1) - *nextci[k]; 620 Jptr = buf_rj[k] + *nextci[k]; 621 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 622 nextrow[k]++; nextci[k]++; 623 } 624 } 625 nzi = lnk[0]; 626 627 /* copy data into free space, then initialize lnk */ 628 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 629 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 630 } 631 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 632 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 633 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 634 635 /* local sizes and preallocation */ 636 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 637 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 638 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 639 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 640 641 /* members in merge */ 642 ierr = PetscFree(id_r);CHKERRQ(ierr); 643 ierr = PetscFree(len_r);CHKERRQ(ierr); 644 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 645 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 646 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 647 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 648 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 649 650 /* attach the supporting struct to Cmpi for reuse */ 651 c = (Mat_MPIAIJ*)Cmpi->data; 652 c->ap = ptap; 653 ptap->duplicate = Cmpi->ops->duplicate; 654 ptap->destroy = Cmpi->ops->destroy; 655 ptap->view = Cmpi->ops->view; 656 657 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 658 Cmpi->assembled = PETSC_FALSE; 659 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 660 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 661 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 662 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 663 *C = Cmpi; 664 665 nout = 0; 666 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 667 if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout); 668 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 669 if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout); 670 671 PetscFunctionReturn(0); 672 } 673 674 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 675 { 676 PetscErrorCode ierr; 677 Mat_APMPI *ptap; 678 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 679 MPI_Comm comm; 680 PetscMPIInt size,rank; 681 Mat Cmpi; 682 PetscFreeSpaceList free_space=NULL,current_space=NULL; 683 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 684 PetscInt *lnk,i,k,pnz,row,nsend; 685 PetscBT lnkbt; 686 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 687 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 688 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 689 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 690 MPI_Request *swaits,*rwaits; 691 MPI_Status *sstatus,rstatus; 692 PetscLayout rowmap; 693 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 694 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 695 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 696 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 697 PetscScalar *apv; 698 PetscTable ta; 699 MatType mtype; 700 #if defined(PETSC_USE_INFO) 701 PetscReal apfill; 702 #endif 703 704 PetscFunctionBegin; 705 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 706 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 707 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 708 709 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 710 711 /* create symbolic parallel matrix Cmpi */ 712 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 713 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 714 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 715 716 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 717 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 718 719 /* create struct Mat_APMPI and attached it to C later */ 720 ierr = PetscNew(&ptap);CHKERRQ(ierr); 721 ptap->reuse = MAT_INITIAL_MATRIX; 722 ptap->algType = 1; 723 724 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 725 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 726 /* get P_loc by taking all local rows of P */ 727 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 728 729 /* (0) compute Rd = Pd^T, Ro = Po^T */ 730 /* --------------------------------- */ 731 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 732 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 733 734 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 735 /* ----------------------------------------------------------------- */ 736 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 737 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 738 739 /* create and initialize a linked list */ 740 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 741 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 742 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 743 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 744 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 745 746 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 747 748 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 749 if (ao) { 750 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 751 } else { 752 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 753 } 754 current_space = free_space; 755 nspacedouble = 0; 756 757 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 758 api[0] = 0; 759 for (i=0; i<am; i++) { 760 /* diagonal portion: Ad[i,:]*P */ 761 ai = ad->i; pi = p_loc->i; 762 nzi = ai[i+1] - ai[i]; 763 aj = ad->j + ai[i]; 764 for (j=0; j<nzi; j++) { 765 row = aj[j]; 766 pnz = pi[row+1] - pi[row]; 767 Jptr = p_loc->j + pi[row]; 768 /* add non-zero cols of P into the sorted linked list lnk */ 769 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 770 } 771 /* off-diagonal portion: Ao[i,:]*P */ 772 if (ao) { 773 ai = ao->i; pi = p_oth->i; 774 nzi = ai[i+1] - ai[i]; 775 aj = ao->j + ai[i]; 776 for (j=0; j<nzi; j++) { 777 row = aj[j]; 778 pnz = pi[row+1] - pi[row]; 779 Jptr = p_oth->j + pi[row]; 780 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 781 } 782 } 783 apnz = lnk[0]; 784 api[i+1] = api[i] + apnz; 785 if (ap_rmax < apnz) ap_rmax = apnz; 786 787 /* if free space is not available, double the total space in the list */ 788 if (current_space->local_remaining<apnz) { 789 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 790 nspacedouble++; 791 } 792 793 /* Copy data into free space, then initialize lnk */ 794 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 795 796 current_space->array += apnz; 797 current_space->local_used += apnz; 798 current_space->local_remaining -= apnz; 799 } 800 /* Allocate space for apj and apv, initialize apj, and */ 801 /* destroy list of free space and other temporary array(s) */ 802 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 803 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 804 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 805 806 /* Create AP_loc for reuse */ 807 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 808 809 #if defined(PETSC_USE_INFO) 810 if (ao) { 811 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 812 } else { 813 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 814 } 815 ptap->AP_loc->info.mallocs = nspacedouble; 816 ptap->AP_loc->info.fill_ratio_given = fill; 817 ptap->AP_loc->info.fill_ratio_needed = apfill; 818 819 if (api[am]) { 820 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 821 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 822 } else { 823 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 824 } 825 #endif 826 827 /* (2-1) compute symbolic Co = Ro*AP_loc */ 828 /* ------------------------------------ */ 829 ierr = MatSetOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 830 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 831 832 /* (3) send coj of C_oth to other processors */ 833 /* ------------------------------------------ */ 834 /* determine row ownership */ 835 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 836 rowmap->n = pn; 837 rowmap->bs = 1; 838 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 839 owners = rowmap->range; 840 841 /* determine the number of messages to send, their lengths */ 842 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 843 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 844 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 845 846 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 847 coi = c_oth->i; coj = c_oth->j; 848 con = ptap->C_oth->rmap->n; 849 proc = 0; 850 for (i=0; i<con; i++) { 851 while (prmap[i] >= owners[proc+1]) proc++; 852 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 853 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 854 } 855 856 len = 0; /* max length of buf_si[], see (4) */ 857 owners_co[0] = 0; 858 nsend = 0; 859 for (proc=0; proc<size; proc++) { 860 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 861 if (len_s[proc]) { 862 nsend++; 863 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 864 len += len_si[proc]; 865 } 866 } 867 868 /* determine the number and length of messages to receive for coi and coj */ 869 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 870 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 871 872 /* post the Irecv and Isend of coj */ 873 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 874 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 875 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 876 for (proc=0, k=0; proc<size; proc++) { 877 if (!len_s[proc]) continue; 878 i = owners_co[proc]; 879 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 880 k++; 881 } 882 883 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 884 /* ---------------------------------------- */ 885 ierr = MatSetOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 886 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 887 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 888 889 /* receives coj are complete */ 890 for (i=0; i<nrecv; i++) { 891 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 892 } 893 ierr = PetscFree(rwaits);CHKERRQ(ierr); 894 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 895 896 /* add received column indices into ta to update Crmax */ 897 for (k=0; k<nrecv; k++) {/* k-th received message */ 898 Jptr = buf_rj[k]; 899 for (j=0; j<len_r[k]; j++) { 900 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 901 } 902 } 903 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 904 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 905 906 /* (4) send and recv coi */ 907 /*-----------------------*/ 908 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 909 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 910 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 911 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 912 for (proc=0,k=0; proc<size; proc++) { 913 if (!len_s[proc]) continue; 914 /* form outgoing message for i-structure: 915 buf_si[0]: nrows to be sent 916 [1:nrows]: row index (global) 917 [nrows+1:2*nrows+1]: i-structure index 918 */ 919 /*-------------------------------------------*/ 920 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 921 buf_si_i = buf_si + nrows+1; 922 buf_si[0] = nrows; 923 buf_si_i[0] = 0; 924 nrows = 0; 925 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 926 nzi = coi[i+1] - coi[i]; 927 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 928 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 929 nrows++; 930 } 931 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 932 k++; 933 buf_si += len_si[proc]; 934 } 935 for (i=0; i<nrecv; i++) { 936 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 937 } 938 ierr = PetscFree(rwaits);CHKERRQ(ierr); 939 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 940 941 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 942 ierr = PetscFree(len_ri);CHKERRQ(ierr); 943 ierr = PetscFree(swaits);CHKERRQ(ierr); 944 ierr = PetscFree(buf_s);CHKERRQ(ierr); 945 946 /* (5) compute the local portion of Cmpi */ 947 /* ------------------------------------------ */ 948 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 949 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 950 current_space = free_space; 951 952 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 953 for (k=0; k<nrecv; k++) { 954 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 955 nrows = *buf_ri_k[k]; 956 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 957 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 958 } 959 960 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 961 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 962 for (i=0; i<pn; i++) { 963 /* add C_loc into Cmpi */ 964 nzi = c_loc->i[i+1] - c_loc->i[i]; 965 Jptr = c_loc->j + c_loc->i[i]; 966 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 967 968 /* add received col data into lnk */ 969 for (k=0; k<nrecv; k++) { /* k-th received message */ 970 if (i == *nextrow[k]) { /* i-th row */ 971 nzi = *(nextci[k]+1) - *nextci[k]; 972 Jptr = buf_rj[k] + *nextci[k]; 973 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 974 nextrow[k]++; nextci[k]++; 975 } 976 } 977 nzi = lnk[0]; 978 979 /* copy data into free space, then initialize lnk */ 980 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 981 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 982 } 983 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 984 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 985 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 986 987 /* local sizes and preallocation */ 988 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 989 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 990 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 991 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 992 993 /* members in merge */ 994 ierr = PetscFree(id_r);CHKERRQ(ierr); 995 ierr = PetscFree(len_r);CHKERRQ(ierr); 996 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 997 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 998 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 999 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 1000 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 1001 1002 /* attach the supporting struct to Cmpi for reuse */ 1003 c = (Mat_MPIAIJ*)Cmpi->data; 1004 c->ap = ptap; 1005 ptap->duplicate = Cmpi->ops->duplicate; 1006 ptap->destroy = Cmpi->ops->destroy; 1007 ptap->view = Cmpi->ops->view; 1008 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1009 1010 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1011 Cmpi->assembled = PETSC_FALSE; 1012 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1013 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1014 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1015 *C = Cmpi; 1016 PetscFunctionReturn(0); 1017 } 1018 1019 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1020 { 1021 PetscErrorCode ierr; 1022 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1023 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1024 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1025 Mat_APMPI *ptap = c->ap; 1026 Mat AP_loc,C_loc,C_oth; 1027 PetscInt i,rstart,rend,cm,ncols,row; 1028 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1029 PetscScalar *apa; 1030 const PetscInt *cols; 1031 const PetscScalar *vals; 1032 1033 PetscFunctionBegin; 1034 if (!ptap) { 1035 MPI_Comm comm; 1036 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1037 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 1038 } 1039 1040 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1041 /* 1) get R = Pd^T,Ro = Po^T */ 1042 if (ptap->reuse == MAT_REUSE_MATRIX) { 1043 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1044 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1045 } 1046 1047 /* 2) get AP_loc */ 1048 AP_loc = ptap->AP_loc; 1049 ap = (Mat_SeqAIJ*)AP_loc->data; 1050 1051 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1052 /*-----------------------------------------------------*/ 1053 if (ptap->reuse == MAT_REUSE_MATRIX) { 1054 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1055 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1056 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1057 } 1058 1059 /* 2-2) compute numeric A_loc*P - dominating part */ 1060 /* ---------------------------------------------- */ 1061 /* get data from symbolic products */ 1062 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1063 if (ptap->P_oth) { 1064 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1065 } 1066 apa = ptap->apa; 1067 api = ap->i; 1068 apj = ap->j; 1069 for (i=0; i<am; i++) { 1070 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1071 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 1072 apnz = api[i+1] - api[i]; 1073 for (j=0; j<apnz; j++) { 1074 col = apj[j+api[i]]; 1075 ap->a[j+ap->i[i]] = apa[col]; 1076 apa[col] = 0.0; 1077 } 1078 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 1079 } 1080 1081 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1082 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 1083 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 1084 C_loc = ptap->C_loc; 1085 C_oth = ptap->C_oth; 1086 1087 /* add C_loc and Co to to C */ 1088 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 1089 1090 /* C_loc -> C */ 1091 cm = C_loc->rmap->N; 1092 c_seq = (Mat_SeqAIJ*)C_loc->data; 1093 cols = c_seq->j; 1094 vals = c_seq->a; 1095 1096 1097 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1098 /* when there are no off-processor parts. */ 1099 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1100 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1101 /* a table, and other, more complex stuff has to be done. */ 1102 if (C->assembled) { 1103 C->was_assembled = PETSC_TRUE; 1104 C->assembled = PETSC_FALSE; 1105 } 1106 if (C->was_assembled) { 1107 for (i=0; i<cm; i++) { 1108 ncols = c_seq->i[i+1] - c_seq->i[i]; 1109 row = rstart + i; 1110 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 1111 cols += ncols; vals += ncols; 1112 } 1113 } else { 1114 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 1115 } 1116 1117 /* Co -> C, off-processor part */ 1118 cm = C_oth->rmap->N; 1119 c_seq = (Mat_SeqAIJ*)C_oth->data; 1120 cols = c_seq->j; 1121 vals = c_seq->a; 1122 for (i=0; i<cm; i++) { 1123 ncols = c_seq->i[i+1] - c_seq->i[i]; 1124 row = p->garray[i]; 1125 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 1126 cols += ncols; vals += ncols; 1127 } 1128 1129 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1130 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1131 1132 ptap->reuse = MAT_REUSE_MATRIX; 1133 1134 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 1135 if (ptap->freestruct) { 1136 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 1137 } 1138 PetscFunctionReturn(0); 1139 } 1140