1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 17 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 18 { 19 PetscErrorCode ierr; 20 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 21 Mat_APMPI *ptap=a->ap; 22 PetscBool iascii; 23 PetscViewerFormat format; 24 25 PetscFunctionBegin; 26 if (!ptap) { 27 /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */ 28 A->ops->view = MatView_MPIAIJ; 29 ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr); 30 PetscFunctionReturn(0); 31 } 32 33 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 34 if (iascii) { 35 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 36 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 37 if (ptap->algType == 0) { 38 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 39 } else if (ptap->algType == 1) { 40 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 41 } else if (ptap->algType == 2) { 42 ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 43 } else if (ptap->algType == 3) { 44 ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 45 } 46 } 47 } 48 ierr = (ptap->view)(A,viewer);CHKERRQ(ierr); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A) 53 { 54 PetscErrorCode ierr; 55 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 56 Mat_APMPI *ptap=a->ap; 57 Mat_Merge_SeqsToMPI *merge; 58 59 PetscFunctionBegin; 60 if (!ptap) PetscFunctionReturn(0); 61 62 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 63 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 64 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 65 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 66 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 67 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 68 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 69 if (ptap->AP_loc) { /* used by alg_rap */ 70 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 71 ierr = PetscFree(ap->i);CHKERRQ(ierr); 72 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 73 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 74 } else { /* used by alg_ptap */ 75 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 76 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 77 } 78 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 79 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 80 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 81 82 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 83 84 merge=ptap->merge; 85 if (merge) { /* used by alg_ptap */ 86 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 87 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 88 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 89 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 90 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 91 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 92 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 93 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 94 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 95 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 96 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 97 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 98 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 99 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 100 } 101 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 102 103 ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr); 104 ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr); 105 ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 110 { 111 PetscErrorCode ierr; 112 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 113 Mat_APMPI *ptap=a->ap; 114 115 PetscFunctionBegin; 116 ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr); 117 ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */ 118 ierr = PetscFree(ptap);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 123 { 124 PetscErrorCode ierr; 125 PetscBool flg; 126 MPI_Comm comm; 127 #if !defined(PETSC_HAVE_HYPRE) 128 const char *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"}; 129 PetscInt nalg=4; 130 #else 131 const char *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"}; 132 PetscInt nalg=5; 133 #endif 134 PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */ 135 136 PetscFunctionBegin; 137 /* check if matrix local sizes are compatible */ 138 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 139 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 140 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 141 142 if (scall == MAT_INITIAL_MATRIX) { 143 /* pick an algorithm */ 144 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 145 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 146 ierr = PetscOptionsEnd();CHKERRQ(ierr); 147 148 if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */ 149 MatInfo Ainfo,Pinfo; 150 PetscInt nz_local; 151 PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable; 152 153 ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr); 154 ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr); 155 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 156 157 if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE; 158 ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr); 159 160 if (alg_scalable) { 161 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 162 } 163 } 164 165 switch (alg) { 166 case 1: 167 /* do R=P^T locally, then C=R*A*P -- nonscalable */ 168 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 169 ierr = PetscPrintf(PetscObjectComm((PetscObject)A),"MatPtAP_MPIAIJ_MPIAIJ \n");CHKERRQ(ierr); 170 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 171 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 172 break; 173 case 2: 174 /* compute C=P^T*A*P allatonce */ 175 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 176 ierr = PetscPrintf(PetscObjectComm((PetscObject)A),"MatPtAP_MPIAIJ_MPIAIJ_allatonce \n");CHKERRQ(ierr); 177 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr); 178 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 179 break; 180 case 3: 181 /* compute C=P^T*A*P allatonce */ 182 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 183 ierr = PetscPrintf(PetscObjectComm((PetscObject)A),"MatPtAP_MPIAIJ_MPIAIJ_allatonce_merged \n");CHKERRQ(ierr); 184 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr); 185 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 186 break; 187 #if defined(PETSC_HAVE_HYPRE) 188 case 4: 189 /* Use boomerAMGBuildCoarseOperator */ 190 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 191 ierr = PetscPrintf(PetscObjectComm((PetscObject)A),"MatPtAP_AIJ_AIJ_wHYPRE \n");CHKERRQ(ierr); 192 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 193 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 194 break; 195 #endif 196 default: 197 /* do R=P^T locally, then C=R*A*P */ 198 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 199 ierr = PetscPrintf(PetscObjectComm((PetscObject)A),"MatPtAP_MPIAIJ_MPIAIJ_scalable \n");CHKERRQ(ierr); 200 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr); 201 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 202 break; 203 } 204 205 if (alg == 0 || alg == 1 || alg == 2 || alg == 3) { 206 Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data; 207 Mat_APMPI *ap = c->ap; 208 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr); 209 ap->freestruct = PETSC_FALSE; 210 ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr); 211 ierr = PetscOptionsEnd();CHKERRQ(ierr); 212 } 213 } 214 215 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 216 ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr); 217 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 218 PetscFunctionReturn(0); 219 } 220 221 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 222 { 223 PetscErrorCode ierr; 224 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 225 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 226 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 227 Mat_APMPI *ptap = c->ap; 228 Mat AP_loc,C_loc,C_oth; 229 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 230 PetscScalar *apa; 231 const PetscInt *cols; 232 const PetscScalar *vals; 233 234 PetscFunctionBegin; 235 if (!ptap->AP_loc) { 236 MPI_Comm comm; 237 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 238 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 239 } 240 241 ierr = MatZeroEntries(C);CHKERRQ(ierr); 242 243 /* 1) get R = Pd^T,Ro = Po^T */ 244 if (ptap->reuse == MAT_REUSE_MATRIX) { 245 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 246 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 247 } 248 249 /* 2) get AP_loc */ 250 AP_loc = ptap->AP_loc; 251 ap = (Mat_SeqAIJ*)AP_loc->data; 252 253 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 254 /*-----------------------------------------------------*/ 255 if (ptap->reuse == MAT_REUSE_MATRIX) { 256 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 257 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 258 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 259 } 260 261 /* 2-2) compute numeric A_loc*P - dominating part */ 262 /* ---------------------------------------------- */ 263 /* get data from symbolic products */ 264 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 265 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 266 267 api = ap->i; 268 apj = ap->j; 269 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 270 for (i=0; i<am; i++) { 271 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 272 apnz = api[i+1] - api[i]; 273 apa = ap->a + api[i]; 274 ierr = PetscArrayzero(apa,apnz);CHKERRQ(ierr); 275 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 276 } 277 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 278 if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout); 279 280 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 281 /* Always use scalable version since we are in the MPI scalable version */ 282 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 283 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 284 285 C_loc = ptap->C_loc; 286 C_oth = ptap->C_oth; 287 288 /* add C_loc and Co to to C */ 289 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 290 291 /* C_loc -> C */ 292 cm = C_loc->rmap->N; 293 c_seq = (Mat_SeqAIJ*)C_loc->data; 294 cols = c_seq->j; 295 vals = c_seq->a; 296 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 297 298 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 299 /* when there are no off-processor parts. */ 300 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 301 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 302 /* a table, and other, more complex stuff has to be done. */ 303 if (C->assembled) { 304 C->was_assembled = PETSC_TRUE; 305 C->assembled = PETSC_FALSE; 306 } 307 if (C->was_assembled) { 308 for (i=0; i<cm; i++) { 309 ncols = c_seq->i[i+1] - c_seq->i[i]; 310 row = rstart + i; 311 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 312 cols += ncols; vals += ncols; 313 } 314 } else { 315 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 316 } 317 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 318 if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 319 320 /* Co -> C, off-processor part */ 321 cm = C_oth->rmap->N; 322 c_seq = (Mat_SeqAIJ*)C_oth->data; 323 cols = c_seq->j; 324 vals = c_seq->a; 325 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 326 for (i=0; i<cm; i++) { 327 ncols = c_seq->i[i+1] - c_seq->i[i]; 328 row = p->garray[i]; 329 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 330 cols += ncols; vals += ncols; 331 } 332 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 333 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 334 335 ptap->reuse = MAT_REUSE_MATRIX; 336 337 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 338 if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 339 340 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 341 if (ptap->freestruct) { 342 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 343 } 344 PetscFunctionReturn(0); 345 } 346 347 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C) 348 { 349 PetscErrorCode ierr; 350 Mat_APMPI *ptap; 351 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 352 MPI_Comm comm; 353 PetscMPIInt size,rank; 354 Mat Cmpi,P_loc,P_oth; 355 PetscFreeSpaceList free_space=NULL,current_space=NULL; 356 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 357 PetscInt *lnk,i,k,pnz,row,nsend; 358 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 359 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 360 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 361 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 362 MPI_Request *swaits,*rwaits; 363 MPI_Status *sstatus,rstatus; 364 PetscLayout rowmap; 365 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 366 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 367 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 368 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 369 PetscScalar *apv; 370 PetscTable ta; 371 MatType mtype; 372 const char *prefix; 373 #if defined(PETSC_USE_INFO) 374 PetscReal apfill; 375 #endif 376 377 PetscFunctionBegin; 378 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 379 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 380 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 381 382 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 383 384 /* create symbolic parallel matrix Cmpi */ 385 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 386 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 387 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 388 389 /* create struct Mat_APMPI and attached it to C later */ 390 ierr = PetscNew(&ptap);CHKERRQ(ierr); 391 ptap->reuse = MAT_INITIAL_MATRIX; 392 ptap->algType = 0; 393 394 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 395 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 396 /* get P_loc by taking all local rows of P */ 397 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 398 399 ptap->P_loc = P_loc; 400 ptap->P_oth = P_oth; 401 402 /* (0) compute Rd = Pd^T, Ro = Po^T */ 403 /* --------------------------------- */ 404 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 405 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 406 407 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 408 /* ----------------------------------------------------------------- */ 409 p_loc = (Mat_SeqAIJ*)P_loc->data; 410 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 411 412 /* create and initialize a linked list */ 413 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 414 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 415 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 416 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 417 418 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 419 420 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 421 if (ao) { 422 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 423 } else { 424 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 425 } 426 current_space = free_space; 427 nspacedouble = 0; 428 429 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 430 api[0] = 0; 431 for (i=0; i<am; i++) { 432 /* diagonal portion: Ad[i,:]*P */ 433 ai = ad->i; pi = p_loc->i; 434 nzi = ai[i+1] - ai[i]; 435 aj = ad->j + ai[i]; 436 for (j=0; j<nzi; j++) { 437 row = aj[j]; 438 pnz = pi[row+1] - pi[row]; 439 Jptr = p_loc->j + pi[row]; 440 /* add non-zero cols of P into the sorted linked list lnk */ 441 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 442 } 443 /* off-diagonal portion: Ao[i,:]*P */ 444 if (ao) { 445 ai = ao->i; pi = p_oth->i; 446 nzi = ai[i+1] - ai[i]; 447 aj = ao->j + ai[i]; 448 for (j=0; j<nzi; j++) { 449 row = aj[j]; 450 pnz = pi[row+1] - pi[row]; 451 Jptr = p_oth->j + pi[row]; 452 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 453 } 454 } 455 apnz = lnk[0]; 456 api[i+1] = api[i] + apnz; 457 458 /* if free space is not available, double the total space in the list */ 459 if (current_space->local_remaining<apnz) { 460 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 461 nspacedouble++; 462 } 463 464 /* Copy data into free space, then initialize lnk */ 465 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 466 467 current_space->array += apnz; 468 current_space->local_used += apnz; 469 current_space->local_remaining -= apnz; 470 } 471 /* Allocate space for apj and apv, initialize apj, and */ 472 /* destroy list of free space and other temporary array(s) */ 473 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 474 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 475 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 476 477 /* Create AP_loc for reuse */ 478 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 479 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 480 481 #if defined(PETSC_USE_INFO) 482 if (ao) { 483 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 484 } else { 485 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 486 } 487 ptap->AP_loc->info.mallocs = nspacedouble; 488 ptap->AP_loc->info.fill_ratio_given = fill; 489 ptap->AP_loc->info.fill_ratio_needed = apfill; 490 491 if (api[am]) { 492 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 493 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 494 } else { 495 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 496 } 497 #endif 498 499 /* (2-1) compute symbolic Co = Ro*AP_loc */ 500 /* ------------------------------------ */ 501 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 502 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 503 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 504 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 505 506 /* (3) send coj of C_oth to other processors */ 507 /* ------------------------------------------ */ 508 /* determine row ownership */ 509 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 510 rowmap->n = pn; 511 rowmap->bs = 1; 512 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 513 owners = rowmap->range; 514 515 /* determine the number of messages to send, their lengths */ 516 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 517 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 518 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 519 520 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 521 coi = c_oth->i; coj = c_oth->j; 522 con = ptap->C_oth->rmap->n; 523 proc = 0; 524 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 525 for (i=0; i<con; i++) { 526 while (prmap[i] >= owners[proc+1]) proc++; 527 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 528 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 529 } 530 531 len = 0; /* max length of buf_si[], see (4) */ 532 owners_co[0] = 0; 533 nsend = 0; 534 for (proc=0; proc<size; proc++) { 535 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 536 if (len_s[proc]) { 537 nsend++; 538 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 539 len += len_si[proc]; 540 } 541 } 542 543 /* determine the number and length of messages to receive for coi and coj */ 544 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 545 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 546 547 /* post the Irecv and Isend of coj */ 548 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 549 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 550 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 551 for (proc=0, k=0; proc<size; proc++) { 552 if (!len_s[proc]) continue; 553 i = owners_co[proc]; 554 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 555 k++; 556 } 557 558 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 559 /* ---------------------------------------- */ 560 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 561 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 562 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 563 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 564 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 565 566 /* receives coj are complete */ 567 for (i=0; i<nrecv; i++) { 568 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 569 } 570 ierr = PetscFree(rwaits);CHKERRQ(ierr); 571 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 572 573 /* add received column indices into ta to update Crmax */ 574 for (k=0; k<nrecv; k++) {/* k-th received message */ 575 Jptr = buf_rj[k]; 576 for (j=0; j<len_r[k]; j++) { 577 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 578 } 579 } 580 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 581 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 582 583 /* (4) send and recv coi */ 584 /*-----------------------*/ 585 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 586 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 587 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 588 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 589 for (proc=0,k=0; proc<size; proc++) { 590 if (!len_s[proc]) continue; 591 /* form outgoing message for i-structure: 592 buf_si[0]: nrows to be sent 593 [1:nrows]: row index (global) 594 [nrows+1:2*nrows+1]: i-structure index 595 */ 596 /*-------------------------------------------*/ 597 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 598 buf_si_i = buf_si + nrows+1; 599 buf_si[0] = nrows; 600 buf_si_i[0] = 0; 601 nrows = 0; 602 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 603 nzi = coi[i+1] - coi[i]; 604 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 605 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 606 nrows++; 607 } 608 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 609 k++; 610 buf_si += len_si[proc]; 611 } 612 for (i=0; i<nrecv; i++) { 613 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 614 } 615 ierr = PetscFree(rwaits);CHKERRQ(ierr); 616 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 617 618 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 619 ierr = PetscFree(len_ri);CHKERRQ(ierr); 620 ierr = PetscFree(swaits);CHKERRQ(ierr); 621 ierr = PetscFree(buf_s);CHKERRQ(ierr); 622 623 /* (5) compute the local portion of Cmpi */ 624 /* ------------------------------------------ */ 625 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 626 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 627 current_space = free_space; 628 629 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 630 for (k=0; k<nrecv; k++) { 631 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 632 nrows = *buf_ri_k[k]; 633 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 634 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 635 } 636 637 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 638 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 639 for (i=0; i<pn; i++) { 640 /* add C_loc into Cmpi */ 641 nzi = c_loc->i[i+1] - c_loc->i[i]; 642 Jptr = c_loc->j + c_loc->i[i]; 643 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 644 645 /* add received col data into lnk */ 646 for (k=0; k<nrecv; k++) { /* k-th received message */ 647 if (i == *nextrow[k]) { /* i-th row */ 648 nzi = *(nextci[k]+1) - *nextci[k]; 649 Jptr = buf_rj[k] + *nextci[k]; 650 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 651 nextrow[k]++; nextci[k]++; 652 } 653 } 654 nzi = lnk[0]; 655 656 /* copy data into free space, then initialize lnk */ 657 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 658 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 659 } 660 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 661 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 662 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 663 664 /* local sizes and preallocation */ 665 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 666 if (P->cmap->bs > 0) { 667 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 668 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 669 } 670 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 671 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 672 673 /* members in merge */ 674 ierr = PetscFree(id_r);CHKERRQ(ierr); 675 ierr = PetscFree(len_r);CHKERRQ(ierr); 676 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 677 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 678 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 679 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 680 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 681 682 /* attach the supporting struct to Cmpi for reuse */ 683 c = (Mat_MPIAIJ*)Cmpi->data; 684 c->ap = ptap; 685 ptap->duplicate = Cmpi->ops->duplicate; 686 ptap->destroy = Cmpi->ops->destroy; 687 ptap->view = Cmpi->ops->view; 688 689 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 690 Cmpi->assembled = PETSC_FALSE; 691 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 692 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 693 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 694 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 695 *C = Cmpi; 696 697 nout = 0; 698 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 699 if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout); 700 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 701 if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout); 702 703 PetscFunctionReturn(0); 704 } 705 706 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht) 707 { 708 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 709 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 710 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 711 PetscInt pcstart,pcend,column,offset; 712 PetscErrorCode ierr; 713 714 PetscFunctionBegin; 715 pcstart = P->cmap->rstart; 716 pcstart *= dof; 717 pcend = P->cmap->rend; 718 pcend *= dof; 719 /* diagonal portion: Ad[i,:]*P */ 720 ai = ad->i; 721 nzi = ai[i+1] - ai[i]; 722 aj = ad->j + ai[i]; 723 for (j=0; j<nzi; j++) { 724 row = aj[j]; 725 offset = row%dof; 726 row /= dof; 727 nzpi = pd->i[row+1] - pd->i[row]; 728 pj = pd->j + pd->i[row]; 729 for (k=0; k<nzpi; k++) { 730 ierr = PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);CHKERRQ(ierr); 731 } 732 } 733 /* off diag P */ 734 for (j=0; j<nzi; j++) { 735 row = aj[j]; 736 offset = row%dof; 737 row /= dof; 738 nzpi = po->i[row+1] - po->i[row]; 739 pj = po->j + po->i[row]; 740 for (k=0; k<nzpi; k++) { 741 ierr = PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);CHKERRQ(ierr); 742 } 743 } 744 745 /* off diagonal part: Ao[i, :]*P_oth */ 746 if (ao) { 747 ai = ao->i; 748 pi = p_oth->i; 749 nzi = ai[i+1] - ai[i]; 750 aj = ao->j + ai[i]; 751 for (j=0; j<nzi; j++) { 752 row = aj[j]; 753 offset = a->garray[row]%dof; 754 row = map[row]; 755 pnz = pi[row+1] - pi[row]; 756 p_othcols = p_oth->j + pi[row]; 757 for (col=0; col<pnz; col++) { 758 column = p_othcols[col] * dof + offset; 759 if (column>=pcstart && column<pcend) { 760 ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr); 761 } else { 762 ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr); 763 } 764 } 765 } 766 } /* end if (ao) */ 767 PetscFunctionReturn(0); 768 } 769 770 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap) 771 { 772 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 773 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 774 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset; 775 PetscScalar ra,*aa,*pa; 776 PetscErrorCode ierr; 777 778 PetscFunctionBegin; 779 pcstart = P->cmap->rstart; 780 pcstart *= dof; 781 782 /* diagonal portion: Ad[i,:]*P */ 783 ai = ad->i; 784 nzi = ai[i+1] - ai[i]; 785 aj = ad->j + ai[i]; 786 aa = ad->a + ai[i]; 787 for (j=0; j<nzi; j++) { 788 ra = aa[j]; 789 row = aj[j]; 790 offset = row%dof; 791 row /= dof; 792 nzpi = pd->i[row+1] - pd->i[row]; 793 pj = pd->j + pd->i[row]; 794 pa = pd->a + pd->i[row]; 795 for (k=0; k<nzpi; k++) { 796 ierr = PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);CHKERRQ(ierr); 797 } 798 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 799 } 800 for (j=0; j<nzi; j++) { 801 ra = aa[j]; 802 row = aj[j]; 803 offset = row%dof; 804 row /= dof; 805 nzpi = po->i[row+1] - po->i[row]; 806 pj = po->j + po->i[row]; 807 pa = po->a + po->i[row]; 808 for (k=0; k<nzpi; k++) { 809 ierr = PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);CHKERRQ(ierr); 810 } 811 ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr); 812 } 813 814 /* off diagonal part: Ao[i, :]*P_oth */ 815 if (ao) { 816 ai = ao->i; 817 pi = p_oth->i; 818 nzi = ai[i+1] - ai[i]; 819 aj = ao->j + ai[i]; 820 aa = ao->a + ai[i]; 821 for (j=0; j<nzi; j++) { 822 row = aj[j]; 823 offset = a->garray[row]%dof; 824 row = map[row]; 825 ra = aa[j]; 826 pnz = pi[row+1] - pi[row]; 827 p_othcols = p_oth->j + pi[row]; 828 pa = p_oth->a + pi[row]; 829 for (col=0; col<pnz; col++) { 830 ierr = PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);CHKERRQ(ierr); 831 } 832 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 833 } 834 } /* end if (ao) */ 835 836 PetscFunctionReturn(0); 837 } 838 839 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*); 840 841 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C) 842 { 843 PetscErrorCode ierr; 844 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 845 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 846 Mat_APMPI *ptap = c->ap; 847 PetscHMapIV hmap; 848 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 849 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 850 PetscInt offset,ii,pocol; 851 const PetscInt *mappingindices; 852 IS map; 853 MPI_Comm comm; 854 855 PetscFunctionBegin; 856 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 857 if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 858 859 ierr = MatZeroEntries(C);CHKERRQ(ierr); 860 861 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 862 /*-----------------------------------------------------*/ 863 if (ptap->reuse == MAT_REUSE_MATRIX) { 864 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 865 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 866 } 867 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 868 869 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 870 pon *= dof; 871 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 872 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 873 cmaxr = 0; 874 for (i=0; i<pon; i++) { 875 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 876 } 877 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 878 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 879 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 880 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 881 for (i=0; i<am && pon; i++) { 882 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 883 offset = i%dof; 884 ii = i/dof; 885 nzi = po->i[ii+1] - po->i[ii]; 886 if (!nzi) continue; 887 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 888 voff = 0; 889 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 890 if (!voff) continue; 891 892 /* Form C(ii, :) */ 893 poj = po->j + po->i[ii]; 894 poa = po->a + po->i[ii]; 895 for (j=0; j<nzi; j++) { 896 pocol = poj[j]*dof+offset; 897 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 898 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 899 for (jj=0; jj<voff; jj++) { 900 apvaluestmp[jj] = apvalues[jj]*poa[j]; 901 /*If the row is empty */ 902 if (!c_rmtc[pocol]) { 903 c_rmtjj[jj] = apindices[jj]; 904 c_rmtaa[jj] = apvaluestmp[jj]; 905 c_rmtc[pocol]++; 906 } else { 907 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 908 if (loc>=0){ /* hit */ 909 c_rmtaa[loc] += apvaluestmp[jj]; 910 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 911 } else { /* new element */ 912 loc = -(loc+1); 913 /* Move data backward */ 914 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 915 c_rmtjj[kk] = c_rmtjj[kk-1]; 916 c_rmtaa[kk] = c_rmtaa[kk-1]; 917 }/* End kk */ 918 c_rmtjj[loc] = apindices[jj]; 919 c_rmtaa[loc] = apvaluestmp[jj]; 920 c_rmtc[pocol]++; 921 } 922 } 923 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 924 } /* End jj */ 925 } /* End j */ 926 } /* End i */ 927 928 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 929 930 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 931 pn *= dof; 932 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 933 934 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 935 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 936 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 937 pcstart = pcstart*dof; 938 pcend = pcend*dof; 939 cd = (Mat_SeqAIJ*)(c->A)->data; 940 co = (Mat_SeqAIJ*)(c->B)->data; 941 942 cmaxr = 0; 943 for (i=0; i<pn; i++) { 944 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 945 } 946 ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr); 947 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 948 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 949 for (i=0; i<am && pn; i++) { 950 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 951 offset = i%dof; 952 ii = i/dof; 953 nzi = pd->i[ii+1] - pd->i[ii]; 954 if (!nzi) continue; 955 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 956 voff = 0; 957 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 958 if (!voff) continue; 959 /* Form C(ii, :) */ 960 pdj = pd->j + pd->i[ii]; 961 pda = pd->a + pd->i[ii]; 962 for (j=0; j<nzi; j++) { 963 row = pcstart + pdj[j] * dof + offset; 964 for (jj=0; jj<voff; jj++) { 965 apvaluestmp[jj] = apvalues[jj]*pda[j]; 966 } 967 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 968 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 969 } 970 } 971 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 972 ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr); 973 ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr); 974 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 975 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 976 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 977 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 978 979 /* Add contributions from remote */ 980 for (i = 0; i < pn; i++) { 981 row = i + pcstart; 982 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 983 } 984 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 985 986 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 987 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 988 989 ptap->reuse = MAT_REUSE_MATRIX; 990 991 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 992 if (ptap->freestruct) { 993 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 994 } 995 PetscFunctionReturn(0); 996 } 997 998 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 999 { 1000 PetscErrorCode ierr; 1001 1002 PetscFunctionBegin; 1003 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);CHKERRQ(ierr); 1004 PetscFunctionReturn(0); 1005 } 1006 1007 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C) 1008 { 1009 PetscErrorCode ierr; 1010 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1011 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 1012 Mat_APMPI *ptap = c->ap; 1013 PetscHMapIV hmap; 1014 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 1015 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 1016 PetscInt offset,ii,pocol; 1017 const PetscInt *mappingindices; 1018 IS map; 1019 MPI_Comm comm; 1020 1021 PetscFunctionBegin; 1022 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1023 if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 1024 1025 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1026 1027 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1028 /*-----------------------------------------------------*/ 1029 if (ptap->reuse == MAT_REUSE_MATRIX) { 1030 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1031 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1032 } 1033 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1034 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1035 pon *= dof; 1036 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1037 pn *= dof; 1038 1039 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 1040 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1041 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1042 pcstart *= dof; 1043 pcend *= dof; 1044 cmaxr = 0; 1045 for (i=0; i<pon; i++) { 1046 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 1047 } 1048 cd = (Mat_SeqAIJ*)(c->A)->data; 1049 co = (Mat_SeqAIJ*)(c->B)->data; 1050 for (i=0; i<pn; i++) { 1051 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 1052 } 1053 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 1054 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 1055 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 1056 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1057 for (i=0; i<am && (pon || pn); i++) { 1058 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 1059 offset = i%dof; 1060 ii = i/dof; 1061 nzi = po->i[ii+1] - po->i[ii]; 1062 dnzi = pd->i[ii+1] - pd->i[ii]; 1063 if (!nzi && !dnzi) continue; 1064 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr); 1065 voff = 0; 1066 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 1067 if (!voff) continue; 1068 1069 /* Form remote C(ii, :) */ 1070 poj = po->j + po->i[ii]; 1071 poa = po->a + po->i[ii]; 1072 for (j=0; j<nzi; j++) { 1073 pocol = poj[j]*dof+offset; 1074 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 1075 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 1076 for (jj=0; jj<voff; jj++) { 1077 apvaluestmp[jj] = apvalues[jj]*poa[j]; 1078 /*If the row is empty */ 1079 if (!c_rmtc[pocol]) { 1080 c_rmtjj[jj] = apindices[jj]; 1081 c_rmtaa[jj] = apvaluestmp[jj]; 1082 c_rmtc[pocol]++; 1083 } else { 1084 ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr); 1085 if (loc>=0){ /* hit */ 1086 c_rmtaa[loc] += apvaluestmp[jj]; 1087 ierr = PetscLogFlops(1.0);CHKERRQ(ierr); 1088 } else { /* new element */ 1089 loc = -(loc+1); 1090 /* Move data backward */ 1091 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 1092 c_rmtjj[kk] = c_rmtjj[kk-1]; 1093 c_rmtaa[kk] = c_rmtaa[kk-1]; 1094 }/* End kk */ 1095 c_rmtjj[loc] = apindices[jj]; 1096 c_rmtaa[loc] = apvaluestmp[jj]; 1097 c_rmtc[pocol]++; 1098 } 1099 } 1100 } /* End jj */ 1101 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 1102 } /* End j */ 1103 1104 /* Form local C(ii, :) */ 1105 pdj = pd->j + pd->i[ii]; 1106 pda = pd->a + pd->i[ii]; 1107 for (j=0; j<dnzi; j++) { 1108 row = pcstart + pdj[j] * dof + offset; 1109 for (jj=0; jj<voff; jj++) { 1110 apvaluestmp[jj] = apvalues[jj]*pda[j]; 1111 }/* End kk */ 1112 ierr = PetscLogFlops(voff);CHKERRQ(ierr); 1113 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 1114 }/* End j */ 1115 } /* End i */ 1116 1117 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1118 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 1119 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 1120 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 1121 1122 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1123 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1124 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1125 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1126 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 1127 1128 /* Add contributions from remote */ 1129 for (i = 0; i < pn; i++) { 1130 row = i + pcstart; 1131 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 1132 } 1133 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 1134 1135 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1136 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1137 1138 ptap->reuse = MAT_REUSE_MATRIX; 1139 1140 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 1141 if (ptap->freestruct) { 1142 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 1143 } 1144 PetscFunctionReturn(0); 1145 } 1146 1147 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 1148 { 1149 PetscErrorCode ierr; 1150 1151 PetscFunctionBegin; 1152 ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);CHKERRQ(ierr); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C) 1157 { 1158 Mat_APMPI *ptap; 1159 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1160 MPI_Comm comm; 1161 Mat Cmpi; 1162 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1163 MatType mtype; 1164 PetscSF sf; 1165 PetscSFNode *iremote; 1166 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1167 const PetscInt *rootdegrees; 1168 PetscHSetI ht,oht,*hta,*hto; 1169 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1170 PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1171 PetscInt nalg=2,alg=0,offset,ii; 1172 const PetscInt *mappingindices; 1173 PetscBool flg; 1174 const char *algTypes[2] = {"overlapping","merged"}; 1175 IS map; 1176 PetscErrorCode ierr; 1177 1178 PetscFunctionBegin; 1179 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1180 /* Create symbolic parallel matrix Cmpi */ 1181 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1182 pn *= dof; 1183 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1184 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1185 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1186 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1187 1188 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1189 ptap->reuse = MAT_INITIAL_MATRIX; 1190 ptap->algType = 2; 1191 1192 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1193 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1194 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1195 /* This equals to the number of offdiag columns in P */ 1196 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1197 pon *= dof; 1198 /* offsets */ 1199 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1200 /* The number of columns we will send to remote ranks */ 1201 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1202 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1203 for (i=0; i<pon; i++) { 1204 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1205 } 1206 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1207 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1208 /* Create hash table to merge all columns for C(i, :) */ 1209 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1210 1211 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1212 ptap->c_rmti[0] = 0; 1213 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1214 for (i=0; i<am && pon; i++) { 1215 /* Form one row of AP */ 1216 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1217 offset = i%dof; 1218 ii = i/dof; 1219 /* If the off diag is empty, we should not do any calculation */ 1220 nzi = po->i[ii+1] - po->i[ii]; 1221 if (!nzi) continue; 1222 1223 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);CHKERRQ(ierr); 1224 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1225 /* If AP is empty, just continue */ 1226 if (!htsize) continue; 1227 /* Form C(ii, :) */ 1228 poj = po->j + po->i[ii]; 1229 for (j=0; j<nzi; j++) { 1230 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1231 } 1232 } 1233 1234 for (i=0; i<pon; i++) { 1235 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1236 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1237 c_rmtc[i] = htsize; 1238 } 1239 1240 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1241 1242 for (i=0; i<pon; i++) { 1243 off = 0; 1244 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1245 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1246 } 1247 ierr = PetscFree(hta);CHKERRQ(ierr); 1248 1249 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1250 for (i=0; i<pon; i++) { 1251 owner = 0; lidx = 0; 1252 offset = i%dof; 1253 ii = i/dof; 1254 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1255 iremote[i].index = lidx*dof + offset; 1256 iremote[i].rank = owner; 1257 } 1258 1259 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1260 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1261 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1262 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1263 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1264 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1265 /* How many neighbors have contributions to my rows? */ 1266 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1267 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1268 rootspacesize = 0; 1269 for (i = 0; i < pn; i++) { 1270 rootspacesize += rootdegrees[i]; 1271 } 1272 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1273 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1274 /* Get information from leaves 1275 * Number of columns other people contribute to my rows 1276 * */ 1277 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1278 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1279 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1280 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1281 /* The number of columns is received for each row */ 1282 ptap->c_othi[0] = 0; 1283 rootspacesize = 0; 1284 rootspaceoffsets[0] = 0; 1285 for (i = 0; i < pn; i++) { 1286 rcvncols = 0; 1287 for (j = 0; j<rootdegrees[i]; j++) { 1288 rcvncols += rootspace[rootspacesize]; 1289 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1290 rootspacesize++; 1291 } 1292 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1293 } 1294 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1295 1296 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1297 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1298 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1299 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1300 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1301 1302 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1303 nleaves = 0; 1304 for (i = 0; i<pon; i++) { 1305 owner = 0; 1306 ii = i/dof; 1307 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1308 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1309 for (j=0; j<sendncols; j++) { 1310 iremote[nleaves].rank = owner; 1311 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1312 } 1313 } 1314 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1315 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1316 1317 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1318 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1319 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1320 /* One to one map */ 1321 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1322 1323 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1324 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1325 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1326 pcstart *= dof; 1327 pcend *= dof; 1328 ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr); 1329 for (i=0; i<pn; i++) { 1330 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1331 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1332 } 1333 /* Work on local part */ 1334 /* 4) Pass 1: Estimate memory for C_loc */ 1335 for (i=0; i<am && pn; i++) { 1336 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1337 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1338 offset = i%dof; 1339 ii = i/dof; 1340 nzi = pd->i[ii+1] - pd->i[ii]; 1341 if (!nzi) continue; 1342 1343 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1344 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1345 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1346 if (!(htsize+htosize)) continue; 1347 /* Form C(ii, :) */ 1348 pdj = pd->j + pd->i[ii]; 1349 for (j=0; j<nzi; j++) { 1350 ierr = PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1351 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1352 } 1353 } 1354 1355 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1356 1357 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1358 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1359 1360 /* Get remote data */ 1361 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1362 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1363 1364 for (i = 0; i < pn; i++) { 1365 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1366 rdj = c_othj + ptap->c_othi[i]; 1367 for (j = 0; j < nzi; j++) { 1368 col = rdj[j]; 1369 /* diag part */ 1370 if (col>=pcstart && col<pcend) { 1371 ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr); 1372 } else { /* off diag */ 1373 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1374 } 1375 } 1376 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1377 dnz[i] = htsize; 1378 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1379 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1380 onz[i] = htsize; 1381 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1382 } 1383 1384 ierr = PetscFree2(hta,hto);CHKERRQ(ierr); 1385 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1386 1387 /* local sizes and preallocation */ 1388 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1389 ierr = MatSetBlockSizes(Cmpi,PetscAbs(A->rmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr); 1390 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1391 ierr = MatSetUp(Cmpi);CHKERRQ(ierr); 1392 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1393 1394 /* attach the supporting struct to Cmpi for reuse */ 1395 c = (Mat_MPIAIJ*)Cmpi->data; 1396 c->ap = ptap; 1397 ptap->duplicate = Cmpi->ops->duplicate; 1398 ptap->destroy = Cmpi->ops->destroy; 1399 ptap->view = Cmpi->ops->view; 1400 1401 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1402 Cmpi->assembled = PETSC_FALSE; 1403 /* pick an algorithm */ 1404 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1405 alg = 0; 1406 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1407 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1408 switch (alg) { 1409 case 0: 1410 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1411 break; 1412 case 1: 1413 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1414 break; 1415 default: 1416 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1417 } 1418 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1419 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1420 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1421 *C = Cmpi; 1422 PetscFunctionReturn(0); 1423 } 1424 1425 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C) 1426 { 1427 PetscErrorCode ierr; 1428 1429 PetscFunctionBegin; 1430 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);CHKERRQ(ierr); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C) 1435 { 1436 Mat_APMPI *ptap; 1437 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1438 MPI_Comm comm; 1439 Mat Cmpi; 1440 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1441 MatType mtype; 1442 PetscSF sf; 1443 PetscSFNode *iremote; 1444 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1445 const PetscInt *rootdegrees; 1446 PetscHSetI ht,oht,*hta,*hto,*htd; 1447 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1448 PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1449 PetscInt nalg=2,alg=0,offset,ii; 1450 PetscBool flg; 1451 const char *algTypes[2] = {"merged","overlapping"}; 1452 const PetscInt *mappingindices; 1453 IS map; 1454 PetscErrorCode ierr; 1455 1456 PetscFunctionBegin; 1457 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1458 /* Create symbolic parallel matrix Cmpi */ 1459 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1460 pn *= dof; 1461 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1462 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1463 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1464 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1465 1466 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1467 ptap->reuse = MAT_INITIAL_MATRIX; 1468 ptap->algType = 3; 1469 1470 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1471 ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr); 1472 ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr); 1473 1474 /* This equals to the number of offdiag columns in P */ 1475 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1476 pon *= dof; 1477 /* offsets */ 1478 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1479 /* The number of columns we will send to remote ranks */ 1480 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1481 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1482 for (i=0; i<pon; i++) { 1483 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1484 } 1485 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1486 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1487 /* Create hash table to merge all columns for C(i, :) */ 1488 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1489 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1490 ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr); 1491 for (i=0; i<pn; i++) { 1492 ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr); 1493 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1494 } 1495 1496 ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr); 1497 ptap->c_rmti[0] = 0; 1498 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1499 for (i=0; i<am && (pon || pn); i++) { 1500 /* Form one row of AP */ 1501 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1502 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1503 offset = i%dof; 1504 ii = i/dof; 1505 /* If the off diag is empty, we should not do any calculation */ 1506 nzi = po->i[ii+1] - po->i[ii]; 1507 dnzi = pd->i[ii+1] - pd->i[ii]; 1508 if (!nzi && !dnzi) continue; 1509 1510 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr); 1511 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1512 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1513 /* If AP is empty, just continue */ 1514 if (!(htsize+htosize)) continue; 1515 1516 /* Form remote C(ii, :) */ 1517 poj = po->j + po->i[ii]; 1518 for (j=0; j<nzi; j++) { 1519 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr); 1520 ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);CHKERRQ(ierr); 1521 } 1522 1523 /* Form local C(ii, :) */ 1524 pdj = pd->j + pd->i[ii]; 1525 for (j=0; j<dnzi; j++) { 1526 ierr = PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);CHKERRQ(ierr); 1527 ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr); 1528 } 1529 } 1530 1531 ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr); 1532 1533 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1534 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1535 1536 for (i=0; i<pon; i++) { 1537 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1538 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1539 c_rmtc[i] = htsize; 1540 } 1541 1542 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1543 1544 for (i=0; i<pon; i++) { 1545 off = 0; 1546 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1547 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1548 } 1549 ierr = PetscFree(hta);CHKERRQ(ierr); 1550 1551 ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr); 1552 for (i=0; i<pon; i++) { 1553 owner = 0; lidx = 0; 1554 offset = i%dof; 1555 ii = i/dof; 1556 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr); 1557 iremote[i].index = lidx*dof+offset; 1558 iremote[i].rank = owner; 1559 } 1560 1561 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1562 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1563 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1564 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1565 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1566 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1567 /* How many neighbors have contributions to my rows? */ 1568 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1569 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1570 rootspacesize = 0; 1571 for (i = 0; i < pn; i++) { 1572 rootspacesize += rootdegrees[i]; 1573 } 1574 ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1575 ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1576 /* Get information from leaves 1577 * Number of columns other people contribute to my rows 1578 * */ 1579 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1580 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1581 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1582 ierr = PetscMalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1583 /* The number of columns is received for each row */ 1584 ptap->c_othi[0] = 0; 1585 rootspacesize = 0; 1586 rootspaceoffsets[0] = 0; 1587 for (i = 0; i < pn; i++) { 1588 rcvncols = 0; 1589 for (j = 0; j<rootdegrees[i]; j++) { 1590 rcvncols += rootspace[rootspacesize]; 1591 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1592 rootspacesize++; 1593 } 1594 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1595 } 1596 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1597 1598 ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1599 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1600 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1601 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1602 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1603 1604 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1605 nleaves = 0; 1606 for (i = 0; i<pon; i++) { 1607 owner = 0; 1608 ii = i/dof; 1609 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr); 1610 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1611 for (j=0; j<sendncols; j++) { 1612 iremote[nleaves].rank = owner; 1613 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1614 } 1615 } 1616 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1617 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1618 1619 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1620 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1621 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1622 /* One to one map */ 1623 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1624 /* Get remote data */ 1625 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1626 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1627 ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1628 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1629 pcstart *= dof; 1630 pcend *= dof; 1631 for (i = 0; i < pn; i++) { 1632 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1633 rdj = c_othj + ptap->c_othi[i]; 1634 for (j = 0; j < nzi; j++) { 1635 col = rdj[j]; 1636 /* diag part */ 1637 if (col>=pcstart && col<pcend) { 1638 ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr); 1639 } else { /* off diag */ 1640 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1641 } 1642 } 1643 ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr); 1644 dnz[i] = htsize; 1645 ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr); 1646 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1647 onz[i] = htsize; 1648 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1649 } 1650 1651 ierr = PetscFree2(htd,hto);CHKERRQ(ierr); 1652 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1653 1654 /* local sizes and preallocation */ 1655 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1656 ierr = MatSetBlockSizes(Cmpi,PetscAbs(A->rmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr); 1657 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1658 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1659 1660 /* attach the supporting struct to Cmpi for reuse */ 1661 c = (Mat_MPIAIJ*)Cmpi->data; 1662 c->ap = ptap; 1663 ptap->duplicate = Cmpi->ops->duplicate; 1664 ptap->destroy = Cmpi->ops->destroy; 1665 ptap->view = Cmpi->ops->view; 1666 1667 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1668 Cmpi->assembled = PETSC_FALSE; 1669 /* pick an algorithm */ 1670 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1671 alg = 0; 1672 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1673 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1674 switch (alg) { 1675 case 0: 1676 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1677 break; 1678 case 1: 1679 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1680 break; 1681 default: 1682 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1683 } 1684 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1685 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1686 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1687 *C = Cmpi; 1688 PetscFunctionReturn(0); 1689 } 1690 1691 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C) 1692 { 1693 PetscErrorCode ierr; 1694 1695 PetscFunctionBegin; 1696 ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);CHKERRQ(ierr); 1697 PetscFunctionReturn(0); 1698 } 1699 1700 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 1701 { 1702 PetscErrorCode ierr; 1703 Mat_APMPI *ptap; 1704 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 1705 MPI_Comm comm; 1706 PetscMPIInt size,rank; 1707 Mat Cmpi; 1708 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1709 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1710 PetscInt *lnk,i,k,pnz,row,nsend; 1711 PetscBT lnkbt; 1712 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 1713 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1714 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1715 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1716 MPI_Request *swaits,*rwaits; 1717 MPI_Status *sstatus,rstatus; 1718 PetscLayout rowmap; 1719 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1720 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1721 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1722 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1723 PetscScalar *apv; 1724 PetscTable ta; 1725 MatType mtype; 1726 const char *prefix; 1727 #if defined(PETSC_USE_INFO) 1728 PetscReal apfill; 1729 #endif 1730 1731 PetscFunctionBegin; 1732 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1733 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1734 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1735 1736 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1737 1738 /* create symbolic parallel matrix Cmpi */ 1739 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1740 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1741 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1742 1743 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1744 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1745 1746 /* create struct Mat_APMPI and attached it to C later */ 1747 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1748 ptap->reuse = MAT_INITIAL_MATRIX; 1749 ptap->algType = 1; 1750 1751 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1752 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1753 /* get P_loc by taking all local rows of P */ 1754 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1755 1756 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1757 /* --------------------------------- */ 1758 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1759 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1760 1761 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1762 /* ----------------------------------------------------------------- */ 1763 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1764 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1765 1766 /* create and initialize a linked list */ 1767 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 1768 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1769 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1770 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 1771 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1772 1773 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1774 1775 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1776 if (ao) { 1777 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 1778 } else { 1779 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 1780 } 1781 current_space = free_space; 1782 nspacedouble = 0; 1783 1784 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1785 api[0] = 0; 1786 for (i=0; i<am; i++) { 1787 /* diagonal portion: Ad[i,:]*P */ 1788 ai = ad->i; pi = p_loc->i; 1789 nzi = ai[i+1] - ai[i]; 1790 aj = ad->j + ai[i]; 1791 for (j=0; j<nzi; j++) { 1792 row = aj[j]; 1793 pnz = pi[row+1] - pi[row]; 1794 Jptr = p_loc->j + pi[row]; 1795 /* add non-zero cols of P into the sorted linked list lnk */ 1796 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1797 } 1798 /* off-diagonal portion: Ao[i,:]*P */ 1799 if (ao) { 1800 ai = ao->i; pi = p_oth->i; 1801 nzi = ai[i+1] - ai[i]; 1802 aj = ao->j + ai[i]; 1803 for (j=0; j<nzi; j++) { 1804 row = aj[j]; 1805 pnz = pi[row+1] - pi[row]; 1806 Jptr = p_oth->j + pi[row]; 1807 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1808 } 1809 } 1810 apnz = lnk[0]; 1811 api[i+1] = api[i] + apnz; 1812 if (ap_rmax < apnz) ap_rmax = apnz; 1813 1814 /* if free space is not available, double the total space in the list */ 1815 if (current_space->local_remaining<apnz) { 1816 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1817 nspacedouble++; 1818 } 1819 1820 /* Copy data into free space, then initialize lnk */ 1821 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1822 1823 current_space->array += apnz; 1824 current_space->local_used += apnz; 1825 current_space->local_remaining -= apnz; 1826 } 1827 /* Allocate space for apj and apv, initialize apj, and */ 1828 /* destroy list of free space and other temporary array(s) */ 1829 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 1830 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1831 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1832 1833 /* Create AP_loc for reuse */ 1834 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 1835 1836 #if defined(PETSC_USE_INFO) 1837 if (ao) { 1838 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1839 } else { 1840 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1841 } 1842 ptap->AP_loc->info.mallocs = nspacedouble; 1843 ptap->AP_loc->info.fill_ratio_given = fill; 1844 ptap->AP_loc->info.fill_ratio_needed = apfill; 1845 1846 if (api[am]) { 1847 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 1848 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 1849 } else { 1850 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 1851 } 1852 #endif 1853 1854 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1855 /* ------------------------------------ */ 1856 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 1857 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 1858 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 1859 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 1860 1861 /* (3) send coj of C_oth to other processors */ 1862 /* ------------------------------------------ */ 1863 /* determine row ownership */ 1864 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 1865 rowmap->n = pn; 1866 rowmap->bs = 1; 1867 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 1868 owners = rowmap->range; 1869 1870 /* determine the number of messages to send, their lengths */ 1871 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 1872 ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr); 1873 ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr); 1874 1875 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1876 coi = c_oth->i; coj = c_oth->j; 1877 con = ptap->C_oth->rmap->n; 1878 proc = 0; 1879 for (i=0; i<con; i++) { 1880 while (prmap[i] >= owners[proc+1]) proc++; 1881 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1882 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1883 } 1884 1885 len = 0; /* max length of buf_si[], see (4) */ 1886 owners_co[0] = 0; 1887 nsend = 0; 1888 for (proc=0; proc<size; proc++) { 1889 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1890 if (len_s[proc]) { 1891 nsend++; 1892 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1893 len += len_si[proc]; 1894 } 1895 } 1896 1897 /* determine the number and length of messages to receive for coi and coj */ 1898 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 1899 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 1900 1901 /* post the Irecv and Isend of coj */ 1902 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1903 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1904 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 1905 for (proc=0, k=0; proc<size; proc++) { 1906 if (!len_s[proc]) continue; 1907 i = owners_co[proc]; 1908 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1909 k++; 1910 } 1911 1912 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1913 /* ---------------------------------------- */ 1914 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 1915 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 1916 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 1917 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1918 1919 /* receives coj are complete */ 1920 for (i=0; i<nrecv; i++) { 1921 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1922 } 1923 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1924 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1925 1926 /* add received column indices into ta to update Crmax */ 1927 for (k=0; k<nrecv; k++) {/* k-th received message */ 1928 Jptr = buf_rj[k]; 1929 for (j=0; j<len_r[k]; j++) { 1930 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 1931 } 1932 } 1933 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 1934 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 1935 1936 /* (4) send and recv coi */ 1937 /*-----------------------*/ 1938 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1939 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1940 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1941 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1942 for (proc=0,k=0; proc<size; proc++) { 1943 if (!len_s[proc]) continue; 1944 /* form outgoing message for i-structure: 1945 buf_si[0]: nrows to be sent 1946 [1:nrows]: row index (global) 1947 [nrows+1:2*nrows+1]: i-structure index 1948 */ 1949 /*-------------------------------------------*/ 1950 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1951 buf_si_i = buf_si + nrows+1; 1952 buf_si[0] = nrows; 1953 buf_si_i[0] = 0; 1954 nrows = 0; 1955 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1956 nzi = coi[i+1] - coi[i]; 1957 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1958 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1959 nrows++; 1960 } 1961 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1962 k++; 1963 buf_si += len_si[proc]; 1964 } 1965 for (i=0; i<nrecv; i++) { 1966 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1967 } 1968 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1969 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1970 1971 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 1972 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1973 ierr = PetscFree(swaits);CHKERRQ(ierr); 1974 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1975 1976 /* (5) compute the local portion of Cmpi */ 1977 /* ------------------------------------------ */ 1978 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1979 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 1980 current_space = free_space; 1981 1982 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 1983 for (k=0; k<nrecv; k++) { 1984 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1985 nrows = *buf_ri_k[k]; 1986 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1987 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1988 } 1989 1990 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1991 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1992 for (i=0; i<pn; i++) { 1993 /* add C_loc into Cmpi */ 1994 nzi = c_loc->i[i+1] - c_loc->i[i]; 1995 Jptr = c_loc->j + c_loc->i[i]; 1996 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1997 1998 /* add received col data into lnk */ 1999 for (k=0; k<nrecv; k++) { /* k-th received message */ 2000 if (i == *nextrow[k]) { /* i-th row */ 2001 nzi = *(nextci[k]+1) - *nextci[k]; 2002 Jptr = buf_rj[k] + *nextci[k]; 2003 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 2004 nextrow[k]++; nextci[k]++; 2005 } 2006 } 2007 nzi = lnk[0]; 2008 2009 /* copy data into free space, then initialize lnk */ 2010 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 2011 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 2012 } 2013 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 2014 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2015 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 2016 2017 /* local sizes and preallocation */ 2018 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2019 if (P->cmap->bs > 0) { 2020 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 2021 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 2022 } 2023 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 2024 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 2025 2026 /* members in merge */ 2027 ierr = PetscFree(id_r);CHKERRQ(ierr); 2028 ierr = PetscFree(len_r);CHKERRQ(ierr); 2029 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 2030 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 2031 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 2032 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 2033 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 2034 2035 /* attach the supporting struct to Cmpi for reuse */ 2036 c = (Mat_MPIAIJ*)Cmpi->data; 2037 c->ap = ptap; 2038 ptap->duplicate = Cmpi->ops->duplicate; 2039 ptap->destroy = Cmpi->ops->destroy; 2040 ptap->view = Cmpi->ops->view; 2041 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 2042 2043 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 2044 Cmpi->assembled = PETSC_FALSE; 2045 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 2046 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 2047 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 2048 *C = Cmpi; 2049 PetscFunctionReturn(0); 2050 } 2051 2052 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 2053 { 2054 PetscErrorCode ierr; 2055 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 2056 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 2057 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 2058 Mat_APMPI *ptap = c->ap; 2059 Mat AP_loc,C_loc,C_oth; 2060 PetscInt i,rstart,rend,cm,ncols,row; 2061 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 2062 PetscScalar *apa; 2063 const PetscInt *cols; 2064 const PetscScalar *vals; 2065 2066 PetscFunctionBegin; 2067 if (!ptap->AP_loc) { 2068 MPI_Comm comm; 2069 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2070 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 2071 } 2072 2073 ierr = MatZeroEntries(C);CHKERRQ(ierr); 2074 /* 1) get R = Pd^T,Ro = Po^T */ 2075 if (ptap->reuse == MAT_REUSE_MATRIX) { 2076 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 2077 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 2078 } 2079 2080 /* 2) get AP_loc */ 2081 AP_loc = ptap->AP_loc; 2082 ap = (Mat_SeqAIJ*)AP_loc->data; 2083 2084 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 2085 /*-----------------------------------------------------*/ 2086 if (ptap->reuse == MAT_REUSE_MATRIX) { 2087 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 2088 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 2089 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 2090 } 2091 2092 /* 2-2) compute numeric A_loc*P - dominating part */ 2093 /* ---------------------------------------------- */ 2094 /* get data from symbolic products */ 2095 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 2096 if (ptap->P_oth) { 2097 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 2098 } 2099 apa = ptap->apa; 2100 api = ap->i; 2101 apj = ap->j; 2102 for (i=0; i<am; i++) { 2103 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 2104 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 2105 apnz = api[i+1] - api[i]; 2106 for (j=0; j<apnz; j++) { 2107 col = apj[j+api[i]]; 2108 ap->a[j+ap->i[i]] = apa[col]; 2109 apa[col] = 0.0; 2110 } 2111 } 2112 2113 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 2114 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 2115 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 2116 C_loc = ptap->C_loc; 2117 C_oth = ptap->C_oth; 2118 2119 /* add C_loc and Co to to C */ 2120 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 2121 2122 /* C_loc -> C */ 2123 cm = C_loc->rmap->N; 2124 c_seq = (Mat_SeqAIJ*)C_loc->data; 2125 cols = c_seq->j; 2126 vals = c_seq->a; 2127 2128 2129 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 2130 /* when there are no off-processor parts. */ 2131 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 2132 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 2133 /* a table, and other, more complex stuff has to be done. */ 2134 if (C->assembled) { 2135 C->was_assembled = PETSC_TRUE; 2136 C->assembled = PETSC_FALSE; 2137 } 2138 if (C->was_assembled) { 2139 for (i=0; i<cm; i++) { 2140 ncols = c_seq->i[i+1] - c_seq->i[i]; 2141 row = rstart + i; 2142 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2143 cols += ncols; vals += ncols; 2144 } 2145 } else { 2146 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 2147 } 2148 2149 /* Co -> C, off-processor part */ 2150 cm = C_oth->rmap->N; 2151 c_seq = (Mat_SeqAIJ*)C_oth->data; 2152 cols = c_seq->j; 2153 vals = c_seq->a; 2154 for (i=0; i<cm; i++) { 2155 ncols = c_seq->i[i+1] - c_seq->i[i]; 2156 row = p->garray[i]; 2157 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2158 cols += ncols; vals += ncols; 2159 } 2160 2161 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2162 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2163 2164 ptap->reuse = MAT_REUSE_MATRIX; 2165 2166 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 2167 if (ptap->freestruct) { 2168 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 2169 } 2170 PetscFunctionReturn(0); 2171 } 2172