1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmap.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PETSC_HASH_MAP(HMapIV, PetscInt, PetscScalar, PetscHashInt, PetscHashEqual, -1) 17 18 PETSC_STATIC_INLINE 19 PetscErrorCode PetscHMapIVAdd(PetscHMapIV ht,PetscInt key,PetscScalar val) 20 { 21 int ret; 22 khiter_t iter; 23 PetscFunctionBeginHot; 24 PetscValidPointer(ht,1); 25 iter = kh_put(HMapIV,ht,key,&ret); 26 PetscHashAssert(ret>=0); 27 if (ret) kh_val(ht,iter) = val; 28 else kh_val(ht,iter) += val; 29 PetscFunctionReturn(0); 30 } 31 32 33 PETSC_STATIC_INLINE PETSC_UNUSED 34 PetscErrorCode PetscHMapIVAddAndScale(PetscHMapIV ht,PetscHMapIV hd,PetscScalar scale) 35 { 36 int ret; 37 PetscInt key; 38 PetscScalar val; 39 PetscFunctionBegin; 40 PetscValidPointer(ht,1); 41 PetscValidPointer(hd,2); 42 kh_foreach(hd,key,val,{ khiter_t i; 43 i = kh_put(HMapIV,ht,key,&ret); 44 PetscHashAssert(ret>=0); 45 if (ret) kh_val(ht,i) = val*scale; 46 else kh_val(ht,i) += val*scale;}) 47 PetscFunctionReturn(0); 48 } 49 50 /* #define PTAP_PROFILE */ 51 52 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 53 { 54 PetscErrorCode ierr; 55 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 56 Mat_APMPI *ptap=a->ap; 57 PetscBool iascii; 58 PetscViewerFormat format; 59 60 PetscFunctionBegin; 61 if (!ptap) { 62 /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */ 63 A->ops->view = MatView_MPIAIJ; 64 ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr); 65 PetscFunctionReturn(0); 66 } 67 68 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 69 if (iascii) { 70 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 71 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 72 if (ptap->algType == 0) { 73 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 74 } else if (ptap->algType == 1) { 75 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 76 } else if (ptap->algType == 2) { 77 ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 78 } else if (ptap->algType == 3) { 79 ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 80 } 81 } 82 } 83 ierr = (ptap->view)(A,viewer);CHKERRQ(ierr); 84 PetscFunctionReturn(0); 85 } 86 87 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A) 88 { 89 PetscErrorCode ierr; 90 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 91 Mat_APMPI *ptap=a->ap; 92 Mat_Merge_SeqsToMPI *merge; 93 94 PetscFunctionBegin; 95 if (!ptap) PetscFunctionReturn(0); 96 97 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 98 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 99 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 100 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 101 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 102 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 103 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 104 if (ptap->AP_loc) { /* used by alg_rap */ 105 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 106 ierr = PetscFree(ap->i);CHKERRQ(ierr); 107 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 108 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 109 } else { /* used by alg_ptap */ 110 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 111 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 112 } 113 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 114 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 115 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 116 117 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 118 119 merge=ptap->merge; 120 if (merge) { /* used by alg_ptap */ 121 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 122 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 123 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 124 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 125 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 126 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 127 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 128 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 129 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 130 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 131 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 132 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 133 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 134 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 135 } 136 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 137 138 ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr); 139 ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr); 140 ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr); 141 PetscFunctionReturn(0); 142 } 143 144 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 145 { 146 PetscErrorCode ierr; 147 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 148 Mat_APMPI *ptap=a->ap; 149 150 PetscFunctionBegin; 151 ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr); 152 ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */ 153 ierr = PetscFree(ptap);CHKERRQ(ierr); 154 PetscFunctionReturn(0); 155 } 156 157 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 158 { 159 PetscErrorCode ierr; 160 PetscBool flg; 161 MPI_Comm comm; 162 #if !defined(PETSC_HAVE_HYPRE) 163 const char *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"}; 164 PetscInt nalg=4; 165 #else 166 const char *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"}; 167 PetscInt nalg=5; 168 #endif 169 PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */ 170 171 PetscFunctionBegin; 172 /* check if matrix local sizes are compatible */ 173 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 174 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 175 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 176 177 if (scall == MAT_INITIAL_MATRIX) { 178 /* pick an algorithm */ 179 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 180 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 181 ierr = PetscOptionsEnd();CHKERRQ(ierr); 182 183 if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */ 184 MatInfo Ainfo,Pinfo; 185 PetscInt nz_local; 186 PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable; 187 188 ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr); 189 ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr); 190 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 191 192 if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE; 193 ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr); 194 195 if (alg_scalable) { 196 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 197 } 198 } 199 200 switch (alg) { 201 case 1: 202 /* do R=P^T locally, then C=R*A*P -- nonscalable */ 203 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 204 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 205 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 206 break; 207 case 2: 208 /* compute C=P^T*A*P allatonce */ 209 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 210 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr); 211 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 212 break; 213 case 3: 214 /* compute C=P^T*A*P allatonce */ 215 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 216 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr); 217 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 218 break; 219 #if defined(PETSC_HAVE_HYPRE) 220 case 4: 221 /* Use boomerAMGBuildCoarseOperator */ 222 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 223 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 224 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 225 break; 226 #endif 227 default: 228 /* do R=P^T locally, then C=R*A*P */ 229 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 230 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr); 231 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 232 break; 233 } 234 235 if (alg == 0 || alg == 1 || alg == 2 || alg == 3) { 236 Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data; 237 Mat_APMPI *ap = c->ap; 238 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr); 239 ap->freestruct = PETSC_FALSE; 240 ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr); 241 ierr = PetscOptionsEnd();CHKERRQ(ierr); 242 } 243 } 244 245 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 246 ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr); 247 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 248 PetscFunctionReturn(0); 249 } 250 251 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 252 { 253 PetscErrorCode ierr; 254 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 255 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 256 Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq; 257 Mat_APMPI *ptap = c->ap; 258 Mat AP_loc,C_loc,C_oth; 259 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 260 PetscScalar *apa; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 if (!ptap) { 266 MPI_Comm comm; 267 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 268 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 269 } 270 271 ierr = MatZeroEntries(C);CHKERRQ(ierr); 272 273 /* 1) get R = Pd^T,Ro = Po^T */ 274 if (ptap->reuse == MAT_REUSE_MATRIX) { 275 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 276 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 277 } 278 279 /* 2) get AP_loc */ 280 AP_loc = ptap->AP_loc; 281 ap = (Mat_SeqAIJ*)AP_loc->data; 282 283 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 284 /*-----------------------------------------------------*/ 285 if (ptap->reuse == MAT_REUSE_MATRIX) { 286 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 287 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 288 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 289 } 290 291 /* 2-2) compute numeric A_loc*P - dominating part */ 292 /* ---------------------------------------------- */ 293 /* get data from symbolic products */ 294 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 295 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 296 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!"); 297 298 api = ap->i; 299 apj = ap->j; 300 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 301 for (i=0; i<am; i++) { 302 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 303 apnz = api[i+1] - api[i]; 304 apa = ap->a + api[i]; 305 ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr); 306 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 307 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 308 } 309 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 310 if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout); 311 312 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 313 /* Always use scalable version since we are in the MPI scalable version */ 314 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 315 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 316 317 C_loc = ptap->C_loc; 318 C_oth = ptap->C_oth; 319 320 /* add C_loc and Co to to C */ 321 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 322 323 /* C_loc -> C */ 324 cm = C_loc->rmap->N; 325 c_seq = (Mat_SeqAIJ*)C_loc->data; 326 cols = c_seq->j; 327 vals = c_seq->a; 328 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 329 330 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 331 /* when there are no off-processor parts. */ 332 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 333 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 334 /* a table, and other, more complex stuff has to be done. */ 335 if (C->assembled) { 336 C->was_assembled = PETSC_TRUE; 337 C->assembled = PETSC_FALSE; 338 } 339 if (C->was_assembled) { 340 for (i=0; i<cm; i++) { 341 ncols = c_seq->i[i+1] - c_seq->i[i]; 342 row = rstart + i; 343 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 344 cols += ncols; vals += ncols; 345 } 346 } else { 347 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 348 } 349 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 350 if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 351 352 /* Co -> C, off-processor part */ 353 cm = C_oth->rmap->N; 354 c_seq = (Mat_SeqAIJ*)C_oth->data; 355 cols = c_seq->j; 356 vals = c_seq->a; 357 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 358 for (i=0; i<cm; i++) { 359 ncols = c_seq->i[i+1] - c_seq->i[i]; 360 row = p->garray[i]; 361 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 362 cols += ncols; vals += ncols; 363 } 364 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 365 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 366 367 ptap->reuse = MAT_REUSE_MATRIX; 368 369 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 370 if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 371 372 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 373 if (ptap->freestruct) { 374 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 375 } 376 PetscFunctionReturn(0); 377 } 378 379 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C) 380 { 381 PetscErrorCode ierr; 382 Mat_APMPI *ptap; 383 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 384 MPI_Comm comm; 385 PetscMPIInt size,rank; 386 Mat Cmpi,P_loc,P_oth; 387 PetscFreeSpaceList free_space=NULL,current_space=NULL; 388 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 389 PetscInt *lnk,i,k,pnz,row,nsend; 390 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 391 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 392 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 393 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 394 MPI_Request *swaits,*rwaits; 395 MPI_Status *sstatus,rstatus; 396 PetscLayout rowmap; 397 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 398 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 399 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 400 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 401 PetscScalar *apv; 402 PetscTable ta; 403 MatType mtype; 404 const char *prefix; 405 #if defined(PETSC_USE_INFO) 406 PetscReal apfill; 407 #endif 408 409 PetscFunctionBegin; 410 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 411 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 412 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 413 414 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 415 416 /* create symbolic parallel matrix Cmpi */ 417 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 418 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 419 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 420 421 /* create struct Mat_APMPI and attached it to C later */ 422 ierr = PetscNew(&ptap);CHKERRQ(ierr); 423 ptap->reuse = MAT_INITIAL_MATRIX; 424 ptap->algType = 0; 425 426 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 427 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 428 /* get P_loc by taking all local rows of P */ 429 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 430 431 ptap->P_loc = P_loc; 432 ptap->P_oth = P_oth; 433 434 /* (0) compute Rd = Pd^T, Ro = Po^T */ 435 /* --------------------------------- */ 436 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 437 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 438 439 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 440 /* ----------------------------------------------------------------- */ 441 p_loc = (Mat_SeqAIJ*)P_loc->data; 442 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 443 444 /* create and initialize a linked list */ 445 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 446 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 447 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 448 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 449 450 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 451 452 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 453 if (ao) { 454 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 455 } else { 456 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 457 } 458 current_space = free_space; 459 nspacedouble = 0; 460 461 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 462 api[0] = 0; 463 for (i=0; i<am; i++) { 464 /* diagonal portion: Ad[i,:]*P */ 465 ai = ad->i; pi = p_loc->i; 466 nzi = ai[i+1] - ai[i]; 467 aj = ad->j + ai[i]; 468 for (j=0; j<nzi; j++) { 469 row = aj[j]; 470 pnz = pi[row+1] - pi[row]; 471 Jptr = p_loc->j + pi[row]; 472 /* add non-zero cols of P into the sorted linked list lnk */ 473 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 474 } 475 /* off-diagonal portion: Ao[i,:]*P */ 476 if (ao) { 477 ai = ao->i; pi = p_oth->i; 478 nzi = ai[i+1] - ai[i]; 479 aj = ao->j + ai[i]; 480 for (j=0; j<nzi; j++) { 481 row = aj[j]; 482 pnz = pi[row+1] - pi[row]; 483 Jptr = p_oth->j + pi[row]; 484 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 485 } 486 } 487 apnz = lnk[0]; 488 api[i+1] = api[i] + apnz; 489 490 /* if free space is not available, double the total space in the list */ 491 if (current_space->local_remaining<apnz) { 492 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 493 nspacedouble++; 494 } 495 496 /* Copy data into free space, then initialize lnk */ 497 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 498 499 current_space->array += apnz; 500 current_space->local_used += apnz; 501 current_space->local_remaining -= apnz; 502 } 503 /* Allocate space for apj and apv, initialize apj, and */ 504 /* destroy list of free space and other temporary array(s) */ 505 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 506 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 507 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 508 509 /* Create AP_loc for reuse */ 510 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 511 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 512 513 #if defined(PETSC_USE_INFO) 514 if (ao) { 515 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 516 } else { 517 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 518 } 519 ptap->AP_loc->info.mallocs = nspacedouble; 520 ptap->AP_loc->info.fill_ratio_given = fill; 521 ptap->AP_loc->info.fill_ratio_needed = apfill; 522 523 if (api[am]) { 524 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 525 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 526 } else { 527 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 528 } 529 #endif 530 531 /* (2-1) compute symbolic Co = Ro*AP_loc */ 532 /* ------------------------------------ */ 533 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 534 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 535 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 536 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 537 538 /* (3) send coj of C_oth to other processors */ 539 /* ------------------------------------------ */ 540 /* determine row ownership */ 541 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 542 rowmap->n = pn; 543 rowmap->bs = 1; 544 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 545 owners = rowmap->range; 546 547 /* determine the number of messages to send, their lengths */ 548 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 549 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 550 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 551 552 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 553 coi = c_oth->i; coj = c_oth->j; 554 con = ptap->C_oth->rmap->n; 555 proc = 0; 556 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 557 for (i=0; i<con; i++) { 558 while (prmap[i] >= owners[proc+1]) proc++; 559 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 560 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 561 } 562 563 len = 0; /* max length of buf_si[], see (4) */ 564 owners_co[0] = 0; 565 nsend = 0; 566 for (proc=0; proc<size; proc++) { 567 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 568 if (len_s[proc]) { 569 nsend++; 570 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 571 len += len_si[proc]; 572 } 573 } 574 575 /* determine the number and length of messages to receive for coi and coj */ 576 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 577 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 578 579 /* post the Irecv and Isend of coj */ 580 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 581 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 582 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 583 for (proc=0, k=0; proc<size; proc++) { 584 if (!len_s[proc]) continue; 585 i = owners_co[proc]; 586 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 587 k++; 588 } 589 590 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 591 /* ---------------------------------------- */ 592 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 593 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 594 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 595 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 596 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 597 598 /* receives coj are complete */ 599 for (i=0; i<nrecv; i++) { 600 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 601 } 602 ierr = PetscFree(rwaits);CHKERRQ(ierr); 603 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 604 605 /* add received column indices into ta to update Crmax */ 606 for (k=0; k<nrecv; k++) {/* k-th received message */ 607 Jptr = buf_rj[k]; 608 for (j=0; j<len_r[k]; j++) { 609 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 610 } 611 } 612 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 613 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 614 615 /* (4) send and recv coi */ 616 /*-----------------------*/ 617 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 618 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 619 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 620 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 621 for (proc=0,k=0; proc<size; proc++) { 622 if (!len_s[proc]) continue; 623 /* form outgoing message for i-structure: 624 buf_si[0]: nrows to be sent 625 [1:nrows]: row index (global) 626 [nrows+1:2*nrows+1]: i-structure index 627 */ 628 /*-------------------------------------------*/ 629 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 630 buf_si_i = buf_si + nrows+1; 631 buf_si[0] = nrows; 632 buf_si_i[0] = 0; 633 nrows = 0; 634 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 635 nzi = coi[i+1] - coi[i]; 636 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 637 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 638 nrows++; 639 } 640 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 641 k++; 642 buf_si += len_si[proc]; 643 } 644 for (i=0; i<nrecv; i++) { 645 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 646 } 647 ierr = PetscFree(rwaits);CHKERRQ(ierr); 648 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 649 650 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 651 ierr = PetscFree(len_ri);CHKERRQ(ierr); 652 ierr = PetscFree(swaits);CHKERRQ(ierr); 653 ierr = PetscFree(buf_s);CHKERRQ(ierr); 654 655 /* (5) compute the local portion of Cmpi */ 656 /* ------------------------------------------ */ 657 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 658 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 659 current_space = free_space; 660 661 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 662 for (k=0; k<nrecv; k++) { 663 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 664 nrows = *buf_ri_k[k]; 665 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 666 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 667 } 668 669 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 670 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 671 for (i=0; i<pn; i++) { 672 /* add C_loc into Cmpi */ 673 nzi = c_loc->i[i+1] - c_loc->i[i]; 674 Jptr = c_loc->j + c_loc->i[i]; 675 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 676 677 /* add received col data into lnk */ 678 for (k=0; k<nrecv; k++) { /* k-th received message */ 679 if (i == *nextrow[k]) { /* i-th row */ 680 nzi = *(nextci[k]+1) - *nextci[k]; 681 Jptr = buf_rj[k] + *nextci[k]; 682 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 683 nextrow[k]++; nextci[k]++; 684 } 685 } 686 nzi = lnk[0]; 687 688 /* copy data into free space, then initialize lnk */ 689 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 690 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 691 } 692 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 693 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 694 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 695 696 /* local sizes and preallocation */ 697 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 698 if (P->cmap->bs > 0) { 699 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 700 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 701 } 702 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 703 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 704 705 /* members in merge */ 706 ierr = PetscFree(id_r);CHKERRQ(ierr); 707 ierr = PetscFree(len_r);CHKERRQ(ierr); 708 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 709 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 710 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 711 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 712 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 713 714 /* attach the supporting struct to Cmpi for reuse */ 715 c = (Mat_MPIAIJ*)Cmpi->data; 716 c->ap = ptap; 717 ptap->duplicate = Cmpi->ops->duplicate; 718 ptap->destroy = Cmpi->ops->destroy; 719 ptap->view = Cmpi->ops->view; 720 721 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 722 Cmpi->assembled = PETSC_FALSE; 723 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 724 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 725 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 726 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 727 *C = Cmpi; 728 729 nout = 0; 730 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 731 if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout); 732 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 733 if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout); 734 735 PetscFunctionReturn(0); 736 } 737 738 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHSetI dht,PetscHSetI oht) 739 { 740 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 741 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 742 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 743 PetscInt pcstart,pcend,column; 744 PetscErrorCode ierr; 745 746 PetscFunctionBegin; 747 748 pcstart = P->cmap->rstart; 749 pcend = P->cmap->rend; 750 /* diagonal portion: Ad[i,:]*P */ 751 ai = ad->i; 752 nzi = ai[i+1] - ai[i]; 753 aj = ad->j + ai[i]; 754 for (j=0; j<nzi; j++) { 755 row = aj[j]; 756 nzpi = pd->i[row+1] - pd->i[row]; 757 pj = pd->j + pd->i[row]; 758 for (k=0; k<nzpi; k++) { 759 ierr = PetscHSetIAdd(dht,pj[k]+pcstart);CHKERRQ(ierr); 760 } 761 } 762 for (j=0; j<nzi; j++) { 763 row = aj[j]; 764 nzpi = po->i[row+1] - po->i[row]; 765 pj = po->j + po->i[row]; 766 for (k=0; k<nzpi; k++) { 767 ierr = PetscHSetIAdd(oht,p->garray[pj[k]]);CHKERRQ(ierr); 768 } 769 } 770 771 /* off diagonal part: Ao[i, :]*P_oth */ 772 if (ao) { 773 ai = ao->i; 774 pi = p_oth->i; 775 nzi = ai[i+1] - ai[i]; 776 aj = ao->j + ai[i]; 777 for (j=0; j<nzi; j++) { 778 row = aj[j]; 779 pnz = pi[row+1] - pi[row]; 780 p_othcols = p_oth->j + pi[row]; 781 for (col=0; col<pnz; col++) { 782 column = p_othcols[col]; 783 if (column>=pcstart && column<pcend) { 784 ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr); 785 } else { 786 ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr); 787 } 788 } 789 } 790 } /* end if (ao) */ 791 PetscFunctionReturn(0); 792 } 793 794 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHMapIV hmap) 795 { 796 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 797 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 798 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi; 799 PetscScalar ra,*aa,*pa; 800 PetscErrorCode ierr; 801 802 PetscFunctionBegin; 803 pcstart = P->cmap->rstart; 804 /* diagonal portion: Ad[i,:]*P */ 805 ai = ad->i; 806 nzi = ai[i+1] - ai[i]; 807 aj = ad->j + ai[i]; 808 aa = ad->a + ai[i]; 809 810 for (j=0; j<nzi; j++) { 811 ra = aa[j]; 812 row = aj[j]; 813 nzpi = pd->i[row+1] - pd->i[row]; 814 pj = pd->j + pd->i[row]; 815 pa = pd->a + pd->i[row]; 816 for (k=0; k<nzpi; k++) { 817 ierr = PetscHMapIVAdd(hmap,pj[k]+pcstart,ra*pa[k]);CHKERRQ(ierr); 818 } 819 } 820 for (j=0; j<nzi; j++) { 821 ra = aa[j]; 822 row = aj[j]; 823 nzpi = po->i[row+1] - po->i[row]; 824 pj = po->j + po->i[row]; 825 pa = po->a + po->i[row]; 826 for (k=0; k<nzpi; k++) { 827 ierr = PetscHMapIVAdd(hmap,p->garray[pj[k]],ra*pa[k]);CHKERRQ(ierr); 828 } 829 } 830 831 832 /* off diagonal part: Ao[i, :]*P_oth */ 833 if (ao) { 834 ai = ao->i; 835 pi = p_oth->i; 836 nzi = ai[i+1] - ai[i]; 837 aj = ao->j + ai[i]; 838 aa = ao->a + ai[i]; 839 for (j=0; j<nzi; j++) { 840 row = aj[j]; 841 ra = aa[j]; 842 pnz = pi[row+1] - pi[row]; 843 p_othcols = p_oth->j + pi[row]; 844 pa = p_oth->a + pi[row]; 845 for (col=0; col<pnz; col++) { 846 ierr = PetscHMapIVAdd(hmap,p_othcols[col],ra*pa[col]);CHKERRQ(ierr); 847 } 848 } 849 } /* end if (ao) */ 850 PetscFunctionReturn(0); 851 } 852 853 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 854 { 855 PetscErrorCode ierr; 856 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 857 Mat_SeqAIJ *cd,*co,*po,*pd; 858 Mat_APMPI *ptap = c->ap; 859 PetscHMapIV hmap; 860 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 861 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 862 MPI_Comm comm; 863 864 PetscFunctionBegin; 865 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 866 if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 867 868 ierr = MatZeroEntries(C);CHKERRQ(ierr); 869 870 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 871 /*-----------------------------------------------------*/ 872 if (ptap->reuse == MAT_REUSE_MATRIX) { 873 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 874 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 875 } 876 877 po = (Mat_SeqAIJ*) p->B->data; 878 pd = (Mat_SeqAIJ*) p->A->data; 879 880 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 881 882 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 883 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 884 cmaxr = 0; 885 for (i=0; i<pon; i++) { 886 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 887 } 888 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 889 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 890 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 891 for (i=0; i<am && pon; i++) { 892 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 893 nzi = po->i[i+1] - po->i[i]; 894 if (!nzi) continue; 895 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr); 896 voff = 0; 897 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 898 if (!voff) continue; 899 /*ierr = PetscMemzero(c_rmtc,sizeof(PetscInt)*pon);CHKERRQ(ierr);*/ 900 901 /* Form C(ii, :) */ 902 poj = po->j + po->i[i]; 903 poa = po->a + po->i[i]; 904 for (j=0; j<nzi; j++) { 905 c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]]; 906 c_rmtaa = c_rmta + ptap->c_rmti[poj[j]]; 907 for (jj=0; jj<voff; jj++) { 908 apvaluestmp[jj] = apvalues[jj]*poa[j]; 909 /*If the row is empty */ 910 if (!c_rmtc[poj[j]]) { 911 c_rmtjj[jj] = apindices[jj]; 912 c_rmtaa[jj] = apvaluestmp[jj]; 913 c_rmtc[poj[j]]++; 914 } else { 915 ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr); 916 if (loc>=0){ /* hit */ 917 c_rmtaa[loc] += apvaluestmp[jj]; 918 } else { /* new element */ 919 loc = -(loc+1); 920 /* Move data backward */ 921 for (kk=c_rmtc[poj[j]]; kk>loc; kk--) { 922 c_rmtjj[kk] = c_rmtjj[kk-1]; 923 c_rmtaa[kk] = c_rmtaa[kk-1]; 924 }/* End kk */ 925 c_rmtjj[loc] = apindices[jj]; 926 c_rmtaa[loc] = apvaluestmp[jj]; 927 c_rmtc[poj[j]]++; 928 } 929 } 930 } /* End jj */ 931 } /* End j */ 932 } /* End i */ 933 934 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 935 936 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 937 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 938 939 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 940 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 941 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 942 cd = (Mat_SeqAIJ*)(c->A)->data; 943 co = (Mat_SeqAIJ*)(c->B)->data; 944 945 cmaxr = 0; 946 for (i=0; i<pn; i++) { 947 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 948 } 949 ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr); 950 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 951 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 952 for (i=0; i<am && pn; i++) { 953 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 954 nzi = pd->i[i+1] - pd->i[i]; 955 if (!nzi) continue; 956 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr); 957 voff = 0; 958 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 959 if (!voff) continue; 960 /* Form C(ii, :) */ 961 pdj = pd->j + pd->i[i]; 962 pda = pd->a + pd->i[i]; 963 for (j=0; j<nzi; j++) { 964 row = pcstart + pdj[j]; 965 for (jj=0; jj<voff; jj++) { 966 apvaluestmp[jj] = apvalues[jj]*pda[j]; 967 } 968 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 969 } 970 } 971 972 ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr); 973 ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr); 974 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 975 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr); 976 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 977 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 978 979 /* Add contributions from remote */ 980 for (i = 0; i < pn; i++) { 981 row = i + pcstart; 982 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 983 } 984 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 985 986 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 987 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 988 989 ptap->reuse = MAT_REUSE_MATRIX; 990 991 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 992 if (ptap->freestruct) { 993 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 994 } 995 PetscFunctionReturn(0); 996 } 997 998 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 999 { 1000 PetscErrorCode ierr; 1001 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1002 Mat_SeqAIJ *cd,*co,*po,*pd; 1003 Mat_APMPI *ptap = c->ap; 1004 PetscHMapIV hmap; 1005 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 1006 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 1007 MPI_Comm comm; 1008 1009 PetscFunctionBegin; 1010 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1011 if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 1012 1013 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1014 1015 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1016 /*-----------------------------------------------------*/ 1017 if (ptap->reuse == MAT_REUSE_MATRIX) { 1018 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1019 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1020 } 1021 1022 po = (Mat_SeqAIJ*) p->B->data; 1023 pd = (Mat_SeqAIJ*) p->A->data; 1024 1025 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1026 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1027 1028 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 1029 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1030 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1031 cmaxr = 0; 1032 for (i=0; i<pon; i++) { 1033 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 1034 } 1035 cd = (Mat_SeqAIJ*)(c->A)->data; 1036 co = (Mat_SeqAIJ*)(c->B)->data; 1037 for (i=0; i<pn; i++) { 1038 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 1039 } 1040 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 1041 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 1042 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 1043 for (i=0; i<am && (pon || pn); i++) { 1044 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 1045 nzi = po->i[i+1] - po->i[i]; 1046 dnzi = pd->i[i+1] - pd->i[i]; 1047 if (!nzi && !dnzi) continue; 1048 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr); 1049 voff = 0; 1050 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 1051 if (!voff) continue; 1052 1053 /* Form remote C(ii, :) */ 1054 poj = po->j + po->i[i]; 1055 poa = po->a + po->i[i]; 1056 for (j=0; j<nzi; j++) { 1057 c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]]; 1058 c_rmtaa = c_rmta + ptap->c_rmti[poj[j]]; 1059 for (jj=0; jj<voff; jj++) { 1060 apvaluestmp[jj] = apvalues[jj]*poa[j]; 1061 /*If the row is empty */ 1062 if (!c_rmtc[poj[j]]) { 1063 c_rmtjj[jj] = apindices[jj]; 1064 c_rmtaa[jj] = apvaluestmp[jj]; 1065 c_rmtc[poj[j]]++; 1066 } else { 1067 ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr); 1068 if (loc>=0){ /* hit */ 1069 c_rmtaa[loc] += apvaluestmp[jj]; 1070 } else { /* new element */ 1071 loc = -(loc+1); 1072 /* Move data backward */ 1073 for (kk=c_rmtc[poj[j]]; kk>loc; kk--) { 1074 c_rmtjj[kk] = c_rmtjj[kk-1]; 1075 c_rmtaa[kk] = c_rmtaa[kk-1]; 1076 }/* End kk */ 1077 c_rmtjj[loc] = apindices[jj]; 1078 c_rmtaa[loc] = apvaluestmp[jj]; 1079 c_rmtc[poj[j]]++; 1080 } 1081 } 1082 } /* End jj */ 1083 } /* End j */ 1084 1085 /* Form local C(ii, :) */ 1086 pdj = pd->j + pd->i[i]; 1087 pda = pd->a + pd->i[i]; 1088 for (j=0; j<dnzi; j++) { 1089 row = pcstart + pdj[j]; 1090 for (jj=0; jj<voff; jj++) { 1091 apvaluestmp[jj] = apvalues[jj]*pda[j]; 1092 }/* End kk */ 1093 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 1094 }/* End j */ 1095 } /* End i */ 1096 1097 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 1098 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 1099 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 1100 1101 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1102 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1103 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr); 1104 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1105 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 1106 1107 /* Add contributions from remote */ 1108 for (i = 0; i < pn; i++) { 1109 row = i + pcstart; 1110 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 1111 } 1112 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 1113 1114 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1115 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1116 1117 ptap->reuse = MAT_REUSE_MATRIX; 1118 1119 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 1120 if (ptap->freestruct) { 1121 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 1122 } 1123 PetscFunctionReturn(0); 1124 } 1125 1126 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C) 1127 { 1128 Mat_APMPI *ptap; 1129 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1130 MPI_Comm comm; 1131 Mat Cmpi; 1132 Mat_SeqAIJ *pd,*po; 1133 MatType mtype; 1134 PetscSF sf; 1135 PetscSFNode *iremote; 1136 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1137 const PetscInt *rootdegrees; 1138 PetscHSetI ht,oht,*hta,*hto; 1139 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1140 PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1141 PetscInt nalg=2,alg=0; 1142 PetscBool flg; 1143 const char *algTypes[2] = {"overlapping","merged"}; 1144 PetscErrorCode ierr; 1145 1146 PetscFunctionBegin; 1147 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1148 1149 /* Create symbolic parallel matrix Cmpi */ 1150 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1151 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1152 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1153 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1154 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1155 1156 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1157 ptap->reuse = MAT_INITIAL_MATRIX; 1158 ptap->algType = 2; 1159 1160 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1161 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1162 1163 po = (Mat_SeqAIJ*)p->B->data; 1164 pd = (Mat_SeqAIJ*)p->A->data; 1165 1166 /* This equals to the number of offdiag columns in P */ 1167 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1168 /* offsets */ 1169 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1170 /* The number of columns we will send to remote ranks */ 1171 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1172 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1173 for (i=0; i<pon; i++) { 1174 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1175 } 1176 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1177 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1178 /* Create hash table to merge all columns for C(i, :) */ 1179 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1180 1181 ptap->c_rmti[0] = 0; 1182 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1183 for (i=0; i<am && pon; i++) { 1184 /* Form one row of AP */ 1185 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1186 /* If the off diag is empty, we should not do any calculation */ 1187 nzi = po->i[i+1] - po->i[i]; 1188 if (!nzi) continue; 1189 1190 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,ht);CHKERRQ(ierr); 1191 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1192 /* If AP is empty, just continue */ 1193 if (!htsize) continue; 1194 /* Form C(ii, :) */ 1195 poj = po->j + po->i[i]; 1196 for (j=0; j<nzi; j++) { 1197 ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr); 1198 } 1199 } 1200 1201 for (i=0; i<pon; i++) { 1202 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1203 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1204 c_rmtc[i] = htsize; 1205 } 1206 1207 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1208 1209 for (i=0; i<pon; i++) { 1210 off = 0; 1211 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1212 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1213 } 1214 ierr = PetscFree(hta);CHKERRQ(ierr); 1215 1216 ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr); 1217 for (i=0; i<pon; i++) { 1218 owner = 0; lidx = 0; 1219 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1220 iremote[i].index = lidx; 1221 iremote[i].rank = owner; 1222 } 1223 1224 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1225 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1226 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1227 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1228 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1229 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1230 /* How many neighbors have contributions to my rows? */ 1231 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1232 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1233 rootspacesize = 0; 1234 for (i = 0; i < pn; i++) { 1235 rootspacesize += rootdegrees[i]; 1236 } 1237 ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1238 ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1239 /* Get information from leaves 1240 * Number of columns other people contribute to my rows 1241 * */ 1242 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1243 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1244 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1245 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1246 /* The number of columns is received for each row */ 1247 ptap->c_othi[0] = 0; 1248 rootspacesize = 0; 1249 rootspaceoffsets[0] = 0; 1250 for (i = 0; i < pn; i++) { 1251 rcvncols = 0; 1252 for (j = 0; j<rootdegrees[i]; j++) { 1253 rcvncols += rootspace[rootspacesize]; 1254 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1255 rootspacesize++; 1256 } 1257 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1258 } 1259 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1260 1261 ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1262 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1263 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1264 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1265 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1266 1267 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1268 nleaves = 0; 1269 for (i = 0; i<pon; i++) { 1270 owner = 0; lidx = 0; 1271 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1272 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1273 for (j=0; j<sendncols; j++) { 1274 iremote[nleaves].rank = owner; 1275 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1276 } 1277 } 1278 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1279 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1280 1281 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1282 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1283 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1284 /* One to one map */ 1285 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1286 1287 ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1288 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1289 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1290 ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr); 1291 for (i=0; i<pn; i++) { 1292 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1293 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1294 } 1295 /* Work on local part */ 1296 /* 4) Pass 1: Estimate memory for C_loc */ 1297 for (i=0; i<am && pn; i++) { 1298 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1299 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1300 nzi = pd->i[i+1] - pd->i[i]; 1301 if (!nzi) continue; 1302 1303 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr); 1304 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1305 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1306 if (!(htsize+htosize)) continue; 1307 /* Form C(ii, :) */ 1308 pdj = pd->j + pd->i[i]; 1309 for (j=0; j<nzi; j++) { 1310 ierr = PetscHSetIAppend(hta[pdj[j]],ht);CHKERRQ(ierr); 1311 ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr); 1312 } 1313 } 1314 1315 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1316 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1317 1318 /* Get remote data */ 1319 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1320 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1321 1322 for (i = 0; i < pn; i++) { 1323 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1324 rdj = c_othj + ptap->c_othi[i]; 1325 for (j = 0; j < nzi; j++) { 1326 col = rdj[j]; 1327 /* diag part */ 1328 if (col>=pcstart && col<pcend) { 1329 ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr); 1330 } else { /* off diag */ 1331 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1332 } 1333 } 1334 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1335 dnz[i] = htsize; 1336 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1337 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1338 onz[i] = htsize; 1339 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1340 } 1341 1342 ierr = PetscFree2(hta,hto);CHKERRQ(ierr); 1343 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1344 1345 /* local sizes and preallocation */ 1346 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1347 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1348 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1349 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1350 1351 /* attach the supporting struct to Cmpi for reuse */ 1352 c = (Mat_MPIAIJ*)Cmpi->data; 1353 c->ap = ptap; 1354 ptap->duplicate = Cmpi->ops->duplicate; 1355 ptap->destroy = Cmpi->ops->destroy; 1356 ptap->view = Cmpi->ops->view; 1357 1358 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1359 Cmpi->assembled = PETSC_FALSE; 1360 /* pick an algorithm */ 1361 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1362 alg = 0; 1363 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1364 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1365 switch (alg) { 1366 case 0: 1367 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1368 break; 1369 case 1: 1370 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1371 break; 1372 default: 1373 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1374 } 1375 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1376 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1377 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1378 *C = Cmpi; 1379 PetscFunctionReturn(0); 1380 } 1381 1382 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C) 1383 { 1384 Mat_APMPI *ptap; 1385 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1386 MPI_Comm comm; 1387 Mat Cmpi; 1388 Mat_SeqAIJ *pd,*po; 1389 MatType mtype; 1390 PetscSF sf; 1391 PetscSFNode *iremote; 1392 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1393 const PetscInt *rootdegrees; 1394 PetscHSetI ht,oht,*hta,*hto,*htd; 1395 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1396 PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1397 PetscInt nalg=2,alg=0; 1398 PetscBool flg; 1399 const char *algTypes[2] = {"merged","overlapping"}; 1400 PetscErrorCode ierr; 1401 1402 PetscFunctionBegin; 1403 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1404 1405 /* Create symbolic parallel matrix Cmpi */ 1406 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1407 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1408 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1409 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1410 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1411 1412 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1413 ptap->reuse = MAT_INITIAL_MATRIX; 1414 ptap->algType = 3; 1415 1416 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1417 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1418 1419 po = (Mat_SeqAIJ*)p->B->data; 1420 pd = (Mat_SeqAIJ*)p->A->data; 1421 1422 /* This equals to the number of offdiag columns in P */ 1423 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1424 /* offsets */ 1425 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1426 /* The number of columns we will send to remote ranks */ 1427 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1428 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1429 for (i=0; i<pon; i++) { 1430 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1431 } 1432 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1433 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1434 /* Create hash table to merge all columns for C(i, :) */ 1435 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1436 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1437 ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr); 1438 for (i=0; i<pn; i++) { 1439 ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr); 1440 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1441 } 1442 ptap->c_rmti[0] = 0; 1443 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1444 for (i=0; i<am && (pon || pn); i++) { 1445 /* Form one row of AP */ 1446 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1447 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1448 /* If the off diag is empty, we should not do any calculation */ 1449 nzi = po->i[i+1] - po->i[i]; 1450 dnzi = pd->i[i+1] - pd->i[i]; 1451 if (!nzi && !dnzi) continue; 1452 1453 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr); 1454 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1455 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1456 /* If AP is empty, just continue */ 1457 if (!(htsize+htosize)) continue; 1458 1459 /* Form remote C(ii, :) */ 1460 poj = po->j + po->i[i]; 1461 for (j=0; j<nzi; j++) { 1462 ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr); 1463 ierr = PetscHSetIAppend(hta[poj[j]],oht);CHKERRQ(ierr); 1464 } 1465 1466 /* Form local C(ii, :) */ 1467 pdj = pd->j + pd->i[i]; 1468 for (j=0; j<dnzi; j++) { 1469 ierr = PetscHSetIAppend(htd[pdj[j]],ht);CHKERRQ(ierr); 1470 ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr); 1471 } 1472 } 1473 1474 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1475 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1476 1477 for (i=0; i<pon; i++) { 1478 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1479 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1480 c_rmtc[i] = htsize; 1481 } 1482 1483 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1484 1485 for (i=0; i<pon; i++) { 1486 off = 0; 1487 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1488 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1489 } 1490 ierr = PetscFree(hta);CHKERRQ(ierr); 1491 1492 ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr); 1493 for (i=0; i<pon; i++) { 1494 owner = 0; lidx = 0; 1495 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1496 iremote[i].index = lidx; 1497 iremote[i].rank = owner; 1498 } 1499 1500 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1501 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1502 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1503 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1504 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1505 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1506 /* How many neighbors have contributions to my rows? */ 1507 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1508 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1509 rootspacesize = 0; 1510 for (i = 0; i < pn; i++) { 1511 rootspacesize += rootdegrees[i]; 1512 } 1513 ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1514 ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1515 /* Get information from leaves 1516 * Number of columns other people contribute to my rows 1517 * */ 1518 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1519 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1520 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1521 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1522 /* The number of columns is received for each row */ 1523 ptap->c_othi[0] = 0; 1524 rootspacesize = 0; 1525 rootspaceoffsets[0] = 0; 1526 for (i = 0; i < pn; i++) { 1527 rcvncols = 0; 1528 for (j = 0; j<rootdegrees[i]; j++) { 1529 rcvncols += rootspace[rootspacesize]; 1530 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1531 rootspacesize++; 1532 } 1533 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1534 } 1535 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1536 1537 ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1538 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1539 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1540 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1541 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1542 1543 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1544 nleaves = 0; 1545 for (i = 0; i<pon; i++) { 1546 owner = 0; lidx = 0; 1547 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1548 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1549 for (j=0; j<sendncols; j++) { 1550 iremote[nleaves].rank = owner; 1551 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1552 } 1553 } 1554 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1555 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1556 1557 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1558 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1559 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1560 /* One to one map */ 1561 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1562 /* Get remote data */ 1563 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1564 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1565 ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1566 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1567 1568 for (i = 0; i < pn; i++) { 1569 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1570 rdj = c_othj + ptap->c_othi[i]; 1571 for (j = 0; j < nzi; j++) { 1572 col = rdj[j]; 1573 /* diag part */ 1574 if (col>=pcstart && col<pcend) { 1575 ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr); 1576 } else { /* off diag */ 1577 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1578 } 1579 } 1580 ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr); 1581 dnz[i] = htsize; 1582 ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr); 1583 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1584 onz[i] = htsize; 1585 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1586 } 1587 1588 ierr = PetscFree2(htd,hto);CHKERRQ(ierr); 1589 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1590 1591 /* local sizes and preallocation */ 1592 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1593 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1594 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1595 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1596 1597 /* attach the supporting struct to Cmpi for reuse */ 1598 c = (Mat_MPIAIJ*)Cmpi->data; 1599 c->ap = ptap; 1600 ptap->duplicate = Cmpi->ops->duplicate; 1601 ptap->destroy = Cmpi->ops->destroy; 1602 ptap->view = Cmpi->ops->view; 1603 1604 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1605 Cmpi->assembled = PETSC_FALSE; 1606 /* pick an algorithm */ 1607 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1608 alg = 0; 1609 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1610 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1611 switch (alg) { 1612 case 0: 1613 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1614 break; 1615 case 1: 1616 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1617 break; 1618 default: 1619 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1620 } 1621 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1622 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1623 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1624 *C = Cmpi; 1625 PetscFunctionReturn(0); 1626 } 1627 1628 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 1629 { 1630 PetscErrorCode ierr; 1631 Mat_APMPI *ptap; 1632 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 1633 MPI_Comm comm; 1634 PetscMPIInt size,rank; 1635 Mat Cmpi; 1636 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1637 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1638 PetscInt *lnk,i,k,pnz,row,nsend; 1639 PetscBT lnkbt; 1640 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 1641 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1642 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1643 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1644 MPI_Request *swaits,*rwaits; 1645 MPI_Status *sstatus,rstatus; 1646 PetscLayout rowmap; 1647 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1648 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1649 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1650 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1651 PetscScalar *apv; 1652 PetscTable ta; 1653 MatType mtype; 1654 const char *prefix; 1655 #if defined(PETSC_USE_INFO) 1656 PetscReal apfill; 1657 #endif 1658 1659 PetscFunctionBegin; 1660 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1661 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1662 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1663 1664 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1665 1666 /* create symbolic parallel matrix Cmpi */ 1667 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1668 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1669 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1670 1671 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1672 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1673 1674 /* create struct Mat_APMPI and attached it to C later */ 1675 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1676 ptap->reuse = MAT_INITIAL_MATRIX; 1677 ptap->algType = 1; 1678 1679 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1680 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1681 /* get P_loc by taking all local rows of P */ 1682 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1683 1684 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1685 /* --------------------------------- */ 1686 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1687 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1688 1689 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1690 /* ----------------------------------------------------------------- */ 1691 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1692 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1693 1694 /* create and initialize a linked list */ 1695 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 1696 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1697 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1698 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 1699 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1700 1701 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1702 1703 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1704 if (ao) { 1705 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 1706 } else { 1707 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 1708 } 1709 current_space = free_space; 1710 nspacedouble = 0; 1711 1712 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1713 api[0] = 0; 1714 for (i=0; i<am; i++) { 1715 /* diagonal portion: Ad[i,:]*P */ 1716 ai = ad->i; pi = p_loc->i; 1717 nzi = ai[i+1] - ai[i]; 1718 aj = ad->j + ai[i]; 1719 for (j=0; j<nzi; j++) { 1720 row = aj[j]; 1721 pnz = pi[row+1] - pi[row]; 1722 Jptr = p_loc->j + pi[row]; 1723 /* add non-zero cols of P into the sorted linked list lnk */ 1724 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1725 } 1726 /* off-diagonal portion: Ao[i,:]*P */ 1727 if (ao) { 1728 ai = ao->i; pi = p_oth->i; 1729 nzi = ai[i+1] - ai[i]; 1730 aj = ao->j + ai[i]; 1731 for (j=0; j<nzi; j++) { 1732 row = aj[j]; 1733 pnz = pi[row+1] - pi[row]; 1734 Jptr = p_oth->j + pi[row]; 1735 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1736 } 1737 } 1738 apnz = lnk[0]; 1739 api[i+1] = api[i] + apnz; 1740 if (ap_rmax < apnz) ap_rmax = apnz; 1741 1742 /* if free space is not available, double the total space in the list */ 1743 if (current_space->local_remaining<apnz) { 1744 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1745 nspacedouble++; 1746 } 1747 1748 /* Copy data into free space, then initialize lnk */ 1749 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1750 1751 current_space->array += apnz; 1752 current_space->local_used += apnz; 1753 current_space->local_remaining -= apnz; 1754 } 1755 /* Allocate space for apj and apv, initialize apj, and */ 1756 /* destroy list of free space and other temporary array(s) */ 1757 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 1758 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1759 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1760 1761 /* Create AP_loc for reuse */ 1762 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 1763 1764 #if defined(PETSC_USE_INFO) 1765 if (ao) { 1766 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1767 } else { 1768 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1769 } 1770 ptap->AP_loc->info.mallocs = nspacedouble; 1771 ptap->AP_loc->info.fill_ratio_given = fill; 1772 ptap->AP_loc->info.fill_ratio_needed = apfill; 1773 1774 if (api[am]) { 1775 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 1776 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 1777 } else { 1778 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 1779 } 1780 #endif 1781 1782 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1783 /* ------------------------------------ */ 1784 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 1785 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 1786 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 1787 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 1788 1789 /* (3) send coj of C_oth to other processors */ 1790 /* ------------------------------------------ */ 1791 /* determine row ownership */ 1792 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 1793 rowmap->n = pn; 1794 rowmap->bs = 1; 1795 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 1796 owners = rowmap->range; 1797 1798 /* determine the number of messages to send, their lengths */ 1799 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 1800 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1801 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1802 1803 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1804 coi = c_oth->i; coj = c_oth->j; 1805 con = ptap->C_oth->rmap->n; 1806 proc = 0; 1807 for (i=0; i<con; i++) { 1808 while (prmap[i] >= owners[proc+1]) proc++; 1809 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1810 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1811 } 1812 1813 len = 0; /* max length of buf_si[], see (4) */ 1814 owners_co[0] = 0; 1815 nsend = 0; 1816 for (proc=0; proc<size; proc++) { 1817 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1818 if (len_s[proc]) { 1819 nsend++; 1820 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1821 len += len_si[proc]; 1822 } 1823 } 1824 1825 /* determine the number and length of messages to receive for coi and coj */ 1826 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 1827 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 1828 1829 /* post the Irecv and Isend of coj */ 1830 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1831 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1832 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 1833 for (proc=0, k=0; proc<size; proc++) { 1834 if (!len_s[proc]) continue; 1835 i = owners_co[proc]; 1836 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1837 k++; 1838 } 1839 1840 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1841 /* ---------------------------------------- */ 1842 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 1843 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 1844 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 1845 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1846 1847 /* receives coj are complete */ 1848 for (i=0; i<nrecv; i++) { 1849 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1850 } 1851 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1852 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1853 1854 /* add received column indices into ta to update Crmax */ 1855 for (k=0; k<nrecv; k++) {/* k-th received message */ 1856 Jptr = buf_rj[k]; 1857 for (j=0; j<len_r[k]; j++) { 1858 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 1859 } 1860 } 1861 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 1862 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 1863 1864 /* (4) send and recv coi */ 1865 /*-----------------------*/ 1866 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1867 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1868 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1869 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1870 for (proc=0,k=0; proc<size; proc++) { 1871 if (!len_s[proc]) continue; 1872 /* form outgoing message for i-structure: 1873 buf_si[0]: nrows to be sent 1874 [1:nrows]: row index (global) 1875 [nrows+1:2*nrows+1]: i-structure index 1876 */ 1877 /*-------------------------------------------*/ 1878 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1879 buf_si_i = buf_si + nrows+1; 1880 buf_si[0] = nrows; 1881 buf_si_i[0] = 0; 1882 nrows = 0; 1883 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1884 nzi = coi[i+1] - coi[i]; 1885 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1886 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1887 nrows++; 1888 } 1889 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1890 k++; 1891 buf_si += len_si[proc]; 1892 } 1893 for (i=0; i<nrecv; i++) { 1894 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1895 } 1896 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1897 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1898 1899 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 1900 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1901 ierr = PetscFree(swaits);CHKERRQ(ierr); 1902 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1903 1904 /* (5) compute the local portion of Cmpi */ 1905 /* ------------------------------------------ */ 1906 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1907 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 1908 current_space = free_space; 1909 1910 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 1911 for (k=0; k<nrecv; k++) { 1912 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1913 nrows = *buf_ri_k[k]; 1914 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1915 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1916 } 1917 1918 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1919 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1920 for (i=0; i<pn; i++) { 1921 /* add C_loc into Cmpi */ 1922 nzi = c_loc->i[i+1] - c_loc->i[i]; 1923 Jptr = c_loc->j + c_loc->i[i]; 1924 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1925 1926 /* add received col data into lnk */ 1927 for (k=0; k<nrecv; k++) { /* k-th received message */ 1928 if (i == *nextrow[k]) { /* i-th row */ 1929 nzi = *(nextci[k]+1) - *nextci[k]; 1930 Jptr = buf_rj[k] + *nextci[k]; 1931 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1932 nextrow[k]++; nextci[k]++; 1933 } 1934 } 1935 nzi = lnk[0]; 1936 1937 /* copy data into free space, then initialize lnk */ 1938 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1939 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 1940 } 1941 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1942 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1943 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 1944 1945 /* local sizes and preallocation */ 1946 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1947 if (P->cmap->bs > 0) { 1948 ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr); 1949 ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr); 1950 } 1951 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1952 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1953 1954 /* members in merge */ 1955 ierr = PetscFree(id_r);CHKERRQ(ierr); 1956 ierr = PetscFree(len_r);CHKERRQ(ierr); 1957 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 1958 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 1959 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 1960 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 1961 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 1962 1963 /* attach the supporting struct to Cmpi for reuse */ 1964 c = (Mat_MPIAIJ*)Cmpi->data; 1965 c->ap = ptap; 1966 ptap->duplicate = Cmpi->ops->duplicate; 1967 ptap->destroy = Cmpi->ops->destroy; 1968 ptap->view = Cmpi->ops->view; 1969 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1970 1971 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1972 Cmpi->assembled = PETSC_FALSE; 1973 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1974 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1975 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1976 *C = Cmpi; 1977 PetscFunctionReturn(0); 1978 } 1979 1980 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1981 { 1982 PetscErrorCode ierr; 1983 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1984 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1985 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1986 Mat_APMPI *ptap = c->ap; 1987 Mat AP_loc,C_loc,C_oth; 1988 PetscInt i,rstart,rend,cm,ncols,row; 1989 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1990 PetscScalar *apa; 1991 const PetscInt *cols; 1992 const PetscScalar *vals; 1993 1994 PetscFunctionBegin; 1995 if (!ptap) { 1996 MPI_Comm comm; 1997 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1998 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 1999 } 2000 2001 ierr = MatZeroEntries(C);CHKERRQ(ierr); 2002 /* 1) get R = Pd^T,Ro = Po^T */ 2003 if (ptap->reuse == MAT_REUSE_MATRIX) { 2004 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 2005 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 2006 } 2007 2008 /* 2) get AP_loc */ 2009 AP_loc = ptap->AP_loc; 2010 ap = (Mat_SeqAIJ*)AP_loc->data; 2011 2012 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 2013 /*-----------------------------------------------------*/ 2014 if (ptap->reuse == MAT_REUSE_MATRIX) { 2015 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 2016 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 2017 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 2018 } 2019 2020 /* 2-2) compute numeric A_loc*P - dominating part */ 2021 /* ---------------------------------------------- */ 2022 /* get data from symbolic products */ 2023 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 2024 if (ptap->P_oth) { 2025 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 2026 } 2027 apa = ptap->apa; 2028 api = ap->i; 2029 apj = ap->j; 2030 for (i=0; i<am; i++) { 2031 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 2032 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 2033 apnz = api[i+1] - api[i]; 2034 for (j=0; j<apnz; j++) { 2035 col = apj[j+api[i]]; 2036 ap->a[j+ap->i[i]] = apa[col]; 2037 apa[col] = 0.0; 2038 } 2039 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 2040 } 2041 2042 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 2043 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 2044 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 2045 C_loc = ptap->C_loc; 2046 C_oth = ptap->C_oth; 2047 2048 /* add C_loc and Co to to C */ 2049 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 2050 2051 /* C_loc -> C */ 2052 cm = C_loc->rmap->N; 2053 c_seq = (Mat_SeqAIJ*)C_loc->data; 2054 cols = c_seq->j; 2055 vals = c_seq->a; 2056 2057 2058 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 2059 /* when there are no off-processor parts. */ 2060 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 2061 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 2062 /* a table, and other, more complex stuff has to be done. */ 2063 if (C->assembled) { 2064 C->was_assembled = PETSC_TRUE; 2065 C->assembled = PETSC_FALSE; 2066 } 2067 if (C->was_assembled) { 2068 for (i=0; i<cm; i++) { 2069 ncols = c_seq->i[i+1] - c_seq->i[i]; 2070 row = rstart + i; 2071 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2072 cols += ncols; vals += ncols; 2073 } 2074 } else { 2075 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 2076 } 2077 2078 /* Co -> C, off-processor part */ 2079 cm = C_oth->rmap->N; 2080 c_seq = (Mat_SeqAIJ*)C_oth->data; 2081 cols = c_seq->j; 2082 vals = c_seq->a; 2083 for (i=0; i<cm; i++) { 2084 ncols = c_seq->i[i+1] - c_seq->i[i]; 2085 row = p->garray[i]; 2086 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2087 cols += ncols; vals += ncols; 2088 } 2089 2090 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2091 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2092 2093 ptap->reuse = MAT_REUSE_MATRIX; 2094 2095 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 2096 if (ptap->freestruct) { 2097 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 2098 } 2099 PetscFunctionReturn(0); 2100 } 2101