1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmap.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PETSC_HASH_MAP(HMapIV, PetscInt, PetscScalar, PetscHashInt, PetscHashEqual, -1) 17 18 PETSC_STATIC_INLINE 19 PetscErrorCode PetscHMapIVAdd(PetscHMapIV ht,PetscInt key,PetscScalar val) 20 { 21 int ret; 22 khiter_t iter; 23 PetscFunctionBeginHot; 24 PetscValidPointer(ht,1); 25 iter = kh_put(HMapIV,ht,key,&ret); 26 PetscHashAssert(ret>=0); 27 if (ret) kh_val(ht,iter) = val; 28 else kh_val(ht,iter) += val; 29 PetscFunctionReturn(0); 30 } 31 32 33 PETSC_STATIC_INLINE PETSC_UNUSED 34 PetscErrorCode PetscHMapIVAddAndScale(PetscHMapIV ht,PetscHMapIV hd,PetscScalar scale) 35 { 36 int ret; 37 PetscInt key; 38 PetscScalar val; 39 PetscFunctionBegin; 40 PetscValidPointer(ht,1); 41 PetscValidPointer(hd,2); 42 kh_foreach(hd,key,val,{ khiter_t i; 43 i = kh_put(HMapIV,ht,key,&ret); 44 PetscHashAssert(ret>=0); 45 if (ret) kh_val(ht,i) = val*scale; 46 else kh_val(ht,i) += val*scale;}) 47 PetscFunctionReturn(0); 48 } 49 50 /* #define PTAP_PROFILE */ 51 52 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 53 { 54 PetscErrorCode ierr; 55 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 56 Mat_APMPI *ptap=a->ap; 57 PetscBool iascii; 58 PetscViewerFormat format; 59 60 PetscFunctionBegin; 61 if (!ptap) { 62 /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */ 63 A->ops->view = MatView_MPIAIJ; 64 ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr); 65 PetscFunctionReturn(0); 66 } 67 68 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 69 if (iascii) { 70 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 71 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 72 if (ptap->algType == 0) { 73 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 74 } else if (ptap->algType == 1) { 75 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 76 } else if (ptap->algType == 2) { 77 ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 78 } else if (ptap->algType == 3) { 79 ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr); 80 } 81 } 82 } 83 ierr = (ptap->view)(A,viewer);CHKERRQ(ierr); 84 PetscFunctionReturn(0); 85 } 86 87 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A) 88 { 89 PetscErrorCode ierr; 90 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 91 Mat_APMPI *ptap=a->ap; 92 Mat_Merge_SeqsToMPI *merge; 93 94 PetscFunctionBegin; 95 if (!ptap) PetscFunctionReturn(0); 96 97 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 98 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 99 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 100 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 101 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 102 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 103 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 104 if (ptap->AP_loc) { /* used by alg_rap */ 105 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 106 ierr = PetscFree(ap->i);CHKERRQ(ierr); 107 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 108 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 109 } else { /* used by alg_ptap */ 110 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 111 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 112 } 113 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 114 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 115 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 116 117 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 118 119 merge=ptap->merge; 120 if (merge) { /* used by alg_ptap */ 121 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 122 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 123 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 124 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 125 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 126 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 127 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 128 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 129 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 130 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 131 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 132 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 133 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 134 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 135 } 136 ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr); 137 138 ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr); 139 ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr); 140 ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr); 141 PetscFunctionReturn(0); 142 } 143 144 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 145 { 146 PetscErrorCode ierr; 147 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 148 Mat_APMPI *ptap=a->ap; 149 150 PetscFunctionBegin; 151 ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr); 152 ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */ 153 ierr = PetscFree(ptap);CHKERRQ(ierr); 154 PetscFunctionReturn(0); 155 } 156 157 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 158 { 159 PetscErrorCode ierr; 160 PetscBool flg; 161 MPI_Comm comm; 162 #if !defined(PETSC_HAVE_HYPRE) 163 const char *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"}; 164 PetscInt nalg=4; 165 #else 166 const char *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"}; 167 PetscInt nalg=5; 168 #endif 169 PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */ 170 171 PetscFunctionBegin; 172 /* check if matrix local sizes are compatible */ 173 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 174 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 175 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 176 177 if (scall == MAT_INITIAL_MATRIX) { 178 /* pick an algorithm */ 179 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 180 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 181 ierr = PetscOptionsEnd();CHKERRQ(ierr); 182 183 if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */ 184 MatInfo Ainfo,Pinfo; 185 PetscInt nz_local; 186 PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable; 187 188 ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr); 189 ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr); 190 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 191 192 if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE; 193 ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr); 194 195 if (alg_scalable) { 196 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 197 } 198 } 199 200 switch (alg) { 201 case 1: 202 /* do R=P^T locally, then C=R*A*P -- nonscalable */ 203 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 204 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 205 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 206 break; 207 case 2: 208 /* compute C=P^T*A*P allatonce */ 209 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 210 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr); 211 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 212 break; 213 case 3: 214 /* compute C=P^T*A*P allatonce */ 215 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 216 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr); 217 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 218 break; 219 #if defined(PETSC_HAVE_HYPRE) 220 case 4: 221 /* Use boomerAMGBuildCoarseOperator */ 222 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 223 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 224 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 225 break; 226 #endif 227 default: 228 /* do R=P^T locally, then C=R*A*P */ 229 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 230 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr); 231 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 232 break; 233 } 234 235 if (alg == 0 || alg == 1 || alg == 2 || alg == 3) { 236 Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data; 237 Mat_APMPI *ap = c->ap; 238 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr); 239 ap->freestruct = PETSC_FALSE; 240 ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr); 241 ierr = PetscOptionsEnd();CHKERRQ(ierr); 242 } 243 } 244 245 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 246 ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr); 247 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 248 PetscFunctionReturn(0); 249 } 250 251 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 252 { 253 PetscErrorCode ierr; 254 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 255 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 256 Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq; 257 Mat_APMPI *ptap = c->ap; 258 Mat AP_loc,C_loc,C_oth; 259 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 260 PetscScalar *apa; 261 const PetscInt *cols; 262 const PetscScalar *vals; 263 264 PetscFunctionBegin; 265 if (!ptap) { 266 MPI_Comm comm; 267 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 268 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 269 } 270 271 ierr = MatZeroEntries(C);CHKERRQ(ierr); 272 273 /* 1) get R = Pd^T,Ro = Po^T */ 274 if (ptap->reuse == MAT_REUSE_MATRIX) { 275 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 276 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 277 } 278 279 /* 2) get AP_loc */ 280 AP_loc = ptap->AP_loc; 281 ap = (Mat_SeqAIJ*)AP_loc->data; 282 283 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 284 /*-----------------------------------------------------*/ 285 if (ptap->reuse == MAT_REUSE_MATRIX) { 286 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 287 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 288 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 289 } 290 291 /* 2-2) compute numeric A_loc*P - dominating part */ 292 /* ---------------------------------------------- */ 293 /* get data from symbolic products */ 294 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 295 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 296 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!"); 297 298 api = ap->i; 299 apj = ap->j; 300 ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr); 301 for (i=0; i<am; i++) { 302 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 303 apnz = api[i+1] - api[i]; 304 apa = ap->a + api[i]; 305 ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr); 306 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 307 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 308 } 309 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr); 310 if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout); 311 312 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 313 /* Always use scalable version since we are in the MPI scalable version */ 314 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 315 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 316 317 C_loc = ptap->C_loc; 318 C_oth = ptap->C_oth; 319 320 /* add C_loc and Co to to C */ 321 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 322 323 /* C_loc -> C */ 324 cm = C_loc->rmap->N; 325 c_seq = (Mat_SeqAIJ*)C_loc->data; 326 cols = c_seq->j; 327 vals = c_seq->a; 328 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 329 330 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 331 /* when there are no off-processor parts. */ 332 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 333 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 334 /* a table, and other, more complex stuff has to be done. */ 335 if (C->assembled) { 336 C->was_assembled = PETSC_TRUE; 337 C->assembled = PETSC_FALSE; 338 } 339 if (C->was_assembled) { 340 for (i=0; i<cm; i++) { 341 ncols = c_seq->i[i+1] - c_seq->i[i]; 342 row = rstart + i; 343 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 344 cols += ncols; vals += ncols; 345 } 346 } else { 347 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 348 } 349 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 350 if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 351 352 /* Co -> C, off-processor part */ 353 cm = C_oth->rmap->N; 354 c_seq = (Mat_SeqAIJ*)C_oth->data; 355 cols = c_seq->j; 356 vals = c_seq->a; 357 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr); 358 for (i=0; i<cm; i++) { 359 ncols = c_seq->i[i+1] - c_seq->i[i]; 360 row = p->garray[i]; 361 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 362 cols += ncols; vals += ncols; 363 } 364 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 365 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 366 367 ptap->reuse = MAT_REUSE_MATRIX; 368 369 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr); 370 if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout); 371 372 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 373 if (ptap->freestruct) { 374 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 375 } 376 PetscFunctionReturn(0); 377 } 378 379 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C) 380 { 381 PetscErrorCode ierr; 382 Mat_APMPI *ptap; 383 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 384 MPI_Comm comm; 385 PetscMPIInt size,rank; 386 Mat Cmpi,P_loc,P_oth; 387 PetscFreeSpaceList free_space=NULL,current_space=NULL; 388 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 389 PetscInt *lnk,i,k,pnz,row,nsend; 390 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 391 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 392 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 393 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 394 MPI_Request *swaits,*rwaits; 395 MPI_Status *sstatus,rstatus; 396 PetscLayout rowmap; 397 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 398 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 399 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 400 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 401 PetscScalar *apv; 402 PetscTable ta; 403 MatType mtype; 404 const char *prefix; 405 #if defined(PETSC_USE_INFO) 406 PetscReal apfill; 407 #endif 408 409 PetscFunctionBegin; 410 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 411 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 412 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 413 414 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 415 416 /* create symbolic parallel matrix Cmpi */ 417 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 418 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 419 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 420 421 /* create struct Mat_APMPI and attached it to C later */ 422 ierr = PetscNew(&ptap);CHKERRQ(ierr); 423 ptap->reuse = MAT_INITIAL_MATRIX; 424 ptap->algType = 0; 425 426 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 427 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 428 /* get P_loc by taking all local rows of P */ 429 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 430 431 ptap->P_loc = P_loc; 432 ptap->P_oth = P_oth; 433 434 /* (0) compute Rd = Pd^T, Ro = Po^T */ 435 /* --------------------------------- */ 436 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 437 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 438 439 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 440 /* ----------------------------------------------------------------- */ 441 p_loc = (Mat_SeqAIJ*)P_loc->data; 442 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 443 444 /* create and initialize a linked list */ 445 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 446 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 447 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 448 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 449 450 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 451 452 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 453 if (ao) { 454 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 455 } else { 456 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 457 } 458 current_space = free_space; 459 nspacedouble = 0; 460 461 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 462 api[0] = 0; 463 for (i=0; i<am; i++) { 464 /* diagonal portion: Ad[i,:]*P */ 465 ai = ad->i; pi = p_loc->i; 466 nzi = ai[i+1] - ai[i]; 467 aj = ad->j + ai[i]; 468 for (j=0; j<nzi; j++) { 469 row = aj[j]; 470 pnz = pi[row+1] - pi[row]; 471 Jptr = p_loc->j + pi[row]; 472 /* add non-zero cols of P into the sorted linked list lnk */ 473 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 474 } 475 /* off-diagonal portion: Ao[i,:]*P */ 476 if (ao) { 477 ai = ao->i; pi = p_oth->i; 478 nzi = ai[i+1] - ai[i]; 479 aj = ao->j + ai[i]; 480 for (j=0; j<nzi; j++) { 481 row = aj[j]; 482 pnz = pi[row+1] - pi[row]; 483 Jptr = p_oth->j + pi[row]; 484 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 485 } 486 } 487 apnz = lnk[0]; 488 api[i+1] = api[i] + apnz; 489 490 /* if free space is not available, double the total space in the list */ 491 if (current_space->local_remaining<apnz) { 492 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 493 nspacedouble++; 494 } 495 496 /* Copy data into free space, then initialize lnk */ 497 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 498 499 current_space->array += apnz; 500 current_space->local_used += apnz; 501 current_space->local_remaining -= apnz; 502 } 503 /* Allocate space for apj and apv, initialize apj, and */ 504 /* destroy list of free space and other temporary array(s) */ 505 ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 506 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 507 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 508 509 /* Create AP_loc for reuse */ 510 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 511 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr); 512 513 #if defined(PETSC_USE_INFO) 514 if (ao) { 515 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 516 } else { 517 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 518 } 519 ptap->AP_loc->info.mallocs = nspacedouble; 520 ptap->AP_loc->info.fill_ratio_given = fill; 521 ptap->AP_loc->info.fill_ratio_needed = apfill; 522 523 if (api[am]) { 524 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 525 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 526 } else { 527 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 528 } 529 #endif 530 531 /* (2-1) compute symbolic Co = Ro*AP_loc */ 532 /* ------------------------------------ */ 533 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 534 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 535 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 536 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 537 538 /* (3) send coj of C_oth to other processors */ 539 /* ------------------------------------------ */ 540 /* determine row ownership */ 541 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 542 rowmap->n = pn; 543 rowmap->bs = 1; 544 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 545 owners = rowmap->range; 546 547 /* determine the number of messages to send, their lengths */ 548 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 549 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 550 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 551 552 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 553 coi = c_oth->i; coj = c_oth->j; 554 con = ptap->C_oth->rmap->n; 555 proc = 0; 556 ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr); 557 for (i=0; i<con; i++) { 558 while (prmap[i] >= owners[proc+1]) proc++; 559 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 560 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 561 } 562 563 len = 0; /* max length of buf_si[], see (4) */ 564 owners_co[0] = 0; 565 nsend = 0; 566 for (proc=0; proc<size; proc++) { 567 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 568 if (len_s[proc]) { 569 nsend++; 570 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 571 len += len_si[proc]; 572 } 573 } 574 575 /* determine the number and length of messages to receive for coi and coj */ 576 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 577 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 578 579 /* post the Irecv and Isend of coj */ 580 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 581 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 582 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 583 for (proc=0, k=0; proc<size; proc++) { 584 if (!len_s[proc]) continue; 585 i = owners_co[proc]; 586 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 587 k++; 588 } 589 590 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 591 /* ---------------------------------------- */ 592 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 593 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 594 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 595 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 596 ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr); 597 598 /* receives coj are complete */ 599 for (i=0; i<nrecv; i++) { 600 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 601 } 602 ierr = PetscFree(rwaits);CHKERRQ(ierr); 603 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 604 605 /* add received column indices into ta to update Crmax */ 606 for (k=0; k<nrecv; k++) {/* k-th received message */ 607 Jptr = buf_rj[k]; 608 for (j=0; j<len_r[k]; j++) { 609 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 610 } 611 } 612 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 613 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 614 615 /* (4) send and recv coi */ 616 /*-----------------------*/ 617 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 618 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 619 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 620 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 621 for (proc=0,k=0; proc<size; proc++) { 622 if (!len_s[proc]) continue; 623 /* form outgoing message for i-structure: 624 buf_si[0]: nrows to be sent 625 [1:nrows]: row index (global) 626 [nrows+1:2*nrows+1]: i-structure index 627 */ 628 /*-------------------------------------------*/ 629 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 630 buf_si_i = buf_si + nrows+1; 631 buf_si[0] = nrows; 632 buf_si_i[0] = 0; 633 nrows = 0; 634 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 635 nzi = coi[i+1] - coi[i]; 636 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 637 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 638 nrows++; 639 } 640 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 641 k++; 642 buf_si += len_si[proc]; 643 } 644 for (i=0; i<nrecv; i++) { 645 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 646 } 647 ierr = PetscFree(rwaits);CHKERRQ(ierr); 648 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 649 650 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 651 ierr = PetscFree(len_ri);CHKERRQ(ierr); 652 ierr = PetscFree(swaits);CHKERRQ(ierr); 653 ierr = PetscFree(buf_s);CHKERRQ(ierr); 654 655 /* (5) compute the local portion of Cmpi */ 656 /* ------------------------------------------ */ 657 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 658 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 659 current_space = free_space; 660 661 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 662 for (k=0; k<nrecv; k++) { 663 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 664 nrows = *buf_ri_k[k]; 665 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 666 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 667 } 668 669 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 670 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 671 for (i=0; i<pn; i++) { 672 /* add C_loc into Cmpi */ 673 nzi = c_loc->i[i+1] - c_loc->i[i]; 674 Jptr = c_loc->j + c_loc->i[i]; 675 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 676 677 /* add received col data into lnk */ 678 for (k=0; k<nrecv; k++) { /* k-th received message */ 679 if (i == *nextrow[k]) { /* i-th row */ 680 nzi = *(nextci[k]+1) - *nextci[k]; 681 Jptr = buf_rj[k] + *nextci[k]; 682 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 683 nextrow[k]++; nextci[k]++; 684 } 685 } 686 nzi = lnk[0]; 687 688 /* copy data into free space, then initialize lnk */ 689 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 690 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 691 } 692 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 693 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 694 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 695 696 /* local sizes and preallocation */ 697 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 698 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 699 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 700 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 701 702 /* members in merge */ 703 ierr = PetscFree(id_r);CHKERRQ(ierr); 704 ierr = PetscFree(len_r);CHKERRQ(ierr); 705 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 706 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 707 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 708 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 709 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 710 711 /* attach the supporting struct to Cmpi for reuse */ 712 c = (Mat_MPIAIJ*)Cmpi->data; 713 c->ap = ptap; 714 ptap->duplicate = Cmpi->ops->duplicate; 715 ptap->destroy = Cmpi->ops->destroy; 716 ptap->view = Cmpi->ops->view; 717 718 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 719 Cmpi->assembled = PETSC_FALSE; 720 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 721 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 722 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 723 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 724 *C = Cmpi; 725 726 nout = 0; 727 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr); 728 if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout); 729 ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr); 730 if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout); 731 732 PetscFunctionReturn(0); 733 } 734 735 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHSetI dht,PetscHSetI oht) 736 { 737 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 738 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 739 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 740 PetscInt pcstart,pcend,column; 741 PetscErrorCode ierr; 742 743 PetscFunctionBegin; 744 745 pcstart = P->cmap->rstart; 746 pcend = P->cmap->rend; 747 /* diagonal portion: Ad[i,:]*P */ 748 ai = ad->i; 749 nzi = ai[i+1] - ai[i]; 750 aj = ad->j + ai[i]; 751 for (j=0; j<nzi; j++) { 752 row = aj[j]; 753 nzpi = pd->i[row+1] - pd->i[row]; 754 pj = pd->j + pd->i[row]; 755 for (k=0; k<nzpi; k++) { 756 ierr = PetscHSetIAdd(dht,pj[k]+pcstart);CHKERRQ(ierr); 757 } 758 } 759 for (j=0; j<nzi; j++) { 760 row = aj[j]; 761 nzpi = po->i[row+1] - po->i[row]; 762 pj = po->j + po->i[row]; 763 for (k=0; k<nzpi; k++) { 764 ierr = PetscHSetIAdd(oht,p->garray[pj[k]]);CHKERRQ(ierr); 765 } 766 } 767 768 /* off diagonal part: Ao[i, :]*P_oth */ 769 if (ao) { 770 ai = ao->i; 771 pi = p_oth->i; 772 nzi = ai[i+1] - ai[i]; 773 aj = ao->j + ai[i]; 774 for (j=0; j<nzi; j++) { 775 row = aj[j]; 776 pnz = pi[row+1] - pi[row]; 777 p_othcols = p_oth->j + pi[row]; 778 for (col=0; col<pnz; col++) { 779 column = p_othcols[col]; 780 if (column>=pcstart && column<pcend) { 781 ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr); 782 } else { 783 ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr); 784 } 785 } 786 } 787 } /* end if (ao) */ 788 PetscFunctionReturn(0); 789 } 790 791 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHMapIV hmap) 792 { 793 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 794 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 795 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi; 796 PetscScalar ra,*aa,*pa; 797 PetscErrorCode ierr; 798 799 PetscFunctionBegin; 800 pcstart = P->cmap->rstart; 801 /* diagonal portion: Ad[i,:]*P */ 802 ai = ad->i; 803 nzi = ai[i+1] - ai[i]; 804 aj = ad->j + ai[i]; 805 aa = ad->a + ai[i]; 806 807 for (j=0; j<nzi; j++) { 808 ra = aa[j]; 809 row = aj[j]; 810 nzpi = pd->i[row+1] - pd->i[row]; 811 pj = pd->j + pd->i[row]; 812 pa = pd->a + pd->i[row]; 813 for (k=0; k<nzpi; k++) { 814 ierr = PetscHMapIVAdd(hmap,pj[k]+pcstart,ra*pa[k]);CHKERRQ(ierr); 815 } 816 } 817 for (j=0; j<nzi; j++) { 818 ra = aa[j]; 819 row = aj[j]; 820 nzpi = po->i[row+1] - po->i[row]; 821 pj = po->j + po->i[row]; 822 pa = po->a + po->i[row]; 823 for (k=0; k<nzpi; k++) { 824 ierr = PetscHMapIVAdd(hmap,p->garray[pj[k]],ra*pa[k]);CHKERRQ(ierr); 825 } 826 } 827 828 829 /* off diagonal part: Ao[i, :]*P_oth */ 830 if (ao) { 831 ai = ao->i; 832 pi = p_oth->i; 833 nzi = ai[i+1] - ai[i]; 834 aj = ao->j + ai[i]; 835 aa = ao->a + ai[i]; 836 for (j=0; j<nzi; j++) { 837 row = aj[j]; 838 ra = aa[j]; 839 pnz = pi[row+1] - pi[row]; 840 p_othcols = p_oth->j + pi[row]; 841 pa = p_oth->a + pi[row]; 842 for (col=0; col<pnz; col++) { 843 ierr = PetscHMapIVAdd(hmap,p_othcols[col],ra*pa[col]);CHKERRQ(ierr); 844 } 845 } 846 } /* end if (ao) */ 847 PetscFunctionReturn(0); 848 } 849 850 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 851 { 852 PetscErrorCode ierr; 853 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 854 Mat_SeqAIJ *cd,*co,*po,*pd; 855 Mat_APMPI *ptap = c->ap; 856 PetscHMapIV hmap; 857 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 858 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 859 MPI_Comm comm; 860 861 PetscFunctionBegin; 862 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 863 if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 864 865 ierr = MatZeroEntries(C);CHKERRQ(ierr); 866 867 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 868 /*-----------------------------------------------------*/ 869 if (ptap->reuse == MAT_REUSE_MATRIX) { 870 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 871 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 872 } 873 874 po = (Mat_SeqAIJ*) p->B->data; 875 pd = (Mat_SeqAIJ*) p->A->data; 876 877 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 878 879 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 880 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 881 cmaxr = 0; 882 for (i=0; i<pon; i++) { 883 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 884 } 885 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 886 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 887 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 888 for (i=0; i<am && pon; i++) { 889 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 890 nzi = po->i[i+1] - po->i[i]; 891 if (!nzi) continue; 892 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr); 893 voff = 0; 894 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 895 if (!voff) continue; 896 /*ierr = PetscMemzero(c_rmtc,sizeof(PetscInt)*pon);CHKERRQ(ierr);*/ 897 898 /* Form C(ii, :) */ 899 poj = po->j + po->i[i]; 900 poa = po->a + po->i[i]; 901 for (j=0; j<nzi; j++) { 902 c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]]; 903 c_rmtaa = c_rmta + ptap->c_rmti[poj[j]]; 904 for (jj=0; jj<voff; jj++) { 905 apvaluestmp[jj] = apvalues[jj]*poa[j]; 906 /*If the row is empty */ 907 if (!c_rmtc[poj[j]]) { 908 c_rmtjj[jj] = apindices[jj]; 909 c_rmtaa[jj] = apvaluestmp[jj]; 910 c_rmtc[poj[j]]++; 911 } else { 912 ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr); 913 if (loc>=0){ /* hit */ 914 c_rmtaa[loc] += apvaluestmp[jj]; 915 } else { /* new element */ 916 loc = -(loc+1); 917 /* Move data backward */ 918 for (kk=c_rmtc[poj[j]]; kk>loc; kk--) { 919 c_rmtjj[kk] = c_rmtjj[kk-1]; 920 c_rmtaa[kk] = c_rmtaa[kk-1]; 921 }/* End kk */ 922 c_rmtjj[loc] = apindices[jj]; 923 c_rmtaa[loc] = apvaluestmp[jj]; 924 c_rmtc[poj[j]]++; 925 } 926 } 927 } /* End jj */ 928 } /* End j */ 929 } /* End i */ 930 931 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 932 933 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 934 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 935 936 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 937 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 938 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 939 cd = (Mat_SeqAIJ*)(c->A)->data; 940 co = (Mat_SeqAIJ*)(c->B)->data; 941 942 cmaxr = 0; 943 for (i=0; i<pn; i++) { 944 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 945 } 946 ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr); 947 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 948 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 949 for (i=0; i<am && pn; i++) { 950 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 951 nzi = pd->i[i+1] - pd->i[i]; 952 if (!nzi) continue; 953 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr); 954 voff = 0; 955 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 956 if (!voff) continue; 957 /* Form C(ii, :) */ 958 pdj = pd->j + pd->i[i]; 959 pda = pd->a + pd->i[i]; 960 for (j=0; j<nzi; j++) { 961 row = pcstart + pdj[j]; 962 for (jj=0; jj<voff; jj++) { 963 apvaluestmp[jj] = apvalues[jj]*pda[j]; 964 } 965 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 966 } 967 } 968 969 ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr); 970 ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr); 971 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 972 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr); 973 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 974 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 975 976 /* Add contributions from remote */ 977 for (i = 0; i < pn; i++) { 978 row = i + pcstart; 979 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 980 } 981 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 982 983 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 984 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 985 986 ptap->reuse = MAT_REUSE_MATRIX; 987 988 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 989 if (ptap->freestruct) { 990 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 991 } 992 PetscFunctionReturn(0); 993 } 994 995 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 996 { 997 PetscErrorCode ierr; 998 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 999 Mat_SeqAIJ *cd,*co,*po,*pd; 1000 Mat_APMPI *ptap = c->ap; 1001 PetscHMapIV hmap; 1002 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 1003 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 1004 MPI_Comm comm; 1005 1006 PetscFunctionBegin; 1007 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1008 if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 1009 1010 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1011 1012 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1013 /*-----------------------------------------------------*/ 1014 if (ptap->reuse == MAT_REUSE_MATRIX) { 1015 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1016 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1017 } 1018 1019 po = (Mat_SeqAIJ*) p->B->data; 1020 pd = (Mat_SeqAIJ*) p->A->data; 1021 1022 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1023 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1024 1025 ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr); 1026 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1027 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1028 cmaxr = 0; 1029 for (i=0; i<pon; i++) { 1030 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 1031 } 1032 cd = (Mat_SeqAIJ*)(c->A)->data; 1033 co = (Mat_SeqAIJ*)(c->B)->data; 1034 for (i=0; i<pn; i++) { 1035 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 1036 } 1037 ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr); 1038 ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr); 1039 ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr); 1040 for (i=0; i<am && (pon || pn); i++) { 1041 ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr); 1042 nzi = po->i[i+1] - po->i[i]; 1043 dnzi = pd->i[i+1] - pd->i[i]; 1044 if (!nzi && !dnzi) continue; 1045 ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr); 1046 voff = 0; 1047 ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr); 1048 if (!voff) continue; 1049 1050 /* Form remote C(ii, :) */ 1051 poj = po->j + po->i[i]; 1052 poa = po->a + po->i[i]; 1053 for (j=0; j<nzi; j++) { 1054 c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]]; 1055 c_rmtaa = c_rmta + ptap->c_rmti[poj[j]]; 1056 for (jj=0; jj<voff; jj++) { 1057 apvaluestmp[jj] = apvalues[jj]*poa[j]; 1058 /*If the row is empty */ 1059 if (!c_rmtc[poj[j]]) { 1060 c_rmtjj[jj] = apindices[jj]; 1061 c_rmtaa[jj] = apvaluestmp[jj]; 1062 c_rmtc[poj[j]]++; 1063 } else { 1064 ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr); 1065 if (loc>=0){ /* hit */ 1066 c_rmtaa[loc] += apvaluestmp[jj]; 1067 } else { /* new element */ 1068 loc = -(loc+1); 1069 /* Move data backward */ 1070 for (kk=c_rmtc[poj[j]]; kk>loc; kk--) { 1071 c_rmtjj[kk] = c_rmtjj[kk-1]; 1072 c_rmtaa[kk] = c_rmtaa[kk-1]; 1073 }/* End kk */ 1074 c_rmtjj[loc] = apindices[jj]; 1075 c_rmtaa[loc] = apvaluestmp[jj]; 1076 c_rmtc[poj[j]]++; 1077 } 1078 } 1079 } /* End jj */ 1080 } /* End j */ 1081 1082 /* Form local C(ii, :) */ 1083 pdj = pd->j + pd->i[i]; 1084 pda = pd->a + pd->i[i]; 1085 for (j=0; j<dnzi; j++) { 1086 row = pcstart + pdj[j]; 1087 for (jj=0; jj<voff; jj++) { 1088 apvaluestmp[jj] = apvalues[jj]*pda[j]; 1089 }/* End kk */ 1090 ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr); 1091 }/* End j */ 1092 } /* End i */ 1093 1094 ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr); 1095 ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr); 1096 ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr); 1097 1098 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1099 ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1100 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr); 1101 ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr); 1102 ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr); 1103 1104 /* Add contributions from remote */ 1105 for (i = 0; i < pn; i++) { 1106 row = i + pcstart; 1107 ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr); 1108 } 1109 ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr); 1110 1111 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1112 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1113 1114 ptap->reuse = MAT_REUSE_MATRIX; 1115 1116 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 1117 if (ptap->freestruct) { 1118 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 1119 } 1120 PetscFunctionReturn(0); 1121 } 1122 1123 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C) 1124 { 1125 Mat_APMPI *ptap; 1126 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1127 MPI_Comm comm; 1128 Mat Cmpi; 1129 Mat_SeqAIJ *pd,*po; 1130 MatType mtype; 1131 PetscSF sf; 1132 PetscSFNode *iremote; 1133 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1134 const PetscInt *rootdegrees; 1135 PetscHSetI ht,oht,*hta,*hto; 1136 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1137 PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1138 PetscInt nalg=2,alg=0; 1139 PetscBool flg; 1140 const char *algTypes[2] = {"overlapping","merged"}; 1141 PetscErrorCode ierr; 1142 1143 PetscFunctionBegin; 1144 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1145 1146 /* Create symbolic parallel matrix Cmpi */ 1147 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1148 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1149 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1150 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1151 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1152 1153 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1154 ptap->reuse = MAT_INITIAL_MATRIX; 1155 ptap->algType = 2; 1156 1157 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1158 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1159 1160 po = (Mat_SeqAIJ*)p->B->data; 1161 pd = (Mat_SeqAIJ*)p->A->data; 1162 1163 /* This equals to the number of offdiag columns in P */ 1164 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1165 /* offsets */ 1166 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1167 /* The number of columns we will send to remote ranks */ 1168 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1169 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1170 for (i=0; i<pon; i++) { 1171 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1172 } 1173 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1174 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1175 /* Create hash table to merge all columns for C(i, :) */ 1176 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1177 1178 ptap->c_rmti[0] = 0; 1179 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1180 for (i=0; i<am && pon; i++) { 1181 /* Form one row of AP */ 1182 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1183 /* If the off diag is empty, we should not do any calculation */ 1184 nzi = po->i[i+1] - po->i[i]; 1185 if (!nzi) continue; 1186 1187 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,ht);CHKERRQ(ierr); 1188 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1189 /* If AP is empty, just continue */ 1190 if (!htsize) continue; 1191 /* Form C(ii, :) */ 1192 poj = po->j + po->i[i]; 1193 for (j=0; j<nzi; j++) { 1194 ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr); 1195 } 1196 } 1197 1198 for (i=0; i<pon; i++) { 1199 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1200 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1201 c_rmtc[i] = htsize; 1202 } 1203 1204 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1205 1206 for (i=0; i<pon; i++) { 1207 off = 0; 1208 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1209 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1210 } 1211 ierr = PetscFree(hta);CHKERRQ(ierr); 1212 1213 ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr); 1214 for (i=0; i<pon; i++) { 1215 owner = 0; lidx = 0; 1216 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1217 iremote[i].index = lidx; 1218 iremote[i].rank = owner; 1219 } 1220 1221 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1222 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1223 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1224 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1225 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1226 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1227 /* How many neighbors have contributions to my rows? */ 1228 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1229 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1230 rootspacesize = 0; 1231 for (i = 0; i < pn; i++) { 1232 rootspacesize += rootdegrees[i]; 1233 } 1234 ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1235 ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1236 /* Get information from leaves 1237 * Number of columns other people contribute to my rows 1238 * */ 1239 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1240 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1241 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1242 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1243 /* The number of columns is received for each row */ 1244 ptap->c_othi[0] = 0; 1245 rootspacesize = 0; 1246 rootspaceoffsets[0] = 0; 1247 for (i = 0; i < pn; i++) { 1248 rcvncols = 0; 1249 for (j = 0; j<rootdegrees[i]; j++) { 1250 rcvncols += rootspace[rootspacesize]; 1251 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1252 rootspacesize++; 1253 } 1254 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1255 } 1256 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1257 1258 ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1259 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1260 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1261 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1262 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1263 1264 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1265 nleaves = 0; 1266 for (i = 0; i<pon; i++) { 1267 owner = 0; lidx = 0; 1268 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1269 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1270 for (j=0; j<sendncols; j++) { 1271 iremote[nleaves].rank = owner; 1272 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1273 } 1274 } 1275 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1276 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1277 1278 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1279 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1280 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1281 /* One to one map */ 1282 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1283 1284 ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1285 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1286 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1287 ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr); 1288 for (i=0; i<pn; i++) { 1289 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1290 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1291 } 1292 /* Work on local part */ 1293 /* 4) Pass 1: Estimate memory for C_loc */ 1294 for (i=0; i<am && pn; i++) { 1295 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1296 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1297 nzi = pd->i[i+1] - pd->i[i]; 1298 if (!nzi) continue; 1299 1300 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr); 1301 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1302 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1303 if (!(htsize+htosize)) continue; 1304 /* Form C(ii, :) */ 1305 pdj = pd->j + pd->i[i]; 1306 for (j=0; j<nzi; j++) { 1307 ierr = PetscHSetIAppend(hta[pdj[j]],ht);CHKERRQ(ierr); 1308 ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr); 1309 } 1310 } 1311 1312 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1313 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1314 1315 /* Get remote data */ 1316 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1317 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1318 1319 for (i = 0; i < pn; i++) { 1320 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1321 rdj = c_othj + ptap->c_othi[i]; 1322 for (j = 0; j < nzi; j++) { 1323 col = rdj[j]; 1324 /* diag part */ 1325 if (col>=pcstart && col<pcend) { 1326 ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr); 1327 } else { /* off diag */ 1328 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1329 } 1330 } 1331 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1332 dnz[i] = htsize; 1333 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1334 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1335 onz[i] = htsize; 1336 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1337 } 1338 1339 ierr = PetscFree2(hta,hto);CHKERRQ(ierr); 1340 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1341 1342 /* local sizes and preallocation */ 1343 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1344 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1345 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1346 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1347 1348 /* attach the supporting struct to Cmpi for reuse */ 1349 c = (Mat_MPIAIJ*)Cmpi->data; 1350 c->ap = ptap; 1351 ptap->duplicate = Cmpi->ops->duplicate; 1352 ptap->destroy = Cmpi->ops->destroy; 1353 ptap->view = Cmpi->ops->view; 1354 1355 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1356 Cmpi->assembled = PETSC_FALSE; 1357 /* pick an algorithm */ 1358 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1359 alg = 0; 1360 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1361 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1362 switch (alg) { 1363 case 0: 1364 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1365 break; 1366 case 1: 1367 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1368 break; 1369 default: 1370 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1371 } 1372 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1373 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1374 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1375 *C = Cmpi; 1376 PetscFunctionReturn(0); 1377 } 1378 1379 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C) 1380 { 1381 Mat_APMPI *ptap; 1382 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1383 MPI_Comm comm; 1384 Mat Cmpi; 1385 Mat_SeqAIJ *pd,*po; 1386 MatType mtype; 1387 PetscSF sf; 1388 PetscSFNode *iremote; 1389 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1390 const PetscInt *rootdegrees; 1391 PetscHSetI ht,oht,*hta,*hto,*htd; 1392 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1393 PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1394 PetscInt nalg=2,alg=0; 1395 PetscBool flg; 1396 const char *algTypes[2] = {"merged","overlapping"}; 1397 PetscErrorCode ierr; 1398 1399 PetscFunctionBegin; 1400 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1401 1402 /* Create symbolic parallel matrix Cmpi */ 1403 ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr); 1404 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1405 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1406 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1407 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1408 1409 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1410 ptap->reuse = MAT_INITIAL_MATRIX; 1411 ptap->algType = 3; 1412 1413 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1414 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1415 1416 po = (Mat_SeqAIJ*)p->B->data; 1417 pd = (Mat_SeqAIJ*)p->A->data; 1418 1419 /* This equals to the number of offdiag columns in P */ 1420 ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr); 1421 /* offsets */ 1422 ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr); 1423 /* The number of columns we will send to remote ranks */ 1424 ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr); 1425 ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr); 1426 for (i=0; i<pon; i++) { 1427 ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr); 1428 } 1429 ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr); 1430 ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr); 1431 /* Create hash table to merge all columns for C(i, :) */ 1432 ierr = PetscHSetICreate(&ht);CHKERRQ(ierr); 1433 ierr = PetscHSetICreate(&oht);CHKERRQ(ierr); 1434 ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr); 1435 for (i=0; i<pn; i++) { 1436 ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr); 1437 ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr); 1438 } 1439 ptap->c_rmti[0] = 0; 1440 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1441 for (i=0; i<am && (pon || pn); i++) { 1442 /* Form one row of AP */ 1443 ierr = PetscHSetIClear(ht);CHKERRQ(ierr); 1444 ierr = PetscHSetIClear(oht);CHKERRQ(ierr); 1445 /* If the off diag is empty, we should not do any calculation */ 1446 nzi = po->i[i+1] - po->i[i]; 1447 dnzi = pd->i[i+1] - pd->i[i]; 1448 if (!nzi && !dnzi) continue; 1449 1450 ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr); 1451 ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr); 1452 ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr); 1453 /* If AP is empty, just continue */ 1454 if (!(htsize+htosize)) continue; 1455 1456 /* Form remote C(ii, :) */ 1457 poj = po->j + po->i[i]; 1458 for (j=0; j<nzi; j++) { 1459 ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr); 1460 ierr = PetscHSetIAppend(hta[poj[j]],oht);CHKERRQ(ierr); 1461 } 1462 1463 /* Form local C(ii, :) */ 1464 pdj = pd->j + pd->i[i]; 1465 for (j=0; j<dnzi; j++) { 1466 ierr = PetscHSetIAppend(htd[pdj[j]],ht);CHKERRQ(ierr); 1467 ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr); 1468 } 1469 } 1470 1471 ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr); 1472 ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr); 1473 1474 for (i=0; i<pon; i++) { 1475 ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr); 1476 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1477 c_rmtc[i] = htsize; 1478 } 1479 1480 ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr); 1481 1482 for (i=0; i<pon; i++) { 1483 off = 0; 1484 ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr); 1485 ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr); 1486 } 1487 ierr = PetscFree(hta);CHKERRQ(ierr); 1488 1489 ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr); 1490 for (i=0; i<pon; i++) { 1491 owner = 0; lidx = 0; 1492 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1493 iremote[i].index = lidx; 1494 iremote[i].rank = owner; 1495 } 1496 1497 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 1498 ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1499 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1500 ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr); 1501 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 1502 ierr = PetscSFSetUp(sf);CHKERRQ(ierr); 1503 /* How many neighbors have contributions to my rows? */ 1504 ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr); 1505 ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr); 1506 rootspacesize = 0; 1507 for (i = 0; i < pn; i++) { 1508 rootspacesize += rootdegrees[i]; 1509 } 1510 ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr); 1511 ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr); 1512 /* Get information from leaves 1513 * Number of columns other people contribute to my rows 1514 * */ 1515 ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1516 ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr); 1517 ierr = PetscFree(c_rmtc);CHKERRQ(ierr); 1518 ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr); 1519 /* The number of columns is received for each row */ 1520 ptap->c_othi[0] = 0; 1521 rootspacesize = 0; 1522 rootspaceoffsets[0] = 0; 1523 for (i = 0; i < pn; i++) { 1524 rcvncols = 0; 1525 for (j = 0; j<rootdegrees[i]; j++) { 1526 rcvncols += rootspace[rootspacesize]; 1527 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1528 rootspacesize++; 1529 } 1530 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1531 } 1532 ierr = PetscFree(rootspace);CHKERRQ(ierr); 1533 1534 ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr); 1535 ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1536 ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr); 1537 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 1538 ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr); 1539 1540 ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr); 1541 nleaves = 0; 1542 for (i = 0; i<pon; i++) { 1543 owner = 0; lidx = 0; 1544 ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr); 1545 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1546 for (j=0; j<sendncols; j++) { 1547 iremote[nleaves].rank = owner; 1548 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1549 } 1550 } 1551 ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr); 1552 ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr); 1553 1554 ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr); 1555 ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1556 ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr); 1557 /* One to one map */ 1558 ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1559 /* Get remote data */ 1560 ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr); 1561 ierr = PetscFree(c_rmtj);CHKERRQ(ierr); 1562 ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr); 1563 ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr); 1564 1565 for (i = 0; i < pn; i++) { 1566 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1567 rdj = c_othj + ptap->c_othi[i]; 1568 for (j = 0; j < nzi; j++) { 1569 col = rdj[j]; 1570 /* diag part */ 1571 if (col>=pcstart && col<pcend) { 1572 ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr); 1573 } else { /* off diag */ 1574 ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr); 1575 } 1576 } 1577 ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr); 1578 dnz[i] = htsize; 1579 ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr); 1580 ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr); 1581 onz[i] = htsize; 1582 ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr); 1583 } 1584 1585 ierr = PetscFree2(htd,hto);CHKERRQ(ierr); 1586 ierr = PetscFree(c_othj);CHKERRQ(ierr); 1587 1588 /* local sizes and preallocation */ 1589 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1590 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1591 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1592 ierr = PetscFree2(dnz,onz);CHKERRQ(ierr); 1593 1594 /* attach the supporting struct to Cmpi for reuse */ 1595 c = (Mat_MPIAIJ*)Cmpi->data; 1596 c->ap = ptap; 1597 ptap->duplicate = Cmpi->ops->duplicate; 1598 ptap->destroy = Cmpi->ops->destroy; 1599 ptap->view = Cmpi->ops->view; 1600 1601 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1602 Cmpi->assembled = PETSC_FALSE; 1603 /* pick an algorithm */ 1604 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1605 alg = 0; 1606 ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr); 1607 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1608 switch (alg) { 1609 case 0: 1610 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1611 break; 1612 case 1: 1613 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1614 break; 1615 default: 1616 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n"); 1617 } 1618 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1619 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1620 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1621 *C = Cmpi; 1622 PetscFunctionReturn(0); 1623 } 1624 1625 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 1626 { 1627 PetscErrorCode ierr; 1628 Mat_APMPI *ptap; 1629 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 1630 MPI_Comm comm; 1631 PetscMPIInt size,rank; 1632 Mat Cmpi; 1633 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1634 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1635 PetscInt *lnk,i,k,pnz,row,nsend; 1636 PetscBT lnkbt; 1637 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 1638 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1639 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1640 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1641 MPI_Request *swaits,*rwaits; 1642 MPI_Status *sstatus,rstatus; 1643 PetscLayout rowmap; 1644 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1645 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1646 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1647 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1648 PetscScalar *apv; 1649 PetscTable ta; 1650 MatType mtype; 1651 const char *prefix; 1652 #if defined(PETSC_USE_INFO) 1653 PetscReal apfill; 1654 #endif 1655 1656 PetscFunctionBegin; 1657 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1658 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1659 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1660 1661 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1662 1663 /* create symbolic parallel matrix Cmpi */ 1664 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1665 ierr = MatGetType(A,&mtype);CHKERRQ(ierr); 1666 ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr); 1667 1668 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1669 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1670 1671 /* create struct Mat_APMPI and attached it to C later */ 1672 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1673 ptap->reuse = MAT_INITIAL_MATRIX; 1674 ptap->algType = 1; 1675 1676 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1677 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1678 /* get P_loc by taking all local rows of P */ 1679 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1680 1681 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1682 /* --------------------------------- */ 1683 ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1684 ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1685 1686 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1687 /* ----------------------------------------------------------------- */ 1688 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1689 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1690 1691 /* create and initialize a linked list */ 1692 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 1693 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1694 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1695 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 1696 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1697 1698 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1699 1700 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1701 if (ao) { 1702 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 1703 } else { 1704 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 1705 } 1706 current_space = free_space; 1707 nspacedouble = 0; 1708 1709 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1710 api[0] = 0; 1711 for (i=0; i<am; i++) { 1712 /* diagonal portion: Ad[i,:]*P */ 1713 ai = ad->i; pi = p_loc->i; 1714 nzi = ai[i+1] - ai[i]; 1715 aj = ad->j + ai[i]; 1716 for (j=0; j<nzi; j++) { 1717 row = aj[j]; 1718 pnz = pi[row+1] - pi[row]; 1719 Jptr = p_loc->j + pi[row]; 1720 /* add non-zero cols of P into the sorted linked list lnk */ 1721 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1722 } 1723 /* off-diagonal portion: Ao[i,:]*P */ 1724 if (ao) { 1725 ai = ao->i; pi = p_oth->i; 1726 nzi = ai[i+1] - ai[i]; 1727 aj = ao->j + ai[i]; 1728 for (j=0; j<nzi; j++) { 1729 row = aj[j]; 1730 pnz = pi[row+1] - pi[row]; 1731 Jptr = p_oth->j + pi[row]; 1732 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1733 } 1734 } 1735 apnz = lnk[0]; 1736 api[i+1] = api[i] + apnz; 1737 if (ap_rmax < apnz) ap_rmax = apnz; 1738 1739 /* if free space is not available, double the total space in the list */ 1740 if (current_space->local_remaining<apnz) { 1741 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1742 nspacedouble++; 1743 } 1744 1745 /* Copy data into free space, then initialize lnk */ 1746 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1747 1748 current_space->array += apnz; 1749 current_space->local_used += apnz; 1750 current_space->local_remaining -= apnz; 1751 } 1752 /* Allocate space for apj and apv, initialize apj, and */ 1753 /* destroy list of free space and other temporary array(s) */ 1754 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 1755 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1756 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1757 1758 /* Create AP_loc for reuse */ 1759 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 1760 1761 #if defined(PETSC_USE_INFO) 1762 if (ao) { 1763 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1764 } else { 1765 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1766 } 1767 ptap->AP_loc->info.mallocs = nspacedouble; 1768 ptap->AP_loc->info.fill_ratio_given = fill; 1769 ptap->AP_loc->info.fill_ratio_needed = apfill; 1770 1771 if (api[am]) { 1772 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 1773 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 1774 } else { 1775 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 1776 } 1777 #endif 1778 1779 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1780 /* ------------------------------------ */ 1781 ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr); 1782 ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr); 1783 ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr); 1784 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 1785 1786 /* (3) send coj of C_oth to other processors */ 1787 /* ------------------------------------------ */ 1788 /* determine row ownership */ 1789 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 1790 rowmap->n = pn; 1791 rowmap->bs = 1; 1792 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 1793 owners = rowmap->range; 1794 1795 /* determine the number of messages to send, their lengths */ 1796 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 1797 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1798 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1799 1800 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1801 coi = c_oth->i; coj = c_oth->j; 1802 con = ptap->C_oth->rmap->n; 1803 proc = 0; 1804 for (i=0; i<con; i++) { 1805 while (prmap[i] >= owners[proc+1]) proc++; 1806 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1807 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1808 } 1809 1810 len = 0; /* max length of buf_si[], see (4) */ 1811 owners_co[0] = 0; 1812 nsend = 0; 1813 for (proc=0; proc<size; proc++) { 1814 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1815 if (len_s[proc]) { 1816 nsend++; 1817 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1818 len += len_si[proc]; 1819 } 1820 } 1821 1822 /* determine the number and length of messages to receive for coi and coj */ 1823 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 1824 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 1825 1826 /* post the Irecv and Isend of coj */ 1827 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1828 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1829 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 1830 for (proc=0, k=0; proc<size; proc++) { 1831 if (!len_s[proc]) continue; 1832 i = owners_co[proc]; 1833 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1834 k++; 1835 } 1836 1837 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1838 /* ---------------------------------------- */ 1839 ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr); 1840 ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr); 1841 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 1842 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1843 1844 /* receives coj are complete */ 1845 for (i=0; i<nrecv; i++) { 1846 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1847 } 1848 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1849 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1850 1851 /* add received column indices into ta to update Crmax */ 1852 for (k=0; k<nrecv; k++) {/* k-th received message */ 1853 Jptr = buf_rj[k]; 1854 for (j=0; j<len_r[k]; j++) { 1855 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 1856 } 1857 } 1858 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 1859 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 1860 1861 /* (4) send and recv coi */ 1862 /*-----------------------*/ 1863 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1864 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1865 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1866 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1867 for (proc=0,k=0; proc<size; proc++) { 1868 if (!len_s[proc]) continue; 1869 /* form outgoing message for i-structure: 1870 buf_si[0]: nrows to be sent 1871 [1:nrows]: row index (global) 1872 [nrows+1:2*nrows+1]: i-structure index 1873 */ 1874 /*-------------------------------------------*/ 1875 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1876 buf_si_i = buf_si + nrows+1; 1877 buf_si[0] = nrows; 1878 buf_si_i[0] = 0; 1879 nrows = 0; 1880 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1881 nzi = coi[i+1] - coi[i]; 1882 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1883 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1884 nrows++; 1885 } 1886 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1887 k++; 1888 buf_si += len_si[proc]; 1889 } 1890 for (i=0; i<nrecv; i++) { 1891 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1892 } 1893 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1894 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 1895 1896 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 1897 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1898 ierr = PetscFree(swaits);CHKERRQ(ierr); 1899 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1900 1901 /* (5) compute the local portion of Cmpi */ 1902 /* ------------------------------------------ */ 1903 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1904 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 1905 current_space = free_space; 1906 1907 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 1908 for (k=0; k<nrecv; k++) { 1909 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1910 nrows = *buf_ri_k[k]; 1911 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1912 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1913 } 1914 1915 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1916 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1917 for (i=0; i<pn; i++) { 1918 /* add C_loc into Cmpi */ 1919 nzi = c_loc->i[i+1] - c_loc->i[i]; 1920 Jptr = c_loc->j + c_loc->i[i]; 1921 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1922 1923 /* add received col data into lnk */ 1924 for (k=0; k<nrecv; k++) { /* k-th received message */ 1925 if (i == *nextrow[k]) { /* i-th row */ 1926 nzi = *(nextci[k]+1) - *nextci[k]; 1927 Jptr = buf_rj[k] + *nextci[k]; 1928 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1929 nextrow[k]++; nextci[k]++; 1930 } 1931 } 1932 nzi = lnk[0]; 1933 1934 /* copy data into free space, then initialize lnk */ 1935 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1936 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 1937 } 1938 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1939 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1940 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 1941 1942 /* local sizes and preallocation */ 1943 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1944 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1945 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1946 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1947 1948 /* members in merge */ 1949 ierr = PetscFree(id_r);CHKERRQ(ierr); 1950 ierr = PetscFree(len_r);CHKERRQ(ierr); 1951 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 1952 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 1953 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 1954 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 1955 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 1956 1957 /* attach the supporting struct to Cmpi for reuse */ 1958 c = (Mat_MPIAIJ*)Cmpi->data; 1959 c->ap = ptap; 1960 ptap->duplicate = Cmpi->ops->duplicate; 1961 ptap->destroy = Cmpi->ops->destroy; 1962 ptap->view = Cmpi->ops->view; 1963 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1964 1965 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1966 Cmpi->assembled = PETSC_FALSE; 1967 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1968 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 1969 Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP; 1970 *C = Cmpi; 1971 PetscFunctionReturn(0); 1972 } 1973 1974 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1975 { 1976 PetscErrorCode ierr; 1977 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1978 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1979 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1980 Mat_APMPI *ptap = c->ap; 1981 Mat AP_loc,C_loc,C_oth; 1982 PetscInt i,rstart,rend,cm,ncols,row; 1983 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1984 PetscScalar *apa; 1985 const PetscInt *cols; 1986 const PetscScalar *vals; 1987 1988 PetscFunctionBegin; 1989 if (!ptap) { 1990 MPI_Comm comm; 1991 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1992 SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'"); 1993 } 1994 1995 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1996 /* 1) get R = Pd^T,Ro = Po^T */ 1997 if (ptap->reuse == MAT_REUSE_MATRIX) { 1998 ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1999 ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 2000 } 2001 2002 /* 2) get AP_loc */ 2003 AP_loc = ptap->AP_loc; 2004 ap = (Mat_SeqAIJ*)AP_loc->data; 2005 2006 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 2007 /*-----------------------------------------------------*/ 2008 if (ptap->reuse == MAT_REUSE_MATRIX) { 2009 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 2010 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 2011 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 2012 } 2013 2014 /* 2-2) compute numeric A_loc*P - dominating part */ 2015 /* ---------------------------------------------- */ 2016 /* get data from symbolic products */ 2017 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 2018 if (ptap->P_oth) { 2019 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 2020 } 2021 apa = ptap->apa; 2022 api = ap->i; 2023 apj = ap->j; 2024 for (i=0; i<am; i++) { 2025 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 2026 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 2027 apnz = api[i+1] - api[i]; 2028 for (j=0; j<apnz; j++) { 2029 col = apj[j+api[i]]; 2030 ap->a[j+ap->i[i]] = apa[col]; 2031 apa[col] = 0.0; 2032 } 2033 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 2034 } 2035 2036 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 2037 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 2038 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 2039 C_loc = ptap->C_loc; 2040 C_oth = ptap->C_oth; 2041 2042 /* add C_loc and Co to to C */ 2043 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 2044 2045 /* C_loc -> C */ 2046 cm = C_loc->rmap->N; 2047 c_seq = (Mat_SeqAIJ*)C_loc->data; 2048 cols = c_seq->j; 2049 vals = c_seq->a; 2050 2051 2052 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 2053 /* when there are no off-processor parts. */ 2054 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 2055 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 2056 /* a table, and other, more complex stuff has to be done. */ 2057 if (C->assembled) { 2058 C->was_assembled = PETSC_TRUE; 2059 C->assembled = PETSC_FALSE; 2060 } 2061 if (C->was_assembled) { 2062 for (i=0; i<cm; i++) { 2063 ncols = c_seq->i[i+1] - c_seq->i[i]; 2064 row = rstart + i; 2065 ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2066 cols += ncols; vals += ncols; 2067 } 2068 } else { 2069 ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr); 2070 } 2071 2072 /* Co -> C, off-processor part */ 2073 cm = C_oth->rmap->N; 2074 c_seq = (Mat_SeqAIJ*)C_oth->data; 2075 cols = c_seq->j; 2076 vals = c_seq->a; 2077 for (i=0; i<cm; i++) { 2078 ncols = c_seq->i[i+1] - c_seq->i[i]; 2079 row = p->garray[i]; 2080 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 2081 cols += ncols; vals += ncols; 2082 } 2083 2084 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2085 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2086 2087 ptap->reuse = MAT_REUSE_MATRIX; 2088 2089 /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */ 2090 if (ptap->freestruct) { 2091 ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr); 2092 } 2093 PetscFunctionReturn(0); 2094 } 2095