142c57489SHong Zhang /* 242c57489SHong Zhang Defines projective product routines where A is a MPIAIJ matrix 342c57489SHong Zhang C = P^T * A * P 442c57489SHong Zhang */ 542c57489SHong Zhang 642c57489SHong Zhang #include "src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 742c57489SHong Zhang #include "src/mat/utils/freespace.h" 842c57489SHong Zhang #include "src/mat/impls/aij/mpi/mpiaij.h" 942c57489SHong Zhang #include "petscbt.h" 1042c57489SHong Zhang 1142c57489SHong Zhang EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat); 1242c57489SHong Zhang #undef __FUNCT__ 1342c57489SHong Zhang #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP" 1442c57489SHong Zhang PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 1542c57489SHong Zhang { 1642c57489SHong Zhang PetscErrorCode ierr; 1742c57489SHong Zhang Mat_MatMatMultMPI *ptap; 1842c57489SHong Zhang PetscObjectContainer container; 1942c57489SHong Zhang 2042c57489SHong Zhang PetscFunctionBegin; 2142c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 2242c57489SHong Zhang if (container) { 2342c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 2442c57489SHong Zhang ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr); 2542c57489SHong Zhang ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr); 2642c57489SHong Zhang ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr); 2742c57489SHong Zhang ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr); 2842c57489SHong Zhang if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr); 2942c57489SHong Zhang if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr); 3042c57489SHong Zhang 3142c57489SHong Zhang ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr); 3242c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr); 3342c57489SHong Zhang } 3442c57489SHong Zhang ierr = PetscFree(ptap);CHKERRQ(ierr); 3542c57489SHong Zhang 3642c57489SHong Zhang ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 3742c57489SHong Zhang PetscFunctionReturn(0); 3842c57489SHong Zhang } 3942c57489SHong Zhang 4042c57489SHong Zhang #undef __FUNCT__ 4142c57489SHong Zhang #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ" 4242c57489SHong Zhang PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 4342c57489SHong Zhang { 4442c57489SHong Zhang PetscErrorCode ierr; 4542c57489SHong Zhang Mat_MatMatMultMPI *ptap; 4642c57489SHong Zhang PetscObjectContainer container; 4742c57489SHong Zhang 4842c57489SHong Zhang PetscFunctionBegin; 4942c57489SHong Zhang if (scall == MAT_INITIAL_MATRIX){ 5042c57489SHong Zhang ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 5142c57489SHong Zhang ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr); 5242c57489SHong Zhang ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL; 5342c57489SHong Zhang ptap->abnz_max = 0; /* symbolic A*P is not done yet */ 5442c57489SHong Zhang 5542c57489SHong Zhang /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 5642c57489SHong Zhang ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 5742c57489SHong Zhang 5842c57489SHong Zhang /* get P_loc by taking all local rows of P */ 5942c57489SHong Zhang ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 6042c57489SHong Zhang 6142c57489SHong Zhang /* attach the supporting struct to P for reuse */ 6242c57489SHong Zhang P->ops->destroy = MatDestroy_MPIAIJ_PtAP; 6342c57489SHong Zhang ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 6442c57489SHong Zhang ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr); 6542c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr); 6642c57489SHong Zhang 6742c57489SHong Zhang /* now, compute symbolic local P^T*A*P */ 6842c57489SHong Zhang ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */ 6942c57489SHong Zhang ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 7042c57489SHong Zhang } else if (scall == MAT_REUSE_MATRIX){ 7142c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 7242c57489SHong Zhang if (container) { 7342c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 7442c57489SHong Zhang } else { 7542c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 7642c57489SHong Zhang } 7742c57489SHong Zhang 7842c57489SHong Zhang /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 7942c57489SHong Zhang ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 8042c57489SHong Zhang 8142c57489SHong Zhang /* get P_loc by taking all local rows of P */ 8242c57489SHong Zhang ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 8342c57489SHong Zhang 8442c57489SHong Zhang } else { 8542c57489SHong Zhang SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall); 8642c57489SHong Zhang } 8742c57489SHong Zhang /* now, compute numeric local P^T*A*P */ 8842c57489SHong Zhang ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 8942c57489SHong Zhang ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr); 9042c57489SHong Zhang ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 9142c57489SHong Zhang 9242c57489SHong Zhang PetscFunctionReturn(0); 9342c57489SHong Zhang } 9442c57489SHong Zhang 9542c57489SHong Zhang #undef __FUNCT__ 9642c57489SHong Zhang #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ" 9742c57489SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 9842c57489SHong Zhang { 9942c57489SHong Zhang PetscErrorCode ierr; 100*de0260b3SHong Zhang Mat P_loc,P_oth,B_mpi; 101*de0260b3SHong Zhang Mat_MatMatMultMPI *ap; 10242c57489SHong Zhang PetscObjectContainer container; 10342c57489SHong Zhang FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 10483445d95SHong Zhang Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 10542c57489SHong Zhang Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 10642c57489SHong Zhang Mat_SeqAIJ *p_loc,*p_oth; 107*de0260b3SHong Zhang PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ,*pjj; 10883445d95SHong Zhang PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz; 109*de0260b3SHong Zhang PetscInt nlnk,*lnk,*cdi,*cdj,*owners_co,*coi,*coj,*cji; 110*de0260b3SHong Zhang PetscInt am=A->m,pN=P->N,pm=P->m,pn=P->n; 11183445d95SHong Zhang PetscInt i,j,k,ptnzi,arow,prow,pnzj,cnzi; 11242c57489SHong Zhang PetscBT lnkbt; 11383445d95SHong Zhang PetscInt prstart,prend; 11442c57489SHong Zhang MPI_Comm comm=A->comm; 115*de0260b3SHong Zhang PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 11642c57489SHong Zhang PetscInt **buf_rj,**buf_ri,**buf_ri_k; 11742c57489SHong Zhang PetscInt len,proc,*dnz,*onz,*owners; 11842c57489SHong Zhang PetscInt anzi,*bi,*bj,bnzi,nspacedouble=0; 11942c57489SHong Zhang PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 12042c57489SHong Zhang MPI_Request *si_waits,*sj_waits,*ri_waits,*rj_waits; 12142c57489SHong Zhang MPI_Status *status; 12242c57489SHong Zhang Mat_Merge_SeqsToMPI *merge; 123*de0260b3SHong Zhang PetscInt *apsymi,*apsymj,*apj,apnzj,*prmap=p->garray,pon; 12442c57489SHong Zhang 12542c57489SHong Zhang PetscFunctionBegin; 12642c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 12742c57489SHong Zhang if (container) { 128*de0260b3SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ap);CHKERRQ(ierr); 12942c57489SHong Zhang } else { 13042c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 13142c57489SHong Zhang } 13283445d95SHong Zhang ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 13383445d95SHong Zhang ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 13483445d95SHong Zhang 13542c57489SHong Zhang /* get data from P_loc and P_oth */ 136*de0260b3SHong Zhang P_loc=ap->B_loc; P_oth=ap->B_oth; prstart=ap->brstart; 13742c57489SHong Zhang p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data; 13842c57489SHong Zhang pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j; 13942c57489SHong Zhang prend = prstart + pm; 14042c57489SHong Zhang 14183445d95SHong Zhang /* first, compute symbolic AP = A_loc*P */ 142*de0260b3SHong Zhang /*--------------------------------------*/ 143f4a743e1SHong Zhang ierr = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr); 144*de0260b3SHong Zhang ap->abi = apsymi; 145f4a743e1SHong Zhang apsymi[0] = 0; 14683445d95SHong Zhang 14783445d95SHong Zhang /* create and initialize a linked list */ 14883445d95SHong Zhang nlnk = pN+1; 14983445d95SHong Zhang ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 15083445d95SHong Zhang 151f4a743e1SHong Zhang /* Initial FreeSpace size is 2*nnz(A) */ 152*de0260b3SHong Zhang ierr = GetMoreSpace((PetscInt)(fill*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr); 153f4a743e1SHong Zhang current_space = free_space; 154f4a743e1SHong Zhang 155f4a743e1SHong Zhang for (i=0;i<am;i++) { 156f4a743e1SHong Zhang apnzj = 0; 157f4a743e1SHong Zhang /* diagonal portion of A */ 158f4a743e1SHong Zhang anzi = adi[i+1] - adi[i]; 159f4a743e1SHong Zhang for (j=0; j<anzi; j++){ 16083445d95SHong Zhang prow = *adj++; 161f4a743e1SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 162f4a743e1SHong Zhang pjj = pj_loc + pi_loc[prow]; 16383445d95SHong Zhang /* add non-zero cols of P into the sorted linked list lnk */ 16483445d95SHong Zhang ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 16583445d95SHong Zhang apnzj += nlnk; 166f4a743e1SHong Zhang } 167f4a743e1SHong Zhang /* off-diagonal portion of A */ 168f4a743e1SHong Zhang anzi = aoi[i+1] - aoi[i]; 169f4a743e1SHong Zhang for (j=0; j<anzi; j++){ 17083445d95SHong Zhang prow = *aoj++; 171f4a743e1SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 172f4a743e1SHong Zhang pjj = pj_oth + pi_oth[prow]; 17383445d95SHong Zhang ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 17483445d95SHong Zhang apnzj += nlnk; 175f4a743e1SHong Zhang } 176f4a743e1SHong Zhang 177f4a743e1SHong Zhang apsymi[i+1] = apsymi[i] + apnzj; 178*de0260b3SHong Zhang if (ap->abnz_max < apnzj) ap->abnz_max = apnzj; 179f4a743e1SHong Zhang 18083445d95SHong Zhang /* if free space is not available, double the total space in the list */ 181f4a743e1SHong Zhang if (current_space->local_remaining<apnzj) { 182f4a743e1SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 183f4a743e1SHong Zhang } 184f4a743e1SHong Zhang 185f4a743e1SHong Zhang /* Copy data into free space, then initialize lnk */ 18683445d95SHong Zhang ierr = PetscLLClean(pN,pN,apnzj,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 187f4a743e1SHong Zhang current_space->array += apnzj; 188f4a743e1SHong Zhang current_space->local_used += apnzj; 189f4a743e1SHong Zhang current_space->local_remaining -= apnzj; 190f4a743e1SHong Zhang } 191f4a743e1SHong Zhang /* Allocate space for apsymj, initialize apsymj, and */ 192f4a743e1SHong Zhang /* destroy list of free space and other temporary array(s) */ 193*de0260b3SHong Zhang ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ap->abj);CHKERRQ(ierr); 194*de0260b3SHong Zhang apsymj = ap->abj; 195*de0260b3SHong Zhang ierr = MakeSpaceContiguous(&free_space,ap->abj);CHKERRQ(ierr); 196f4a743e1SHong Zhang 197*de0260b3SHong Zhang /* get ij structure of (p->A)^T and (p->B)^T */ 198*de0260b3SHong Zhang /*-------------------------------------------*/ 199*de0260b3SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); 200*de0260b3SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); 20142c57489SHong Zhang 202*de0260b3SHong Zhang /* then, compute symbolic Co_seq = (p->B)^T*AP */ 203*de0260b3SHong Zhang /* Allocate coi array, arrays for fill computation and */ 20442c57489SHong Zhang /* free space for accumulating nonzero column info */ 205*de0260b3SHong Zhang pon = (p->B)->n; /* total num of rows to be sent to other processors 20683445d95SHong Zhang >= (num of nonzero rows of C_seq) - pn */ 207*de0260b3SHong Zhang ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr); 208*de0260b3SHong Zhang coi[0] = 0; 20942c57489SHong Zhang 210*de0260b3SHong Zhang /* set initial free space to be 2*pon*max( nnz(AP) per row)- an upper bound */ 211*de0260b3SHong Zhang nnz = 2*pon*ap->abnz_max + 1; 212*de0260b3SHong Zhang ierr = GetMoreSpace(nnz,&free_space); 21342c57489SHong Zhang current_space = free_space; 21442c57489SHong Zhang 215*de0260b3SHong Zhang /* determine symbolic Co=(p->B)^T*AP - send to others */ 216*de0260b3SHong Zhang for (i=0; i<pon; i++) { 21742c57489SHong Zhang cnzi = 0; 218*de0260b3SHong Zhang ptnzi = poti[i+1] - poti[i]; 21983445d95SHong Zhang j = ptnzi; 220*de0260b3SHong Zhang ptJ = potj + poti[i+1]; 22183445d95SHong Zhang while (j){/* assume cols are almost in increasing order, starting from its end saves computation */ 22283445d95SHong Zhang j--; ptJ--; 223*de0260b3SHong Zhang arow = *ptJ; /* row of AP == col of Pot */ 22483445d95SHong Zhang apnzj = apsymi[arow+1] - apsymi[arow]; 22583445d95SHong Zhang apj = apsymj + apsymi[arow]; 22683445d95SHong Zhang /* add non-zero cols of AP into the sorted linked list lnk */ 22783445d95SHong Zhang ierr = PetscLLAdd(apnzj,apj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 22842c57489SHong Zhang cnzi += nlnk; 22942c57489SHong Zhang } 23042c57489SHong Zhang 23183445d95SHong Zhang /* If free space is not available, double the total space in the list */ 23242c57489SHong Zhang if (current_space->local_remaining<cnzi) { 23342c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 23442c57489SHong Zhang } 23542c57489SHong Zhang 23642c57489SHong Zhang /* Copy data into free space, and zero out denserows */ 23742c57489SHong Zhang ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 23842c57489SHong Zhang current_space->array += cnzi; 23942c57489SHong Zhang current_space->local_used += cnzi; 24042c57489SHong Zhang current_space->local_remaining -= cnzi; 241*de0260b3SHong Zhang coi[i+1] = coi[i] + cnzi; 24242c57489SHong Zhang } 243*de0260b3SHong Zhang ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr); 244*de0260b3SHong Zhang ierr = MakeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 245*de0260b3SHong Zhang 246*de0260b3SHong Zhang /* determine symbolic Cd = (p->A)^T*AP - stay local */ 247*de0260b3SHong Zhang /*------------------------------------------------------*/ 248*de0260b3SHong Zhang /* Allocate ci array, arrays for fill computation and */ 249*de0260b3SHong Zhang /* free space for accumulating nonzero column info */ 250*de0260b3SHong Zhang ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&cdi);CHKERRQ(ierr); 251*de0260b3SHong Zhang cdi[0] = 0; 252*de0260b3SHong Zhang 253*de0260b3SHong Zhang /* Set initial free space to be 2*pn*max( nnz(AP) per row) */ 254*de0260b3SHong Zhang nnz = 2*pn*ap->abnz_max + 1; 255*de0260b3SHong Zhang ierr = GetMoreSpace(nnz,&free_space); 256*de0260b3SHong Zhang current_space = free_space; 257*de0260b3SHong Zhang 258*de0260b3SHong Zhang for (i=0; i<pn; i++) { 259*de0260b3SHong Zhang cnzi = 0; 260*de0260b3SHong Zhang ptnzi = pdti[i+1] - pdti[i]; 261*de0260b3SHong Zhang if (!ptnzi) SETERRQ1(1,"%d ptnzi=0",i); 262*de0260b3SHong Zhang j = ptnzi; 263*de0260b3SHong Zhang ptJ = pdtj + pdti[i+1]; 264*de0260b3SHong Zhang while (j){/* assume cols are almost in increasing order, starting from its end saves computation */ 265*de0260b3SHong Zhang j--; ptJ--; 266*de0260b3SHong Zhang arow = *ptJ; /* row of AP == col of Pt */ 267*de0260b3SHong Zhang apnzj = apsymi[arow+1] - apsymi[arow]; 268*de0260b3SHong Zhang apj = apsymj + apsymi[arow]; 269*de0260b3SHong Zhang /* add non-zero cols of AP into the sorted linked list lnk */ 270*de0260b3SHong Zhang ierr = PetscLLAdd(apnzj,apj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 271*de0260b3SHong Zhang cnzi += nlnk; 272*de0260b3SHong Zhang } 273*de0260b3SHong Zhang 274*de0260b3SHong Zhang /* If free space is not available, double the total space in the list */ 275*de0260b3SHong Zhang if (current_space->local_remaining<cnzi) { 276*de0260b3SHong Zhang printf("[%d] free space is not available, double\n",rank); 277*de0260b3SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 278*de0260b3SHong Zhang } 279*de0260b3SHong Zhang 280*de0260b3SHong Zhang /* Copy data into free space, and zero out denserows */ 281*de0260b3SHong Zhang ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 282*de0260b3SHong Zhang current_space->array += cnzi; 283*de0260b3SHong Zhang current_space->local_used += cnzi; 284*de0260b3SHong Zhang current_space->local_remaining -= cnzi; 285*de0260b3SHong Zhang cdi[i+1] = cdi[i] + cnzi; 286*de0260b3SHong Zhang } 28783445d95SHong Zhang 28842c57489SHong Zhang /* Clean up. */ 289*de0260b3SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); 290*de0260b3SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); 29142c57489SHong Zhang 29242c57489SHong Zhang /* Allocate space for cj, initialize cj, and */ 29342c57489SHong Zhang /* destroy list of free space and other temporary array(s) */ 294*de0260b3SHong Zhang ierr = PetscMalloc((cdi[pn]+1)*sizeof(PetscInt),&cdj);CHKERRQ(ierr); 295*de0260b3SHong Zhang ierr = MakeSpaceContiguous(&free_space,cdj);CHKERRQ(ierr); 29642c57489SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 29742c57489SHong Zhang 29842c57489SHong Zhang /* add C_seq into mpi C */ 29942c57489SHong Zhang /*-----------------------------------*/ 30042c57489SHong Zhang /* determine row ownership */ 30183445d95SHong Zhang ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 30242c57489SHong Zhang ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr); 30342c57489SHong Zhang ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr); 30442c57489SHong Zhang ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr); 30542c57489SHong Zhang ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 30642c57489SHong Zhang 30742c57489SHong Zhang /* determine the number of messages to send, their lengths */ 30842c57489SHong Zhang /*---------------------------------------------------------*/ 30942c57489SHong Zhang ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 31083445d95SHong Zhang ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 31142c57489SHong Zhang ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 31242c57489SHong Zhang len_s = merge->len_s; 31342c57489SHong Zhang merge->nsend = 0; 31483445d95SHong Zhang 315*de0260b3SHong Zhang ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr); 316*de0260b3SHong Zhang ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 317*de0260b3SHong Zhang 31883445d95SHong Zhang proc = 0; 319*de0260b3SHong Zhang for (i=0; i<pon; i++){ 320*de0260b3SHong Zhang while (prmap[i] >= owners[proc+1]) proc++; 321*de0260b3SHong Zhang len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 322*de0260b3SHong Zhang len_s[proc] += coi[i+1] - coi[i]; 32383445d95SHong Zhang } 324*de0260b3SHong Zhang 325*de0260b3SHong Zhang len = 0; /* max length of buf_si[] */ 326*de0260b3SHong Zhang owners_co[0] = 0; 32742c57489SHong Zhang for (proc=0; proc<size; proc++){ 328*de0260b3SHong Zhang owners_co[proc+1] = owners_co[proc] + len_si[proc]; 32983445d95SHong Zhang if (len_si[proc]){ 33042c57489SHong Zhang merge->nsend++; 33183445d95SHong Zhang len_si[proc] = 2*(len_si[proc] + 1); 33242c57489SHong Zhang len += len_si[proc]; 33342c57489SHong Zhang } 33442c57489SHong Zhang } 33542c57489SHong Zhang 33642c57489SHong Zhang /* determine the number and length of messages to receive for ij-structure */ 33742c57489SHong Zhang /*-------------------------------------------------------------------------*/ 33842c57489SHong Zhang ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 33942c57489SHong Zhang ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 34042c57489SHong Zhang 34142c57489SHong Zhang /* post the Irecv of j-structure */ 34242c57489SHong Zhang /*-------------------------------*/ 34342c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr); 34442c57489SHong Zhang ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr); 34542c57489SHong Zhang 34642c57489SHong Zhang /* post the Isend of j-structure */ 34742c57489SHong Zhang /*--------------------------------*/ 34842c57489SHong Zhang ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr); 34942c57489SHong Zhang sj_waits = si_waits + merge->nsend; 35042c57489SHong Zhang 35142c57489SHong Zhang for (proc=0, k=0; proc<size; proc++){ 35242c57489SHong Zhang if (!len_s[proc]) continue; 353*de0260b3SHong Zhang i = owners_co[proc]; 354*de0260b3SHong Zhang ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr); 35542c57489SHong Zhang k++; 35642c57489SHong Zhang } 35742c57489SHong Zhang 35842c57489SHong Zhang /* receives and sends of j-structure are complete */ 35942c57489SHong Zhang /*------------------------------------------------*/ 36042c57489SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 36142c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr); 36242c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr); 36342c57489SHong Zhang 36442c57489SHong Zhang /* send and recv i-structure */ 36542c57489SHong Zhang /*---------------------------*/ 36642c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr); 36742c57489SHong Zhang ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr); 36842c57489SHong Zhang 36942c57489SHong Zhang ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 37042c57489SHong Zhang buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 37142c57489SHong Zhang for (proc=0,k=0; proc<size; proc++){ 37242c57489SHong Zhang if (!len_s[proc]) continue; 37342c57489SHong Zhang /* form outgoing message for i-structure: 37442c57489SHong Zhang buf_si[0]: nrows to be sent 37542c57489SHong Zhang [1:nrows]: row index (global) 37642c57489SHong Zhang [nrows+1:2*nrows+1]: i-structure index 37742c57489SHong Zhang */ 37842c57489SHong Zhang /*-------------------------------------------*/ 37942c57489SHong Zhang nrows = len_si[proc]/2 - 1; 38042c57489SHong Zhang buf_si_i = buf_si + nrows+1; 38142c57489SHong Zhang buf_si[0] = nrows; 38242c57489SHong Zhang buf_si_i[0] = 0; 38342c57489SHong Zhang nrows = 0; 384*de0260b3SHong Zhang for (i=owners_co[proc]; i<owners_co[proc+1]; i++){ 385*de0260b3SHong Zhang anzi = coi[i+1] - coi[i]; 386*de0260b3SHong Zhang if (!anzi) SETERRQ1(1,"i=%d, ani = 0",i); 38742c57489SHong Zhang buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */ 388*de0260b3SHong Zhang buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */ 38942c57489SHong Zhang nrows++; 39042c57489SHong Zhang } 39142c57489SHong Zhang ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr); 39242c57489SHong Zhang k++; 39342c57489SHong Zhang buf_si += len_si[proc]; 39442c57489SHong Zhang } 39542c57489SHong Zhang 39642c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr); 39742c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr); 39842c57489SHong Zhang 39942c57489SHong Zhang ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr); 40042c57489SHong Zhang for (i=0; i<merge->nrecv; i++){ 40142c57489SHong Zhang ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr); 40242c57489SHong Zhang } 40342c57489SHong Zhang 40442c57489SHong Zhang ierr = PetscFree(len_si);CHKERRQ(ierr); 40542c57489SHong Zhang ierr = PetscFree(len_ri);CHKERRQ(ierr); 40642c57489SHong Zhang ierr = PetscFree(rj_waits);CHKERRQ(ierr); 40742c57489SHong Zhang ierr = PetscFree(si_waits);CHKERRQ(ierr); 40842c57489SHong Zhang ierr = PetscFree(ri_waits);CHKERRQ(ierr); 40942c57489SHong Zhang ierr = PetscFree(buf_s);CHKERRQ(ierr); 41042c57489SHong Zhang ierr = PetscFree(status);CHKERRQ(ierr); 41142c57489SHong Zhang 412*de0260b3SHong Zhang /* compute the local portion of C (mpi mat) */ 413*de0260b3SHong Zhang /*------------------------------------------*/ 41442c57489SHong Zhang /* allocate bi array and free space for accumulating nonzero column info */ 41542c57489SHong Zhang ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 41642c57489SHong Zhang bi[0] = 0; 41742c57489SHong Zhang 41842c57489SHong Zhang /* create and initialize a linked list */ 41942c57489SHong Zhang nlnk = pN+1; 42042c57489SHong Zhang ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 42142c57489SHong Zhang 42242c57489SHong Zhang /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */ 42383445d95SHong Zhang free_space=PETSC_NULL; current_space=PETSC_NULL; 424*de0260b3SHong Zhang ierr = GetMoreSpace((PetscInt)(2*cdi[pn]+1),&free_space);CHKERRQ(ierr); 42542c57489SHong Zhang current_space = free_space; 42642c57489SHong Zhang 42742c57489SHong Zhang /* determine symbolic info for each local row */ 42842c57489SHong Zhang ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 42942c57489SHong Zhang nextrow = buf_ri_k + merge->nrecv; 43042c57489SHong Zhang nextci = nextrow + merge->nrecv; 43142c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ 43242c57489SHong Zhang buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 43342c57489SHong Zhang nrows = *buf_ri_k[k]; 43442c57489SHong Zhang nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 43542c57489SHong Zhang nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 43642c57489SHong Zhang } 43742c57489SHong Zhang 43842c57489SHong Zhang ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 43942c57489SHong Zhang len = 0; 44042c57489SHong Zhang for (i=0; i<pn; i++) { 44142c57489SHong Zhang bnzi = 0; 44242c57489SHong Zhang /* add local non-zero cols of this proc's C_seq into lnk */ 443*de0260b3SHong Zhang anzi = cdi[i+1] - cdi[i]; 444*de0260b3SHong Zhang cji = cdj + cdi[i]; 44542c57489SHong Zhang ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 44642c57489SHong Zhang bnzi += nlnk; 44742c57489SHong Zhang /* add received col data into lnk */ 44842c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 44942c57489SHong Zhang if (i == *nextrow[k]) { /* i-th row */ 45042c57489SHong Zhang anzi = *(nextci[k]+1) - *nextci[k]; 45142c57489SHong Zhang cji = buf_rj[k] + *nextci[k]; 45242c57489SHong Zhang ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 45342c57489SHong Zhang bnzi += nlnk; 45442c57489SHong Zhang nextrow[k]++; nextci[k]++; 45542c57489SHong Zhang } 45642c57489SHong Zhang } 45742c57489SHong Zhang if (len < bnzi) len = bnzi; /* =max(bnzi) */ 45842c57489SHong Zhang 45942c57489SHong Zhang /* if free space is not available, make more free space */ 46042c57489SHong Zhang if (current_space->local_remaining<bnzi) { 46142c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 46242c57489SHong Zhang nspacedouble++; 46342c57489SHong Zhang } 46442c57489SHong Zhang /* copy data into free space, then initialize lnk */ 46542c57489SHong Zhang ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 46642c57489SHong Zhang ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr); 46742c57489SHong Zhang 46842c57489SHong Zhang current_space->array += bnzi; 46942c57489SHong Zhang current_space->local_used += bnzi; 47042c57489SHong Zhang current_space->local_remaining -= bnzi; 47142c57489SHong Zhang 47242c57489SHong Zhang bi[i+1] = bi[i] + bnzi; 47342c57489SHong Zhang } 474*de0260b3SHong Zhang ierr = PetscFree(cdi);CHKERRQ(ierr); 475*de0260b3SHong Zhang ierr = PetscFree(cdj);CHKERRQ(ierr); 47642c57489SHong Zhang ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 47742c57489SHong Zhang 47842c57489SHong Zhang ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 47942c57489SHong Zhang ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 48042c57489SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 48142c57489SHong Zhang 48242c57489SHong Zhang /* create symbolic parallel matrix B_mpi */ 48342c57489SHong Zhang /*---------------------------------------*/ 48442c57489SHong Zhang ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr); 48542c57489SHong Zhang ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr); 48642c57489SHong Zhang ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr); 48742c57489SHong Zhang ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 48842c57489SHong Zhang /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */ 48942c57489SHong Zhang 49042c57489SHong Zhang /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */ 49142c57489SHong Zhang B_mpi->assembled = PETSC_FALSE; 49242c57489SHong Zhang B_mpi->ops->destroy = MatDestroy_MPIAIJ_SeqsToMPI; 49342c57489SHong Zhang merge->bi = bi; 49442c57489SHong Zhang merge->bj = bj; 495*de0260b3SHong Zhang merge->coi = coi; 496*de0260b3SHong Zhang merge->coj = coj; 49742c57489SHong Zhang merge->buf_ri = buf_ri; 49842c57489SHong Zhang merge->buf_rj = buf_rj; 499*de0260b3SHong Zhang merge->owners_co = owners_co; 50042c57489SHong Zhang 50142c57489SHong Zhang /* attach the supporting struct to B_mpi for reuse */ 50242c57489SHong Zhang ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 50342c57489SHong Zhang ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr); 50442c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr); 50542c57489SHong Zhang *C = B_mpi; 50642c57489SHong Zhang 50742c57489SHong Zhang PetscFunctionReturn(0); 50842c57489SHong Zhang } 50942c57489SHong Zhang 51042c57489SHong Zhang #undef __FUNCT__ 51142c57489SHong Zhang #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ" 51242c57489SHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 51342c57489SHong Zhang { 51442c57489SHong Zhang PetscErrorCode ierr; 51542c57489SHong Zhang Mat_Merge_SeqsToMPI *merge; 516*de0260b3SHong Zhang Mat_MatMatMultMPI *ap; 51742c57489SHong Zhang PetscObjectContainer cont_merge,cont_ptap; 51842c57489SHong Zhang PetscInt flops=0; 519*de0260b3SHong Zhang Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 52042c57489SHong Zhang Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 521*de0260b3SHong Zhang Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 52242c57489SHong Zhang Mat_SeqAIJ *p_loc,*p_oth; 523*de0260b3SHong Zhang PetscInt *adi=ad->i,*aoi=ao->i,*adj=ad->j,*aoj=ao->j,*apj,nextp; 52442c57489SHong Zhang PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj; 52542c57489SHong Zhang PetscInt i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow; 52642c57489SHong Zhang PetscInt *cjj; 527*de0260b3SHong Zhang MatScalar *ada=ad->a,*aoa=ao->a,*apa,*paj,*caj; 528*de0260b3SHong Zhang MatScalar *pa_loc,*pa_oth; 529*de0260b3SHong Zhang PetscInt am=A->m,cm=C->m; 53042c57489SHong Zhang MPI_Comm comm=C->comm; 53142c57489SHong Zhang PetscMPIInt size,rank,taga,*len_s; 532*de0260b3SHong Zhang PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextcseqi; 53342c57489SHong Zhang PetscInt **buf_ri,**buf_rj; 534*de0260b3SHong Zhang PetscInt cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi,bj,ba: local array of C(mpi mat) */ 53542c57489SHong Zhang MPI_Request *s_waits,*r_waits; 53642c57489SHong Zhang MPI_Status *status; 537*de0260b3SHong Zhang MatScalar **abuf_r,*ba_i,*cseqa_tmp; 53842c57489SHong Zhang PetscInt *cseqj_tmp; 53942c57489SHong Zhang PetscInt *apsymi,*apsymj; 540*de0260b3SHong Zhang PetscInt pon=(p->B)->n,*coi,*coj; 541*de0260b3SHong Zhang PetscInt *poi=po->i,*poj=po->j,*poJ=poj,*pdi=pd->i,*pdj=pd->j,*pdJ=pdj; 542*de0260b3SHong Zhang MatScalar *coa,*pdA=pd->a,*poA=po->a,*ba; 54342c57489SHong Zhang 54442c57489SHong Zhang PetscFunctionBegin; 54542c57489SHong Zhang ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 54642c57489SHong Zhang ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 54742c57489SHong Zhang 54842c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr); 54942c57489SHong Zhang if (cont_merge) { 55042c57489SHong Zhang ierr = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr); 55142c57489SHong Zhang } else { 55242c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container"); 55342c57489SHong Zhang } 55442c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr); 55542c57489SHong Zhang if (cont_ptap) { 556*de0260b3SHong Zhang ierr = PetscObjectContainerGetPointer(cont_ptap,(void **)&ap);CHKERRQ(ierr); 55742c57489SHong Zhang } else { 55842c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 55942c57489SHong Zhang } 56042c57489SHong Zhang 56142c57489SHong Zhang /* get data from symbolic products */ 562*de0260b3SHong Zhang p_loc=(Mat_SeqAIJ*)(ap->B_loc)->data; 563*de0260b3SHong Zhang p_oth=(Mat_SeqAIJ*)(ap->B_oth)->data; 564*de0260b3SHong Zhang pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; 56542c57489SHong Zhang pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 56642c57489SHong Zhang 567*de0260b3SHong Zhang coi = merge->coi; coj = merge->coj; 568*de0260b3SHong Zhang ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr); 569*de0260b3SHong Zhang ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr); 570*de0260b3SHong Zhang 571*de0260b3SHong Zhang bi = merge->bi; bj = merge->bj; 572*de0260b3SHong Zhang ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 573*de0260b3SHong Zhang ierr = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr); 574*de0260b3SHong Zhang ierr = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr); 57542c57489SHong Zhang 576f4a743e1SHong Zhang /* get data from symbolic A*P */ 577*de0260b3SHong Zhang ierr = PetscMalloc((ap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr); 578*de0260b3SHong Zhang ierr = PetscMemzero(apa,ap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr); 57942c57489SHong Zhang 580f4a743e1SHong Zhang /* compute numeric C_seq=P_loc^T*A_loc*P */ 581*de0260b3SHong Zhang apsymi = ap->abi; apsymj = ap->abj; 58242c57489SHong Zhang for (i=0;i<am;i++) { 583f4a743e1SHong Zhang /* form i-th sparse row of A*P */ 584f4a743e1SHong Zhang apnzj = apsymi[i+1] - apsymi[i]; 585f4a743e1SHong Zhang apj = apsymj + apsymi[i]; 58642c57489SHong Zhang /* diagonal portion of A */ 58742c57489SHong Zhang anzi = adi[i+1] - adi[i]; 58842c57489SHong Zhang for (j=0;j<anzi;j++) { 589*de0260b3SHong Zhang prow = *adj++; 59042c57489SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 59142c57489SHong Zhang pjj = pj_loc + pi_loc[prow]; 59242c57489SHong Zhang paj = pa_loc + pi_loc[prow]; 593f4a743e1SHong Zhang nextp = 0; 594f4a743e1SHong Zhang for (k=0; nextp<pnzj; k++) { 595f4a743e1SHong Zhang if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 596f4a743e1SHong Zhang apa[k] += (*ada)*paj[nextp++]; 59742c57489SHong Zhang } 59842c57489SHong Zhang } 59942c57489SHong Zhang flops += 2*pnzj; 60042c57489SHong Zhang ada++; 60142c57489SHong Zhang } 60242c57489SHong Zhang /* off-diagonal portion of A */ 60342c57489SHong Zhang anzi = aoi[i+1] - aoi[i]; 60442c57489SHong Zhang for (j=0;j<anzi;j++) { 605*de0260b3SHong Zhang prow = *aoj++; 60642c57489SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 60742c57489SHong Zhang pjj = pj_oth + pi_oth[prow]; 60842c57489SHong Zhang paj = pa_oth + pi_oth[prow]; 609f4a743e1SHong Zhang nextp = 0; 610f4a743e1SHong Zhang for (k=0; nextp<pnzj; k++) { 611f4a743e1SHong Zhang if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 612f4a743e1SHong Zhang apa[k] += (*aoa)*paj[nextp++]; 61342c57489SHong Zhang } 61442c57489SHong Zhang } 61542c57489SHong Zhang flops += 2*pnzj; 61642c57489SHong Zhang aoa++; 61742c57489SHong Zhang } 61842c57489SHong Zhang 619*de0260b3SHong Zhang /* compute outer product (p->A)[i,:]^T*AP[i,:] - stay local */ 620*de0260b3SHong Zhang pnzi = pdi[i+1] - pdi[i]; 62142c57489SHong Zhang for (j=0;j<pnzi;j++) { 62242c57489SHong Zhang nextap = 0; 623*de0260b3SHong Zhang crow = *pdJ++; 624*de0260b3SHong Zhang cjj = bj + bi[crow]; 625*de0260b3SHong Zhang caj = ba + bi[crow]; 62642c57489SHong Zhang for (k=0; nextap<apnzj; k++) { 627*de0260b3SHong Zhang if (cjj[k]==apj[nextap]) caj[k] += (*pdA)*apa[nextap++]; 62842c57489SHong Zhang } 62942c57489SHong Zhang flops += 2*apnzj; 630*de0260b3SHong Zhang pdA++; 63142c57489SHong Zhang } 632*de0260b3SHong Zhang 633*de0260b3SHong Zhang /* compute outer product (p->B)[i,:]^T*AP[i,:] - send to others */ 634*de0260b3SHong Zhang pnzi = poi[i+1] - poi[i]; 635*de0260b3SHong Zhang for (j=0;j<pnzi;j++) { 636*de0260b3SHong Zhang nextap = 0; 637*de0260b3SHong Zhang crow = *poJ++; 638*de0260b3SHong Zhang cjj = coj + coi[crow]; 639*de0260b3SHong Zhang caj = coa + coi[crow]; 640*de0260b3SHong Zhang for (k=0; nextap<apnzj; k++) { 641*de0260b3SHong Zhang if (cjj[k] == apj[nextap]) caj[k] += (*poA)*apa[nextap++]; 642*de0260b3SHong Zhang } 643*de0260b3SHong Zhang flops += 2*apnzj; 644*de0260b3SHong Zhang poA++; 645*de0260b3SHong Zhang } 646*de0260b3SHong Zhang 64742c57489SHong Zhang /* zero the current row info for A*P */ 64842c57489SHong Zhang ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr); 64942c57489SHong Zhang } 65042c57489SHong Zhang ierr = PetscFree(apa);CHKERRQ(ierr); 65142c57489SHong Zhang 652*de0260b3SHong Zhang /* send and recv matrix values */ 653*de0260b3SHong Zhang /*-----------------------------*/ 654*de0260b3SHong Zhang 65542c57489SHong Zhang buf_ri = merge->buf_ri; 65642c57489SHong Zhang buf_rj = merge->buf_rj; 65742c57489SHong Zhang 65842c57489SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 65942c57489SHong Zhang 66042c57489SHong Zhang len_s = merge->len_s; 66142c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr); 66242c57489SHong Zhang ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 66342c57489SHong Zhang 66442c57489SHong Zhang ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr); 66542c57489SHong Zhang for (proc=0,k=0; proc<size; proc++){ 66642c57489SHong Zhang if (!len_s[proc]) continue; 667*de0260b3SHong Zhang i = merge->owners_co[proc]; 668*de0260b3SHong Zhang ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 66942c57489SHong Zhang k++; 67042c57489SHong Zhang } 67142c57489SHong Zhang 67242c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr); 67342c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr); 67442c57489SHong Zhang ierr = PetscFree(status);CHKERRQ(ierr); 67542c57489SHong Zhang 67642c57489SHong Zhang ierr = PetscFree(s_waits);CHKERRQ(ierr); 67742c57489SHong Zhang ierr = PetscFree(r_waits);CHKERRQ(ierr); 67842c57489SHong Zhang 67942c57489SHong Zhang /* insert mat values of mpimat */ 68042c57489SHong Zhang /*----------------------------*/ 68142c57489SHong Zhang ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 68242c57489SHong Zhang nextrow = buf_ri_k + merge->nrecv; 68342c57489SHong Zhang nextcseqi = nextrow + merge->nrecv; 68442c57489SHong Zhang 68542c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ 68642c57489SHong Zhang buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 68742c57489SHong Zhang nrows = *(buf_ri_k[k]); 68842c57489SHong Zhang nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 68942c57489SHong Zhang nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 69042c57489SHong Zhang } 69142c57489SHong Zhang 692*de0260b3SHong Zhang /* insert local and received values into C */ 693*de0260b3SHong Zhang for (i=0; i<cm; i++) { 69442c57489SHong Zhang cseqrow = owners[rank] + i; /* global row index of C_seq */ 69542c57489SHong Zhang bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 696*de0260b3SHong Zhang ba_i = ba + bi[i]; 69742c57489SHong Zhang bnzi = bi[i+1] - bi[i]; 69842c57489SHong Zhang /* add received vals into ba */ 69942c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 70042c57489SHong Zhang /* i-th row */ 70142c57489SHong Zhang if (i == *nextrow[k]) { 70242c57489SHong Zhang cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k]; 70342c57489SHong Zhang cseqj_tmp = buf_rj[k] + *(nextcseqi[k]); 70442c57489SHong Zhang cseqa_tmp = abuf_r[k] + *(nextcseqi[k]); 70542c57489SHong Zhang nextcseqj = 0; 70642c57489SHong Zhang for (j=0; nextcseqj<cseqnzi; j++){ 707*de0260b3SHong Zhang if (bj_i[j] == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 70842c57489SHong Zhang ba_i[j] += cseqa_tmp[nextcseqj++]; 70942c57489SHong Zhang } 71042c57489SHong Zhang } 71142c57489SHong Zhang nextrow[k]++; nextcseqi[k]++; 71242c57489SHong Zhang } 71342c57489SHong Zhang } 71442c57489SHong Zhang ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 715*de0260b3SHong Zhang flops += 2*cseqnzi; 71642c57489SHong Zhang } 71742c57489SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 71842c57489SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 71942c57489SHong Zhang 720*de0260b3SHong Zhang ierr = PetscFree(coa);CHKERRQ(ierr); 721*de0260b3SHong Zhang ierr = PetscFree(ba);CHKERRQ(ierr); 72242c57489SHong Zhang ierr = PetscFree(abuf_r);CHKERRQ(ierr); 72342c57489SHong Zhang ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 72442c57489SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 72542c57489SHong Zhang 72642c57489SHong Zhang PetscFunctionReturn(0); 72742c57489SHong Zhang } 728