142c57489SHong Zhang /* 242c57489SHong Zhang Defines projective product routines where A is a MPIAIJ matrix 342c57489SHong Zhang C = P^T * A * P 442c57489SHong Zhang */ 542c57489SHong Zhang 642c57489SHong Zhang #include "src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 742c57489SHong Zhang #include "src/mat/utils/freespace.h" 842c57489SHong Zhang #include "src/mat/impls/aij/mpi/mpiaij.h" 942c57489SHong Zhang #include "petscbt.h" 1042c57489SHong Zhang 1142c57489SHong Zhang EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat); 1242c57489SHong Zhang #undef __FUNCT__ 1342c57489SHong Zhang #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP" 1442c57489SHong Zhang PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 1542c57489SHong Zhang { 1642c57489SHong Zhang PetscErrorCode ierr; 1742c57489SHong Zhang Mat_MatMatMultMPI *ptap; 1842c57489SHong Zhang PetscObjectContainer container; 1942c57489SHong Zhang 2042c57489SHong Zhang PetscFunctionBegin; 2142c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 2242c57489SHong Zhang if (container) { 2342c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 2442c57489SHong Zhang ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr); 2542c57489SHong Zhang ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr); 2642c57489SHong Zhang ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr); 2742c57489SHong Zhang ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr); 2842c57489SHong Zhang if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr); 2942c57489SHong Zhang if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr); 3042c57489SHong Zhang 3142c57489SHong Zhang ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr); 3242c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr); 3342c57489SHong Zhang } 3442c57489SHong Zhang ierr = PetscFree(ptap);CHKERRQ(ierr); 3542c57489SHong Zhang 3642c57489SHong Zhang ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 3742c57489SHong Zhang PetscFunctionReturn(0); 3842c57489SHong Zhang } 3942c57489SHong Zhang 4042c57489SHong Zhang #undef __FUNCT__ 4142c57489SHong Zhang #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ" 4242c57489SHong Zhang PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 4342c57489SHong Zhang { 4442c57489SHong Zhang PetscErrorCode ierr; 4542c57489SHong Zhang Mat_MatMatMultMPI *ptap; 4642c57489SHong Zhang PetscObjectContainer container; 4742c57489SHong Zhang 4842c57489SHong Zhang PetscFunctionBegin; 4942c57489SHong Zhang if (scall == MAT_INITIAL_MATRIX){ 5042c57489SHong Zhang ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 5142c57489SHong Zhang ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr); 5242c57489SHong Zhang ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL; 5342c57489SHong Zhang ptap->abnz_max = 0; /* symbolic A*P is not done yet */ 5442c57489SHong Zhang 5542c57489SHong Zhang /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 5642c57489SHong Zhang ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 5742c57489SHong Zhang 5842c57489SHong Zhang /* get P_loc by taking all local rows of P */ 5942c57489SHong Zhang ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 6042c57489SHong Zhang 6142c57489SHong Zhang /* attach the supporting struct to P for reuse */ 6242c57489SHong Zhang P->ops->destroy = MatDestroy_MPIAIJ_PtAP; 6342c57489SHong Zhang ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 6442c57489SHong Zhang ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr); 6542c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr); 6642c57489SHong Zhang 6742c57489SHong Zhang /* now, compute symbolic local P^T*A*P */ 6842c57489SHong Zhang ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */ 6942c57489SHong Zhang ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 7042c57489SHong Zhang } else if (scall == MAT_REUSE_MATRIX){ 7142c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 7242c57489SHong Zhang if (container) { 7342c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 7442c57489SHong Zhang } else { 7542c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 7642c57489SHong Zhang } 7742c57489SHong Zhang 7842c57489SHong Zhang /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 7942c57489SHong Zhang ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 8042c57489SHong Zhang 8142c57489SHong Zhang /* get P_loc by taking all local rows of P */ 8242c57489SHong Zhang ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 8342c57489SHong Zhang 8442c57489SHong Zhang } else { 8542c57489SHong Zhang SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall); 8642c57489SHong Zhang } 8742c57489SHong Zhang /* now, compute numeric local P^T*A*P */ 8842c57489SHong Zhang ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 8942c57489SHong Zhang ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr); 9042c57489SHong Zhang ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 9142c57489SHong Zhang 9242c57489SHong Zhang PetscFunctionReturn(0); 9342c57489SHong Zhang } 9442c57489SHong Zhang 9542c57489SHong Zhang #undef __FUNCT__ 9642c57489SHong Zhang #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ" 9742c57489SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 9842c57489SHong Zhang { 9942c57489SHong Zhang PetscErrorCode ierr; 10042c57489SHong Zhang Mat P_loc,P_oth; 10142c57489SHong Zhang Mat_MatMatMultMPI *ptap; 10242c57489SHong Zhang PetscObjectContainer container; 10342c57489SHong Zhang FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 10442c57489SHong Zhang Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 10542c57489SHong Zhang Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 10642c57489SHong Zhang Mat_SeqAIJ *p_loc,*p_oth; 10742c57489SHong Zhang PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj; 10842c57489SHong Zhang PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col; 10942c57489SHong Zhang PetscInt pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj; 11042c57489SHong Zhang PetscInt aN=A->N,am=A->m,pN=P->N; 11142c57489SHong Zhang PetscInt i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi; 11242c57489SHong Zhang PetscBT lnkbt; 11342c57489SHong Zhang PetscInt prstart,prend,nprow_loc,nprow_oth; 11442c57489SHong Zhang PetscInt *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth; 11542c57489SHong Zhang 11642c57489SHong Zhang MPI_Comm comm=A->comm; 11742c57489SHong Zhang Mat B_mpi; 11842c57489SHong Zhang PetscMPIInt size,rank,tagi,tagj,*len_s,*len_si,*len_ri; 11942c57489SHong Zhang PetscInt **buf_rj,**buf_ri,**buf_ri_k; 12042c57489SHong Zhang PetscInt len,proc,*dnz,*onz,*owners; 12142c57489SHong Zhang PetscInt anzi,*bi,*bj,bnzi,nspacedouble=0; 12242c57489SHong Zhang PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 12342c57489SHong Zhang MPI_Request *si_waits,*sj_waits,*ri_waits,*rj_waits; 12442c57489SHong Zhang MPI_Status *status; 12542c57489SHong Zhang Mat_Merge_SeqsToMPI *merge; 126*f4a743e1SHong Zhang PetscInt *apsymi,*apj,*apjdense,apnzj; 12742c57489SHong Zhang 12842c57489SHong Zhang PetscFunctionBegin; 12942c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 13042c57489SHong Zhang if (container) { 13142c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 13242c57489SHong Zhang } else { 13342c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 13442c57489SHong Zhang } 13542c57489SHong Zhang /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */ 13642c57489SHong Zhang /*--------------------------------------------------*/ 13742c57489SHong Zhang /* get data from P_loc and P_oth */ 13842c57489SHong Zhang P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart; 13942c57489SHong Zhang p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data; 14042c57489SHong Zhang pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j; 14142c57489SHong Zhang prend = prstart + pm; 14242c57489SHong Zhang 143*f4a743e1SHong Zhang /* ---------- Compute symbolic A*P --------- */ 144*f4a743e1SHong Zhang ierr = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr); 145*f4a743e1SHong Zhang ptap->abi = apsymi; 146*f4a743e1SHong Zhang apsymi[0] = 0; 147*f4a743e1SHong Zhang ierr = PetscMalloc(pN*2*sizeof(PetscInt),&apj);CHKERRQ(ierr); 148*f4a743e1SHong Zhang ierr = PetscMemzero(apj,pN*2*sizeof(PetscInt));CHKERRQ(ierr); 149*f4a743e1SHong Zhang apjdense = apj + pN; 150*f4a743e1SHong Zhang /* Initial FreeSpace size is 2*nnz(A) */ 151*f4a743e1SHong Zhang ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr); 152*f4a743e1SHong Zhang current_space = free_space; 153*f4a743e1SHong Zhang 154*f4a743e1SHong Zhang for (i=0;i<am;i++) { 155*f4a743e1SHong Zhang apnzj = 0; 156*f4a743e1SHong Zhang /* diagonal portion of A */ 157*f4a743e1SHong Zhang anzi = adi[i+1] - adi[i]; 158*f4a743e1SHong Zhang for (j=0;j<anzi;j++) { 159*f4a743e1SHong Zhang prow = *adj; 160*f4a743e1SHong Zhang adj++; 161*f4a743e1SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 162*f4a743e1SHong Zhang pjj = pj_loc + pi_loc[prow]; 163*f4a743e1SHong Zhang for (k=0;k<pnzj;k++) { 164*f4a743e1SHong Zhang if (!apjdense[pjj[k]]) { 165*f4a743e1SHong Zhang apjdense[pjj[k]] = -1; 166*f4a743e1SHong Zhang apj[apnzj++] = pjj[k]; 167*f4a743e1SHong Zhang } 168*f4a743e1SHong Zhang } 169*f4a743e1SHong Zhang } 170*f4a743e1SHong Zhang /* off-diagonal portion of A */ 171*f4a743e1SHong Zhang anzi = aoi[i+1] - aoi[i]; 172*f4a743e1SHong Zhang for (j=0;j<anzi;j++) { 173*f4a743e1SHong Zhang col = a->garray[*aoj]; 174*f4a743e1SHong Zhang if (col < cstart){ 175*f4a743e1SHong Zhang prow = *aoj; 176*f4a743e1SHong Zhang } else if (col >= cend){ 177*f4a743e1SHong Zhang prow = *aoj; 178*f4a743e1SHong Zhang } else { 179*f4a743e1SHong Zhang SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 180*f4a743e1SHong Zhang } 181*f4a743e1SHong Zhang aoj++; 182*f4a743e1SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 183*f4a743e1SHong Zhang pjj = pj_oth + pi_oth[prow]; 184*f4a743e1SHong Zhang for (k=0;k<pnzj;k++) { 185*f4a743e1SHong Zhang if (!apjdense[pjj[k]]) { 186*f4a743e1SHong Zhang apjdense[pjj[k]] = -1; 187*f4a743e1SHong Zhang apj[apnzj++] = pjj[k]; 188*f4a743e1SHong Zhang } 189*f4a743e1SHong Zhang } 190*f4a743e1SHong Zhang } 191*f4a743e1SHong Zhang /* Sort the j index array for quick sparse axpy. */ 192*f4a743e1SHong Zhang ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr); 193*f4a743e1SHong Zhang 194*f4a743e1SHong Zhang apsymi[i+1] = apsymi[i] + apnzj; 195*f4a743e1SHong Zhang if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj; 196*f4a743e1SHong Zhang 197*f4a743e1SHong Zhang /* If free space is not available, make more free space */ 198*f4a743e1SHong Zhang /* Double the amount of total space in the list */ 199*f4a743e1SHong Zhang if (current_space->local_remaining<apnzj) { 200*f4a743e1SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 201*f4a743e1SHong Zhang } 202*f4a743e1SHong Zhang 203*f4a743e1SHong Zhang /* Copy data into free space, then initialize lnk */ 204*f4a743e1SHong Zhang /* ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); */ 205*f4a743e1SHong Zhang for (j=0; j<apnzj; j++) current_space->array[j] = apj[j]; 206*f4a743e1SHong Zhang current_space->array += apnzj; 207*f4a743e1SHong Zhang current_space->local_used += apnzj; 208*f4a743e1SHong Zhang current_space->local_remaining -= apnzj; 209*f4a743e1SHong Zhang 210*f4a743e1SHong Zhang for (j=0;j<apnzj;j++) apjdense[apj[j]] = 0; 211*f4a743e1SHong Zhang } 212*f4a743e1SHong Zhang /* Allocate space for apsymj, initialize apsymj, and */ 213*f4a743e1SHong Zhang /* destroy list of free space and other temporary array(s) */ 214*f4a743e1SHong Zhang ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr); 215*f4a743e1SHong Zhang ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr); 216*f4a743e1SHong Zhang ierr = PetscFree(apj);CHKERRQ(ierr); 217*f4a743e1SHong Zhang /* -------endof Compute symbolic A*P ---------------------*/ 218*f4a743e1SHong Zhang 21942c57489SHong Zhang /* get ij structure of P_loc^T */ 22042c57489SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 22142c57489SHong Zhang ptJ=ptj; 22242c57489SHong Zhang 22342c57489SHong Zhang /* Allocate ci array, arrays for fill computation and */ 22442c57489SHong Zhang /* free space for accumulating nonzero column info */ 22542c57489SHong Zhang ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr); 22642c57489SHong Zhang ci[0] = 0; 22742c57489SHong Zhang 22842c57489SHong Zhang ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr); 22942c57489SHong Zhang ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr); 23042c57489SHong Zhang ptasparserow_loc = ptadenserow_loc + aN; 23142c57489SHong Zhang ptadenserow_oth = ptasparserow_loc + aN; 23242c57489SHong Zhang ptasparserow_oth = ptadenserow_oth + aN; 23342c57489SHong Zhang 23442c57489SHong Zhang /* create and initialize a linked list */ 23542c57489SHong Zhang nlnk = pN+1; 23642c57489SHong Zhang ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 23742c57489SHong Zhang 23842c57489SHong Zhang /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */ 23942c57489SHong Zhang /* This should be reasonable if sparsity of PtAP is similar to that of A. */ 24042c57489SHong Zhang nnz = adi[am] + aoi[am]; 24142c57489SHong Zhang ierr = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space); 24242c57489SHong Zhang current_space = free_space; 24342c57489SHong Zhang 24442c57489SHong Zhang /* determine symbolic info for each row of C: */ 245*f4a743e1SHong Zhang adj = ad->j; aoj = ao->j; 24642c57489SHong Zhang for (i=0;i<pN;i++) { 24742c57489SHong Zhang ptnzi = pti[i+1] - pti[i]; 24842c57489SHong Zhang nprow_loc = 0; nprow_oth = 0; 24942c57489SHong Zhang /* i-th row of symbolic P_loc^T*A_loc: */ 25042c57489SHong Zhang for (j=0;j<ptnzi;j++) { 25142c57489SHong Zhang arow = *ptJ++; 25242c57489SHong Zhang /* diagonal portion of A */ 25342c57489SHong Zhang anzj = adi[arow+1] - adi[arow]; 25442c57489SHong Zhang ajj = adj + adi[arow]; 25542c57489SHong Zhang for (k=0;k<anzj;k++) { 25642c57489SHong Zhang col = ajj[k]+prstart; 25742c57489SHong Zhang if (!ptadenserow_loc[col]) { 25842c57489SHong Zhang ptadenserow_loc[col] = -1; 25942c57489SHong Zhang ptasparserow_loc[nprow_loc++] = col; 26042c57489SHong Zhang } 26142c57489SHong Zhang } 26242c57489SHong Zhang /* off-diagonal portion of A */ 26342c57489SHong Zhang anzj = aoi[arow+1] - aoi[arow]; 26442c57489SHong Zhang ajj = aoj + aoi[arow]; 26542c57489SHong Zhang for (k=0;k<anzj;k++) { 26642c57489SHong Zhang col = a->garray[ajj[k]]; /* global col */ 26742c57489SHong Zhang if (col < cstart){ 26842c57489SHong Zhang col = ajj[k]; 26942c57489SHong Zhang } else if (col >= cend){ 27042c57489SHong Zhang col = ajj[k] + pm; 27142c57489SHong Zhang } else { 27242c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 27342c57489SHong Zhang } 27442c57489SHong Zhang if (!ptadenserow_oth[col]) { 27542c57489SHong Zhang ptadenserow_oth[col] = -1; 27642c57489SHong Zhang ptasparserow_oth[nprow_oth++] = col; 27742c57489SHong Zhang } 27842c57489SHong Zhang } 27942c57489SHong Zhang } 28042c57489SHong Zhang 28142c57489SHong Zhang /* using symbolic info of local PtA, determine symbolic info for row of C: */ 28242c57489SHong Zhang cnzi = 0; 28342c57489SHong Zhang /* rows of P_loc */ 28442c57489SHong Zhang ptaj = ptasparserow_loc; 28542c57489SHong Zhang for (j=0; j<nprow_loc; j++) { 28642c57489SHong Zhang prow = *ptaj++; 28742c57489SHong Zhang prow -= prstart; /* rm */ 28842c57489SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 28942c57489SHong Zhang pjj = pj_loc + pi_loc[prow]; 29042c57489SHong Zhang /* add non-zero cols of P into the sorted linked list lnk */ 29142c57489SHong Zhang ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 29242c57489SHong Zhang cnzi += nlnk; 29342c57489SHong Zhang } 29442c57489SHong Zhang /* rows of P_oth */ 29542c57489SHong Zhang ptaj = ptasparserow_oth; 29642c57489SHong Zhang for (j=0; j<nprow_oth; j++) { 29742c57489SHong Zhang prow = *ptaj++; 29842c57489SHong Zhang if (prow >= prend) prow -= pm; /* rm */ 29942c57489SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 30042c57489SHong Zhang pjj = pj_oth + pi_oth[prow]; 30142c57489SHong Zhang /* add non-zero cols of P into the sorted linked list lnk */ 30242c57489SHong Zhang ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 30342c57489SHong Zhang cnzi += nlnk; 30442c57489SHong Zhang } 30542c57489SHong Zhang 30642c57489SHong Zhang /* If free space is not available, make more free space */ 30742c57489SHong Zhang /* Double the amount of total space in the list */ 30842c57489SHong Zhang if (current_space->local_remaining<cnzi) { 30942c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 31042c57489SHong Zhang } 31142c57489SHong Zhang 31242c57489SHong Zhang /* Copy data into free space, and zero out denserows */ 31342c57489SHong Zhang ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 31442c57489SHong Zhang current_space->array += cnzi; 31542c57489SHong Zhang current_space->local_used += cnzi; 31642c57489SHong Zhang current_space->local_remaining -= cnzi; 31742c57489SHong Zhang 31842c57489SHong Zhang for (j=0;j<nprow_loc; j++) { 31942c57489SHong Zhang ptadenserow_loc[ptasparserow_loc[j]] = 0; 32042c57489SHong Zhang } 32142c57489SHong Zhang for (j=0;j<nprow_oth; j++) { 32242c57489SHong Zhang ptadenserow_oth[ptasparserow_oth[j]] = 0; 32342c57489SHong Zhang } 32442c57489SHong Zhang 32542c57489SHong Zhang /* Aside: Perhaps we should save the pta info for the numerical factorization. */ 32642c57489SHong Zhang /* For now, we will recompute what is needed. */ 32742c57489SHong Zhang ci[i+1] = ci[i] + cnzi; 32842c57489SHong Zhang } 32942c57489SHong Zhang /* Clean up. */ 33042c57489SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 33142c57489SHong Zhang 33242c57489SHong Zhang /* nnz is now stored in ci[ptm], column indices are in the list of free space */ 33342c57489SHong Zhang /* Allocate space for cj, initialize cj, and */ 33442c57489SHong Zhang /* destroy list of free space and other temporary array(s) */ 33542c57489SHong Zhang ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr); 33642c57489SHong Zhang ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 33742c57489SHong Zhang ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr); 33842c57489SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 33942c57489SHong Zhang 34042c57489SHong Zhang /* add C_seq into mpi C */ 34142c57489SHong Zhang /*-----------------------------------*/ 34242c57489SHong Zhang free_space=PETSC_NULL; current_space=PETSC_NULL; 34342c57489SHong Zhang 34442c57489SHong Zhang ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 34542c57489SHong Zhang ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 34642c57489SHong Zhang 34742c57489SHong Zhang ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 34842c57489SHong Zhang 34942c57489SHong Zhang 35042c57489SHong Zhang /* determine row ownership */ 35142c57489SHong Zhang /*---------------------------------------------------------*/ 35242c57489SHong Zhang ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr); 35342c57489SHong Zhang ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr); 35442c57489SHong Zhang ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr); 35542c57489SHong Zhang ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 35642c57489SHong Zhang 35742c57489SHong Zhang /* determine the number of messages to send, their lengths */ 35842c57489SHong Zhang /*---------------------------------------------------------*/ 35942c57489SHong Zhang ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 36042c57489SHong Zhang ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 36142c57489SHong Zhang len_s = merge->len_s; 36242c57489SHong Zhang len = 0; /* length of buf_si[] */ 36342c57489SHong Zhang merge->nsend = 0; 36442c57489SHong Zhang for (proc=0; proc<size; proc++){ 36542c57489SHong Zhang len_si[proc] = 0; 36642c57489SHong Zhang if (proc == rank){ 36742c57489SHong Zhang len_s[proc] = 0; 36842c57489SHong Zhang } else { 36942c57489SHong Zhang len_si[proc] = owners[proc+1] - owners[proc] + 1; 37042c57489SHong Zhang len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */ 37142c57489SHong Zhang } 37242c57489SHong Zhang if (len_s[proc]) { 37342c57489SHong Zhang merge->nsend++; 37442c57489SHong Zhang nrows = 0; 37542c57489SHong Zhang for (i=owners[proc]; i<owners[proc+1]; i++){ 37642c57489SHong Zhang if (ci[i+1] > ci[i]) nrows++; 37742c57489SHong Zhang } 37842c57489SHong Zhang len_si[proc] = 2*(nrows+1); 37942c57489SHong Zhang len += len_si[proc]; 38042c57489SHong Zhang } 38142c57489SHong Zhang } 38242c57489SHong Zhang 38342c57489SHong Zhang /* determine the number and length of messages to receive for ij-structure */ 38442c57489SHong Zhang /*-------------------------------------------------------------------------*/ 38542c57489SHong Zhang ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 38642c57489SHong Zhang ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 38742c57489SHong Zhang 38842c57489SHong Zhang /* post the Irecv of j-structure */ 38942c57489SHong Zhang /*-------------------------------*/ 39042c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr); 39142c57489SHong Zhang ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr); 39242c57489SHong Zhang 39342c57489SHong Zhang /* post the Isend of j-structure */ 39442c57489SHong Zhang /*--------------------------------*/ 39542c57489SHong Zhang ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr); 39642c57489SHong Zhang sj_waits = si_waits + merge->nsend; 39742c57489SHong Zhang 39842c57489SHong Zhang for (proc=0, k=0; proc<size; proc++){ 39942c57489SHong Zhang if (!len_s[proc]) continue; 40042c57489SHong Zhang i = owners[proc]; 40142c57489SHong Zhang ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr); 40242c57489SHong Zhang k++; 40342c57489SHong Zhang } 40442c57489SHong Zhang 40542c57489SHong Zhang /* receives and sends of j-structure are complete */ 40642c57489SHong Zhang /*------------------------------------------------*/ 40742c57489SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 40842c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr); 40942c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr); 41042c57489SHong Zhang 41142c57489SHong Zhang /* send and recv i-structure */ 41242c57489SHong Zhang /*---------------------------*/ 41342c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr); 41442c57489SHong Zhang ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr); 41542c57489SHong Zhang 41642c57489SHong Zhang ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 41742c57489SHong Zhang buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 41842c57489SHong Zhang for (proc=0,k=0; proc<size; proc++){ 41942c57489SHong Zhang if (!len_s[proc]) continue; 42042c57489SHong Zhang /* form outgoing message for i-structure: 42142c57489SHong Zhang buf_si[0]: nrows to be sent 42242c57489SHong Zhang [1:nrows]: row index (global) 42342c57489SHong Zhang [nrows+1:2*nrows+1]: i-structure index 42442c57489SHong Zhang */ 42542c57489SHong Zhang /*-------------------------------------------*/ 42642c57489SHong Zhang nrows = len_si[proc]/2 - 1; 42742c57489SHong Zhang buf_si_i = buf_si + nrows+1; 42842c57489SHong Zhang buf_si[0] = nrows; 42942c57489SHong Zhang buf_si_i[0] = 0; 43042c57489SHong Zhang nrows = 0; 43142c57489SHong Zhang for (i=owners[proc]; i<owners[proc+1]; i++){ 43242c57489SHong Zhang anzi = ci[i+1] - ci[i]; 43342c57489SHong Zhang if (anzi) { 43442c57489SHong Zhang buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */ 43542c57489SHong Zhang buf_si[nrows+1] = i-owners[proc]; /* local row index */ 43642c57489SHong Zhang nrows++; 43742c57489SHong Zhang } 43842c57489SHong Zhang } 43942c57489SHong Zhang ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr); 44042c57489SHong Zhang k++; 44142c57489SHong Zhang buf_si += len_si[proc]; 44242c57489SHong Zhang } 44342c57489SHong Zhang 44442c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr); 44542c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr); 44642c57489SHong Zhang 44742c57489SHong Zhang ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr); 44842c57489SHong Zhang for (i=0; i<merge->nrecv; i++){ 44942c57489SHong Zhang ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr); 45042c57489SHong Zhang } 45142c57489SHong Zhang 45242c57489SHong Zhang ierr = PetscFree(len_si);CHKERRQ(ierr); 45342c57489SHong Zhang ierr = PetscFree(len_ri);CHKERRQ(ierr); 45442c57489SHong Zhang ierr = PetscFree(rj_waits);CHKERRQ(ierr); 45542c57489SHong Zhang ierr = PetscFree(si_waits);CHKERRQ(ierr); 45642c57489SHong Zhang ierr = PetscFree(ri_waits);CHKERRQ(ierr); 45742c57489SHong Zhang ierr = PetscFree(buf_s);CHKERRQ(ierr); 45842c57489SHong Zhang ierr = PetscFree(status);CHKERRQ(ierr); 45942c57489SHong Zhang 46042c57489SHong Zhang /* compute a local seq matrix in each processor */ 46142c57489SHong Zhang /*----------------------------------------------*/ 46242c57489SHong Zhang /* allocate bi array and free space for accumulating nonzero column info */ 46342c57489SHong Zhang ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 46442c57489SHong Zhang bi[0] = 0; 46542c57489SHong Zhang 46642c57489SHong Zhang /* create and initialize a linked list */ 46742c57489SHong Zhang nlnk = pN+1; 46842c57489SHong Zhang ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 46942c57489SHong Zhang 47042c57489SHong Zhang /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */ 47142c57489SHong Zhang len = 0; 47242c57489SHong Zhang len = ci[owners[rank+1]] - ci[owners[rank]]; 47342c57489SHong Zhang ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr); 47442c57489SHong Zhang current_space = free_space; 47542c57489SHong Zhang 47642c57489SHong Zhang /* determine symbolic info for each local row */ 47742c57489SHong Zhang ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 47842c57489SHong Zhang nextrow = buf_ri_k + merge->nrecv; 47942c57489SHong Zhang nextci = nextrow + merge->nrecv; 48042c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ 48142c57489SHong Zhang buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 48242c57489SHong Zhang nrows = *buf_ri_k[k]; 48342c57489SHong Zhang nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 48442c57489SHong Zhang nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 48542c57489SHong Zhang } 48642c57489SHong Zhang 48742c57489SHong Zhang ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 48842c57489SHong Zhang len = 0; 48942c57489SHong Zhang for (i=0; i<pn; i++) { 49042c57489SHong Zhang bnzi = 0; 49142c57489SHong Zhang /* add local non-zero cols of this proc's C_seq into lnk */ 49242c57489SHong Zhang arow = owners[rank] + i; 49342c57489SHong Zhang anzi = ci[arow+1] - ci[arow]; 49442c57489SHong Zhang cji = cj + ci[arow]; 49542c57489SHong Zhang ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 49642c57489SHong Zhang bnzi += nlnk; 49742c57489SHong Zhang /* add received col data into lnk */ 49842c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 49942c57489SHong Zhang if (i == *nextrow[k]) { /* i-th row */ 50042c57489SHong Zhang anzi = *(nextci[k]+1) - *nextci[k]; 50142c57489SHong Zhang cji = buf_rj[k] + *nextci[k]; 50242c57489SHong Zhang ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 50342c57489SHong Zhang bnzi += nlnk; 50442c57489SHong Zhang nextrow[k]++; nextci[k]++; 50542c57489SHong Zhang } 50642c57489SHong Zhang } 50742c57489SHong Zhang if (len < bnzi) len = bnzi; /* =max(bnzi) */ 50842c57489SHong Zhang 50942c57489SHong Zhang /* if free space is not available, make more free space */ 51042c57489SHong Zhang if (current_space->local_remaining<bnzi) { 51142c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 51242c57489SHong Zhang nspacedouble++; 51342c57489SHong Zhang } 51442c57489SHong Zhang /* copy data into free space, then initialize lnk */ 51542c57489SHong Zhang ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 51642c57489SHong Zhang ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr); 51742c57489SHong Zhang 51842c57489SHong Zhang current_space->array += bnzi; 51942c57489SHong Zhang current_space->local_used += bnzi; 52042c57489SHong Zhang current_space->local_remaining -= bnzi; 52142c57489SHong Zhang 52242c57489SHong Zhang bi[i+1] = bi[i] + bnzi; 52342c57489SHong Zhang } 52442c57489SHong Zhang 52542c57489SHong Zhang ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 52642c57489SHong Zhang 52742c57489SHong Zhang ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 52842c57489SHong Zhang ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 52942c57489SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 53042c57489SHong Zhang 53142c57489SHong Zhang /* create symbolic parallel matrix B_mpi */ 53242c57489SHong Zhang /*---------------------------------------*/ 53342c57489SHong Zhang ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr); 53442c57489SHong Zhang ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr); 53542c57489SHong Zhang ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr); 53642c57489SHong Zhang ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 53742c57489SHong Zhang /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */ 53842c57489SHong Zhang 53942c57489SHong Zhang /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */ 54042c57489SHong Zhang B_mpi->assembled = PETSC_FALSE; 54142c57489SHong Zhang B_mpi->ops->destroy = MatDestroy_MPIAIJ_SeqsToMPI; 54242c57489SHong Zhang merge->bi = bi; 54342c57489SHong Zhang merge->bj = bj; 54442c57489SHong Zhang merge->ci = ci; 54542c57489SHong Zhang merge->cj = cj; 54642c57489SHong Zhang merge->buf_ri = buf_ri; 54742c57489SHong Zhang merge->buf_rj = buf_rj; 54842c57489SHong Zhang merge->C_seq = PETSC_NULL; 54942c57489SHong Zhang 55042c57489SHong Zhang /* attach the supporting struct to B_mpi for reuse */ 55142c57489SHong Zhang ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 55242c57489SHong Zhang ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr); 55342c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr); 55442c57489SHong Zhang *C = B_mpi; 55542c57489SHong Zhang 55642c57489SHong Zhang PetscFunctionReturn(0); 55742c57489SHong Zhang } 55842c57489SHong Zhang 55942c57489SHong Zhang #undef __FUNCT__ 56042c57489SHong Zhang #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ" 56142c57489SHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 56242c57489SHong Zhang { 56342c57489SHong Zhang PetscErrorCode ierr; 56442c57489SHong Zhang Mat_Merge_SeqsToMPI *merge; 56542c57489SHong Zhang Mat_MatMatMultMPI *ptap; 56642c57489SHong Zhang PetscObjectContainer cont_merge,cont_ptap; 56742c57489SHong Zhang 56842c57489SHong Zhang PetscInt flops=0; 56942c57489SHong Zhang Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 57042c57489SHong Zhang Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 57142c57489SHong Zhang Mat_SeqAIJ *p_loc,*p_oth; 572*f4a743e1SHong Zhang PetscInt *adi=ad->i,*aoi=ao->i,*apj,cstart=a->cstart,cend=a->cend,col; 57342c57489SHong Zhang PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj; 57442c57489SHong Zhang PetscInt i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow; 57542c57489SHong Zhang PetscInt *cjj; 576*f4a743e1SHong Zhang MatScalar *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj; 57742c57489SHong Zhang MatScalar *pa_loc,*pA,*pa_oth; 57842c57489SHong Zhang PetscInt am=A->m,cN=C->N; 579*f4a743e1SHong Zhang PetscInt nextp,*adj=ad->j,*aoj=ao->j; 58042c57489SHong Zhang MPI_Comm comm=C->comm; 58142c57489SHong Zhang PetscMPIInt size,rank,taga,*len_s; 58242c57489SHong Zhang PetscInt *owners; 58342c57489SHong Zhang PetscInt proc; 58442c57489SHong Zhang PetscInt **buf_ri,**buf_rj; 58542c57489SHong Zhang PetscInt cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */ 58642c57489SHong Zhang PetscInt nrows,**buf_ri_k,**nextrow,**nextcseqi; 58742c57489SHong Zhang MPI_Request *s_waits,*r_waits; 58842c57489SHong Zhang MPI_Status *status; 58942c57489SHong Zhang MatScalar **abuf_r,*ba_i; 59042c57489SHong Zhang PetscInt *cseqi,*cseqj; 59142c57489SHong Zhang PetscInt *cseqj_tmp; 59242c57489SHong Zhang MatScalar *cseqa_tmp; 593*f4a743e1SHong Zhang PetscInt stages[2]; 59442c57489SHong Zhang PetscInt *apsymi,*apsymj; 59542c57489SHong Zhang 59642c57489SHong Zhang PetscFunctionBegin; 597*f4a743e1SHong Zhang ierr = PetscLogStageRegister(&stages[0],"NumPtAP_local");CHKERRQ(ierr); 598*f4a743e1SHong Zhang ierr = PetscLogStageRegister(&stages[1],"NumPtAP_Comm");CHKERRQ(ierr); 59942c57489SHong Zhang 60042c57489SHong Zhang ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 60142c57489SHong Zhang ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 60242c57489SHong Zhang 60342c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr); 60442c57489SHong Zhang if (cont_merge) { 60542c57489SHong Zhang ierr = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr); 60642c57489SHong Zhang } else { 60742c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container"); 60842c57489SHong Zhang } 60942c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr); 61042c57489SHong Zhang if (cont_ptap) { 61142c57489SHong Zhang ierr = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr); 61242c57489SHong Zhang } else { 61342c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 61442c57489SHong Zhang } 61542c57489SHong Zhang 61642c57489SHong Zhang /* get data from symbolic products */ 61742c57489SHong Zhang p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data; 61842c57489SHong Zhang p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data; 61942c57489SHong Zhang pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc; 62042c57489SHong Zhang pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 62142c57489SHong Zhang 62242c57489SHong Zhang cseqi = merge->ci; cseqj=merge->cj; 62342c57489SHong Zhang ierr = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr); 62442c57489SHong Zhang ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr); 62542c57489SHong Zhang 626*f4a743e1SHong Zhang /* get data from symbolic A*P */ 627*f4a743e1SHong Zhang ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr); 628*f4a743e1SHong Zhang ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr); 62942c57489SHong Zhang 630*f4a743e1SHong Zhang /* compute numeric C_seq=P_loc^T*A_loc*P */ 631*f4a743e1SHong Zhang ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr); 632*f4a743e1SHong Zhang apsymi = ptap->abi; apsymj = ptap->abj; 63342c57489SHong Zhang for (i=0;i<am;i++) { 634*f4a743e1SHong Zhang /* form i-th sparse row of A*P */ 635*f4a743e1SHong Zhang apnzj = apsymi[i+1] - apsymi[i]; 636*f4a743e1SHong Zhang apj = apsymj + apsymi[i]; 63742c57489SHong Zhang /* diagonal portion of A */ 63842c57489SHong Zhang anzi = adi[i+1] - adi[i]; 63942c57489SHong Zhang for (j=0;j<anzi;j++) { 64042c57489SHong Zhang prow = *adj; 64142c57489SHong Zhang adj++; 64242c57489SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 64342c57489SHong Zhang pjj = pj_loc + pi_loc[prow]; 64442c57489SHong Zhang paj = pa_loc + pi_loc[prow]; 645*f4a743e1SHong Zhang nextp = 0; 646*f4a743e1SHong Zhang for (k=0; nextp<pnzj; k++) { 647*f4a743e1SHong Zhang if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 648*f4a743e1SHong Zhang apa[k] += (*ada)*paj[nextp++]; 64942c57489SHong Zhang } 65042c57489SHong Zhang } 65142c57489SHong Zhang flops += 2*pnzj; 65242c57489SHong Zhang ada++; 65342c57489SHong Zhang } 65442c57489SHong Zhang /* off-diagonal portion of A */ 65542c57489SHong Zhang anzi = aoi[i+1] - aoi[i]; 65642c57489SHong Zhang for (j=0;j<anzi;j++) { 65742c57489SHong Zhang col = a->garray[*aoj]; 65842c57489SHong Zhang if (col < cstart){ 65942c57489SHong Zhang prow = *aoj; 66042c57489SHong Zhang } else if (col >= cend){ 66142c57489SHong Zhang prow = *aoj; 66242c57489SHong Zhang } else { 66342c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 66442c57489SHong Zhang } 66542c57489SHong Zhang aoj++; 66642c57489SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 66742c57489SHong Zhang pjj = pj_oth + pi_oth[prow]; 66842c57489SHong Zhang paj = pa_oth + pi_oth[prow]; 669*f4a743e1SHong Zhang nextp = 0; 670*f4a743e1SHong Zhang for (k=0; nextp<pnzj; k++) { 671*f4a743e1SHong Zhang if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 672*f4a743e1SHong Zhang apa[k] += (*aoa)*paj[nextp++]; 67342c57489SHong Zhang } 67442c57489SHong Zhang } 67542c57489SHong Zhang flops += 2*pnzj; 67642c57489SHong Zhang aoa++; 67742c57489SHong Zhang } 67842c57489SHong Zhang 67942c57489SHong Zhang /* Compute P_loc[i,:]^T*AP[i,:] using outer product */ 68042c57489SHong Zhang pnzi = pi_loc[i+1] - pi_loc[i]; 68142c57489SHong Zhang for (j=0;j<pnzi;j++) { 68242c57489SHong Zhang nextap = 0; 68342c57489SHong Zhang crow = *pJ++; 68442c57489SHong Zhang cjj = cseqj + cseqi[crow]; 68542c57489SHong Zhang caj = cseqa + cseqi[crow]; 68642c57489SHong Zhang /* Perform sparse axpy operation. Note cjj includes apj. */ 68742c57489SHong Zhang for (k=0;nextap<apnzj;k++) { 68842c57489SHong Zhang if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++]; 68942c57489SHong Zhang } 69042c57489SHong Zhang flops += 2*apnzj; 69142c57489SHong Zhang pA++; 69242c57489SHong Zhang } 69342c57489SHong Zhang /* zero the current row info for A*P */ 69442c57489SHong Zhang ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr); 69542c57489SHong Zhang } 69642c57489SHong Zhang 69742c57489SHong Zhang ierr = PetscFree(apa);CHKERRQ(ierr); 69842c57489SHong Zhang ierr = PetscLogStagePop(); 69942c57489SHong Zhang 70042c57489SHong Zhang bi = merge->bi; 70142c57489SHong Zhang bj = merge->bj; 70242c57489SHong Zhang buf_ri = merge->buf_ri; 70342c57489SHong Zhang buf_rj = merge->buf_rj; 70442c57489SHong Zhang 70542c57489SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 70642c57489SHong Zhang ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 70742c57489SHong Zhang 70842c57489SHong Zhang /* send and recv matrix values */ 70942c57489SHong Zhang /*-----------------------------*/ 710*f4a743e1SHong Zhang ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr); 71142c57489SHong Zhang len_s = merge->len_s; 71242c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr); 71342c57489SHong Zhang ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 71442c57489SHong Zhang 71542c57489SHong Zhang ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr); 71642c57489SHong Zhang for (proc=0,k=0; proc<size; proc++){ 71742c57489SHong Zhang if (!len_s[proc]) continue; 71842c57489SHong Zhang i = owners[proc]; 71942c57489SHong Zhang ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 72042c57489SHong Zhang k++; 72142c57489SHong Zhang } 72242c57489SHong Zhang 72342c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr); 72442c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr); 72542c57489SHong Zhang ierr = PetscFree(status);CHKERRQ(ierr); 72642c57489SHong Zhang 72742c57489SHong Zhang ierr = PetscFree(s_waits);CHKERRQ(ierr); 72842c57489SHong Zhang ierr = PetscFree(r_waits);CHKERRQ(ierr); 72942c57489SHong Zhang 73042c57489SHong Zhang /* insert mat values of mpimat */ 73142c57489SHong Zhang /*----------------------------*/ 73242c57489SHong Zhang ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr); 73342c57489SHong Zhang ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 73442c57489SHong Zhang nextrow = buf_ri_k + merge->nrecv; 73542c57489SHong Zhang nextcseqi = nextrow + merge->nrecv; 73642c57489SHong Zhang 73742c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ 73842c57489SHong Zhang buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 73942c57489SHong Zhang nrows = *(buf_ri_k[k]); 74042c57489SHong Zhang nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 74142c57489SHong Zhang nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 74242c57489SHong Zhang } 74342c57489SHong Zhang 74442c57489SHong Zhang /* set values of ba */ 74542c57489SHong Zhang for (i=0; i<C->m; i++) { 74642c57489SHong Zhang cseqrow = owners[rank] + i; /* global row index of C_seq */ 74742c57489SHong Zhang bj_i = bj+bi[i]; /* col indices of the i-th row of C */ 74842c57489SHong Zhang bnzi = bi[i+1] - bi[i]; 74942c57489SHong Zhang ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr); 75042c57489SHong Zhang 75142c57489SHong Zhang /* add local non-zero vals of this proc's C_seq into ba */ 75242c57489SHong Zhang cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow]; 75342c57489SHong Zhang cseqj_tmp = cseqj + cseqi[cseqrow]; 75442c57489SHong Zhang cseqa_tmp = cseqa + cseqi[cseqrow]; 75542c57489SHong Zhang nextcseqj = 0; 75642c57489SHong Zhang for (j=0; nextcseqj<cseqnzi; j++){ 75742c57489SHong Zhang if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 75842c57489SHong Zhang ba_i[j] += cseqa_tmp[nextcseqj++]; 75942c57489SHong Zhang } 76042c57489SHong Zhang } 76142c57489SHong Zhang 76242c57489SHong Zhang /* add received vals into ba */ 76342c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 76442c57489SHong Zhang /* i-th row */ 76542c57489SHong Zhang if (i == *nextrow[k]) { 76642c57489SHong Zhang cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k]; 76742c57489SHong Zhang cseqj_tmp = buf_rj[k] + *(nextcseqi[k]); 76842c57489SHong Zhang cseqa_tmp = abuf_r[k] + *(nextcseqi[k]); 76942c57489SHong Zhang nextcseqj = 0; 77042c57489SHong Zhang for (j=0; nextcseqj<cseqnzi; j++){ 77142c57489SHong Zhang if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 77242c57489SHong Zhang ba_i[j] += cseqa_tmp[nextcseqj++]; 77342c57489SHong Zhang } 77442c57489SHong Zhang } 77542c57489SHong Zhang nextrow[k]++; nextcseqi[k]++; 77642c57489SHong Zhang } 77742c57489SHong Zhang } 77842c57489SHong Zhang ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 77942c57489SHong Zhang flops += 2*bnzi; 78042c57489SHong Zhang } 78142c57489SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 78242c57489SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 78342c57489SHong Zhang ierr = PetscLogStagePop();CHKERRQ(ierr); 78442c57489SHong Zhang 78542c57489SHong Zhang ierr = PetscFree(cseqa);CHKERRQ(ierr); 78642c57489SHong Zhang ierr = PetscFree(abuf_r);CHKERRQ(ierr); 78742c57489SHong Zhang ierr = PetscFree(ba_i);CHKERRQ(ierr); 78842c57489SHong Zhang ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 78942c57489SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 79042c57489SHong Zhang 79142c57489SHong Zhang PetscFunctionReturn(0); 79242c57489SHong Zhang } 793