1*42c57489SHong Zhang /* 2*42c57489SHong Zhang Defines projective product routines where A is a MPIAIJ matrix 3*42c57489SHong Zhang C = P^T * A * P 4*42c57489SHong Zhang */ 5*42c57489SHong Zhang 6*42c57489SHong Zhang #include "src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 7*42c57489SHong Zhang #include "src/mat/utils/freespace.h" 8*42c57489SHong Zhang #include "src/mat/impls/aij/mpi/mpiaij.h" 9*42c57489SHong Zhang #include "petscbt.h" 10*42c57489SHong Zhang 11*42c57489SHong Zhang EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat); 12*42c57489SHong Zhang #undef __FUNCT__ 13*42c57489SHong Zhang #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP" 14*42c57489SHong Zhang PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 15*42c57489SHong Zhang { 16*42c57489SHong Zhang PetscErrorCode ierr; 17*42c57489SHong Zhang Mat_MatMatMultMPI *ptap; 18*42c57489SHong Zhang PetscObjectContainer container; 19*42c57489SHong Zhang 20*42c57489SHong Zhang PetscFunctionBegin; 21*42c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 22*42c57489SHong Zhang if (container) { 23*42c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 24*42c57489SHong Zhang ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr); 25*42c57489SHong Zhang ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr); 26*42c57489SHong Zhang ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr); 27*42c57489SHong Zhang ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr); 28*42c57489SHong Zhang if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr); 29*42c57489SHong Zhang if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr); 30*42c57489SHong Zhang 31*42c57489SHong Zhang ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr); 32*42c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr); 33*42c57489SHong Zhang } 34*42c57489SHong Zhang ierr = PetscFree(ptap);CHKERRQ(ierr); 35*42c57489SHong Zhang 36*42c57489SHong Zhang ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 37*42c57489SHong Zhang PetscFunctionReturn(0); 38*42c57489SHong Zhang } 39*42c57489SHong Zhang 40*42c57489SHong Zhang #undef __FUNCT__ 41*42c57489SHong Zhang #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ" 42*42c57489SHong Zhang PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 43*42c57489SHong Zhang { 44*42c57489SHong Zhang PetscErrorCode ierr; 45*42c57489SHong Zhang Mat_MatMatMultMPI *ptap; 46*42c57489SHong Zhang PetscObjectContainer container; 47*42c57489SHong Zhang 48*42c57489SHong Zhang PetscFunctionBegin; 49*42c57489SHong Zhang if (scall == MAT_INITIAL_MATRIX){ 50*42c57489SHong Zhang ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 51*42c57489SHong Zhang ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr); 52*42c57489SHong Zhang ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL; 53*42c57489SHong Zhang ptap->abnz_max = 0; /* symbolic A*P is not done yet */ 54*42c57489SHong Zhang 55*42c57489SHong Zhang /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 56*42c57489SHong Zhang ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 57*42c57489SHong Zhang 58*42c57489SHong Zhang /* get P_loc by taking all local rows of P */ 59*42c57489SHong Zhang ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 60*42c57489SHong Zhang 61*42c57489SHong Zhang /* attach the supporting struct to P for reuse */ 62*42c57489SHong Zhang P->ops->destroy = MatDestroy_MPIAIJ_PtAP; 63*42c57489SHong Zhang ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 64*42c57489SHong Zhang ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr); 65*42c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr); 66*42c57489SHong Zhang 67*42c57489SHong Zhang /* now, compute symbolic local P^T*A*P */ 68*42c57489SHong Zhang ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */ 69*42c57489SHong Zhang ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 70*42c57489SHong Zhang } else if (scall == MAT_REUSE_MATRIX){ 71*42c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 72*42c57489SHong Zhang if (container) { 73*42c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 74*42c57489SHong Zhang } else { 75*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 76*42c57489SHong Zhang } 77*42c57489SHong Zhang 78*42c57489SHong Zhang /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 79*42c57489SHong Zhang ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr); 80*42c57489SHong Zhang 81*42c57489SHong Zhang /* get P_loc by taking all local rows of P */ 82*42c57489SHong Zhang ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr); 83*42c57489SHong Zhang 84*42c57489SHong Zhang } else { 85*42c57489SHong Zhang SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall); 86*42c57489SHong Zhang } 87*42c57489SHong Zhang /* now, compute numeric local P^T*A*P */ 88*42c57489SHong Zhang ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 89*42c57489SHong Zhang ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr); 90*42c57489SHong Zhang ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 91*42c57489SHong Zhang 92*42c57489SHong Zhang PetscFunctionReturn(0); 93*42c57489SHong Zhang } 94*42c57489SHong Zhang 95*42c57489SHong Zhang #undef __FUNCT__ 96*42c57489SHong Zhang #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ" 97*42c57489SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 98*42c57489SHong Zhang { 99*42c57489SHong Zhang PetscErrorCode ierr; 100*42c57489SHong Zhang Mat P_loc,P_oth; 101*42c57489SHong Zhang Mat_MatMatMultMPI *ptap; 102*42c57489SHong Zhang PetscObjectContainer container; 103*42c57489SHong Zhang FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 104*42c57489SHong Zhang Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 105*42c57489SHong Zhang Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 106*42c57489SHong Zhang Mat_SeqAIJ *p_loc,*p_oth; 107*42c57489SHong Zhang PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj; 108*42c57489SHong Zhang PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col; 109*42c57489SHong Zhang PetscInt pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj; 110*42c57489SHong Zhang PetscInt aN=A->N,am=A->m,pN=P->N; 111*42c57489SHong Zhang PetscInt i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi; 112*42c57489SHong Zhang PetscBT lnkbt; 113*42c57489SHong Zhang PetscInt prstart,prend,nprow_loc,nprow_oth; 114*42c57489SHong Zhang PetscInt *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth; 115*42c57489SHong Zhang 116*42c57489SHong Zhang MPI_Comm comm=A->comm; 117*42c57489SHong Zhang Mat B_mpi; 118*42c57489SHong Zhang PetscMPIInt size,rank,tagi,tagj,*len_s,*len_si,*len_ri; 119*42c57489SHong Zhang PetscInt **buf_rj,**buf_ri,**buf_ri_k; 120*42c57489SHong Zhang PetscInt len,proc,*dnz,*onz,*owners; 121*42c57489SHong Zhang PetscInt anzi,*bi,*bj,bnzi,nspacedouble=0; 122*42c57489SHong Zhang PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 123*42c57489SHong Zhang MPI_Request *si_waits,*sj_waits,*ri_waits,*rj_waits; 124*42c57489SHong Zhang MPI_Status *status; 125*42c57489SHong Zhang Mat_Merge_SeqsToMPI *merge; 126*42c57489SHong Zhang 127*42c57489SHong Zhang PetscFunctionBegin; 128*42c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr); 129*42c57489SHong Zhang if (container) { 130*42c57489SHong Zhang ierr = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr); 131*42c57489SHong Zhang } else { 132*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 133*42c57489SHong Zhang } 134*42c57489SHong Zhang /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */ 135*42c57489SHong Zhang /*--------------------------------------------------*/ 136*42c57489SHong Zhang /* get data from P_loc and P_oth */ 137*42c57489SHong Zhang P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart; 138*42c57489SHong Zhang p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data; 139*42c57489SHong Zhang pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j; 140*42c57489SHong Zhang prend = prstart + pm; 141*42c57489SHong Zhang 142*42c57489SHong Zhang /* get ij structure of P_loc^T */ 143*42c57489SHong Zhang ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 144*42c57489SHong Zhang ptJ=ptj; 145*42c57489SHong Zhang 146*42c57489SHong Zhang /* Allocate ci array, arrays for fill computation and */ 147*42c57489SHong Zhang /* free space for accumulating nonzero column info */ 148*42c57489SHong Zhang ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr); 149*42c57489SHong Zhang ci[0] = 0; 150*42c57489SHong Zhang 151*42c57489SHong Zhang ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr); 152*42c57489SHong Zhang ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr); 153*42c57489SHong Zhang ptasparserow_loc = ptadenserow_loc + aN; 154*42c57489SHong Zhang ptadenserow_oth = ptasparserow_loc + aN; 155*42c57489SHong Zhang ptasparserow_oth = ptadenserow_oth + aN; 156*42c57489SHong Zhang 157*42c57489SHong Zhang /* create and initialize a linked list */ 158*42c57489SHong Zhang nlnk = pN+1; 159*42c57489SHong Zhang ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 160*42c57489SHong Zhang 161*42c57489SHong Zhang /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */ 162*42c57489SHong Zhang /* This should be reasonable if sparsity of PtAP is similar to that of A. */ 163*42c57489SHong Zhang nnz = adi[am] + aoi[am]; 164*42c57489SHong Zhang ierr = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space); 165*42c57489SHong Zhang current_space = free_space; 166*42c57489SHong Zhang 167*42c57489SHong Zhang /* determine symbolic info for each row of C: */ 168*42c57489SHong Zhang for (i=0;i<pN;i++) { 169*42c57489SHong Zhang ptnzi = pti[i+1] - pti[i]; 170*42c57489SHong Zhang nprow_loc = 0; nprow_oth = 0; 171*42c57489SHong Zhang /* i-th row of symbolic P_loc^T*A_loc: */ 172*42c57489SHong Zhang for (j=0;j<ptnzi;j++) { 173*42c57489SHong Zhang arow = *ptJ++; 174*42c57489SHong Zhang /* diagonal portion of A */ 175*42c57489SHong Zhang anzj = adi[arow+1] - adi[arow]; 176*42c57489SHong Zhang ajj = adj + adi[arow]; 177*42c57489SHong Zhang for (k=0;k<anzj;k++) { 178*42c57489SHong Zhang col = ajj[k]+prstart; 179*42c57489SHong Zhang if (!ptadenserow_loc[col]) { 180*42c57489SHong Zhang ptadenserow_loc[col] = -1; 181*42c57489SHong Zhang ptasparserow_loc[nprow_loc++] = col; 182*42c57489SHong Zhang } 183*42c57489SHong Zhang } 184*42c57489SHong Zhang /* off-diagonal portion of A */ 185*42c57489SHong Zhang anzj = aoi[arow+1] - aoi[arow]; 186*42c57489SHong Zhang ajj = aoj + aoi[arow]; 187*42c57489SHong Zhang for (k=0;k<anzj;k++) { 188*42c57489SHong Zhang col = a->garray[ajj[k]]; /* global col */ 189*42c57489SHong Zhang if (col < cstart){ 190*42c57489SHong Zhang col = ajj[k]; 191*42c57489SHong Zhang } else if (col >= cend){ 192*42c57489SHong Zhang col = ajj[k] + pm; 193*42c57489SHong Zhang } else { 194*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 195*42c57489SHong Zhang } 196*42c57489SHong Zhang if (!ptadenserow_oth[col]) { 197*42c57489SHong Zhang ptadenserow_oth[col] = -1; 198*42c57489SHong Zhang ptasparserow_oth[nprow_oth++] = col; 199*42c57489SHong Zhang } 200*42c57489SHong Zhang } 201*42c57489SHong Zhang } 202*42c57489SHong Zhang 203*42c57489SHong Zhang /* using symbolic info of local PtA, determine symbolic info for row of C: */ 204*42c57489SHong Zhang cnzi = 0; 205*42c57489SHong Zhang /* rows of P_loc */ 206*42c57489SHong Zhang ptaj = ptasparserow_loc; 207*42c57489SHong Zhang for (j=0; j<nprow_loc; j++) { 208*42c57489SHong Zhang prow = *ptaj++; 209*42c57489SHong Zhang prow -= prstart; /* rm */ 210*42c57489SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 211*42c57489SHong Zhang pjj = pj_loc + pi_loc[prow]; 212*42c57489SHong Zhang /* add non-zero cols of P into the sorted linked list lnk */ 213*42c57489SHong Zhang ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 214*42c57489SHong Zhang cnzi += nlnk; 215*42c57489SHong Zhang } 216*42c57489SHong Zhang /* rows of P_oth */ 217*42c57489SHong Zhang ptaj = ptasparserow_oth; 218*42c57489SHong Zhang for (j=0; j<nprow_oth; j++) { 219*42c57489SHong Zhang prow = *ptaj++; 220*42c57489SHong Zhang if (prow >= prend) prow -= pm; /* rm */ 221*42c57489SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 222*42c57489SHong Zhang pjj = pj_oth + pi_oth[prow]; 223*42c57489SHong Zhang /* add non-zero cols of P into the sorted linked list lnk */ 224*42c57489SHong Zhang ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 225*42c57489SHong Zhang cnzi += nlnk; 226*42c57489SHong Zhang } 227*42c57489SHong Zhang 228*42c57489SHong Zhang /* If free space is not available, make more free space */ 229*42c57489SHong Zhang /* Double the amount of total space in the list */ 230*42c57489SHong Zhang if (current_space->local_remaining<cnzi) { 231*42c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 232*42c57489SHong Zhang } 233*42c57489SHong Zhang 234*42c57489SHong Zhang /* Copy data into free space, and zero out denserows */ 235*42c57489SHong Zhang ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 236*42c57489SHong Zhang current_space->array += cnzi; 237*42c57489SHong Zhang current_space->local_used += cnzi; 238*42c57489SHong Zhang current_space->local_remaining -= cnzi; 239*42c57489SHong Zhang 240*42c57489SHong Zhang for (j=0;j<nprow_loc; j++) { 241*42c57489SHong Zhang ptadenserow_loc[ptasparserow_loc[j]] = 0; 242*42c57489SHong Zhang } 243*42c57489SHong Zhang for (j=0;j<nprow_oth; j++) { 244*42c57489SHong Zhang ptadenserow_oth[ptasparserow_oth[j]] = 0; 245*42c57489SHong Zhang } 246*42c57489SHong Zhang 247*42c57489SHong Zhang /* Aside: Perhaps we should save the pta info for the numerical factorization. */ 248*42c57489SHong Zhang /* For now, we will recompute what is needed. */ 249*42c57489SHong Zhang ci[i+1] = ci[i] + cnzi; 250*42c57489SHong Zhang } 251*42c57489SHong Zhang /* Clean up. */ 252*42c57489SHong Zhang ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr); 253*42c57489SHong Zhang 254*42c57489SHong Zhang /* nnz is now stored in ci[ptm], column indices are in the list of free space */ 255*42c57489SHong Zhang /* Allocate space for cj, initialize cj, and */ 256*42c57489SHong Zhang /* destroy list of free space and other temporary array(s) */ 257*42c57489SHong Zhang ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr); 258*42c57489SHong Zhang ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr); 259*42c57489SHong Zhang ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr); 260*42c57489SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 261*42c57489SHong Zhang 262*42c57489SHong Zhang /* add C_seq into mpi C */ 263*42c57489SHong Zhang /*-----------------------------------*/ 264*42c57489SHong Zhang free_space=PETSC_NULL; current_space=PETSC_NULL; 265*42c57489SHong Zhang 266*42c57489SHong Zhang ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 267*42c57489SHong Zhang ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 268*42c57489SHong Zhang 269*42c57489SHong Zhang ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 270*42c57489SHong Zhang 271*42c57489SHong Zhang 272*42c57489SHong Zhang /* determine row ownership */ 273*42c57489SHong Zhang /*---------------------------------------------------------*/ 274*42c57489SHong Zhang ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr); 275*42c57489SHong Zhang ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr); 276*42c57489SHong Zhang ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr); 277*42c57489SHong Zhang ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 278*42c57489SHong Zhang 279*42c57489SHong Zhang /* determine the number of messages to send, their lengths */ 280*42c57489SHong Zhang /*---------------------------------------------------------*/ 281*42c57489SHong Zhang ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 282*42c57489SHong Zhang ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 283*42c57489SHong Zhang len_s = merge->len_s; 284*42c57489SHong Zhang len = 0; /* length of buf_si[] */ 285*42c57489SHong Zhang merge->nsend = 0; 286*42c57489SHong Zhang for (proc=0; proc<size; proc++){ 287*42c57489SHong Zhang len_si[proc] = 0; 288*42c57489SHong Zhang if (proc == rank){ 289*42c57489SHong Zhang len_s[proc] = 0; 290*42c57489SHong Zhang } else { 291*42c57489SHong Zhang len_si[proc] = owners[proc+1] - owners[proc] + 1; 292*42c57489SHong Zhang len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */ 293*42c57489SHong Zhang } 294*42c57489SHong Zhang if (len_s[proc]) { 295*42c57489SHong Zhang merge->nsend++; 296*42c57489SHong Zhang nrows = 0; 297*42c57489SHong Zhang for (i=owners[proc]; i<owners[proc+1]; i++){ 298*42c57489SHong Zhang if (ci[i+1] > ci[i]) nrows++; 299*42c57489SHong Zhang } 300*42c57489SHong Zhang len_si[proc] = 2*(nrows+1); 301*42c57489SHong Zhang len += len_si[proc]; 302*42c57489SHong Zhang } 303*42c57489SHong Zhang } 304*42c57489SHong Zhang 305*42c57489SHong Zhang /* determine the number and length of messages to receive for ij-structure */ 306*42c57489SHong Zhang /*-------------------------------------------------------------------------*/ 307*42c57489SHong Zhang ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 308*42c57489SHong Zhang ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 309*42c57489SHong Zhang 310*42c57489SHong Zhang /* post the Irecv of j-structure */ 311*42c57489SHong Zhang /*-------------------------------*/ 312*42c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr); 313*42c57489SHong Zhang ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr); 314*42c57489SHong Zhang 315*42c57489SHong Zhang /* post the Isend of j-structure */ 316*42c57489SHong Zhang /*--------------------------------*/ 317*42c57489SHong Zhang ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr); 318*42c57489SHong Zhang sj_waits = si_waits + merge->nsend; 319*42c57489SHong Zhang 320*42c57489SHong Zhang for (proc=0, k=0; proc<size; proc++){ 321*42c57489SHong Zhang if (!len_s[proc]) continue; 322*42c57489SHong Zhang i = owners[proc]; 323*42c57489SHong Zhang ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr); 324*42c57489SHong Zhang k++; 325*42c57489SHong Zhang } 326*42c57489SHong Zhang 327*42c57489SHong Zhang /* receives and sends of j-structure are complete */ 328*42c57489SHong Zhang /*------------------------------------------------*/ 329*42c57489SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 330*42c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr); 331*42c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr); 332*42c57489SHong Zhang 333*42c57489SHong Zhang /* send and recv i-structure */ 334*42c57489SHong Zhang /*---------------------------*/ 335*42c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr); 336*42c57489SHong Zhang ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr); 337*42c57489SHong Zhang 338*42c57489SHong Zhang ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 339*42c57489SHong Zhang buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 340*42c57489SHong Zhang for (proc=0,k=0; proc<size; proc++){ 341*42c57489SHong Zhang if (!len_s[proc]) continue; 342*42c57489SHong Zhang /* form outgoing message for i-structure: 343*42c57489SHong Zhang buf_si[0]: nrows to be sent 344*42c57489SHong Zhang [1:nrows]: row index (global) 345*42c57489SHong Zhang [nrows+1:2*nrows+1]: i-structure index 346*42c57489SHong Zhang */ 347*42c57489SHong Zhang /*-------------------------------------------*/ 348*42c57489SHong Zhang nrows = len_si[proc]/2 - 1; 349*42c57489SHong Zhang buf_si_i = buf_si + nrows+1; 350*42c57489SHong Zhang buf_si[0] = nrows; 351*42c57489SHong Zhang buf_si_i[0] = 0; 352*42c57489SHong Zhang nrows = 0; 353*42c57489SHong Zhang for (i=owners[proc]; i<owners[proc+1]; i++){ 354*42c57489SHong Zhang anzi = ci[i+1] - ci[i]; 355*42c57489SHong Zhang if (anzi) { 356*42c57489SHong Zhang buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */ 357*42c57489SHong Zhang buf_si[nrows+1] = i-owners[proc]; /* local row index */ 358*42c57489SHong Zhang nrows++; 359*42c57489SHong Zhang } 360*42c57489SHong Zhang } 361*42c57489SHong Zhang ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr); 362*42c57489SHong Zhang k++; 363*42c57489SHong Zhang buf_si += len_si[proc]; 364*42c57489SHong Zhang } 365*42c57489SHong Zhang 366*42c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr); 367*42c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr); 368*42c57489SHong Zhang 369*42c57489SHong Zhang ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr); 370*42c57489SHong Zhang for (i=0; i<merge->nrecv; i++){ 371*42c57489SHong Zhang ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr); 372*42c57489SHong Zhang } 373*42c57489SHong Zhang 374*42c57489SHong Zhang ierr = PetscFree(len_si);CHKERRQ(ierr); 375*42c57489SHong Zhang ierr = PetscFree(len_ri);CHKERRQ(ierr); 376*42c57489SHong Zhang ierr = PetscFree(rj_waits);CHKERRQ(ierr); 377*42c57489SHong Zhang ierr = PetscFree(si_waits);CHKERRQ(ierr); 378*42c57489SHong Zhang ierr = PetscFree(ri_waits);CHKERRQ(ierr); 379*42c57489SHong Zhang ierr = PetscFree(buf_s);CHKERRQ(ierr); 380*42c57489SHong Zhang ierr = PetscFree(status);CHKERRQ(ierr); 381*42c57489SHong Zhang 382*42c57489SHong Zhang /* compute a local seq matrix in each processor */ 383*42c57489SHong Zhang /*----------------------------------------------*/ 384*42c57489SHong Zhang /* allocate bi array and free space for accumulating nonzero column info */ 385*42c57489SHong Zhang ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 386*42c57489SHong Zhang bi[0] = 0; 387*42c57489SHong Zhang 388*42c57489SHong Zhang /* create and initialize a linked list */ 389*42c57489SHong Zhang nlnk = pN+1; 390*42c57489SHong Zhang ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 391*42c57489SHong Zhang 392*42c57489SHong Zhang /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */ 393*42c57489SHong Zhang len = 0; 394*42c57489SHong Zhang len = ci[owners[rank+1]] - ci[owners[rank]]; 395*42c57489SHong Zhang ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr); 396*42c57489SHong Zhang current_space = free_space; 397*42c57489SHong Zhang 398*42c57489SHong Zhang /* determine symbolic info for each local row */ 399*42c57489SHong Zhang ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 400*42c57489SHong Zhang nextrow = buf_ri_k + merge->nrecv; 401*42c57489SHong Zhang nextci = nextrow + merge->nrecv; 402*42c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ 403*42c57489SHong Zhang buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 404*42c57489SHong Zhang nrows = *buf_ri_k[k]; 405*42c57489SHong Zhang nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 406*42c57489SHong Zhang nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 407*42c57489SHong Zhang } 408*42c57489SHong Zhang 409*42c57489SHong Zhang ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 410*42c57489SHong Zhang len = 0; 411*42c57489SHong Zhang for (i=0; i<pn; i++) { 412*42c57489SHong Zhang bnzi = 0; 413*42c57489SHong Zhang /* add local non-zero cols of this proc's C_seq into lnk */ 414*42c57489SHong Zhang arow = owners[rank] + i; 415*42c57489SHong Zhang anzi = ci[arow+1] - ci[arow]; 416*42c57489SHong Zhang cji = cj + ci[arow]; 417*42c57489SHong Zhang ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 418*42c57489SHong Zhang bnzi += nlnk; 419*42c57489SHong Zhang /* add received col data into lnk */ 420*42c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 421*42c57489SHong Zhang if (i == *nextrow[k]) { /* i-th row */ 422*42c57489SHong Zhang anzi = *(nextci[k]+1) - *nextci[k]; 423*42c57489SHong Zhang cji = buf_rj[k] + *nextci[k]; 424*42c57489SHong Zhang ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr); 425*42c57489SHong Zhang bnzi += nlnk; 426*42c57489SHong Zhang nextrow[k]++; nextci[k]++; 427*42c57489SHong Zhang } 428*42c57489SHong Zhang } 429*42c57489SHong Zhang if (len < bnzi) len = bnzi; /* =max(bnzi) */ 430*42c57489SHong Zhang 431*42c57489SHong Zhang /* if free space is not available, make more free space */ 432*42c57489SHong Zhang if (current_space->local_remaining<bnzi) { 433*42c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 434*42c57489SHong Zhang nspacedouble++; 435*42c57489SHong Zhang } 436*42c57489SHong Zhang /* copy data into free space, then initialize lnk */ 437*42c57489SHong Zhang ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 438*42c57489SHong Zhang ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr); 439*42c57489SHong Zhang 440*42c57489SHong Zhang current_space->array += bnzi; 441*42c57489SHong Zhang current_space->local_used += bnzi; 442*42c57489SHong Zhang current_space->local_remaining -= bnzi; 443*42c57489SHong Zhang 444*42c57489SHong Zhang bi[i+1] = bi[i] + bnzi; 445*42c57489SHong Zhang } 446*42c57489SHong Zhang 447*42c57489SHong Zhang ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 448*42c57489SHong Zhang 449*42c57489SHong Zhang ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 450*42c57489SHong Zhang ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 451*42c57489SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 452*42c57489SHong Zhang 453*42c57489SHong Zhang /* create symbolic parallel matrix B_mpi */ 454*42c57489SHong Zhang /*---------------------------------------*/ 455*42c57489SHong Zhang ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr); 456*42c57489SHong Zhang ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr); 457*42c57489SHong Zhang ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr); 458*42c57489SHong Zhang ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 459*42c57489SHong Zhang /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */ 460*42c57489SHong Zhang 461*42c57489SHong Zhang /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */ 462*42c57489SHong Zhang B_mpi->assembled = PETSC_FALSE; 463*42c57489SHong Zhang B_mpi->ops->destroy = MatDestroy_MPIAIJ_SeqsToMPI; 464*42c57489SHong Zhang merge->bi = bi; 465*42c57489SHong Zhang merge->bj = bj; 466*42c57489SHong Zhang merge->ci = ci; 467*42c57489SHong Zhang merge->cj = cj; 468*42c57489SHong Zhang merge->buf_ri = buf_ri; 469*42c57489SHong Zhang merge->buf_rj = buf_rj; 470*42c57489SHong Zhang merge->C_seq = PETSC_NULL; 471*42c57489SHong Zhang 472*42c57489SHong Zhang /* attach the supporting struct to B_mpi for reuse */ 473*42c57489SHong Zhang ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr); 474*42c57489SHong Zhang ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr); 475*42c57489SHong Zhang ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr); 476*42c57489SHong Zhang *C = B_mpi; 477*42c57489SHong Zhang 478*42c57489SHong Zhang PetscFunctionReturn(0); 479*42c57489SHong Zhang } 480*42c57489SHong Zhang 481*42c57489SHong Zhang #undef __FUNCT__ 482*42c57489SHong Zhang #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ" 483*42c57489SHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 484*42c57489SHong Zhang { 485*42c57489SHong Zhang PetscErrorCode ierr; 486*42c57489SHong Zhang Mat_Merge_SeqsToMPI *merge; 487*42c57489SHong Zhang Mat_MatMatMultMPI *ptap; 488*42c57489SHong Zhang PetscObjectContainer cont_merge,cont_ptap; 489*42c57489SHong Zhang 490*42c57489SHong Zhang PetscInt flops=0; 491*42c57489SHong Zhang Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 492*42c57489SHong Zhang Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 493*42c57489SHong Zhang Mat_SeqAIJ *p_loc,*p_oth; 494*42c57489SHong Zhang PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col; 495*42c57489SHong Zhang PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj; 496*42c57489SHong Zhang PetscInt i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow; 497*42c57489SHong Zhang PetscInt *cjj; 498*42c57489SHong Zhang MatScalar *ada=ad->a,*aoa=ao->a,*ada_tmp=ad->a,*aoa_tmp=ao->a,*apa,*paj,*cseqa,*caj; 499*42c57489SHong Zhang MatScalar *pa_loc,*pA,*pa_oth; 500*42c57489SHong Zhang PetscInt am=A->m,cN=C->N; 501*42c57489SHong Zhang PetscInt nextp,*adj_tmp=ad->j,*aoj_tmp=ao->j; 502*42c57489SHong Zhang 503*42c57489SHong Zhang MPI_Comm comm=C->comm; 504*42c57489SHong Zhang PetscMPIInt size,rank,taga,*len_s; 505*42c57489SHong Zhang PetscInt *owners; 506*42c57489SHong Zhang PetscInt proc; 507*42c57489SHong Zhang PetscInt **buf_ri,**buf_rj; 508*42c57489SHong Zhang PetscInt cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */ 509*42c57489SHong Zhang PetscInt nrows,**buf_ri_k,**nextrow,**nextcseqi; 510*42c57489SHong Zhang MPI_Request *s_waits,*r_waits; 511*42c57489SHong Zhang MPI_Status *status; 512*42c57489SHong Zhang MatScalar **abuf_r,*ba_i; 513*42c57489SHong Zhang PetscInt *cseqi,*cseqj; 514*42c57489SHong Zhang PetscInt *cseqj_tmp; 515*42c57489SHong Zhang MatScalar *cseqa_tmp; 516*42c57489SHong Zhang PetscInt stages[3]; 517*42c57489SHong Zhang PetscInt *apsymi,*apsymj; 518*42c57489SHong Zhang FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 519*42c57489SHong Zhang 520*42c57489SHong Zhang PetscFunctionBegin; 521*42c57489SHong Zhang ierr = PetscLogStageRegister(&stages[0],"SymAP");CHKERRQ(ierr); 522*42c57489SHong Zhang ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr); 523*42c57489SHong Zhang ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr); 524*42c57489SHong Zhang 525*42c57489SHong Zhang ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 526*42c57489SHong Zhang ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 527*42c57489SHong Zhang 528*42c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr); 529*42c57489SHong Zhang if (cont_merge) { 530*42c57489SHong Zhang ierr = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr); 531*42c57489SHong Zhang } else { 532*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container"); 533*42c57489SHong Zhang } 534*42c57489SHong Zhang ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr); 535*42c57489SHong Zhang if (cont_ptap) { 536*42c57489SHong Zhang ierr = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr); 537*42c57489SHong Zhang } else { 538*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container"); 539*42c57489SHong Zhang } 540*42c57489SHong Zhang 541*42c57489SHong Zhang /* get data from symbolic products */ 542*42c57489SHong Zhang p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data; 543*42c57489SHong Zhang p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data; 544*42c57489SHong Zhang pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc; 545*42c57489SHong Zhang pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 546*42c57489SHong Zhang 547*42c57489SHong Zhang cseqi = merge->ci; cseqj=merge->cj; 548*42c57489SHong Zhang ierr = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr); 549*42c57489SHong Zhang ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr); 550*42c57489SHong Zhang 551*42c57489SHong Zhang /* ---------- Compute symbolic A*P --------- */ 552*42c57489SHong Zhang if (!ptap->abnz_max){ 553*42c57489SHong Zhang ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr); 554*42c57489SHong Zhang ierr = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr); 555*42c57489SHong Zhang ptap->abi = apsymi; 556*42c57489SHong Zhang apsymi[0] = 0; 557*42c57489SHong Zhang ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr); 558*42c57489SHong Zhang ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr); 559*42c57489SHong Zhang apj = (PetscInt *)(apa + cN); 560*42c57489SHong Zhang apjdense = apj + cN; 561*42c57489SHong Zhang /* Initial FreeSpace size is 2*nnz(A) */ 562*42c57489SHong Zhang ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr); 563*42c57489SHong Zhang current_space = free_space; 564*42c57489SHong Zhang 565*42c57489SHong Zhang for (i=0;i<am;i++) { 566*42c57489SHong Zhang apnzj = 0; 567*42c57489SHong Zhang /* diagonal portion of A */ 568*42c57489SHong Zhang anzi = adi[i+1] - adi[i]; 569*42c57489SHong Zhang for (j=0;j<anzi;j++) { 570*42c57489SHong Zhang prow = *adj; 571*42c57489SHong Zhang adj++; 572*42c57489SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 573*42c57489SHong Zhang pjj = pj_loc + pi_loc[prow]; 574*42c57489SHong Zhang paj = pa_loc + pi_loc[prow]; 575*42c57489SHong Zhang for (k=0;k<pnzj;k++) { 576*42c57489SHong Zhang if (!apjdense[pjj[k]]) { 577*42c57489SHong Zhang apjdense[pjj[k]] = -1; 578*42c57489SHong Zhang apj[apnzj++] = pjj[k]; 579*42c57489SHong Zhang } 580*42c57489SHong Zhang } 581*42c57489SHong Zhang flops += 2*pnzj; 582*42c57489SHong Zhang ada++; 583*42c57489SHong Zhang } 584*42c57489SHong Zhang /* off-diagonal portion of A */ 585*42c57489SHong Zhang anzi = aoi[i+1] - aoi[i]; 586*42c57489SHong Zhang for (j=0;j<anzi;j++) { 587*42c57489SHong Zhang col = a->garray[*aoj]; 588*42c57489SHong Zhang if (col < cstart){ 589*42c57489SHong Zhang prow = *aoj; 590*42c57489SHong Zhang } else if (col >= cend){ 591*42c57489SHong Zhang prow = *aoj; 592*42c57489SHong Zhang } else { 593*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 594*42c57489SHong Zhang } 595*42c57489SHong Zhang aoj++; 596*42c57489SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 597*42c57489SHong Zhang pjj = pj_oth + pi_oth[prow]; 598*42c57489SHong Zhang paj = pa_oth + pi_oth[prow]; 599*42c57489SHong Zhang for (k=0;k<pnzj;k++) { 600*42c57489SHong Zhang if (!apjdense[pjj[k]]) { 601*42c57489SHong Zhang apjdense[pjj[k]] = -1; 602*42c57489SHong Zhang apj[apnzj++] = pjj[k]; 603*42c57489SHong Zhang } 604*42c57489SHong Zhang } 605*42c57489SHong Zhang flops += 2*pnzj; 606*42c57489SHong Zhang aoa++; 607*42c57489SHong Zhang } 608*42c57489SHong Zhang /* Sort the j index array for quick sparse axpy. */ 609*42c57489SHong Zhang ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr); 610*42c57489SHong Zhang 611*42c57489SHong Zhang apsymi[i+1] = apsymi[i] + apnzj; 612*42c57489SHong Zhang if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj; 613*42c57489SHong Zhang 614*42c57489SHong Zhang /* If free space is not available, make more free space */ 615*42c57489SHong Zhang /* Double the amount of total space in the list */ 616*42c57489SHong Zhang if (current_space->local_remaining<apnzj) { 617*42c57489SHong Zhang ierr = GetMoreSpace(current_space->total_array_size,¤t_space);CHKERRQ(ierr); 618*42c57489SHong Zhang } 619*42c57489SHong Zhang 620*42c57489SHong Zhang /* Copy data into free space, then initialize lnk */ 621*42c57489SHong Zhang /* ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); */ 622*42c57489SHong Zhang for (j=0; j<apnzj; j++) current_space->array[j] = apj[j]; 623*42c57489SHong Zhang current_space->array += apnzj; 624*42c57489SHong Zhang current_space->local_used += apnzj; 625*42c57489SHong Zhang current_space->local_remaining -= apnzj; 626*42c57489SHong Zhang 627*42c57489SHong Zhang for (j=0;j<apnzj;j++) apjdense[apj[j]] = 0; 628*42c57489SHong Zhang } 629*42c57489SHong Zhang /* Allocate space for apsymj, initialize apsymj, and */ 630*42c57489SHong Zhang /* destroy list of free space and other temporary array(s) */ 631*42c57489SHong Zhang ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr); 632*42c57489SHong Zhang ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr); 633*42c57489SHong Zhang ierr = PetscLogStagePop(); 634*42c57489SHong Zhang } 635*42c57489SHong Zhang /* -------endof Compute symbolic A*P ---------------------*/ 636*42c57489SHong Zhang else { 637*42c57489SHong Zhang /* get data from symbolic A*P */ 638*42c57489SHong Zhang ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr); 639*42c57489SHong Zhang ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr); 640*42c57489SHong Zhang } 641*42c57489SHong Zhang 642*42c57489SHong Zhang ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr); 643*42c57489SHong Zhang apsymi = ptap->abi; apsymj = ptap->abj; 644*42c57489SHong Zhang for (i=0;i<am;i++) { 645*42c57489SHong Zhang /* Form i-th sparse row of A*P */ 646*42c57489SHong Zhang apnzj = apsymi[i+1] - apsymi[i]; 647*42c57489SHong Zhang apj = apsymj + apsymi[i]; 648*42c57489SHong Zhang /* diagonal portion of A */ 649*42c57489SHong Zhang anzi = adi[i+1] - adi[i]; 650*42c57489SHong Zhang for (j=0;j<anzi;j++) { 651*42c57489SHong Zhang prow = *adj_tmp; 652*42c57489SHong Zhang adj_tmp++; 653*42c57489SHong Zhang pnzj = pi_loc[prow+1] - pi_loc[prow]; 654*42c57489SHong Zhang pjj = pj_loc + pi_loc[prow]; 655*42c57489SHong Zhang paj = pa_loc + pi_loc[prow]; 656*42c57489SHong Zhang nextp = 0; 657*42c57489SHong Zhang for (k=0; nextp<pnzj; k++) { 658*42c57489SHong Zhang if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 659*42c57489SHong Zhang apa[k] += (*ada_tmp)*paj[nextp++]; 660*42c57489SHong Zhang } 661*42c57489SHong Zhang } 662*42c57489SHong Zhang ada_tmp++; 663*42c57489SHong Zhang } 664*42c57489SHong Zhang /* off-diagonal portion of A */ 665*42c57489SHong Zhang anzi = aoi[i+1] - aoi[i]; 666*42c57489SHong Zhang for (j=0;j<anzi;j++) { 667*42c57489SHong Zhang col = a->garray[*aoj_tmp]; 668*42c57489SHong Zhang if (col < cstart){ 669*42c57489SHong Zhang prow = *aoj_tmp; 670*42c57489SHong Zhang } else if (col >= cend){ 671*42c57489SHong Zhang prow = *aoj_tmp; 672*42c57489SHong Zhang } else { 673*42c57489SHong Zhang SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map"); 674*42c57489SHong Zhang } 675*42c57489SHong Zhang aoj_tmp++; 676*42c57489SHong Zhang pnzj = pi_oth[prow+1] - pi_oth[prow]; 677*42c57489SHong Zhang pjj = pj_oth + pi_oth[prow]; 678*42c57489SHong Zhang paj = pa_oth + pi_oth[prow]; 679*42c57489SHong Zhang nextp = 0; 680*42c57489SHong Zhang for (k=0; nextp<pnzj; k++) { 681*42c57489SHong Zhang if (apj[k] == pjj[nextp]) { /* col of AP == col of P */ 682*42c57489SHong Zhang apa[k] += (*aoa_tmp)*paj[nextp++]; 683*42c57489SHong Zhang } 684*42c57489SHong Zhang } 685*42c57489SHong Zhang flops += 2*pnzj; 686*42c57489SHong Zhang aoa_tmp++; 687*42c57489SHong Zhang } 688*42c57489SHong Zhang 689*42c57489SHong Zhang /* Compute P_loc[i,:]^T*AP[i,:] using outer product */ 690*42c57489SHong Zhang pnzi = pi_loc[i+1] - pi_loc[i]; 691*42c57489SHong Zhang for (j=0;j<pnzi;j++) { 692*42c57489SHong Zhang nextap = 0; 693*42c57489SHong Zhang crow = *pJ++; 694*42c57489SHong Zhang cjj = cseqj + cseqi[crow]; 695*42c57489SHong Zhang caj = cseqa + cseqi[crow]; 696*42c57489SHong Zhang /* Perform sparse axpy operation. Note cjj includes apj. */ 697*42c57489SHong Zhang for (k=0;nextap<apnzj;k++) { 698*42c57489SHong Zhang if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++]; 699*42c57489SHong Zhang } 700*42c57489SHong Zhang flops += 2*apnzj; 701*42c57489SHong Zhang pA++; 702*42c57489SHong Zhang } 703*42c57489SHong Zhang /* zero the current row info for A*P */ 704*42c57489SHong Zhang ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr); 705*42c57489SHong Zhang } 706*42c57489SHong Zhang 707*42c57489SHong Zhang ierr = PetscFree(apa);CHKERRQ(ierr); 708*42c57489SHong Zhang ierr = PetscLogStagePop(); 709*42c57489SHong Zhang 710*42c57489SHong Zhang bi = merge->bi; 711*42c57489SHong Zhang bj = merge->bj; 712*42c57489SHong Zhang buf_ri = merge->buf_ri; 713*42c57489SHong Zhang buf_rj = merge->buf_rj; 714*42c57489SHong Zhang 715*42c57489SHong Zhang ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr); 716*42c57489SHong Zhang ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr); 717*42c57489SHong Zhang 718*42c57489SHong Zhang /* send and recv matrix values */ 719*42c57489SHong Zhang /*-----------------------------*/ 720*42c57489SHong Zhang ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr); 721*42c57489SHong Zhang len_s = merge->len_s; 722*42c57489SHong Zhang ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr); 723*42c57489SHong Zhang ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 724*42c57489SHong Zhang 725*42c57489SHong Zhang ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr); 726*42c57489SHong Zhang for (proc=0,k=0; proc<size; proc++){ 727*42c57489SHong Zhang if (!len_s[proc]) continue; 728*42c57489SHong Zhang i = owners[proc]; 729*42c57489SHong Zhang ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 730*42c57489SHong Zhang k++; 731*42c57489SHong Zhang } 732*42c57489SHong Zhang 733*42c57489SHong Zhang ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr); 734*42c57489SHong Zhang ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr); 735*42c57489SHong Zhang ierr = PetscFree(status);CHKERRQ(ierr); 736*42c57489SHong Zhang 737*42c57489SHong Zhang ierr = PetscFree(s_waits);CHKERRQ(ierr); 738*42c57489SHong Zhang ierr = PetscFree(r_waits);CHKERRQ(ierr); 739*42c57489SHong Zhang 740*42c57489SHong Zhang /* insert mat values of mpimat */ 741*42c57489SHong Zhang /*----------------------------*/ 742*42c57489SHong Zhang ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr); 743*42c57489SHong Zhang ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr); 744*42c57489SHong Zhang nextrow = buf_ri_k + merge->nrecv; 745*42c57489SHong Zhang nextcseqi = nextrow + merge->nrecv; 746*42c57489SHong Zhang 747*42c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ 748*42c57489SHong Zhang buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 749*42c57489SHong Zhang nrows = *(buf_ri_k[k]); 750*42c57489SHong Zhang nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 751*42c57489SHong Zhang nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 752*42c57489SHong Zhang } 753*42c57489SHong Zhang 754*42c57489SHong Zhang /* set values of ba */ 755*42c57489SHong Zhang for (i=0; i<C->m; i++) { 756*42c57489SHong Zhang cseqrow = owners[rank] + i; /* global row index of C_seq */ 757*42c57489SHong Zhang bj_i = bj+bi[i]; /* col indices of the i-th row of C */ 758*42c57489SHong Zhang bnzi = bi[i+1] - bi[i]; 759*42c57489SHong Zhang ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr); 760*42c57489SHong Zhang 761*42c57489SHong Zhang /* add local non-zero vals of this proc's C_seq into ba */ 762*42c57489SHong Zhang cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow]; 763*42c57489SHong Zhang cseqj_tmp = cseqj + cseqi[cseqrow]; 764*42c57489SHong Zhang cseqa_tmp = cseqa + cseqi[cseqrow]; 765*42c57489SHong Zhang nextcseqj = 0; 766*42c57489SHong Zhang for (j=0; nextcseqj<cseqnzi; j++){ 767*42c57489SHong Zhang if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 768*42c57489SHong Zhang ba_i[j] += cseqa_tmp[nextcseqj++]; 769*42c57489SHong Zhang } 770*42c57489SHong Zhang } 771*42c57489SHong Zhang 772*42c57489SHong Zhang /* add received vals into ba */ 773*42c57489SHong Zhang for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 774*42c57489SHong Zhang /* i-th row */ 775*42c57489SHong Zhang if (i == *nextrow[k]) { 776*42c57489SHong Zhang cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k]; 777*42c57489SHong Zhang cseqj_tmp = buf_rj[k] + *(nextcseqi[k]); 778*42c57489SHong Zhang cseqa_tmp = abuf_r[k] + *(nextcseqi[k]); 779*42c57489SHong Zhang nextcseqj = 0; 780*42c57489SHong Zhang for (j=0; nextcseqj<cseqnzi; j++){ 781*42c57489SHong Zhang if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */ 782*42c57489SHong Zhang ba_i[j] += cseqa_tmp[nextcseqj++]; 783*42c57489SHong Zhang } 784*42c57489SHong Zhang } 785*42c57489SHong Zhang nextrow[k]++; nextcseqi[k]++; 786*42c57489SHong Zhang } 787*42c57489SHong Zhang } 788*42c57489SHong Zhang ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 789*42c57489SHong Zhang flops += 2*bnzi; 790*42c57489SHong Zhang } 791*42c57489SHong Zhang ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 792*42c57489SHong Zhang ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 793*42c57489SHong Zhang ierr = PetscLogStagePop();CHKERRQ(ierr); 794*42c57489SHong Zhang 795*42c57489SHong Zhang ierr = PetscFree(cseqa);CHKERRQ(ierr); 796*42c57489SHong Zhang ierr = PetscFree(abuf_r);CHKERRQ(ierr); 797*42c57489SHong Zhang ierr = PetscFree(ba_i);CHKERRQ(ierr); 798*42c57489SHong Zhang ierr = PetscFree(buf_ri_k);CHKERRQ(ierr); 799*42c57489SHong Zhang ierr = PetscLogFlops(flops);CHKERRQ(ierr); 800*42c57489SHong Zhang 801*42c57489SHong Zhang PetscFunctionReturn(0); 802*42c57489SHong Zhang } 803