xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 28b400f66ebc7ae0049166a2294dfcd3df27e64b)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
16cf97cf3cSHong Zhang PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
17cf97cf3cSHong Zhang {
18cf97cf3cSHong Zhang   PetscBool         iascii;
19cf97cf3cSHong Zhang   PetscViewerFormat format;
206718818eSStefano Zampini   Mat_APMPI         *ptap;
21cf97cf3cSHong Zhang 
22cf97cf3cSHong Zhang   PetscFunctionBegin;
236718818eSStefano Zampini   MatCheckProduct(A,1);
246718818eSStefano Zampini   ptap = (Mat_APMPI*)A->product->data;
255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
26cf97cf3cSHong Zhang   if (iascii) {
275f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscViewerGetFormat(viewer,&format));
28cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29cf97cf3cSHong Zhang       if (ptap->algType == 0) {
305f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n"));
31cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
325f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n"));
334a56b808SFande Kong       } else if (ptap->algType == 2) {
345f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n"));
354a56b808SFande Kong       } else if (ptap->algType == 3) {
365f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n"));
37cf97cf3cSHong Zhang       }
38cf97cf3cSHong Zhang     }
39cf97cf3cSHong Zhang   }
40cf97cf3cSHong Zhang   PetscFunctionReturn(0);
41cf97cf3cSHong Zhang }
42cf97cf3cSHong Zhang 
436718818eSStefano Zampini PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44cc6cb787SHong Zhang {
456718818eSStefano Zampini   Mat_APMPI           *ptap = (Mat_APMPI*)data;
4660552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
47cc6cb787SHong Zhang 
48cc6cb787SHong Zhang   PetscFunctionBegin;
495f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(ptap->startsj_s,ptap->startsj_r));
505f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(ptap->bufa));
515f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->P_loc));
525f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->P_oth));
535f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
545f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->Rd));
555f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->Ro));
568403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
57681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
585f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(ap->i));
595f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree2(ap->j,ap->a));
605f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDestroy(&ptap->AP_loc));
618403a639SHong Zhang   } else { /* used by alg_ptap */
625f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(ptap->api));
635f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(ptap->apj));
64681d504bSHong Zhang   }
655f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->C_loc));
665f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->C_oth));
675f80ce2aSJacob Faibussowitsch   if (ptap->apa) CHKERRQ(PetscFree(ptap->apa));
68a560ef98SHong Zhang 
695f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&ptap->Pt));
7060552ceaSHong Zhang 
7160552ceaSHong Zhang   merge = ptap->merge;
728403a639SHong Zhang   if (merge) { /* used by alg_ptap */
735f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->id_r));
745f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->len_s));
755f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->len_r));
765f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->bi));
775f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->bj));
785f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->buf_ri[0]));
795f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->buf_ri));
805f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->buf_rj[0]));
815f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->buf_rj));
825f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->coi));
835f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->coj));
845f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(merge->owners_co));
855f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutDestroy(&merge->rowmap));
865f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(ptap->merge));
87bf0cc555SLisandro Dalcin   }
885f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingDestroy(&ptap->ltog));
894a56b808SFande Kong 
905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFDestroy(&ptap->sf));
915f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(ptap->c_othi));
925f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(ptap->c_rmti));
935f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(ptap));
94cc6cb787SHong Zhang   PetscFunctionReturn(0);
95cc6cb787SHong Zhang }
96cc6cb787SHong Zhang 
97cf742fccSHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
98cf742fccSHong Zhang {
996718818eSStefano Zampini   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
100cf742fccSHong Zhang   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
10192ec70a1SHong Zhang   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1026718818eSStefano Zampini   Mat_APMPI         *ptap;
103cf742fccSHong Zhang   Mat               AP_loc,C_loc,C_oth;
104a3bb6f32SFande Kong   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
105cf742fccSHong Zhang   PetscScalar       *apa;
106cf742fccSHong Zhang   const PetscInt    *cols;
107cf742fccSHong Zhang   const PetscScalar *vals;
108cf742fccSHong Zhang 
109cf742fccSHong Zhang   PetscFunctionBegin;
1106718818eSStefano Zampini   MatCheckProduct(C,3);
1116718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
112*28b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
113*28b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
11480da8df7SHong Zhang 
1155f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(C));
116cf742fccSHong Zhang 
117cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
118cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1195f80ce2aSJacob Faibussowitsch     CHKERRQ(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
1205f80ce2aSJacob Faibussowitsch     CHKERRQ(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
121cf742fccSHong Zhang   }
122cf742fccSHong Zhang 
123cf742fccSHong Zhang   /* 2) get AP_loc */
124cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
125cf742fccSHong Zhang   ap = (Mat_SeqAIJ*)AP_loc->data;
126cf742fccSHong Zhang 
127cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128cf742fccSHong Zhang   /*-----------------------------------------------------*/
129cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
130cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1315f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
1325f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
133cf742fccSHong Zhang   }
134cf742fccSHong Zhang 
135cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
136cf742fccSHong Zhang   /* ---------------------------------------------- */
137cf742fccSHong Zhang   /* get data from symbolic products */
138cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
13952f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
14052f7967eSHong Zhang 
141cf742fccSHong Zhang   api   = ap->i;
142cf742fccSHong Zhang   apj   = ap->j;
1435f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj));
144cf742fccSHong Zhang   for (i=0; i<am; i++) {
145cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146cf742fccSHong Zhang     apnz = api[i+1] - api[i];
147b4e8d1b6SHong Zhang     apa = ap->a + api[i];
1485f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscArrayzero(apa,apnz));
149b4e8d1b6SHong Zhang     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
150cf742fccSHong Zhang   }
1515f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj));
1522c71b3e2SJacob Faibussowitsch   PetscCheckFalse(api[AP_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout);
153cf742fccSHong Zhang 
154cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
1565f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc));
1575f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth));
158cf742fccSHong Zhang 
159cf742fccSHong Zhang   C_loc = ptap->C_loc;
160cf742fccSHong Zhang   C_oth = ptap->C_oth;
161cf742fccSHong Zhang 
162cf742fccSHong Zhang   /* add C_loc and Co to to C */
1635f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRange(C,&rstart,&rend));
164cf742fccSHong Zhang 
165cf742fccSHong Zhang   /* C_loc -> C */
166cf742fccSHong Zhang   cm    = C_loc->rmap->N;
167cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ*)C_loc->data;
168cf742fccSHong Zhang   cols = c_seq->j;
169cf742fccSHong Zhang   vals = c_seq->a;
1705f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j));
171edeac6deSandi selinger 
172e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173edeac6deSandi selinger   /* when there are no off-processor parts.  */
1741de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1751de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1761de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
177edeac6deSandi selinger   if (C->assembled) {
178edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
179edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
180edeac6deSandi selinger   }
181edeac6deSandi selinger   if (C->was_assembled) {
182cf742fccSHong Zhang     for (i=0; i<cm; i++) {
183cf742fccSHong Zhang       ncols = c_seq->i[i+1] - c_seq->i[i];
184cf742fccSHong Zhang       row = rstart + i;
1855f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
186cf742fccSHong Zhang       cols += ncols; vals += ncols;
187cf742fccSHong Zhang     }
188edeac6deSandi selinger   } else {
1895f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
190edeac6deSandi selinger   }
1915f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j));
1922c71b3e2SJacob Faibussowitsch   PetscCheckFalse(c_seq->i[C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
193cf742fccSHong Zhang 
194cf742fccSHong Zhang   /* Co -> C, off-processor part */
195cf742fccSHong Zhang   cm = C_oth->rmap->N;
196cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ*)C_oth->data;
197cf742fccSHong Zhang   cols = c_seq->j;
198cf742fccSHong Zhang   vals = c_seq->a;
1995f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j));
200cf742fccSHong Zhang   for (i=0; i<cm; i++) {
201cf742fccSHong Zhang     ncols = c_seq->i[i+1] - c_seq->i[i];
202cf742fccSHong Zhang     row = p->garray[i];
2035f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
204cf742fccSHong Zhang     cols += ncols; vals += ncols;
205cf742fccSHong Zhang   }
2065f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
2075f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
208cf742fccSHong Zhang 
209cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
21080da8df7SHong Zhang 
2115f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j));
2122c71b3e2SJacob Faibussowitsch   PetscCheckFalse(c_seq->i[C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
213cf742fccSHong Zhang   PetscFunctionReturn(0);
214cf742fccSHong Zhang }
215cf742fccSHong Zhang 
2164222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
2170d3441aeSHong Zhang {
2180d3441aeSHong Zhang   PetscErrorCode      ierr;
2193cdca5ebSHong Zhang   Mat_APMPI           *ptap;
2206718818eSStefano Zampini   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
2212259aa2eSHong Zhang   MPI_Comm            comm;
2222259aa2eSHong Zhang   PetscMPIInt         size,rank;
2234222ddf1SHong Zhang   Mat                 P_loc,P_oth;
22415a3b8e2SHong Zhang   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
225d0e9a2f7SHong Zhang   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
226d0e9a2f7SHong Zhang   PetscInt            *lnk,i,k,pnz,row,nsend;
227ec4bef21SJose E. Roman   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
228ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted=0;
22915a3b8e2SHong Zhang   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
2307c70b0e9SStefano Zampini   const PetscInt      *owners;
2317c70b0e9SStefano Zampini   PetscInt            len,proc,*dnz,*onz,nzi,nspacedouble;
23215a3b8e2SHong Zhang   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
23315a3b8e2SHong Zhang   MPI_Request         *swaits,*rwaits;
23415a3b8e2SHong Zhang   MPI_Status          *sstatus,rstatus;
235445158ffSHong Zhang   PetscLayout         rowmap;
236445158ffSHong Zhang   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
237445158ffSHong Zhang   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
238a3bb6f32SFande Kong   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
23952f7967eSHong Zhang   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
24067a12041SHong Zhang   PetscScalar         *apv;
24178d30b94SHong Zhang   PetscTable          ta;
242b92f168fSBarry Smith   MatType             mtype;
243e83e5f86SFande Kong   const char          *prefix;
244aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2458cb82516SHong Zhang   PetscReal           apfill;
246aa690a28SHong Zhang #endif
24767a12041SHong Zhang 
24867a12041SHong Zhang   PetscFunctionBegin;
2496718818eSStefano Zampini   MatCheckProduct(Cmpi,4);
250*28b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
2515f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
2525f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(comm,&size));
2535f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(comm,&rank));
254ae5f0867Sstefano_zampini 
25552f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
25652f7967eSHong Zhang 
257ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
2585f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetType(A,&mtype));
2595f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(Cmpi,mtype));
260ae5f0867Sstefano_zampini 
2613cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
2625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&ptap));
263e953e47cSHong Zhang   ptap->reuse = MAT_INITIAL_MATRIX;
264cf97cf3cSHong Zhang   ptap->algType = 0;
265e953e47cSHong Zhang 
266e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
2675f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth));
268e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
2695f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc));
27076db11f6SHong Zhang 
27176db11f6SHong Zhang   ptap->P_loc = P_loc;
27276db11f6SHong Zhang   ptap->P_oth = P_oth;
273e953e47cSHong Zhang 
274e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
275e953e47cSHong Zhang   /* --------------------------------- */
2765f80ce2aSJacob Faibussowitsch   CHKERRQ(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
2775f80ce2aSJacob Faibussowitsch   CHKERRQ(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
278e953e47cSHong Zhang 
279e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
280e953e47cSHong Zhang   /* ----------------------------------------------------------------- */
28176db11f6SHong Zhang   p_loc  = (Mat_SeqAIJ*)P_loc->data;
28252f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
283e953e47cSHong Zhang 
284e953e47cSHong Zhang   /* create and initialize a linked list */
2855f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
28676db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
28776db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
2885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
289e953e47cSHong Zhang 
2905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
291e953e47cSHong Zhang 
292e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
29352f7967eSHong Zhang   if (ao) {
2945f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
29552f7967eSHong Zhang   } else {
2965f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
29752f7967eSHong Zhang   }
298e953e47cSHong Zhang   current_space = free_space;
299e953e47cSHong Zhang   nspacedouble  = 0;
300e953e47cSHong Zhang 
3015f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(am+1,&api));
302e953e47cSHong Zhang   api[0] = 0;
303e953e47cSHong Zhang   for (i=0; i<am; i++) {
304e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
305e953e47cSHong Zhang     ai = ad->i; pi = p_loc->i;
306e953e47cSHong Zhang     nzi = ai[i+1] - ai[i];
307e953e47cSHong Zhang     aj  = ad->j + ai[i];
308e953e47cSHong Zhang     for (j=0; j<nzi; j++) {
309e953e47cSHong Zhang       row  = aj[j];
310e953e47cSHong Zhang       pnz  = pi[row+1] - pi[row];
311e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
312e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
3135f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
314e953e47cSHong Zhang     }
315e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
31652f7967eSHong Zhang     if (ao) {
317e953e47cSHong Zhang       ai = ao->i; pi = p_oth->i;
318e953e47cSHong Zhang       nzi = ai[i+1] - ai[i];
319e953e47cSHong Zhang       aj  = ao->j + ai[i];
320e953e47cSHong Zhang       for (j=0; j<nzi; j++) {
321e953e47cSHong Zhang         row  = aj[j];
322e953e47cSHong Zhang         pnz  = pi[row+1] - pi[row];
323e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
3245f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
325e953e47cSHong Zhang       }
32652f7967eSHong Zhang     }
327e953e47cSHong Zhang     apnz     = lnk[0];
328e953e47cSHong Zhang     api[i+1] = api[i] + apnz;
329e953e47cSHong Zhang 
330e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
331e953e47cSHong Zhang     if (current_space->local_remaining<apnz) {
3325f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
333e953e47cSHong Zhang       nspacedouble++;
334e953e47cSHong Zhang     }
335e953e47cSHong Zhang 
336e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
3375f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk));
338e953e47cSHong Zhang 
339e953e47cSHong Zhang     current_space->array           += apnz;
340e953e47cSHong Zhang     current_space->local_used      += apnz;
341e953e47cSHong Zhang     current_space->local_remaining -= apnz;
342e953e47cSHong Zhang   }
343e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
344e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
3455f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc2(api[am],&apj,api[am],&apv));
3465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFreeSpaceContiguous(&free_space,apj));
3475f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLCondensedDestroy_Scalable(lnk));
348e953e47cSHong Zhang 
349e953e47cSHong Zhang   /* Create AP_loc for reuse */
3505f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
3515f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
352e953e47cSHong Zhang 
353e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
35452f7967eSHong Zhang   if (ao) {
355e953e47cSHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
35652f7967eSHong Zhang   } else {
35752f7967eSHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
35852f7967eSHong Zhang   }
359e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
360e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
361e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
362e953e47cSHong Zhang 
363e953e47cSHong Zhang   if (api[am]) {
3645f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
3655f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
366e953e47cSHong Zhang   } else {
3675f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n"));
368e953e47cSHong Zhang   }
369e953e47cSHong Zhang #endif
370e953e47cSHong Zhang 
371e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
3724222ddf1SHong Zhang   /* -------------------------------------- */
3735f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
3745f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOptionsPrefix(A,&prefix));
3755f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOptionsPrefix(ptap->C_oth,prefix));
3765f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_"));
3774222ddf1SHong Zhang 
3785f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
3795f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetAlgorithm(ptap->C_oth,"sorted"));
3805f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(ptap->C_oth,fill));
3815f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(ptap->C_oth));
3825f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSymbolic(ptap->C_oth));
383e953e47cSHong Zhang 
384e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
385e953e47cSHong Zhang   /* ------------------------------------------ */
386e953e47cSHong Zhang   /* determine row ownership */
3875f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutCreate(comm,&rowmap));
3885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetLocalSize(rowmap, pn));
3895f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetBlockSize(rowmap, 1));
3905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(rowmap));
3915f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutGetRanges(rowmap,&owners));
392e953e47cSHong Zhang 
393e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
3945f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
3955f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(len_s,size));
3965f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(len_si,size));
397e953e47cSHong Zhang 
398e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
399e953e47cSHong Zhang   coi   = c_oth->i; coj = c_oth->j;
400e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
401e953e47cSHong Zhang   proc  = 0;
4025f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj));
403e953e47cSHong Zhang   for (i=0; i<con; i++) {
404e953e47cSHong Zhang     while (prmap[i] >= owners[proc+1]) proc++;
405e953e47cSHong Zhang     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
406e953e47cSHong Zhang     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
407e953e47cSHong Zhang   }
408e953e47cSHong Zhang 
409e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
410e953e47cSHong Zhang   owners_co[0] = 0;
411e953e47cSHong Zhang   nsend        = 0;
412e953e47cSHong Zhang   for (proc=0; proc<size; proc++) {
413e953e47cSHong Zhang     owners_co[proc+1] = owners_co[proc] + len_si[proc];
414e953e47cSHong Zhang     if (len_s[proc]) {
415e953e47cSHong Zhang       nsend++;
416e953e47cSHong Zhang       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
417e953e47cSHong Zhang       len         += len_si[proc];
418e953e47cSHong Zhang     }
419e953e47cSHong Zhang   }
420e953e47cSHong Zhang 
421e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
4225f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
4235f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
424e953e47cSHong Zhang 
425e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
4265f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCommGetNewTag(comm,&tagj));
4275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
4285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nsend+1,&swaits));
429e953e47cSHong Zhang   for (proc=0, k=0; proc<size; proc++) {
430e953e47cSHong Zhang     if (!len_s[proc]) continue;
431e953e47cSHong Zhang     i    = owners_co[proc];
4325f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
433e953e47cSHong Zhang     k++;
434e953e47cSHong Zhang   }
435e953e47cSHong Zhang 
436e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
437e953e47cSHong Zhang   /* ---------------------------------------- */
4385f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
4395f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
4405f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetAlgorithm(ptap->C_loc,"default"));
4415f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(ptap->C_loc,fill));
4424222ddf1SHong Zhang 
4435f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOptionsPrefix(ptap->C_loc,prefix));
4445f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_"));
4454222ddf1SHong Zhang 
4465f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(ptap->C_loc));
4475f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSymbolic(ptap->C_loc));
4484222ddf1SHong Zhang 
449e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
4505f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j));
451e953e47cSHong Zhang 
452e953e47cSHong Zhang   /* receives coj are complete */
453e953e47cSHong Zhang   for (i=0; i<nrecv; i++) {
4545f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
455e953e47cSHong Zhang   }
4565f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rwaits));
4575f80ce2aSJacob Faibussowitsch   if (nsend) CHKERRMPI(MPI_Waitall(nsend,swaits,sstatus));
458e953e47cSHong Zhang 
459e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
460e953e47cSHong Zhang   for (k=0; k<nrecv; k++) {/* k-th received message */
461e953e47cSHong Zhang     Jptr = buf_rj[k];
462e953e47cSHong Zhang     for (j=0; j<len_r[k]; j++) {
4635f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
464e953e47cSHong Zhang     }
465e953e47cSHong Zhang   }
4665f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableGetCount(ta,&Crmax));
4675f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableDestroy(&ta));
468e953e47cSHong Zhang 
469e953e47cSHong Zhang   /* (4) send and recv coi */
470e953e47cSHong Zhang   /*-----------------------*/
4715f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCommGetNewTag(comm,&tagi));
4725f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
4735f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(len+1,&buf_s));
474e953e47cSHong Zhang   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
475e953e47cSHong Zhang   for (proc=0,k=0; proc<size; proc++) {
476e953e47cSHong Zhang     if (!len_s[proc]) continue;
477e953e47cSHong Zhang     /* form outgoing message for i-structure:
478e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
479e953e47cSHong Zhang                [1:nrows]:           row index (global)
480e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
481e953e47cSHong Zhang     */
482e953e47cSHong Zhang     /*-------------------------------------------*/
483e953e47cSHong Zhang     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
484e953e47cSHong Zhang     buf_si_i    = buf_si + nrows+1;
485e953e47cSHong Zhang     buf_si[0]   = nrows;
486e953e47cSHong Zhang     buf_si_i[0] = 0;
487e953e47cSHong Zhang     nrows       = 0;
488e953e47cSHong Zhang     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
489e953e47cSHong Zhang       nzi = coi[i+1] - coi[i];
490e953e47cSHong Zhang       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
491e953e47cSHong Zhang       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
492e953e47cSHong Zhang       nrows++;
493e953e47cSHong Zhang     }
4945f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
495e953e47cSHong Zhang     k++;
496e953e47cSHong Zhang     buf_si += len_si[proc];
497e953e47cSHong Zhang   }
498e953e47cSHong Zhang   for (i=0; i<nrecv; i++) {
4995f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
500e953e47cSHong Zhang   }
5015f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rwaits));
5025f80ce2aSJacob Faibussowitsch   if (nsend) CHKERRMPI(MPI_Waitall(nsend,swaits,sstatus));
503e953e47cSHong Zhang 
5045f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree4(len_s,len_si,sstatus,owners_co));
5055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(len_ri));
5065f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(swaits));
5075f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_s));
508b4e8d1b6SHong Zhang 
509e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
510e953e47cSHong Zhang   /* ------------------------------------------ */
511e953e47cSHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
5125f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFreeSpaceGet(Crmax,&free_space));
513e953e47cSHong Zhang   current_space = free_space;
514e953e47cSHong Zhang 
5155f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
516e953e47cSHong Zhang   for (k=0; k<nrecv; k++) {
517e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
518e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
519e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
520a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
521e953e47cSHong Zhang   }
522e953e47cSHong Zhang 
523e953e47cSHong Zhang   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
5245f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
525e953e47cSHong Zhang   for (i=0; i<pn; i++) {
526e953e47cSHong Zhang     /* add C_loc into Cmpi */
527e953e47cSHong Zhang     nzi  = c_loc->i[i+1] - c_loc->i[i];
528e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
5295f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
530e953e47cSHong Zhang 
531e953e47cSHong Zhang     /* add received col data into lnk */
532e953e47cSHong Zhang     for (k=0; k<nrecv; k++) { /* k-th received message */
533e953e47cSHong Zhang       if (i == *nextrow[k]) { /* i-th row */
534e953e47cSHong Zhang         nzi  = *(nextci[k]+1) - *nextci[k];
535e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
5365f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
537e953e47cSHong Zhang         nextrow[k]++; nextci[k]++;
538e953e47cSHong Zhang       }
539e953e47cSHong Zhang     }
540e953e47cSHong Zhang     nzi = lnk[0];
541e953e47cSHong Zhang 
542e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
5435f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk));
5445f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
545e953e47cSHong Zhang   }
5465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree3(buf_ri_k,nextrow,nextci));
5475f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLCondensedDestroy_Scalable(lnk));
5485f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFreeSpaceDestroy(free_space));
549e953e47cSHong Zhang 
550e953e47cSHong Zhang   /* local sizes and preallocation */
5515f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
552ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
5535f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
5545f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
555ac94c67aSTristan Konolige   }
5565f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
557e953e47cSHong Zhang   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
558e953e47cSHong Zhang 
559e953e47cSHong Zhang   /* members in merge */
5605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(id_r));
5615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(len_r));
5625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_ri[0]));
5635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_ri));
5645f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_rj[0]));
5655f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_rj));
5665f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutDestroy(&rowmap));
567e953e47cSHong Zhang 
568a3bb6f32SFande Kong   nout = 0;
5695f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j));
5702c71b3e2SJacob Faibussowitsch   PetscCheckFalse(c_oth->i[ptap->C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout);
5715f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j));
5722c71b3e2SJacob Faibussowitsch   PetscCheckFalse(c_loc->i[ptap->C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout);
573a3bb6f32SFande Kong 
5746718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
5756718818eSStefano Zampini   Cmpi->product->data    = ptap;
5766718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
5776718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
5786718818eSStefano Zampini 
5796718818eSStefano Zampini   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
5806718818eSStefano Zampini   Cmpi->assembled        = PETSC_FALSE;
5816718818eSStefano Zampini   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
582e953e47cSHong Zhang   PetscFunctionReturn(0);
583e953e47cSHong Zhang }
584e953e47cSHong Zhang 
5859fbee547SJacob Faibussowitsch static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
5864a56b808SFande Kong {
5874a56b808SFande Kong   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
5884a56b808SFande Kong   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
5894a56b808SFande Kong   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
590bc8e477aSFande Kong   PetscInt             pcstart,pcend,column,offset;
5914a56b808SFande Kong 
5924a56b808SFande Kong   PetscFunctionBegin;
5934a56b808SFande Kong   pcstart = P->cmap->rstart;
594bc8e477aSFande Kong   pcstart *= dof;
5954a56b808SFande Kong   pcend   = P->cmap->rend;
596bc8e477aSFande Kong   pcend   *= dof;
5974a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
5984a56b808SFande Kong   ai = ad->i;
5994a56b808SFande Kong   nzi = ai[i+1] - ai[i];
6004a56b808SFande Kong   aj  = ad->j + ai[i];
6014a56b808SFande Kong   for (j=0; j<nzi; j++) {
6024a56b808SFande Kong     row  = aj[j];
603bc8e477aSFande Kong     offset = row%dof;
604bc8e477aSFande Kong     row   /= dof;
6054a56b808SFande Kong     nzpi = pd->i[row+1] - pd->i[row];
6064a56b808SFande Kong     pj  = pd->j + pd->i[row];
6074a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6085f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart));
6094a56b808SFande Kong     }
6104a56b808SFande Kong   }
611bc8e477aSFande Kong   /* off diag P */
6124a56b808SFande Kong   for (j=0; j<nzi; j++) {
6134a56b808SFande Kong     row  = aj[j];
614bc8e477aSFande Kong     offset = row%dof;
615bc8e477aSFande Kong     row   /= dof;
6164a56b808SFande Kong     nzpi = po->i[row+1] - po->i[row];
6174a56b808SFande Kong     pj  = po->j + po->i[row];
6184a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6195f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset));
6204a56b808SFande Kong     }
6214a56b808SFande Kong   }
6224a56b808SFande Kong 
6234a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6244a56b808SFande Kong   if (ao) {
6254a56b808SFande Kong     ai = ao->i;
6264a56b808SFande Kong     pi = p_oth->i;
6274a56b808SFande Kong     nzi = ai[i+1] - ai[i];
6284a56b808SFande Kong     aj  = ao->j + ai[i];
6294a56b808SFande Kong     for (j=0; j<nzi; j++) {
6304a56b808SFande Kong       row  = aj[j];
631bc8e477aSFande Kong       offset = a->garray[row]%dof;
632bc8e477aSFande Kong       row  = map[row];
6334a56b808SFande Kong       pnz  = pi[row+1] - pi[row];
6344a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6354a56b808SFande Kong       for (col=0; col<pnz; col++) {
636bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6374a56b808SFande Kong         if (column>=pcstart && column<pcend) {
6385f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscHSetIAdd(dht,column));
6394a56b808SFande Kong         } else {
6405f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscHSetIAdd(oht,column));
6414a56b808SFande Kong         }
6424a56b808SFande Kong       }
6434a56b808SFande Kong     }
6444a56b808SFande Kong   } /* end if (ao) */
6454a56b808SFande Kong   PetscFunctionReturn(0);
6464a56b808SFande Kong }
6474a56b808SFande Kong 
6489fbee547SJacob Faibussowitsch static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
6494a56b808SFande Kong {
6504a56b808SFande Kong   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
6514a56b808SFande Kong   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
652bc8e477aSFande Kong   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
6534a56b808SFande Kong   PetscScalar          ra,*aa,*pa;
6544a56b808SFande Kong 
6554a56b808SFande Kong   PetscFunctionBegin;
6564a56b808SFande Kong   pcstart = P->cmap->rstart;
657bc8e477aSFande Kong   pcstart *= dof;
658bc8e477aSFande Kong 
6594a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6604a56b808SFande Kong   ai  = ad->i;
6614a56b808SFande Kong   nzi = ai[i+1] - ai[i];
6624a56b808SFande Kong   aj  = ad->j + ai[i];
6634a56b808SFande Kong   aa  = ad->a + ai[i];
6644a56b808SFande Kong   for (j=0; j<nzi; j++) {
6654a56b808SFande Kong     ra   = aa[j];
6664a56b808SFande Kong     row  = aj[j];
667bc8e477aSFande Kong     offset = row%dof;
668bc8e477aSFande Kong     row    /= dof;
6694a56b808SFande Kong     nzpi = pd->i[row+1] - pd->i[row];
6704a56b808SFande Kong     pj = pd->j + pd->i[row];
6714a56b808SFande Kong     pa = pd->a + pd->i[row];
6724a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6735f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]));
6744a56b808SFande Kong     }
6755f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogFlops(2.0*nzpi));
6764a56b808SFande Kong   }
6774a56b808SFande Kong   for (j=0; j<nzi; j++) {
6784a56b808SFande Kong     ra   = aa[j];
6794a56b808SFande Kong     row  = aj[j];
680bc8e477aSFande Kong     offset = row%dof;
681bc8e477aSFande Kong     row   /= dof;
6824a56b808SFande Kong     nzpi = po->i[row+1] - po->i[row];
6834a56b808SFande Kong     pj = po->j + po->i[row];
6844a56b808SFande Kong     pa = po->a + po->i[row];
6854a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6865f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]));
6874a56b808SFande Kong     }
6885f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogFlops(2.0*nzpi));
6894a56b808SFande Kong   }
6904a56b808SFande Kong 
6914a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6924a56b808SFande Kong   if (ao) {
6934a56b808SFande Kong     ai = ao->i;
6944a56b808SFande Kong     pi = p_oth->i;
6954a56b808SFande Kong     nzi = ai[i+1] - ai[i];
6964a56b808SFande Kong     aj  = ao->j + ai[i];
6974a56b808SFande Kong     aa  = ao->a + ai[i];
6984a56b808SFande Kong     for (j=0; j<nzi; j++) {
6994a56b808SFande Kong       row  = aj[j];
700bc8e477aSFande Kong       offset = a->garray[row]%dof;
701bc8e477aSFande Kong       row    = map[row];
7024a56b808SFande Kong       ra   = aa[j];
7034a56b808SFande Kong       pnz  = pi[row+1] - pi[row];
7044a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
7054a56b808SFande Kong       pa   = p_oth->a + pi[row];
7064a56b808SFande Kong       for (col=0; col<pnz; col++) {
7075f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]));
7084a56b808SFande Kong       }
7095f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogFlops(2.0*pnz));
7104a56b808SFande Kong     }
7114a56b808SFande Kong   } /* end if (ao) */
712bb5ddf68SFande Kong 
7134a56b808SFande Kong   PetscFunctionReturn(0);
7144a56b808SFande Kong }
7154a56b808SFande Kong 
716bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
7175c65b9ecSFande Kong 
718bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
7194a56b808SFande Kong {
7204a56b808SFande Kong   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
721bc8e477aSFande Kong   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
7226718818eSStefano Zampini   Mat_APMPI         *ptap;
7234a56b808SFande Kong   PetscHMapIV       hmap;
7244a56b808SFande Kong   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
7254a56b808SFande Kong   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
726bc8e477aSFande Kong   PetscInt          offset,ii,pocol;
727bc8e477aSFande Kong   const PetscInt    *mappingindices;
728bc8e477aSFande Kong   IS                map;
7294a56b808SFande Kong 
7304a56b808SFande Kong   PetscFunctionBegin;
7316718818eSStefano Zampini   MatCheckProduct(C,4);
7326718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
733*28b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
734*28b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
7354a56b808SFande Kong 
7365f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(C));
7374a56b808SFande Kong 
7384a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7394a56b808SFande Kong   /*-----------------------------------------------------*/
7404a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7414a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
7425f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
7434a56b808SFande Kong   }
7445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
7454a56b808SFande Kong 
7465f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(p->B,NULL,&pon));
747bc8e477aSFande Kong   pon *= dof;
7485f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
7495f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(A,&am,NULL));
7504a56b808SFande Kong   cmaxr = 0;
7514a56b808SFande Kong   for (i=0; i<pon; i++) {
7524a56b808SFande Kong     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
7534a56b808SFande Kong   }
7545f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
7555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVCreate(&hmap));
7565f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVResize(hmap,cmaxr));
7575f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(map,&mappingindices));
7584a56b808SFande Kong   for (i=0; i<am && pon; i++) {
7595f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHMapIVClear(hmap));
760bc8e477aSFande Kong     offset = i%dof;
761bc8e477aSFande Kong     ii     = i/dof;
762bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
7634a56b808SFande Kong     if (!nzi) continue;
7645f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
7654a56b808SFande Kong     voff = 0;
7665f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
7674a56b808SFande Kong     if (!voff) continue;
7684a56b808SFande Kong 
7694a56b808SFande Kong     /* Form C(ii, :) */
770bc8e477aSFande Kong     poj = po->j + po->i[ii];
771bc8e477aSFande Kong     poa = po->a + po->i[ii];
7724a56b808SFande Kong     for (j=0; j<nzi; j++) {
773bc8e477aSFande Kong       pocol = poj[j]*dof+offset;
774bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
775bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
7764a56b808SFande Kong       for (jj=0; jj<voff; jj++) {
7774a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj]*poa[j];
7784a56b808SFande Kong         /*If the row is empty */
779bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
7804a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
7814a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
782bc8e477aSFande Kong           c_rmtc[pocol]++;
7834a56b808SFande Kong         } else {
7845f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
7854a56b808SFande Kong           if (loc>=0){ /* hit */
7864a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
7875f80ce2aSJacob Faibussowitsch             CHKERRQ(PetscLogFlops(1.0));
7884a56b808SFande Kong           } else { /* new element */
7894a56b808SFande Kong             loc = -(loc+1);
7904a56b808SFande Kong             /* Move data backward */
791bc8e477aSFande Kong             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
7924a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk-1];
7934a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk-1];
7944a56b808SFande Kong             }/* End kk */
7954a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
7964a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
797bc8e477aSFande Kong             c_rmtc[pocol]++;
7984a56b808SFande Kong           }
7994a56b808SFande Kong         }
8005f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogFlops(voff));
8014a56b808SFande Kong       } /* End jj */
8024a56b808SFande Kong     } /* End j */
8034a56b808SFande Kong   } /* End i */
8044a56b808SFande Kong 
8055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
8065f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVDestroy(&hmap));
8074a56b808SFande Kong 
8085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(P,NULL,&pn));
809bc8e477aSFande Kong   pn *= dof;
8105f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
8114a56b808SFande Kong 
8125f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
8135f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
8145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
815bc8e477aSFande Kong   pcstart = pcstart*dof;
816bc8e477aSFande Kong   pcend   = pcend*dof;
8174a56b808SFande Kong   cd = (Mat_SeqAIJ*)(c->A)->data;
8184a56b808SFande Kong   co = (Mat_SeqAIJ*)(c->B)->data;
8194a56b808SFande Kong 
8204a56b808SFande Kong   cmaxr = 0;
8214a56b808SFande Kong   for (i=0; i<pn; i++) {
8224a56b808SFande Kong     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
8234a56b808SFande Kong   }
8245f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ));
8255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVCreate(&hmap));
8265f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVResize(hmap,cmaxr));
8274a56b808SFande Kong   for (i=0; i<am && pn; i++) {
8285f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHMapIVClear(hmap));
829bc8e477aSFande Kong     offset = i%dof;
830bc8e477aSFande Kong     ii     = i/dof;
831bc8e477aSFande Kong     nzi = pd->i[ii+1] - pd->i[ii];
8324a56b808SFande Kong     if (!nzi) continue;
8335f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
8344a56b808SFande Kong     voff = 0;
8355f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
8364a56b808SFande Kong     if (!voff) continue;
8374a56b808SFande Kong     /* Form C(ii, :) */
838bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
839bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8404a56b808SFande Kong     for (j=0; j<nzi; j++) {
841bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
8424a56b808SFande Kong       for (jj=0; jj<voff; jj++) {
8434a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj]*pda[j];
8444a56b808SFande Kong       }
8455f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogFlops(voff));
8465f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
8474a56b808SFande Kong     }
8484a56b808SFande Kong   }
8495f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(map,&mappingindices));
8505f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRangeColumn(C,&ccstart,&ccend));
8515f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ));
8525f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVDestroy(&hmap));
8535f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
8545f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
8555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(c_rmtj,c_rmta));
856bc8e477aSFande Kong 
857bc8e477aSFande Kong   /* Add contributions from remote */
858bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
859bc8e477aSFande Kong     row = i + pcstart;
8605f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
861bc8e477aSFande Kong   }
8625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(c_othj,c_otha));
863bc8e477aSFande Kong 
8645f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
8655f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
866bc8e477aSFande Kong 
867bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
868bc8e477aSFande Kong   PetscFunctionReturn(0);
869bc8e477aSFande Kong }
870bc8e477aSFande Kong 
871bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
872bc8e477aSFande Kong {
873bc8e477aSFande Kong   PetscFunctionBegin;
87434bcad68SFande Kong 
8755f80ce2aSJacob Faibussowitsch   CHKERRQ(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C));
876bc8e477aSFande Kong   PetscFunctionReturn(0);
877bc8e477aSFande Kong }
878bc8e477aSFande Kong 
879bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
880bc8e477aSFande Kong {
881bc8e477aSFande Kong   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
882bc8e477aSFande Kong   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
8836718818eSStefano Zampini   Mat_APMPI         *ptap;
884bc8e477aSFande Kong   PetscHMapIV       hmap;
885bc8e477aSFande Kong   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
886bc8e477aSFande Kong   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
887bc8e477aSFande Kong   PetscInt          offset,ii,pocol;
888bc8e477aSFande Kong   const PetscInt    *mappingindices;
889bc8e477aSFande Kong   IS                map;
890bc8e477aSFande Kong 
891bc8e477aSFande Kong   PetscFunctionBegin;
8926718818eSStefano Zampini   MatCheckProduct(C,4);
8936718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
894*28b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
895*28b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
896bc8e477aSFande Kong 
8975f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(C));
898bc8e477aSFande Kong 
899bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
900bc8e477aSFande Kong   /*-----------------------------------------------------*/
901bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
902bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
9035f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
904bc8e477aSFande Kong   }
9055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
9065f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(p->B,NULL,&pon));
907bc8e477aSFande Kong   pon *= dof;
9085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(P,NULL,&pn));
909bc8e477aSFande Kong   pn  *= dof;
910bc8e477aSFande Kong 
9115f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
9125f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(A,&am,NULL));
9135f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
914bc8e477aSFande Kong   pcstart *= dof;
915bc8e477aSFande Kong   pcend   *= dof;
916bc8e477aSFande Kong   cmaxr = 0;
917bc8e477aSFande Kong   for (i=0; i<pon; i++) {
918bc8e477aSFande Kong     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
919bc8e477aSFande Kong   }
920bc8e477aSFande Kong   cd = (Mat_SeqAIJ*)(c->A)->data;
921bc8e477aSFande Kong   co = (Mat_SeqAIJ*)(c->B)->data;
922bc8e477aSFande Kong   for (i=0; i<pn; i++) {
923bc8e477aSFande Kong     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
924bc8e477aSFande Kong   }
9255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
9265f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVCreate(&hmap));
9275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVResize(hmap,cmaxr));
9285f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(map,&mappingindices));
929bc8e477aSFande Kong   for (i=0; i<am && (pon || pn); i++) {
9305f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHMapIVClear(hmap));
931bc8e477aSFande Kong     offset = i%dof;
932bc8e477aSFande Kong     ii     = i/dof;
933bc8e477aSFande Kong     nzi  = po->i[ii+1] - po->i[ii];
934bc8e477aSFande Kong     dnzi = pd->i[ii+1] - pd->i[ii];
935bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
9365f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
937bc8e477aSFande Kong     voff = 0;
9385f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
939bc8e477aSFande Kong     if (!voff) continue;
940bc8e477aSFande Kong 
941bc8e477aSFande Kong     /* Form remote C(ii, :) */
942bc8e477aSFande Kong     poj = po->j + po->i[ii];
943bc8e477aSFande Kong     poa = po->a + po->i[ii];
944bc8e477aSFande Kong     for (j=0; j<nzi; j++) {
945bc8e477aSFande Kong       pocol = poj[j]*dof+offset;
946bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
947bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
948bc8e477aSFande Kong       for (jj=0; jj<voff; jj++) {
949bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj]*poa[j];
950bc8e477aSFande Kong         /*If the row is empty */
951bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
952bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
953bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
954bc8e477aSFande Kong           c_rmtc[pocol]++;
955bc8e477aSFande Kong         } else {
9565f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
957bc8e477aSFande Kong           if (loc>=0){ /* hit */
958bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
9595f80ce2aSJacob Faibussowitsch             CHKERRQ(PetscLogFlops(1.0));
960bc8e477aSFande Kong           } else { /* new element */
961bc8e477aSFande Kong             loc = -(loc+1);
962bc8e477aSFande Kong             /* Move data backward */
963bc8e477aSFande Kong             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
964bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk-1];
965bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk-1];
966bc8e477aSFande Kong             }/* End kk */
967bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
968bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
969bc8e477aSFande Kong             c_rmtc[pocol]++;
970bc8e477aSFande Kong           }
971bc8e477aSFande Kong         }
972bc8e477aSFande Kong       } /* End jj */
9735f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogFlops(voff));
974bc8e477aSFande Kong     } /* End j */
975bc8e477aSFande Kong 
976bc8e477aSFande Kong     /* Form local C(ii, :) */
977bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
978bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
979bc8e477aSFande Kong     for (j=0; j<dnzi; j++) {
980bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
981bc8e477aSFande Kong       for (jj=0; jj<voff; jj++) {
982bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj]*pda[j];
983bc8e477aSFande Kong       }/* End kk */
9845f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogFlops(voff));
9855f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
986bc8e477aSFande Kong     }/* End j */
987bc8e477aSFande Kong   } /* End i */
988bc8e477aSFande Kong 
9895f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(map,&mappingindices));
9905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
9915f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHMapIVDestroy(&hmap));
9925f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
993bc8e477aSFande Kong 
9945f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
9955f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
9965f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
9975f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
9985f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(c_rmtj,c_rmta));
9994a56b808SFande Kong 
10004a56b808SFande Kong   /* Add contributions from remote */
10014a56b808SFande Kong   for (i = 0; i < pn; i++) {
10024a56b808SFande Kong     row = i + pcstart;
10035f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
10044a56b808SFande Kong   }
10055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(c_othj,c_otha));
10064a56b808SFande Kong 
10075f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
10085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
10094a56b808SFande Kong 
10104a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
10114a56b808SFande Kong   PetscFunctionReturn(0);
10124a56b808SFande Kong }
10134a56b808SFande Kong 
10144a56b808SFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
10154a56b808SFande Kong {
10164a56b808SFande Kong   PetscFunctionBegin;
101734bcad68SFande Kong 
10185f80ce2aSJacob Faibussowitsch   CHKERRQ(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C));
10194a56b808SFande Kong   PetscFunctionReturn(0);
10204a56b808SFande Kong }
10214a56b808SFande Kong 
10226718818eSStefano Zampini /* TODO: move algorithm selection to MatProductSetFromOptions */
10234222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
10244a56b808SFande Kong {
10254a56b808SFande Kong   Mat_APMPI           *ptap;
10266718818eSStefano Zampini   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
10274a56b808SFande Kong   MPI_Comm            comm;
1028bc8e477aSFande Kong   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
10294a56b808SFande Kong   MatType             mtype;
10304a56b808SFande Kong   PetscSF             sf;
10314a56b808SFande Kong   PetscSFNode         *iremote;
10324a56b808SFande Kong   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
10334a56b808SFande Kong   const PetscInt      *rootdegrees;
10344a56b808SFande Kong   PetscHSetI          ht,oht,*hta,*hto;
10354a56b808SFande Kong   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1036131c27b5Sprj-   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1037bc8e477aSFande Kong   PetscInt            nalg=2,alg=0,offset,ii;
1038131c27b5Sprj-   PetscMPIInt         owner;
1039bc8e477aSFande Kong   const PetscInt      *mappingindices;
10404a56b808SFande Kong   PetscBool           flg;
10414a56b808SFande Kong   const char          *algTypes[2] = {"overlapping","merged"};
1042bc8e477aSFande Kong   IS                  map;
10434a56b808SFande Kong   PetscErrorCode      ierr;
10444a56b808SFande Kong 
10454a56b808SFande Kong   PetscFunctionBegin;
10466718818eSStefano Zampini   MatCheckProduct(Cmpi,5);
1047*28b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
10485f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
104934bcad68SFande Kong 
10504a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10515f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(P,NULL,&pn));
1052bc8e477aSFande Kong   pn *= dof;
10535f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetType(A,&mtype));
10545f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(Cmpi,mtype));
10555f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
10564a56b808SFande Kong 
10575f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&ptap));
10584a56b808SFande Kong   ptap->reuse = MAT_INITIAL_MATRIX;
10594a56b808SFande Kong   ptap->algType = 2;
10604a56b808SFande Kong 
10614a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
10625f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
10635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
10644a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
10655f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(p->B,NULL,&pon));
1066bc8e477aSFande Kong   pon *= dof;
10674a56b808SFande Kong   /* offsets */
10685f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon+1,&ptap->c_rmti));
10694a56b808SFande Kong   /* The number of columns we will send to remote ranks */
10705f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&c_rmtc));
10715f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&hta));
10724a56b808SFande Kong   for (i=0; i<pon; i++) {
10735f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetICreate(&hta[i]));
10744a56b808SFande Kong   }
10755f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(A,&am,NULL));
10765f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRange(A,&arstart,&arend));
10774a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
10785f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetICreate(&ht));
10794a56b808SFande Kong 
10805f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(map,&mappingindices));
10814a56b808SFande Kong   ptap->c_rmti[0] = 0;
10824a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
10834a56b808SFande Kong   for (i=0; i<am && pon; i++) {
10844a56b808SFande Kong     /* Form one row of AP */
10855f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIClear(ht));
1086bc8e477aSFande Kong     offset = i%dof;
1087bc8e477aSFande Kong     ii     = i/dof;
10884a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1089bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
10904a56b808SFande Kong     if (!nzi) continue;
10914a56b808SFande Kong 
10925f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht));
10935f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(ht,&htsize));
10944a56b808SFande Kong     /* If AP is empty, just continue */
10954a56b808SFande Kong     if (!htsize) continue;
10964a56b808SFande Kong     /* Form C(ii, :) */
1097bc8e477aSFande Kong     poj = po->j + po->i[ii];
10984a56b808SFande Kong     for (j=0; j<nzi; j++) {
10995f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
11004a56b808SFande Kong     }
11014a56b808SFande Kong   }
11024a56b808SFande Kong 
11034a56b808SFande Kong   for (i=0; i<pon; i++) {
11045f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(hta[i],&htsize));
11054a56b808SFande Kong     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
11064a56b808SFande Kong     c_rmtc[i] = htsize;
11074a56b808SFande Kong   }
11084a56b808SFande Kong 
11095f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
11104a56b808SFande Kong 
11114a56b808SFande Kong   for (i=0; i<pon; i++) {
11124a56b808SFande Kong     off = 0;
11135f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
11145f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIDestroy(&hta[i]));
11154a56b808SFande Kong   }
11165f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(hta));
11174a56b808SFande Kong 
11185f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&iremote));
11194a56b808SFande Kong   for (i=0; i<pon; i++) {
1120ef7a94f2SFande Kong     owner = 0; lidx = 0;
1121bc8e477aSFande Kong     offset = i%dof;
1122bc8e477aSFande Kong     ii     = i/dof;
11235f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1124bc8e477aSFande Kong     iremote[i].index = lidx*dof + offset;
11254a56b808SFande Kong     iremote[i].rank  = owner;
11264a56b808SFande Kong   }
11274a56b808SFande Kong 
11285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFCreate(comm,&sf));
11295f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
11304a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
11315f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetRankOrder(sf,PETSC_TRUE));
11325f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetFromOptions(sf));
11335f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetUp(sf));
11344a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
11355f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFComputeDegreeBegin(sf,&rootdegrees));
11365f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFComputeDegreeEnd(sf,&rootdegrees));
11374a56b808SFande Kong   rootspacesize = 0;
11384a56b808SFande Kong   for (i = 0; i < pn; i++) {
11394a56b808SFande Kong     rootspacesize += rootdegrees[i];
11404a56b808SFande Kong   }
11415f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(rootspacesize,&rootspace));
11425f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
11434a56b808SFande Kong   /* Get information from leaves
11444a56b808SFande Kong    * Number of columns other people contribute to my rows
11454a56b808SFande Kong    * */
11465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
11475f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
11485f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_rmtc));
11495f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(pn+1,&ptap->c_othi));
11504a56b808SFande Kong   /* The number of columns is received for each row */
11514a56b808SFande Kong   ptap->c_othi[0] = 0;
11524a56b808SFande Kong   rootspacesize = 0;
11534a56b808SFande Kong   rootspaceoffsets[0] = 0;
11544a56b808SFande Kong   for (i = 0; i < pn; i++) {
11554a56b808SFande Kong     rcvncols = 0;
11564a56b808SFande Kong     for (j = 0; j<rootdegrees[i]; j++) {
11574a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
11584a56b808SFande Kong       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
11594a56b808SFande Kong       rootspacesize++;
11604a56b808SFande Kong     }
11614a56b808SFande Kong     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
11624a56b808SFande Kong   }
11635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rootspace));
11644a56b808SFande Kong 
11655f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&c_rmtoffsets));
11665f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
11675f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
11685f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFDestroy(&sf));
11695f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rootspaceoffsets));
11704a56b808SFande Kong 
11715f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(ptap->c_rmti[pon],&iremote));
11724a56b808SFande Kong   nleaves = 0;
11734a56b808SFande Kong   for (i = 0; i<pon; i++) {
1174bc8e477aSFande Kong     owner = 0;
1175bc8e477aSFande Kong     ii = i/dof;
11765f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
11774a56b808SFande Kong     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
11784a56b808SFande Kong     for (j=0; j<sendncols; j++) {
11794a56b808SFande Kong       iremote[nleaves].rank = owner;
11804a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
11814a56b808SFande Kong     }
11824a56b808SFande Kong   }
11835f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_rmtoffsets));
11845f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(ptap->c_othi[pn],&c_othj));
11854a56b808SFande Kong 
11865f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFCreate(comm,&ptap->sf));
11875f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
11885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetFromOptions(ptap->sf));
11894a56b808SFande Kong   /* One to one map */
11905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
11914a56b808SFande Kong 
11925f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc2(pn,&dnz,pn,&onz));
11935f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetICreate(&oht));
11945f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1195bc8e477aSFande Kong   pcstart *= dof;
1196bc8e477aSFande Kong   pcend   *= dof;
11975f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc2(pn,&hta,pn,&hto));
11984a56b808SFande Kong   for (i=0; i<pn; i++) {
11995f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetICreate(&hta[i]));
12005f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetICreate(&hto[i]));
12014a56b808SFande Kong   }
12024a56b808SFande Kong   /* Work on local part */
12034a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
12044a56b808SFande Kong   for (i=0; i<am && pn; i++) {
12055f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIClear(ht));
12065f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIClear(oht));
1207bc8e477aSFande Kong     offset = i%dof;
1208bc8e477aSFande Kong     ii     = i/dof;
1209bc8e477aSFande Kong     nzi = pd->i[ii+1] - pd->i[ii];
12104a56b808SFande Kong     if (!nzi) continue;
12114a56b808SFande Kong 
12125f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
12135f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(ht,&htsize));
12145f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(oht,&htosize));
12154a56b808SFande Kong     if (!(htsize+htosize)) continue;
12164a56b808SFande Kong     /* Form C(ii, :) */
1217bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
12184a56b808SFande Kong     for (j=0; j<nzi; j++) {
12195f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht));
12205f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
12214a56b808SFande Kong     }
12224a56b808SFande Kong   }
12234a56b808SFande Kong 
12245f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(map,&mappingindices));
1225bc8e477aSFande Kong 
12265f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetIDestroy(&ht));
12275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetIDestroy(&oht));
12284a56b808SFande Kong 
12294a56b808SFande Kong   /* Get remote data */
12305f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
12315f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_rmtj));
12324a56b808SFande Kong 
12334a56b808SFande Kong   for (i = 0; i < pn; i++) {
12344a56b808SFande Kong     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
12354a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
12364a56b808SFande Kong     for (j = 0; j < nzi; j++) {
12374a56b808SFande Kong       col = rdj[j];
12384a56b808SFande Kong       /* diag part */
12394a56b808SFande Kong       if (col>=pcstart && col<pcend) {
12405f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscHSetIAdd(hta[i],col));
12414a56b808SFande Kong       } else { /* off diag */
12425f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscHSetIAdd(hto[i],col));
12434a56b808SFande Kong       }
12444a56b808SFande Kong     }
12455f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(hta[i],&htsize));
12464a56b808SFande Kong     dnz[i] = htsize;
12475f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIDestroy(&hta[i]));
12485f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(hto[i],&htsize));
12494a56b808SFande Kong     onz[i] = htsize;
12505f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIDestroy(&hto[i]));
12514a56b808SFande Kong   }
12524a56b808SFande Kong 
12535f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(hta,hto));
12545f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_othj));
12554a56b808SFande Kong 
12564a56b808SFande Kong   /* local sizes and preallocation */
12575f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
12585f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
12595f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
12605f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetUp(Cmpi));
12615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(dnz,onz));
12624a56b808SFande Kong 
12634a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
12646718818eSStefano Zampini   Cmpi->product->data    = ptap;
12656718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
12666718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
12674a56b808SFande Kong 
12684a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
12694a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
12704a56b808SFande Kong   /* pick an algorithm */
12714a56b808SFande Kong   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
12724a56b808SFande Kong   alg = 0;
12735f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
12744a56b808SFande Kong   ierr = PetscOptionsEnd();CHKERRQ(ierr);
12754a56b808SFande Kong   switch (alg) {
12764a56b808SFande Kong     case 0:
12774a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
12784a56b808SFande Kong       break;
12794a56b808SFande Kong     case 1:
12804a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
12814a56b808SFande Kong       break;
12824a56b808SFande Kong     default:
1283546078acSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
12844a56b808SFande Kong   }
12854a56b808SFande Kong   PetscFunctionReturn(0);
12864a56b808SFande Kong }
12874a56b808SFande Kong 
12884222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1289bc8e477aSFande Kong {
1290bc8e477aSFande Kong   PetscFunctionBegin;
12915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C));
1292bc8e477aSFande Kong   PetscFunctionReturn(0);
1293bc8e477aSFande Kong }
1294bc8e477aSFande Kong 
12954222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
12964a56b808SFande Kong {
12974a56b808SFande Kong   Mat_APMPI           *ptap;
12986718818eSStefano Zampini   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
12994a56b808SFande Kong   MPI_Comm            comm;
1300bc8e477aSFande Kong   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
13014a56b808SFande Kong   MatType             mtype;
13024a56b808SFande Kong   PetscSF             sf;
13034a56b808SFande Kong   PetscSFNode         *iremote;
13044a56b808SFande Kong   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
13054a56b808SFande Kong   const PetscInt      *rootdegrees;
13064a56b808SFande Kong   PetscHSetI          ht,oht,*hta,*hto,*htd;
13074a56b808SFande Kong   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1308131c27b5Sprj-   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1309bc8e477aSFande Kong   PetscInt            nalg=2,alg=0,offset,ii;
1310131c27b5Sprj-   PetscMPIInt         owner;
13114a56b808SFande Kong   PetscBool           flg;
13124a56b808SFande Kong   const char          *algTypes[2] = {"merged","overlapping"};
1313bc8e477aSFande Kong   const PetscInt      *mappingindices;
1314bc8e477aSFande Kong   IS                  map;
13154a56b808SFande Kong   PetscErrorCode      ierr;
13164a56b808SFande Kong 
13174a56b808SFande Kong   PetscFunctionBegin;
13186718818eSStefano Zampini   MatCheckProduct(Cmpi,5);
1319*28b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
13205f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
132134bcad68SFande Kong 
13224a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
13235f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(P,NULL,&pn));
1324bc8e477aSFande Kong   pn *= dof;
13255f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetType(A,&mtype));
13265f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(Cmpi,mtype));
13275f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
13284a56b808SFande Kong 
13295f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&ptap));
13304a56b808SFande Kong   ptap->reuse = MAT_INITIAL_MATRIX;
13314a56b808SFande Kong   ptap->algType = 3;
13324a56b808SFande Kong 
13334a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
13345f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
13355f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
13364a56b808SFande Kong 
13374a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
13385f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(p->B,NULL,&pon));
1339bc8e477aSFande Kong   pon *= dof;
13404a56b808SFande Kong   /* offsets */
13415f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon+1,&ptap->c_rmti));
13424a56b808SFande Kong   /* The number of columns we will send to remote ranks */
13435f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&c_rmtc));
13445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&hta));
13454a56b808SFande Kong   for (i=0; i<pon; i++) {
13465f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetICreate(&hta[i]));
13474a56b808SFande Kong   }
13485f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(A,&am,NULL));
13495f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRange(A,&arstart,&arend));
13504a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
13515f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetICreate(&ht));
13525f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetICreate(&oht));
13535f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc2(pn,&htd,pn,&hto));
13544a56b808SFande Kong   for (i=0; i<pn; i++) {
13555f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetICreate(&htd[i]));
13565f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetICreate(&hto[i]));
13574a56b808SFande Kong   }
1358bc8e477aSFande Kong 
13595f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(map,&mappingindices));
13604a56b808SFande Kong   ptap->c_rmti[0] = 0;
13614a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
13624a56b808SFande Kong   for (i=0; i<am && (pon || pn); i++) {
13634a56b808SFande Kong     /* Form one row of AP */
13645f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIClear(ht));
13655f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIClear(oht));
1366bc8e477aSFande Kong     offset = i%dof;
1367bc8e477aSFande Kong     ii     = i/dof;
13684a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1369bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
1370bc8e477aSFande Kong     dnzi = pd->i[ii+1] - pd->i[ii];
13714a56b808SFande Kong     if (!nzi && !dnzi) continue;
13724a56b808SFande Kong 
13735f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
13745f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(ht,&htsize));
13755f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(oht,&htosize));
13764a56b808SFande Kong     /* If AP is empty, just continue */
13774a56b808SFande Kong     if (!(htsize+htosize)) continue;
13784a56b808SFande Kong 
13794a56b808SFande Kong     /* Form remote C(ii, :) */
1380bc8e477aSFande Kong     poj = po->j + po->i[ii];
13814a56b808SFande Kong     for (j=0; j<nzi; j++) {
13825f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
13835f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(hta[poj[j]*dof+offset],oht));
13844a56b808SFande Kong     }
13854a56b808SFande Kong 
13864a56b808SFande Kong     /* Form local C(ii, :) */
1387bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
13884a56b808SFande Kong     for (j=0; j<dnzi; j++) {
13895f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht));
13905f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
13914a56b808SFande Kong     }
13924a56b808SFande Kong   }
13934a56b808SFande Kong 
13945f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(map,&mappingindices));
1395bc8e477aSFande Kong 
13965f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetIDestroy(&ht));
13975f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscHSetIDestroy(&oht));
13984a56b808SFande Kong 
13994a56b808SFande Kong   for (i=0; i<pon; i++) {
14005f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(hta[i],&htsize));
14014a56b808SFande Kong     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
14024a56b808SFande Kong     c_rmtc[i] = htsize;
14034a56b808SFande Kong   }
14044a56b808SFande Kong 
14055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
14064a56b808SFande Kong 
14074a56b808SFande Kong   for (i=0; i<pon; i++) {
14084a56b808SFande Kong     off = 0;
14095f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
14105f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIDestroy(&hta[i]));
14114a56b808SFande Kong   }
14125f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(hta));
14134a56b808SFande Kong 
14145f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&iremote));
14154a56b808SFande Kong   for (i=0; i<pon; i++) {
1416ef7a94f2SFande Kong     owner = 0; lidx = 0;
1417bc8e477aSFande Kong     offset = i%dof;
1418bc8e477aSFande Kong     ii     = i/dof;
14195f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1420bc8e477aSFande Kong     iremote[i].index = lidx*dof+offset;
14214a56b808SFande Kong     iremote[i].rank  = owner;
14224a56b808SFande Kong   }
14234a56b808SFande Kong 
14245f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFCreate(comm,&sf));
14255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
14264a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
14275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetRankOrder(sf,PETSC_TRUE));
14285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetFromOptions(sf));
14295f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetUp(sf));
14304a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
14315f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFComputeDegreeBegin(sf,&rootdegrees));
14325f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFComputeDegreeEnd(sf,&rootdegrees));
14334a56b808SFande Kong   rootspacesize = 0;
14344a56b808SFande Kong   for (i = 0; i < pn; i++) {
14354a56b808SFande Kong     rootspacesize += rootdegrees[i];
14364a56b808SFande Kong   }
14375f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(rootspacesize,&rootspace));
14385f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
14394a56b808SFande Kong   /* Get information from leaves
14404a56b808SFande Kong    * Number of columns other people contribute to my rows
14414a56b808SFande Kong    * */
14425f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
14435f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
14445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_rmtc));
14455f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pn+1,&ptap->c_othi));
14464a56b808SFande Kong   /* The number of columns is received for each row */
14474a56b808SFande Kong   ptap->c_othi[0]     = 0;
14484a56b808SFande Kong   rootspacesize       = 0;
14494a56b808SFande Kong   rootspaceoffsets[0] = 0;
14504a56b808SFande Kong   for (i = 0; i < pn; i++) {
14514a56b808SFande Kong     rcvncols = 0;
14524a56b808SFande Kong     for (j = 0; j<rootdegrees[i]; j++) {
14534a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
14544a56b808SFande Kong       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
14554a56b808SFande Kong       rootspacesize++;
14564a56b808SFande Kong     }
14574a56b808SFande Kong     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
14584a56b808SFande Kong   }
14595f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rootspace));
14604a56b808SFande Kong 
14615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(pon,&c_rmtoffsets));
14625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
14635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
14645f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFDestroy(&sf));
14655f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rootspaceoffsets));
14664a56b808SFande Kong 
14675f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(ptap->c_rmti[pon],&iremote));
14684a56b808SFande Kong   nleaves = 0;
14694a56b808SFande Kong   for (i = 0; i<pon; i++) {
1470071fcb05SBarry Smith     owner = 0;
1471bc8e477aSFande Kong     ii    = i/dof;
14725f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
14734a56b808SFande Kong     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
14744a56b808SFande Kong     for (j=0; j<sendncols; j++) {
14754a56b808SFande Kong       iremote[nleaves].rank    = owner;
14764a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
14774a56b808SFande Kong     }
14784a56b808SFande Kong   }
14795f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_rmtoffsets));
14805f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(ptap->c_othi[pn],&c_othj));
14814a56b808SFande Kong 
14825f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFCreate(comm,&ptap->sf));
14835f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
14845f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFSetFromOptions(ptap->sf));
14854a56b808SFande Kong   /* One to one map */
14865f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
14874a56b808SFande Kong   /* Get remote data */
14885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
14895f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_rmtj));
14905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc2(pn,&dnz,pn,&onz));
14915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1492bc8e477aSFande Kong   pcstart *= dof;
1493bc8e477aSFande Kong   pcend   *= dof;
14944a56b808SFande Kong   for (i = 0; i < pn; i++) {
14954a56b808SFande Kong     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
14964a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
14974a56b808SFande Kong     for (j = 0; j < nzi; j++) {
14984a56b808SFande Kong       col =  rdj[j];
14994a56b808SFande Kong       /* diag part */
15004a56b808SFande Kong       if (col>=pcstart && col<pcend) {
15015f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscHSetIAdd(htd[i],col));
15024a56b808SFande Kong       } else { /* off diag */
15035f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscHSetIAdd(hto[i],col));
15044a56b808SFande Kong       }
15054a56b808SFande Kong     }
15065f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(htd[i],&htsize));
15074a56b808SFande Kong     dnz[i] = htsize;
15085f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIDestroy(&htd[i]));
15095f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIGetSize(hto[i],&htsize));
15104a56b808SFande Kong     onz[i] = htsize;
15115f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscHSetIDestroy(&hto[i]));
15124a56b808SFande Kong   }
15134a56b808SFande Kong 
15145f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(htd,hto));
15155f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(c_othj));
15164a56b808SFande Kong 
15174a56b808SFande Kong   /* local sizes and preallocation */
15185f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
15195f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
15205f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
15215f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree2(dnz,onz));
15224a56b808SFande Kong 
15234a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
15246718818eSStefano Zampini   Cmpi->product->data    = ptap;
15256718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
15266718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
15274a56b808SFande Kong 
15284a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
15294a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
15304a56b808SFande Kong   /* pick an algorithm */
15314a56b808SFande Kong   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
15324a56b808SFande Kong   alg = 0;
15335f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
15344a56b808SFande Kong   ierr = PetscOptionsEnd();CHKERRQ(ierr);
15354a56b808SFande Kong   switch (alg) {
15364a56b808SFande Kong     case 0:
15374a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
15384a56b808SFande Kong       break;
15394a56b808SFande Kong     case 1:
15404a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
15414a56b808SFande Kong       break;
15424a56b808SFande Kong     default:
1543546078acSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
15444a56b808SFande Kong   }
15454a56b808SFande Kong   PetscFunctionReturn(0);
15464a56b808SFande Kong }
15474a56b808SFande Kong 
15484222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1549bc8e477aSFande Kong {
1550bc8e477aSFande Kong   PetscFunctionBegin;
15515f80ce2aSJacob Faibussowitsch   CHKERRQ(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C));
1552bc8e477aSFande Kong   PetscFunctionReturn(0);
1553bc8e477aSFande Kong }
1554bc8e477aSFande Kong 
15554222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1556e953e47cSHong Zhang {
1557e953e47cSHong Zhang   PetscErrorCode      ierr;
15583cdca5ebSHong Zhang   Mat_APMPI           *ptap;
15596718818eSStefano Zampini   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1560e953e47cSHong Zhang   MPI_Comm            comm;
1561e953e47cSHong Zhang   PetscMPIInt         size,rank;
1562e953e47cSHong Zhang   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1563e953e47cSHong Zhang   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1564e953e47cSHong Zhang   PetscInt            *lnk,i,k,pnz,row,nsend;
1565e953e47cSHong Zhang   PetscBT             lnkbt;
1566ec4bef21SJose E. Roman   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1567ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted=0;
1568e953e47cSHong Zhang   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1569e953e47cSHong Zhang   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1570e953e47cSHong Zhang   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1571e953e47cSHong Zhang   MPI_Request         *swaits,*rwaits;
1572e953e47cSHong Zhang   MPI_Status          *sstatus,rstatus;
1573e953e47cSHong Zhang   PetscLayout         rowmap;
1574e953e47cSHong Zhang   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1575e953e47cSHong Zhang   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1576e953e47cSHong Zhang   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1577ec07b8f8SHong Zhang   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1578e953e47cSHong Zhang   PetscScalar         *apv;
1579e953e47cSHong Zhang   PetscTable          ta;
1580b92f168fSBarry Smith   MatType             mtype;
1581e83e5f86SFande Kong   const char          *prefix;
1582e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1583e953e47cSHong Zhang   PetscReal           apfill;
1584e953e47cSHong Zhang #endif
1585e953e47cSHong Zhang 
1586e953e47cSHong Zhang   PetscFunctionBegin;
15876718818eSStefano Zampini   MatCheckProduct(Cmpi,4);
1588*28b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
15895f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
15905f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(comm,&size));
15915f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(comm,&rank));
1592e953e47cSHong Zhang 
159352f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1594ec07b8f8SHong Zhang 
1595e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
15965f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetType(A,&mtype));
15975f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(Cmpi,mtype));
1598e953e47cSHong Zhang 
1599e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1600e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1601e953e47cSHong Zhang 
16023cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
16035f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&ptap));
160415a3b8e2SHong Zhang   ptap->reuse = MAT_INITIAL_MATRIX;
1605a4ffb656SHong Zhang   ptap->algType = 1;
160615a3b8e2SHong Zhang 
160715a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
16085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
160915a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
16105f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc));
161115a3b8e2SHong Zhang 
161267a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
161367a12041SHong Zhang   /* --------------------------------- */
16145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
16155f80ce2aSJacob Faibussowitsch   CHKERRQ(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
161615a3b8e2SHong Zhang 
161767a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
161867a12041SHong Zhang   /* ----------------------------------------------------------------- */
161967a12041SHong Zhang   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
162052f7967eSHong Zhang   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1621445158ffSHong Zhang 
16229a6dcab7SHong Zhang   /* create and initialize a linked list */
16235f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
16244b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
16254b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
16265f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
1627d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
162878d30b94SHong Zhang 
16295f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
1630445158ffSHong Zhang 
16318cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1632ec07b8f8SHong Zhang   if (ao) {
16335f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
1634ec07b8f8SHong Zhang   } else {
16355f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
1636ec07b8f8SHong Zhang   }
1637445158ffSHong Zhang   current_space = free_space;
163867a12041SHong Zhang   nspacedouble  = 0;
163967a12041SHong Zhang 
16405f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(am+1,&api));
164167a12041SHong Zhang   api[0] = 0;
1642445158ffSHong Zhang   for (i=0; i<am; i++) {
164367a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
164467a12041SHong Zhang     ai = ad->i; pi = p_loc->i;
164567a12041SHong Zhang     nzi = ai[i+1] - ai[i];
164667a12041SHong Zhang     aj  = ad->j + ai[i];
1647445158ffSHong Zhang     for (j=0; j<nzi; j++) {
1648445158ffSHong Zhang       row  = aj[j];
164967a12041SHong Zhang       pnz  = pi[row+1] - pi[row];
165067a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1651445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
16525f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1653445158ffSHong Zhang     }
165467a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1655ec07b8f8SHong Zhang     if (ao) {
165667a12041SHong Zhang       ai = ao->i; pi = p_oth->i;
165767a12041SHong Zhang       nzi = ai[i+1] - ai[i];
165867a12041SHong Zhang       aj  = ao->j + ai[i];
1659445158ffSHong Zhang       for (j=0; j<nzi; j++) {
1660445158ffSHong Zhang         row  = aj[j];
166167a12041SHong Zhang         pnz  = pi[row+1] - pi[row];
166267a12041SHong Zhang         Jptr = p_oth->j + pi[row];
16635f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1664445158ffSHong Zhang       }
1665ec07b8f8SHong Zhang     }
1666445158ffSHong Zhang     apnz     = lnk[0];
1667445158ffSHong Zhang     api[i+1] = api[i] + apnz;
1668445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1669445158ffSHong Zhang 
1670445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1671445158ffSHong Zhang     if (current_space->local_remaining<apnz) {
16725f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
1673445158ffSHong Zhang       nspacedouble++;
1674445158ffSHong Zhang     }
1675445158ffSHong Zhang 
1676445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
16775f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt));
1678445158ffSHong Zhang 
1679445158ffSHong Zhang     current_space->array           += apnz;
1680445158ffSHong Zhang     current_space->local_used      += apnz;
1681445158ffSHong Zhang     current_space->local_remaining -= apnz;
1682445158ffSHong Zhang   }
1683681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1684445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
16855f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc2(api[am],&apj,api[am],&apv));
16865f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFreeSpaceContiguous(&free_space,apj));
16875f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLDestroy(lnk,lnkbt));
16889a6dcab7SHong Zhang 
1689aa690a28SHong Zhang   /* Create AP_loc for reuse */
16905f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
16915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name));
1692aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1693ec07b8f8SHong Zhang   if (ao) {
1694aa690a28SHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1695ec07b8f8SHong Zhang   } else {
1696ec07b8f8SHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1697ec07b8f8SHong Zhang   }
1698aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1699aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1700aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1701aa690a28SHong Zhang 
1702aa690a28SHong Zhang   if (api[am]) {
17035f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
17045f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
1705aa690a28SHong Zhang   } else {
17065f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n"));
1707aa690a28SHong Zhang   }
1708aa690a28SHong Zhang #endif
1709aa690a28SHong Zhang 
1710681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1711681d504bSHong Zhang   /* ------------------------------------ */
17125f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOptionsPrefix(A,&prefix));
17135f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOptionsPrefix(ptap->Ro,prefix));
17145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_"));
17155f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
17165f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOptionsPrefix(Cmpi,&prefix));
17175f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOptionsPrefix(ptap->C_oth,prefix));
17185f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_"));
17195f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
17205f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetAlgorithm(ptap->C_oth,"default"));
17215f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(ptap->C_oth,fill));
17225f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(ptap->C_oth));
17235f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSymbolic(ptap->C_oth));
172415a3b8e2SHong Zhang 
172567a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
172667a12041SHong Zhang   /* ------------------------------------------ */
172715a3b8e2SHong Zhang   /* determine row ownership */
17285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutCreate(comm,&rowmap));
1729445158ffSHong Zhang   rowmap->n  = pn;
1730445158ffSHong Zhang   rowmap->bs = 1;
17315f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(rowmap));
1732445158ffSHong Zhang   owners = rowmap->range;
173315a3b8e2SHong Zhang 
173415a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
17355f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
17365f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(len_s,size));
17375f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(len_si,size));
173815a3b8e2SHong Zhang 
173967a12041SHong Zhang   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
174067a12041SHong Zhang   coi   = c_oth->i; coj = c_oth->j;
174167a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
174215a3b8e2SHong Zhang   proc  = 0;
174367a12041SHong Zhang   for (i=0; i<con; i++) {
174415a3b8e2SHong Zhang     while (prmap[i] >= owners[proc+1]) proc++;
174515a3b8e2SHong Zhang     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
174615a3b8e2SHong Zhang     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
174715a3b8e2SHong Zhang   }
174815a3b8e2SHong Zhang 
174915a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
175015a3b8e2SHong Zhang   owners_co[0] = 0;
175167a12041SHong Zhang   nsend        = 0;
175215a3b8e2SHong Zhang   for (proc=0; proc<size; proc++) {
175315a3b8e2SHong Zhang     owners_co[proc+1] = owners_co[proc] + len_si[proc];
175415a3b8e2SHong Zhang     if (len_s[proc]) {
1755445158ffSHong Zhang       nsend++;
175615a3b8e2SHong Zhang       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
175715a3b8e2SHong Zhang       len         += len_si[proc];
175815a3b8e2SHong Zhang     }
175915a3b8e2SHong Zhang   }
176015a3b8e2SHong Zhang 
176115a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
17625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
17635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
176415a3b8e2SHong Zhang 
176515a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
17665f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCommGetNewTag(comm,&tagj));
17675f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
17685f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nsend+1,&swaits));
176915a3b8e2SHong Zhang   for (proc=0, k=0; proc<size; proc++) {
177015a3b8e2SHong Zhang     if (!len_s[proc]) continue;
177115a3b8e2SHong Zhang     i    = owners_co[proc];
17725f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
177315a3b8e2SHong Zhang     k++;
177415a3b8e2SHong Zhang   }
177515a3b8e2SHong Zhang 
1776681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1777681d504bSHong Zhang   /* ---------------------------------------- */
17785f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOptionsPrefix(ptap->Rd,prefix));
17795f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAppendOptionsPrefix(ptap->Rd,"inner_diag_"));
17805f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
17815f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOptionsPrefix(Cmpi,&prefix));
17825f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOptionsPrefix(ptap->C_loc,prefix));
17835f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_"));
17845f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
17855f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetAlgorithm(ptap->C_loc,"default"));
17865f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(ptap->C_loc,fill));
17875f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(ptap->C_loc));
17885f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSymbolic(ptap->C_loc));
17894222ddf1SHong Zhang 
1790681d504bSHong Zhang   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1791681d504bSHong Zhang 
1792681d504bSHong Zhang   /* receives coj are complete */
1793445158ffSHong Zhang   for (i=0; i<nrecv; i++) {
17945f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
179515a3b8e2SHong Zhang   }
17965f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rwaits));
17975f80ce2aSJacob Faibussowitsch   if (nsend) CHKERRMPI(MPI_Waitall(nsend,swaits,sstatus));
179815a3b8e2SHong Zhang 
179978d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
180078d30b94SHong Zhang   for (k=0; k<nrecv; k++) {/* k-th received message */
180178d30b94SHong Zhang     Jptr = buf_rj[k];
180278d30b94SHong Zhang     for (j=0; j<len_r[k]; j++) {
18035f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
180478d30b94SHong Zhang     }
180578d30b94SHong Zhang   }
18065f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableGetCount(ta,&Crmax));
18075f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableDestroy(&ta));
18089a6dcab7SHong Zhang 
180915a3b8e2SHong Zhang   /* (4) send and recv coi */
181015a3b8e2SHong Zhang   /*-----------------------*/
18115f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCommGetNewTag(comm,&tagi));
18125f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
18135f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(len+1,&buf_s));
181415a3b8e2SHong Zhang   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
181515a3b8e2SHong Zhang   for (proc=0,k=0; proc<size; proc++) {
181615a3b8e2SHong Zhang     if (!len_s[proc]) continue;
181715a3b8e2SHong Zhang     /* form outgoing message for i-structure:
181815a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
181915a3b8e2SHong Zhang                [1:nrows]:           row index (global)
182015a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
182115a3b8e2SHong Zhang     */
182215a3b8e2SHong Zhang     /*-------------------------------------------*/
182315a3b8e2SHong Zhang     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
182415a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows+1;
182515a3b8e2SHong Zhang     buf_si[0]   = nrows;
182615a3b8e2SHong Zhang     buf_si_i[0] = 0;
182715a3b8e2SHong Zhang     nrows       = 0;
182815a3b8e2SHong Zhang     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
182915a3b8e2SHong Zhang       nzi = coi[i+1] - coi[i];
183015a3b8e2SHong Zhang       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
183115a3b8e2SHong Zhang       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
183215a3b8e2SHong Zhang       nrows++;
183315a3b8e2SHong Zhang     }
18345f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
183515a3b8e2SHong Zhang     k++;
183615a3b8e2SHong Zhang     buf_si += len_si[proc];
183715a3b8e2SHong Zhang   }
1838681d504bSHong Zhang   for (i=0; i<nrecv; i++) {
18395f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
184015a3b8e2SHong Zhang   }
18415f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(rwaits));
18425f80ce2aSJacob Faibussowitsch   if (nsend) CHKERRMPI(MPI_Waitall(nsend,swaits,sstatus));
184315a3b8e2SHong Zhang 
18445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree4(len_s,len_si,sstatus,owners_co));
18455f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(len_ri));
18465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(swaits));
18475f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_s));
1848a4ffb656SHong Zhang 
184967a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
185067a12041SHong Zhang   /* ------------------------------------------ */
185178d30b94SHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
18525f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFreeSpaceGet(Crmax,&free_space));
185315a3b8e2SHong Zhang   current_space = free_space;
185415a3b8e2SHong Zhang 
18555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
1856445158ffSHong Zhang   for (k=0; k<nrecv; k++) {
185715a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
185815a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
185915a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1860a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
186115a3b8e2SHong Zhang   }
1862445158ffSHong Zhang 
186378d30b94SHong Zhang   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
18645f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
186515a3b8e2SHong Zhang   for (i=0; i<pn; i++) {
186667a12041SHong Zhang     /* add C_loc into Cmpi */
186767a12041SHong Zhang     nzi  = c_loc->i[i+1] - c_loc->i[i];
186867a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
18695f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
187015a3b8e2SHong Zhang 
187115a3b8e2SHong Zhang     /* add received col data into lnk */
1872445158ffSHong Zhang     for (k=0; k<nrecv; k++) { /* k-th received message */
187315a3b8e2SHong Zhang       if (i == *nextrow[k]) { /* i-th row */
187415a3b8e2SHong Zhang         nzi  = *(nextci[k]+1) - *nextci[k];
187515a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
18765f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
187715a3b8e2SHong Zhang         nextrow[k]++; nextci[k]++;
187815a3b8e2SHong Zhang       }
187915a3b8e2SHong Zhang     }
1880d0e9a2f7SHong Zhang     nzi = lnk[0];
18818cb82516SHong Zhang 
188215a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
18835f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt));
18845f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
188515a3b8e2SHong Zhang   }
18865f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree3(buf_ri_k,nextrow,nextci));
18875f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLLDestroy(lnk,lnkbt));
18885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFreeSpaceDestroy(free_space));
188980bb4639SHong Zhang 
1890ae5f0867Sstefano_zampini   /* local sizes and preallocation */
18915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1892ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
18935f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
18945f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
1895ac94c67aSTristan Konolige   }
18965f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
189715a3b8e2SHong Zhang   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
189815a3b8e2SHong Zhang 
1899445158ffSHong Zhang   /* members in merge */
19005f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(id_r));
19015f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(len_r));
19025f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_ri[0]));
19035f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_ri));
19045f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_rj[0]));
19055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(buf_rj));
19065f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutDestroy(&rowmap));
190715a3b8e2SHong Zhang 
19085f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCalloc1(pN,&ptap->apa));
19092259aa2eSHong Zhang 
19106718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
19116718818eSStefano Zampini   Cmpi->product->data    = ptap;
19126718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
19136718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
19146718818eSStefano Zampini 
19151a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
19161a47ec54SHong Zhang   Cmpi->assembled = PETSC_FALSE;
19170d3441aeSHong Zhang   PetscFunctionReturn(0);
19180d3441aeSHong Zhang }
19190d3441aeSHong Zhang 
1920aa690a28SHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
19210d3441aeSHong Zhang {
19226718818eSStefano Zampini   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
19230d3441aeSHong Zhang   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
19242dd9e643SHong Zhang   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
19256718818eSStefano Zampini   Mat_APMPI         *ptap;
19269ce11a7cSHong Zhang   Mat               AP_loc,C_loc,C_oth;
19270d3441aeSHong Zhang   PetscInt          i,rstart,rend,cm,ncols,row;
19288cb82516SHong Zhang   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
19298cb82516SHong Zhang   PetscScalar       *apa;
19300d3441aeSHong Zhang   const PetscInt    *cols;
19310d3441aeSHong Zhang   const PetscScalar *vals;
19320d3441aeSHong Zhang 
19330d3441aeSHong Zhang   PetscFunctionBegin;
19346718818eSStefano Zampini   MatCheckProduct(C,3);
19356718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
1936*28b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
1937*28b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
1938a9899c97SHong Zhang 
19395f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(C));
1940e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
1941748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
19425f80ce2aSJacob Faibussowitsch     CHKERRQ(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
19435f80ce2aSJacob Faibussowitsch     CHKERRQ(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
1944748c7196SHong Zhang   }
19450d3441aeSHong Zhang 
1946e497d3c8SHong Zhang   /* 2) get AP_loc */
19470d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
19488cb82516SHong Zhang   ap = (Mat_SeqAIJ*)AP_loc->data;
19490d3441aeSHong Zhang 
1950e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
19510d3441aeSHong Zhang   /*-----------------------------------------------------*/
1952748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1953748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
19545f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
19555f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
1956e497d3c8SHong Zhang   }
19570d3441aeSHong Zhang 
19588cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
19598cb82516SHong Zhang   /* ---------------------------------------------- */
19600d3441aeSHong Zhang   /* get data from symbolic products */
19618cb82516SHong Zhang   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
19622dd9e643SHong Zhang   if (ptap->P_oth) {
19638cb82516SHong Zhang     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
19642dd9e643SHong Zhang   }
19658cb82516SHong Zhang   apa   = ptap->apa;
1966681d504bSHong Zhang   api   = ap->i;
1967681d504bSHong Zhang   apj   = ap->j;
1968e497d3c8SHong Zhang   for (i=0; i<am; i++) {
1969445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1970e497d3c8SHong Zhang     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1971e497d3c8SHong Zhang     apnz = api[i+1] - api[i];
1972e497d3c8SHong Zhang     for (j=0; j<apnz; j++) {
1973e497d3c8SHong Zhang       col = apj[j+api[i]];
1974e497d3c8SHong Zhang       ap->a[j+ap->i[i]] = apa[col];
1975e497d3c8SHong Zhang       apa[col] = 0.0;
19760d3441aeSHong Zhang     }
19770d3441aeSHong Zhang   }
1978976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
19795f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectStateIncrease((PetscObject)AP_loc));
19800d3441aeSHong Zhang 
19818cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
19825f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductNumeric(ptap->C_loc));
19835f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductNumeric(ptap->C_oth));
19849ce11a7cSHong Zhang   C_loc = ptap->C_loc;
19859ce11a7cSHong Zhang   C_oth = ptap->C_oth;
19860d3441aeSHong Zhang 
19870d3441aeSHong Zhang   /* add C_loc and Co to to C */
19885f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOwnershipRange(C,&rstart,&rend));
19890d3441aeSHong Zhang 
19900d3441aeSHong Zhang   /* C_loc -> C */
19910d3441aeSHong Zhang   cm    = C_loc->rmap->N;
19929ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ*)C_loc->data;
19938cb82516SHong Zhang   cols = c_seq->j;
19948cb82516SHong Zhang   vals = c_seq->a;
1995904d1e70Sandi selinger 
1996e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1997904d1e70Sandi selinger   /* when there are no off-processor parts.  */
19981de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
19991de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
20001de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
2001904d1e70Sandi selinger   if (C->assembled) {
2002904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
2003904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
2004904d1e70Sandi selinger   }
2005904d1e70Sandi selinger   if (C->was_assembled) {
20060d3441aeSHong Zhang     for (i=0; i<cm; i++) {
20079ce11a7cSHong Zhang       ncols = c_seq->i[i+1] - c_seq->i[i];
20080d3441aeSHong Zhang       row = rstart + i;
20095f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
20108cb82516SHong Zhang       cols += ncols; vals += ncols;
20110d3441aeSHong Zhang     }
2012904d1e70Sandi selinger   } else {
20135f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
2014904d1e70Sandi selinger   }
20150d3441aeSHong Zhang 
20160d3441aeSHong Zhang   /* Co -> C, off-processor part */
20179ce11a7cSHong Zhang   cm = C_oth->rmap->N;
20189ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ*)C_oth->data;
20198cb82516SHong Zhang   cols = c_seq->j;
20208cb82516SHong Zhang   vals = c_seq->a;
20219ce11a7cSHong Zhang   for (i=0; i<cm; i++) {
20229ce11a7cSHong Zhang     ncols = c_seq->i[i+1] - c_seq->i[i];
20230d3441aeSHong Zhang     row = p->garray[i];
20245f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
20258cb82516SHong Zhang     cols += ncols; vals += ncols;
20260d3441aeSHong Zhang   }
2027904d1e70Sandi selinger 
20285f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
20295f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
20300d3441aeSHong Zhang 
2031748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
20320d3441aeSHong Zhang   PetscFunctionReturn(0);
20330d3441aeSHong Zhang }
20344222ddf1SHong Zhang 
20354222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
20364222ddf1SHong Zhang {
20374222ddf1SHong Zhang   Mat_Product         *product = C->product;
20384222ddf1SHong Zhang   Mat                 A=product->A,P=product->B;
20394222ddf1SHong Zhang   MatProductAlgorithm alg=product->alg;
20404222ddf1SHong Zhang   PetscReal           fill=product->fill;
20414222ddf1SHong Zhang   PetscBool           flg;
20424222ddf1SHong Zhang 
20434222ddf1SHong Zhang   PetscFunctionBegin;
20444222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
20455f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrcmp(alg,"scalable",&flg));
20464222ddf1SHong Zhang   if (flg) {
20475f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C));
20484222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20494222ddf1SHong Zhang     goto next;
20504222ddf1SHong Zhang   }
20514222ddf1SHong Zhang 
20524222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
20535f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrcmp(alg,"nonscalable",&flg));
20544222ddf1SHong Zhang   if (flg) {
20555f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C));
20564222ddf1SHong Zhang     goto next;
20574222ddf1SHong Zhang   }
20584222ddf1SHong Zhang 
20594222ddf1SHong Zhang   /* allatonce */
20605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrcmp(alg,"allatonce",&flg));
20614222ddf1SHong Zhang   if (flg) {
20625f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C));
20634222ddf1SHong Zhang     goto next;
20644222ddf1SHong Zhang   }
20654222ddf1SHong Zhang 
20664222ddf1SHong Zhang   /* allatonce_merged */
20675f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrcmp(alg,"allatonce_merged",&flg));
20684222ddf1SHong Zhang   if (flg) {
20695f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C));
20704222ddf1SHong Zhang     goto next;
20714222ddf1SHong Zhang   }
20724222ddf1SHong Zhang 
20734e84afc0SStefano Zampini   /* backend general code */
20745f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrcmp(alg,"backend",&flg));
20754e84afc0SStefano Zampini   if (flg) {
20765f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSymbolic_MPIAIJBACKEND(C));
20774e84afc0SStefano Zampini     PetscFunctionReturn(0);
20784e84afc0SStefano Zampini   }
20794e84afc0SStefano Zampini 
20804222ddf1SHong Zhang   /* hypre */
20814222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
20825f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrcmp(alg,"hypre",&flg));
20834222ddf1SHong Zhang   if (flg) {
20845f80ce2aSJacob Faibussowitsch     CHKERRQ(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C));
20854222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20864222ddf1SHong Zhang     PetscFunctionReturn(0);
20874222ddf1SHong Zhang   }
20884222ddf1SHong Zhang #endif
20894222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
20904222ddf1SHong Zhang 
20914222ddf1SHong Zhang next:
20924222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
20934222ddf1SHong Zhang   PetscFunctionReturn(0);
20944222ddf1SHong Zhang }
2095