xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision d0609ced746bc51b019815ca91d747429db24893)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
16cf97cf3cSHong Zhang PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
17cf97cf3cSHong Zhang {
18cf97cf3cSHong Zhang   PetscBool         iascii;
19cf97cf3cSHong Zhang   PetscViewerFormat format;
206718818eSStefano Zampini   Mat_APMPI         *ptap;
21cf97cf3cSHong Zhang 
22cf97cf3cSHong Zhang   PetscFunctionBegin;
236718818eSStefano Zampini   MatCheckProduct(A,1);
246718818eSStefano Zampini   ptap = (Mat_APMPI*)A->product->data;
259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
26cf97cf3cSHong Zhang   if (iascii) {
279566063dSJacob Faibussowitsch     PetscCall(PetscViewerGetFormat(viewer,&format));
28cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29cf97cf3cSHong Zhang       if (ptap->algType == 0) {
309566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n"));
31cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
329566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n"));
334a56b808SFande Kong       } else if (ptap->algType == 2) {
349566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n"));
354a56b808SFande Kong       } else if (ptap->algType == 3) {
369566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n"));
37cf97cf3cSHong Zhang       }
38cf97cf3cSHong Zhang     }
39cf97cf3cSHong Zhang   }
40cf97cf3cSHong Zhang   PetscFunctionReturn(0);
41cf97cf3cSHong Zhang }
42cf97cf3cSHong Zhang 
436718818eSStefano Zampini PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44cc6cb787SHong Zhang {
456718818eSStefano Zampini   Mat_APMPI           *ptap = (Mat_APMPI*)data;
4660552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
47cc6cb787SHong Zhang 
48cc6cb787SHong Zhang   PetscFunctionBegin;
499566063dSJacob Faibussowitsch   PetscCall(PetscFree2(ptap->startsj_s,ptap->startsj_r));
509566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->bufa));
519566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_loc));
529566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_oth));
539566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
549566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Rd));
559566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Ro));
568403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
57681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
589566063dSJacob Faibussowitsch     PetscCall(PetscFree(ap->i));
599566063dSJacob Faibussowitsch     PetscCall(PetscFree2(ap->j,ap->a));
609566063dSJacob Faibussowitsch     PetscCall(MatDestroy(&ptap->AP_loc));
618403a639SHong Zhang   } else { /* used by alg_ptap */
629566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->api));
639566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->apj));
64681d504bSHong Zhang   }
659566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_loc));
669566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_oth));
679566063dSJacob Faibussowitsch   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68a560ef98SHong Zhang 
699566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Pt));
7060552ceaSHong Zhang 
7160552ceaSHong Zhang   merge = ptap->merge;
728403a639SHong Zhang   if (merge) { /* used by alg_ptap */
739566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->id_r));
749566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_s));
759566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_r));
769566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bi));
779566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bj));
789566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri[0]));
799566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri));
809566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj[0]));
819566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj));
829566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coi));
839566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coj));
849566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->owners_co));
859566063dSJacob Faibussowitsch     PetscCall(PetscLayoutDestroy(&merge->rowmap));
869566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->merge));
87bf0cc555SLisandro Dalcin   }
889566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
894a56b808SFande Kong 
909566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&ptap->sf));
919566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_othi));
929566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_rmti));
939566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap));
94cc6cb787SHong Zhang   PetscFunctionReturn(0);
95cc6cb787SHong Zhang }
96cc6cb787SHong Zhang 
97cf742fccSHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
98cf742fccSHong Zhang {
996718818eSStefano Zampini   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
100cf742fccSHong Zhang   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
10192ec70a1SHong Zhang   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1026718818eSStefano Zampini   Mat_APMPI         *ptap;
103cf742fccSHong Zhang   Mat               AP_loc,C_loc,C_oth;
104a3bb6f32SFande Kong   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
105cf742fccSHong Zhang   PetscScalar       *apa;
106cf742fccSHong Zhang   const PetscInt    *cols;
107cf742fccSHong Zhang   const PetscScalar *vals;
108cf742fccSHong Zhang 
109cf742fccSHong Zhang   PetscFunctionBegin;
1106718818eSStefano Zampini   MatCheckProduct(C,3);
1116718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
11228b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
11328b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
11480da8df7SHong Zhang 
1159566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
116cf742fccSHong Zhang 
117cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
118cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1199566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
1209566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
121cf742fccSHong Zhang   }
122cf742fccSHong Zhang 
123cf742fccSHong Zhang   /* 2) get AP_loc */
124cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
125cf742fccSHong Zhang   ap = (Mat_SeqAIJ*)AP_loc->data;
126cf742fccSHong Zhang 
127cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128cf742fccSHong Zhang   /*-----------------------------------------------------*/
129cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
130cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1319566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
1329566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
133cf742fccSHong Zhang   }
134cf742fccSHong Zhang 
135cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
136cf742fccSHong Zhang   /* ---------------------------------------------- */
137cf742fccSHong Zhang   /* get data from symbolic products */
138cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
13952f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
14052f7967eSHong Zhang 
141cf742fccSHong Zhang   api   = ap->i;
142cf742fccSHong Zhang   apj   = ap->j;
1439566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj));
144cf742fccSHong Zhang   for (i=0; i<am; i++) {
145cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146cf742fccSHong Zhang     apnz = api[i+1] - api[i];
147b4e8d1b6SHong Zhang     apa = ap->a + api[i];
1489566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(apa,apnz));
149b4e8d1b6SHong Zhang     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
150cf742fccSHong Zhang   }
1519566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj));
15208401ef6SPierre Jolivet   PetscCheck(api[AP_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout);
153cf742fccSHong Zhang 
154cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
1569566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc));
1579566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth));
158cf742fccSHong Zhang 
159cf742fccSHong Zhang   C_loc = ptap->C_loc;
160cf742fccSHong Zhang   C_oth = ptap->C_oth;
161cf742fccSHong Zhang 
162cf742fccSHong Zhang   /* add C_loc and Co to to C */
1639566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C,&rstart,&rend));
164cf742fccSHong Zhang 
165cf742fccSHong Zhang   /* C_loc -> C */
166cf742fccSHong Zhang   cm    = C_loc->rmap->N;
167cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ*)C_loc->data;
168cf742fccSHong Zhang   cols = c_seq->j;
169cf742fccSHong Zhang   vals = c_seq->a;
1709566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j));
171edeac6deSandi selinger 
172e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173edeac6deSandi selinger   /* when there are no off-processor parts.  */
1741de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1751de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1761de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
177edeac6deSandi selinger   if (C->assembled) {
178edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
179edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
180edeac6deSandi selinger   }
181edeac6deSandi selinger   if (C->was_assembled) {
182cf742fccSHong Zhang     for (i=0; i<cm; i++) {
183cf742fccSHong Zhang       ncols = c_seq->i[i+1] - c_seq->i[i];
184cf742fccSHong Zhang       row = rstart + i;
1859566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
186cf742fccSHong Zhang       cols += ncols; vals += ncols;
187cf742fccSHong Zhang     }
188edeac6deSandi selinger   } else {
1899566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
190edeac6deSandi selinger   }
1919566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j));
19208401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
193cf742fccSHong Zhang 
194cf742fccSHong Zhang   /* Co -> C, off-processor part */
195cf742fccSHong Zhang   cm = C_oth->rmap->N;
196cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ*)C_oth->data;
197cf742fccSHong Zhang   cols = c_seq->j;
198cf742fccSHong Zhang   vals = c_seq->a;
1999566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j));
200cf742fccSHong Zhang   for (i=0; i<cm; i++) {
201cf742fccSHong Zhang     ncols = c_seq->i[i+1] - c_seq->i[i];
202cf742fccSHong Zhang     row = p->garray[i];
2039566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
204cf742fccSHong Zhang     cols += ncols; vals += ncols;
205cf742fccSHong Zhang   }
2069566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
2079566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
208cf742fccSHong Zhang 
209cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
21080da8df7SHong Zhang 
2119566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j));
21208401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_oth->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
213cf742fccSHong Zhang   PetscFunctionReturn(0);
214cf742fccSHong Zhang }
215cf742fccSHong Zhang 
2164222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
2170d3441aeSHong Zhang {
2183cdca5ebSHong Zhang   Mat_APMPI           *ptap;
2196718818eSStefano Zampini   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
2202259aa2eSHong Zhang   MPI_Comm            comm;
2212259aa2eSHong Zhang   PetscMPIInt         size,rank;
2224222ddf1SHong Zhang   Mat                 P_loc,P_oth;
22315a3b8e2SHong Zhang   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
224d0e9a2f7SHong Zhang   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
225d0e9a2f7SHong Zhang   PetscInt            *lnk,i,k,pnz,row,nsend;
226ec4bef21SJose E. Roman   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
227ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted=0;
22815a3b8e2SHong Zhang   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
2297c70b0e9SStefano Zampini   const PetscInt      *owners;
2307c70b0e9SStefano Zampini   PetscInt            len,proc,*dnz,*onz,nzi,nspacedouble;
23115a3b8e2SHong Zhang   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
23215a3b8e2SHong Zhang   MPI_Request         *swaits,*rwaits;
23315a3b8e2SHong Zhang   MPI_Status          *sstatus,rstatus;
234445158ffSHong Zhang   PetscLayout         rowmap;
235445158ffSHong Zhang   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
236445158ffSHong Zhang   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
237a3bb6f32SFande Kong   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
23852f7967eSHong Zhang   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
23967a12041SHong Zhang   PetscScalar         *apv;
24078d30b94SHong Zhang   PetscTable          ta;
241b92f168fSBarry Smith   MatType             mtype;
242e83e5f86SFande Kong   const char          *prefix;
243aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2448cb82516SHong Zhang   PetscReal           apfill;
245aa690a28SHong Zhang #endif
24667a12041SHong Zhang 
24767a12041SHong Zhang   PetscFunctionBegin;
2486718818eSStefano Zampini   MatCheckProduct(Cmpi,4);
24928b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
2509566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
2519566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm,&size));
2529566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm,&rank));
253ae5f0867Sstefano_zampini 
25452f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
25552f7967eSHong Zhang 
256ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
2579566063dSJacob Faibussowitsch   PetscCall(MatGetType(A,&mtype));
2589566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi,mtype));
259ae5f0867Sstefano_zampini 
2603cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
2619566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
262e953e47cSHong Zhang   ptap->reuse = MAT_INITIAL_MATRIX;
263cf97cf3cSHong Zhang   ptap->algType = 0;
264e953e47cSHong Zhang 
265e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
2669566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth));
267e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
2689566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc));
26976db11f6SHong Zhang 
27076db11f6SHong Zhang   ptap->P_loc = P_loc;
27176db11f6SHong Zhang   ptap->P_oth = P_oth;
272e953e47cSHong Zhang 
273e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
274e953e47cSHong Zhang   /* --------------------------------- */
2759566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
2769566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
277e953e47cSHong Zhang 
278e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
279e953e47cSHong Zhang   /* ----------------------------------------------------------------- */
28076db11f6SHong Zhang   p_loc  = (Mat_SeqAIJ*)P_loc->data;
28152f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
282e953e47cSHong Zhang 
283e953e47cSHong Zhang   /* create and initialize a linked list */
2849566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
28576db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
28676db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
2879566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
288e953e47cSHong Zhang 
2899566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
290e953e47cSHong Zhang 
291e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
29252f7967eSHong Zhang   if (ao) {
2939566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
29452f7967eSHong Zhang   } else {
2959566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
29652f7967eSHong Zhang   }
297e953e47cSHong Zhang   current_space = free_space;
298e953e47cSHong Zhang   nspacedouble  = 0;
299e953e47cSHong Zhang 
3009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+1,&api));
301e953e47cSHong Zhang   api[0] = 0;
302e953e47cSHong Zhang   for (i=0; i<am; i++) {
303e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
304e953e47cSHong Zhang     ai = ad->i; pi = p_loc->i;
305e953e47cSHong Zhang     nzi = ai[i+1] - ai[i];
306e953e47cSHong Zhang     aj  = ad->j + ai[i];
307e953e47cSHong Zhang     for (j=0; j<nzi; j++) {
308e953e47cSHong Zhang       row  = aj[j];
309e953e47cSHong Zhang       pnz  = pi[row+1] - pi[row];
310e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
311e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
3129566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
313e953e47cSHong Zhang     }
314e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
31552f7967eSHong Zhang     if (ao) {
316e953e47cSHong Zhang       ai = ao->i; pi = p_oth->i;
317e953e47cSHong Zhang       nzi = ai[i+1] - ai[i];
318e953e47cSHong Zhang       aj  = ao->j + ai[i];
319e953e47cSHong Zhang       for (j=0; j<nzi; j++) {
320e953e47cSHong Zhang         row  = aj[j];
321e953e47cSHong Zhang         pnz  = pi[row+1] - pi[row];
322e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
3239566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
324e953e47cSHong Zhang       }
32552f7967eSHong Zhang     }
326e953e47cSHong Zhang     apnz     = lnk[0];
327e953e47cSHong Zhang     api[i+1] = api[i] + apnz;
328e953e47cSHong Zhang 
329e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
330e953e47cSHong Zhang     if (current_space->local_remaining<apnz) {
3319566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
332e953e47cSHong Zhang       nspacedouble++;
333e953e47cSHong Zhang     }
334e953e47cSHong Zhang 
335e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
3369566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk));
337e953e47cSHong Zhang 
338e953e47cSHong Zhang     current_space->array           += apnz;
339e953e47cSHong Zhang     current_space->local_used      += apnz;
340e953e47cSHong Zhang     current_space->local_remaining -= apnz;
341e953e47cSHong Zhang   }
342e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
343e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
3449566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(api[am],&apj,api[am],&apv));
3459566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,apj));
3469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347e953e47cSHong Zhang 
348e953e47cSHong Zhang   /* Create AP_loc for reuse */
3499566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
3509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351e953e47cSHong Zhang 
352e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
35352f7967eSHong Zhang   if (ao) {
354e953e47cSHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
35552f7967eSHong Zhang   } else {
35652f7967eSHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
35752f7967eSHong Zhang   }
358e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
359e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
360e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
361e953e47cSHong Zhang 
362e953e47cSHong Zhang   if (api[am]) {
3639566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
3649566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
365e953e47cSHong Zhang   } else {
3669566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n"));
367e953e47cSHong Zhang   }
368e953e47cSHong Zhang #endif
369e953e47cSHong Zhang 
370e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
3714222ddf1SHong Zhang   /* -------------------------------------- */
3729566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
3739566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A,&prefix));
3749566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix));
3759566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_"));
3764222ddf1SHong Zhang 
3779566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
3789566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth,"sorted"));
3799566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth,fill));
3809566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
3819566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
382e953e47cSHong Zhang 
383e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
384e953e47cSHong Zhang   /* ------------------------------------------ */
385e953e47cSHong Zhang   /* determine row ownership */
3869566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm,&rowmap));
3879566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
3889566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
3899566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
3909566063dSJacob Faibussowitsch   PetscCall(PetscLayoutGetRanges(rowmap,&owners));
391e953e47cSHong Zhang 
392e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
3939566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
3949566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s,size));
3959566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si,size));
396e953e47cSHong Zhang 
397e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
398e953e47cSHong Zhang   coi   = c_oth->i; coj = c_oth->j;
399e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
400e953e47cSHong Zhang   proc  = 0;
4019566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj));
402e953e47cSHong Zhang   for (i=0; i<con; i++) {
403e953e47cSHong Zhang     while (prmap[i] >= owners[proc+1]) proc++;
404e953e47cSHong Zhang     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
405e953e47cSHong Zhang     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
406e953e47cSHong Zhang   }
407e953e47cSHong Zhang 
408e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
409e953e47cSHong Zhang   owners_co[0] = 0;
410e953e47cSHong Zhang   nsend        = 0;
411e953e47cSHong Zhang   for (proc=0; proc<size; proc++) {
412e953e47cSHong Zhang     owners_co[proc+1] = owners_co[proc] + len_si[proc];
413e953e47cSHong Zhang     if (len_s[proc]) {
414e953e47cSHong Zhang       nsend++;
415e953e47cSHong Zhang       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
416e953e47cSHong Zhang       len         += len_si[proc];
417e953e47cSHong Zhang     }
418e953e47cSHong Zhang   }
419e953e47cSHong Zhang 
420e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
4219566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
4229566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
423e953e47cSHong Zhang 
424e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
4259566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm,&tagj));
4269566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
4279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend+1,&swaits));
428e953e47cSHong Zhang   for (proc=0, k=0; proc<size; proc++) {
429e953e47cSHong Zhang     if (!len_s[proc]) continue;
430e953e47cSHong Zhang     i    = owners_co[proc];
4319566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
432e953e47cSHong Zhang     k++;
433e953e47cSHong Zhang   }
434e953e47cSHong Zhang 
435e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
436e953e47cSHong Zhang   /* ---------------------------------------- */
4379566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
4389566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
4399566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default"));
4409566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc,fill));
4414222ddf1SHong Zhang 
4429566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix));
4439566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_"));
4444222ddf1SHong Zhang 
4459566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
4469566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
4474222ddf1SHong Zhang 
448e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
4499566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j));
450e953e47cSHong Zhang 
451e953e47cSHong Zhang   /* receives coj are complete */
452e953e47cSHong Zhang   for (i=0; i<nrecv; i++) {
4539566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
454e953e47cSHong Zhang   }
4559566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4569566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
457e953e47cSHong Zhang 
458e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
459e953e47cSHong Zhang   for (k=0; k<nrecv; k++) {/* k-th received message */
460e953e47cSHong Zhang     Jptr = buf_rj[k];
461e953e47cSHong Zhang     for (j=0; j<len_r[k]; j++) {
4629566063dSJacob Faibussowitsch       PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
463e953e47cSHong Zhang     }
464e953e47cSHong Zhang   }
4659566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax));
4669566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
467e953e47cSHong Zhang 
468e953e47cSHong Zhang   /* (4) send and recv coi */
469e953e47cSHong Zhang   /*-----------------------*/
4709566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm,&tagi));
4719566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
4729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len+1,&buf_s));
473e953e47cSHong Zhang   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
474e953e47cSHong Zhang   for (proc=0,k=0; proc<size; proc++) {
475e953e47cSHong Zhang     if (!len_s[proc]) continue;
476e953e47cSHong Zhang     /* form outgoing message for i-structure:
477e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
478e953e47cSHong Zhang                [1:nrows]:           row index (global)
479e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
480e953e47cSHong Zhang     */
481e953e47cSHong Zhang     /*-------------------------------------------*/
482e953e47cSHong Zhang     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
483e953e47cSHong Zhang     buf_si_i    = buf_si + nrows+1;
484e953e47cSHong Zhang     buf_si[0]   = nrows;
485e953e47cSHong Zhang     buf_si_i[0] = 0;
486e953e47cSHong Zhang     nrows       = 0;
487e953e47cSHong Zhang     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
488e953e47cSHong Zhang       nzi = coi[i+1] - coi[i];
489e953e47cSHong Zhang       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
490e953e47cSHong Zhang       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
491e953e47cSHong Zhang       nrows++;
492e953e47cSHong Zhang     }
4939566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
494e953e47cSHong Zhang     k++;
495e953e47cSHong Zhang     buf_si += len_si[proc];
496e953e47cSHong Zhang   }
497e953e47cSHong Zhang   for (i=0; i<nrecv; i++) {
4989566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
499e953e47cSHong Zhang   }
5009566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
5019566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
502e953e47cSHong Zhang 
5039566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co));
5049566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
5059566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
5069566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
507b4e8d1b6SHong Zhang 
508e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
509e953e47cSHong Zhang   /* ------------------------------------------ */
510e953e47cSHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
5119566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax,&free_space));
512e953e47cSHong Zhang   current_space = free_space;
513e953e47cSHong Zhang 
5149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
515e953e47cSHong Zhang   for (k=0; k<nrecv; k++) {
516e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
517e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
518e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
519a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
520e953e47cSHong Zhang   }
521e953e47cSHong Zhang 
522*d0609cedSBarry Smith   MatPreallocateBegin(comm,pn,pn,dnz,onz);
5239566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
524e953e47cSHong Zhang   for (i=0; i<pn; i++) {
525e953e47cSHong Zhang     /* add C_loc into Cmpi */
526e953e47cSHong Zhang     nzi  = c_loc->i[i+1] - c_loc->i[i];
527e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
5289566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
529e953e47cSHong Zhang 
530e953e47cSHong Zhang     /* add received col data into lnk */
531e953e47cSHong Zhang     for (k=0; k<nrecv; k++) { /* k-th received message */
532e953e47cSHong Zhang       if (i == *nextrow[k]) { /* i-th row */
533e953e47cSHong Zhang         nzi  = *(nextci[k]+1) - *nextci[k];
534e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
5359566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
536e953e47cSHong Zhang         nextrow[k]++; nextci[k]++;
537e953e47cSHong Zhang       }
538e953e47cSHong Zhang     }
539e953e47cSHong Zhang     nzi = lnk[0];
540e953e47cSHong Zhang 
541e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
5429566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk));
5439566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
544e953e47cSHong Zhang   }
5459566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k,nextrow,nextci));
5469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
5479566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
548e953e47cSHong Zhang 
549e953e47cSHong Zhang   /* local sizes and preallocation */
5509566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
551ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
5529566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
5539566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
554ac94c67aSTristan Konolige   }
5559566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
556*d0609cedSBarry Smith   MatPreallocateEnd(dnz,onz);
557e953e47cSHong Zhang 
558e953e47cSHong Zhang   /* members in merge */
5599566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
5609566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
5619566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
5629566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
5639566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
5649566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
5659566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
566e953e47cSHong Zhang 
567a3bb6f32SFande Kong   nout = 0;
5689566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j));
56908401ef6SPierre Jolivet   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout);
5709566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j));
57108401ef6SPierre Jolivet   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout);
572a3bb6f32SFande Kong 
5736718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
5746718818eSStefano Zampini   Cmpi->product->data    = ptap;
5756718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
5766718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
5776718818eSStefano Zampini 
5786718818eSStefano Zampini   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
5796718818eSStefano Zampini   Cmpi->assembled        = PETSC_FALSE;
5806718818eSStefano Zampini   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
581e953e47cSHong Zhang   PetscFunctionReturn(0);
582e953e47cSHong Zhang }
583e953e47cSHong Zhang 
5849fbee547SJacob Faibussowitsch static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
5854a56b808SFande Kong {
5864a56b808SFande Kong   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
5874a56b808SFande Kong   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
5884a56b808SFande Kong   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
589bc8e477aSFande Kong   PetscInt             pcstart,pcend,column,offset;
5904a56b808SFande Kong 
5914a56b808SFande Kong   PetscFunctionBegin;
5924a56b808SFande Kong   pcstart = P->cmap->rstart;
593bc8e477aSFande Kong   pcstart *= dof;
5944a56b808SFande Kong   pcend   = P->cmap->rend;
595bc8e477aSFande Kong   pcend   *= dof;
5964a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
5974a56b808SFande Kong   ai = ad->i;
5984a56b808SFande Kong   nzi = ai[i+1] - ai[i];
5994a56b808SFande Kong   aj  = ad->j + ai[i];
6004a56b808SFande Kong   for (j=0; j<nzi; j++) {
6014a56b808SFande Kong     row  = aj[j];
602bc8e477aSFande Kong     offset = row%dof;
603bc8e477aSFande Kong     row   /= dof;
6044a56b808SFande Kong     nzpi = pd->i[row+1] - pd->i[row];
6054a56b808SFande Kong     pj  = pd->j + pd->i[row];
6064a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6079566063dSJacob Faibussowitsch       PetscCall(PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart));
6084a56b808SFande Kong     }
6094a56b808SFande Kong   }
610bc8e477aSFande Kong   /* off diag P */
6114a56b808SFande Kong   for (j=0; j<nzi; j++) {
6124a56b808SFande Kong     row  = aj[j];
613bc8e477aSFande Kong     offset = row%dof;
614bc8e477aSFande Kong     row   /= dof;
6154a56b808SFande Kong     nzpi = po->i[row+1] - po->i[row];
6164a56b808SFande Kong     pj  = po->j + po->i[row];
6174a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6189566063dSJacob Faibussowitsch       PetscCall(PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset));
6194a56b808SFande Kong     }
6204a56b808SFande Kong   }
6214a56b808SFande Kong 
6224a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6234a56b808SFande Kong   if (ao) {
6244a56b808SFande Kong     ai = ao->i;
6254a56b808SFande Kong     pi = p_oth->i;
6264a56b808SFande Kong     nzi = ai[i+1] - ai[i];
6274a56b808SFande Kong     aj  = ao->j + ai[i];
6284a56b808SFande Kong     for (j=0; j<nzi; j++) {
6294a56b808SFande Kong       row  = aj[j];
630bc8e477aSFande Kong       offset = a->garray[row]%dof;
631bc8e477aSFande Kong       row  = map[row];
6324a56b808SFande Kong       pnz  = pi[row+1] - pi[row];
6334a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6344a56b808SFande Kong       for (col=0; col<pnz; col++) {
635bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6364a56b808SFande Kong         if (column>=pcstart && column<pcend) {
6379566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(dht,column));
6384a56b808SFande Kong         } else {
6399566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(oht,column));
6404a56b808SFande Kong         }
6414a56b808SFande Kong       }
6424a56b808SFande Kong     }
6434a56b808SFande Kong   } /* end if (ao) */
6444a56b808SFande Kong   PetscFunctionReturn(0);
6454a56b808SFande Kong }
6464a56b808SFande Kong 
6479fbee547SJacob Faibussowitsch static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
6484a56b808SFande Kong {
6494a56b808SFande Kong   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
6504a56b808SFande Kong   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
651bc8e477aSFande Kong   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
6524a56b808SFande Kong   PetscScalar          ra,*aa,*pa;
6534a56b808SFande Kong 
6544a56b808SFande Kong   PetscFunctionBegin;
6554a56b808SFande Kong   pcstart = P->cmap->rstart;
656bc8e477aSFande Kong   pcstart *= dof;
657bc8e477aSFande Kong 
6584a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6594a56b808SFande Kong   ai  = ad->i;
6604a56b808SFande Kong   nzi = ai[i+1] - ai[i];
6614a56b808SFande Kong   aj  = ad->j + ai[i];
6624a56b808SFande Kong   aa  = ad->a + ai[i];
6634a56b808SFande Kong   for (j=0; j<nzi; j++) {
6644a56b808SFande Kong     ra   = aa[j];
6654a56b808SFande Kong     row  = aj[j];
666bc8e477aSFande Kong     offset = row%dof;
667bc8e477aSFande Kong     row    /= dof;
6684a56b808SFande Kong     nzpi = pd->i[row+1] - pd->i[row];
6694a56b808SFande Kong     pj = pd->j + pd->i[row];
6704a56b808SFande Kong     pa = pd->a + pd->i[row];
6714a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6729566063dSJacob Faibussowitsch       PetscCall(PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]));
6734a56b808SFande Kong     }
6749566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0*nzpi));
6754a56b808SFande Kong   }
6764a56b808SFande Kong   for (j=0; j<nzi; j++) {
6774a56b808SFande Kong     ra   = aa[j];
6784a56b808SFande Kong     row  = aj[j];
679bc8e477aSFande Kong     offset = row%dof;
680bc8e477aSFande Kong     row   /= dof;
6814a56b808SFande Kong     nzpi = po->i[row+1] - po->i[row];
6824a56b808SFande Kong     pj = po->j + po->i[row];
6834a56b808SFande Kong     pa = po->a + po->i[row];
6844a56b808SFande Kong     for (k=0; k<nzpi; k++) {
6859566063dSJacob Faibussowitsch       PetscCall(PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]));
6864a56b808SFande Kong     }
6879566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0*nzpi));
6884a56b808SFande Kong   }
6894a56b808SFande Kong 
6904a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6914a56b808SFande Kong   if (ao) {
6924a56b808SFande Kong     ai = ao->i;
6934a56b808SFande Kong     pi = p_oth->i;
6944a56b808SFande Kong     nzi = ai[i+1] - ai[i];
6954a56b808SFande Kong     aj  = ao->j + ai[i];
6964a56b808SFande Kong     aa  = ao->a + ai[i];
6974a56b808SFande Kong     for (j=0; j<nzi; j++) {
6984a56b808SFande Kong       row  = aj[j];
699bc8e477aSFande Kong       offset = a->garray[row]%dof;
700bc8e477aSFande Kong       row    = map[row];
7014a56b808SFande Kong       ra   = aa[j];
7024a56b808SFande Kong       pnz  = pi[row+1] - pi[row];
7034a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
7044a56b808SFande Kong       pa   = p_oth->a + pi[row];
7054a56b808SFande Kong       for (col=0; col<pnz; col++) {
7069566063dSJacob Faibussowitsch         PetscCall(PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]));
7074a56b808SFande Kong       }
7089566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0*pnz));
7094a56b808SFande Kong     }
7104a56b808SFande Kong   } /* end if (ao) */
711bb5ddf68SFande Kong 
7124a56b808SFande Kong   PetscFunctionReturn(0);
7134a56b808SFande Kong }
7144a56b808SFande Kong 
715bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
7165c65b9ecSFande Kong 
717bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
7184a56b808SFande Kong {
7194a56b808SFande Kong   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
720bc8e477aSFande Kong   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
7216718818eSStefano Zampini   Mat_APMPI         *ptap;
7224a56b808SFande Kong   PetscHMapIV       hmap;
7234a56b808SFande Kong   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
7244a56b808SFande Kong   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
725bc8e477aSFande Kong   PetscInt          offset,ii,pocol;
726bc8e477aSFande Kong   const PetscInt    *mappingindices;
727bc8e477aSFande Kong   IS                map;
7284a56b808SFande Kong 
7294a56b808SFande Kong   PetscFunctionBegin;
7306718818eSStefano Zampini   MatCheckProduct(C,4);
7316718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
73228b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
73328b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
7344a56b808SFande Kong 
7359566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
7364a56b808SFande Kong 
7374a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7384a56b808SFande Kong   /*-----------------------------------------------------*/
7394a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7404a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
7419566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
7424a56b808SFande Kong   }
7439566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
7444a56b808SFande Kong 
7459566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
746bc8e477aSFande Kong   pon *= dof;
7479566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
7489566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A,&am,NULL));
7494a56b808SFande Kong   cmaxr = 0;
7504a56b808SFande Kong   for (i=0; i<pon; i++) {
7514a56b808SFande Kong     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
7524a56b808SFande Kong   }
7539566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
7549566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
7559566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap,cmaxr));
7569566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map,&mappingindices));
7574a56b808SFande Kong   for (i=0; i<am && pon; i++) {
7589566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
759bc8e477aSFande Kong     offset = i%dof;
760bc8e477aSFande Kong     ii     = i/dof;
761bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
7624a56b808SFande Kong     if (!nzi) continue;
7639566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
7644a56b808SFande Kong     voff = 0;
7659566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
7664a56b808SFande Kong     if (!voff) continue;
7674a56b808SFande Kong 
7684a56b808SFande Kong     /* Form C(ii, :) */
769bc8e477aSFande Kong     poj = po->j + po->i[ii];
770bc8e477aSFande Kong     poa = po->a + po->i[ii];
7714a56b808SFande Kong     for (j=0; j<nzi; j++) {
772bc8e477aSFande Kong       pocol = poj[j]*dof+offset;
773bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
774bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
7754a56b808SFande Kong       for (jj=0; jj<voff; jj++) {
7764a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj]*poa[j];
7774a56b808SFande Kong         /*If the row is empty */
778bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
7794a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
7804a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
781bc8e477aSFande Kong           c_rmtc[pocol]++;
7824a56b808SFande Kong         } else {
7839566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
7844a56b808SFande Kong           if (loc>=0){ /* hit */
7854a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
7869566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
7874a56b808SFande Kong           } else { /* new element */
7884a56b808SFande Kong             loc = -(loc+1);
7894a56b808SFande Kong             /* Move data backward */
790bc8e477aSFande Kong             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
7914a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk-1];
7924a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk-1];
7934a56b808SFande Kong             }/* End kk */
7944a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
7954a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
796bc8e477aSFande Kong             c_rmtc[pocol]++;
7974a56b808SFande Kong           }
7984a56b808SFande Kong         }
7999566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(voff));
8004a56b808SFande Kong       } /* End jj */
8014a56b808SFande Kong     } /* End j */
8024a56b808SFande Kong   } /* End i */
8034a56b808SFande Kong 
8049566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
8059566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
8064a56b808SFande Kong 
8079566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P,NULL,&pn));
808bc8e477aSFande Kong   pn *= dof;
8099566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
8104a56b808SFande Kong 
8119566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
8129566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
8139566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
814bc8e477aSFande Kong   pcstart = pcstart*dof;
815bc8e477aSFande Kong   pcend   = pcend*dof;
8164a56b808SFande Kong   cd = (Mat_SeqAIJ*)(c->A)->data;
8174a56b808SFande Kong   co = (Mat_SeqAIJ*)(c->B)->data;
8184a56b808SFande Kong 
8194a56b808SFande Kong   cmaxr = 0;
8204a56b808SFande Kong   for (i=0; i<pn; i++) {
8214a56b808SFande Kong     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
8224a56b808SFande Kong   }
8239566063dSJacob Faibussowitsch   PetscCall(PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ));
8249566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
8259566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap,cmaxr));
8264a56b808SFande Kong   for (i=0; i<am && pn; i++) {
8279566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
828bc8e477aSFande Kong     offset = i%dof;
829bc8e477aSFande Kong     ii     = i/dof;
830bc8e477aSFande Kong     nzi = pd->i[ii+1] - pd->i[ii];
8314a56b808SFande Kong     if (!nzi) continue;
8329566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
8334a56b808SFande Kong     voff = 0;
8349566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
8354a56b808SFande Kong     if (!voff) continue;
8364a56b808SFande Kong     /* Form C(ii, :) */
837bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
838bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8394a56b808SFande Kong     for (j=0; j<nzi; j++) {
840bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
8414a56b808SFande Kong       for (jj=0; jj<voff; jj++) {
8424a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj]*pda[j];
8434a56b808SFande Kong       }
8449566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
8459566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
8464a56b808SFande Kong     }
8474a56b808SFande Kong   }
8489566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map,&mappingindices));
8499566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(C,&ccstart,&ccend));
8509566063dSJacob Faibussowitsch   PetscCall(PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ));
8519566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
8529566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
8539566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
8549566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj,c_rmta));
855bc8e477aSFande Kong 
856bc8e477aSFande Kong   /* Add contributions from remote */
857bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
858bc8e477aSFande Kong     row = i + pcstart;
8599566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
860bc8e477aSFande Kong   }
8619566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj,c_otha));
862bc8e477aSFande Kong 
8639566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
8649566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
865bc8e477aSFande Kong 
866bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
867bc8e477aSFande Kong   PetscFunctionReturn(0);
868bc8e477aSFande Kong }
869bc8e477aSFande Kong 
870bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
871bc8e477aSFande Kong {
872bc8e477aSFande Kong   PetscFunctionBegin;
87334bcad68SFande Kong 
8749566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C));
875bc8e477aSFande Kong   PetscFunctionReturn(0);
876bc8e477aSFande Kong }
877bc8e477aSFande Kong 
878bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
879bc8e477aSFande Kong {
880bc8e477aSFande Kong   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
881bc8e477aSFande Kong   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
8826718818eSStefano Zampini   Mat_APMPI         *ptap;
883bc8e477aSFande Kong   PetscHMapIV       hmap;
884bc8e477aSFande Kong   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
885bc8e477aSFande Kong   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
886bc8e477aSFande Kong   PetscInt          offset,ii,pocol;
887bc8e477aSFande Kong   const PetscInt    *mappingindices;
888bc8e477aSFande Kong   IS                map;
889bc8e477aSFande Kong 
890bc8e477aSFande Kong   PetscFunctionBegin;
8916718818eSStefano Zampini   MatCheckProduct(C,4);
8926718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
89328b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
89428b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
895bc8e477aSFande Kong 
8969566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
897bc8e477aSFande Kong 
898bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
899bc8e477aSFande Kong   /*-----------------------------------------------------*/
900bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
901bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
9029566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
903bc8e477aSFande Kong   }
9049566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
9059566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
906bc8e477aSFande Kong   pon *= dof;
9079566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P,NULL,&pn));
908bc8e477aSFande Kong   pn  *= dof;
909bc8e477aSFande Kong 
9109566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
9119566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A,&am,NULL));
9129566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
913bc8e477aSFande Kong   pcstart *= dof;
914bc8e477aSFande Kong   pcend   *= dof;
915bc8e477aSFande Kong   cmaxr = 0;
916bc8e477aSFande Kong   for (i=0; i<pon; i++) {
917bc8e477aSFande Kong     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
918bc8e477aSFande Kong   }
919bc8e477aSFande Kong   cd = (Mat_SeqAIJ*)(c->A)->data;
920bc8e477aSFande Kong   co = (Mat_SeqAIJ*)(c->B)->data;
921bc8e477aSFande Kong   for (i=0; i<pn; i++) {
922bc8e477aSFande Kong     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
923bc8e477aSFande Kong   }
9249566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
9259566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
9269566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap,cmaxr));
9279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map,&mappingindices));
928bc8e477aSFande Kong   for (i=0; i<am && (pon || pn); i++) {
9299566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
930bc8e477aSFande Kong     offset = i%dof;
931bc8e477aSFande Kong     ii     = i/dof;
932bc8e477aSFande Kong     nzi  = po->i[ii+1] - po->i[ii];
933bc8e477aSFande Kong     dnzi = pd->i[ii+1] - pd->i[ii];
934bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
9359566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
936bc8e477aSFande Kong     voff = 0;
9379566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
938bc8e477aSFande Kong     if (!voff) continue;
939bc8e477aSFande Kong 
940bc8e477aSFande Kong     /* Form remote C(ii, :) */
941bc8e477aSFande Kong     poj = po->j + po->i[ii];
942bc8e477aSFande Kong     poa = po->a + po->i[ii];
943bc8e477aSFande Kong     for (j=0; j<nzi; j++) {
944bc8e477aSFande Kong       pocol = poj[j]*dof+offset;
945bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
946bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
947bc8e477aSFande Kong       for (jj=0; jj<voff; jj++) {
948bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj]*poa[j];
949bc8e477aSFande Kong         /*If the row is empty */
950bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
951bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
952bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
953bc8e477aSFande Kong           c_rmtc[pocol]++;
954bc8e477aSFande Kong         } else {
9559566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
956bc8e477aSFande Kong           if (loc>=0){ /* hit */
957bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
9589566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
959bc8e477aSFande Kong           } else { /* new element */
960bc8e477aSFande Kong             loc = -(loc+1);
961bc8e477aSFande Kong             /* Move data backward */
962bc8e477aSFande Kong             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
963bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk-1];
964bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk-1];
965bc8e477aSFande Kong             }/* End kk */
966bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
967bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
968bc8e477aSFande Kong             c_rmtc[pocol]++;
969bc8e477aSFande Kong           }
970bc8e477aSFande Kong         }
971bc8e477aSFande Kong       } /* End jj */
9729566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
973bc8e477aSFande Kong     } /* End j */
974bc8e477aSFande Kong 
975bc8e477aSFande Kong     /* Form local C(ii, :) */
976bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
977bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
978bc8e477aSFande Kong     for (j=0; j<dnzi; j++) {
979bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
980bc8e477aSFande Kong       for (jj=0; jj<voff; jj++) {
981bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj]*pda[j];
982bc8e477aSFande Kong       }/* End kk */
9839566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
9849566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
985bc8e477aSFande Kong     }/* End j */
986bc8e477aSFande Kong   } /* End i */
987bc8e477aSFande Kong 
9889566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map,&mappingindices));
9899566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
9909566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
9919566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
992bc8e477aSFande Kong 
9939566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
9949566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
9959566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
9969566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
9979566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj,c_rmta));
9984a56b808SFande Kong 
9994a56b808SFande Kong   /* Add contributions from remote */
10004a56b808SFande Kong   for (i = 0; i < pn; i++) {
10014a56b808SFande Kong     row = i + pcstart;
10029566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
10034a56b808SFande Kong   }
10049566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj,c_otha));
10054a56b808SFande Kong 
10069566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
10079566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
10084a56b808SFande Kong 
10094a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
10104a56b808SFande Kong   PetscFunctionReturn(0);
10114a56b808SFande Kong }
10124a56b808SFande Kong 
10134a56b808SFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
10144a56b808SFande Kong {
10154a56b808SFande Kong   PetscFunctionBegin;
101634bcad68SFande Kong 
10179566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C));
10184a56b808SFande Kong   PetscFunctionReturn(0);
10194a56b808SFande Kong }
10204a56b808SFande Kong 
10216718818eSStefano Zampini /* TODO: move algorithm selection to MatProductSetFromOptions */
10224222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
10234a56b808SFande Kong {
10244a56b808SFande Kong   Mat_APMPI           *ptap;
10256718818eSStefano Zampini   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
10264a56b808SFande Kong   MPI_Comm            comm;
1027bc8e477aSFande Kong   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
10284a56b808SFande Kong   MatType             mtype;
10294a56b808SFande Kong   PetscSF             sf;
10304a56b808SFande Kong   PetscSFNode         *iremote;
10314a56b808SFande Kong   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
10324a56b808SFande Kong   const PetscInt      *rootdegrees;
10334a56b808SFande Kong   PetscHSetI          ht,oht,*hta,*hto;
10344a56b808SFande Kong   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1035131c27b5Sprj-   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1036bc8e477aSFande Kong   PetscInt            nalg=2,alg=0,offset,ii;
1037131c27b5Sprj-   PetscMPIInt         owner;
1038bc8e477aSFande Kong   const PetscInt      *mappingindices;
10394a56b808SFande Kong   PetscBool           flg;
10404a56b808SFande Kong   const char          *algTypes[2] = {"overlapping","merged"};
1041bc8e477aSFande Kong   IS                  map;
10424a56b808SFande Kong 
10434a56b808SFande Kong   PetscFunctionBegin;
10446718818eSStefano Zampini   MatCheckProduct(Cmpi,5);
104528b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
10469566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
104734bcad68SFande Kong 
10484a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10499566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P,NULL,&pn));
1050bc8e477aSFande Kong   pn *= dof;
10519566063dSJacob Faibussowitsch   PetscCall(MatGetType(A,&mtype));
10529566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi,mtype));
10539566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
10544a56b808SFande Kong 
10559566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
10564a56b808SFande Kong   ptap->reuse = MAT_INITIAL_MATRIX;
10574a56b808SFande Kong   ptap->algType = 2;
10584a56b808SFande Kong 
10594a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
10609566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
10619566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
10624a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
10639566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
1064bc8e477aSFande Kong   pon *= dof;
10654a56b808SFande Kong   /* offsets */
10669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti));
10674a56b808SFande Kong   /* The number of columns we will send to remote ranks */
10689566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&c_rmtc));
10699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&hta));
10704a56b808SFande Kong   for (i=0; i<pon; i++) {
10719566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
10724a56b808SFande Kong   }
10739566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A,&am,NULL));
10749566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A,&arstart,&arend));
10754a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
10769566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
10774a56b808SFande Kong 
10789566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map,&mappingindices));
10794a56b808SFande Kong   ptap->c_rmti[0] = 0;
10804a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
10814a56b808SFande Kong   for (i=0; i<am && pon; i++) {
10824a56b808SFande Kong     /* Form one row of AP */
10839566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
1084bc8e477aSFande Kong     offset = i%dof;
1085bc8e477aSFande Kong     ii     = i/dof;
10864a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1087bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
10884a56b808SFande Kong     if (!nzi) continue;
10894a56b808SFande Kong 
10909566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht));
10919566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht,&htsize));
10924a56b808SFande Kong     /* If AP is empty, just continue */
10934a56b808SFande Kong     if (!htsize) continue;
10944a56b808SFande Kong     /* Form C(ii, :) */
1095bc8e477aSFande Kong     poj = po->j + po->i[ii];
10964a56b808SFande Kong     for (j=0; j<nzi; j++) {
10979566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
10984a56b808SFande Kong     }
10994a56b808SFande Kong   }
11004a56b808SFande Kong 
11014a56b808SFande Kong   for (i=0; i<pon; i++) {
11029566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
11034a56b808SFande Kong     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
11044a56b808SFande Kong     c_rmtc[i] = htsize;
11054a56b808SFande Kong   }
11064a56b808SFande Kong 
11079566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
11084a56b808SFande Kong 
11094a56b808SFande Kong   for (i=0; i<pon; i++) {
11104a56b808SFande Kong     off = 0;
11119566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
11129566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
11134a56b808SFande Kong   }
11149566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
11154a56b808SFande Kong 
11169566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&iremote));
11174a56b808SFande Kong   for (i=0; i<pon; i++) {
1118ef7a94f2SFande Kong     owner = 0; lidx = 0;
1119bc8e477aSFande Kong     offset = i%dof;
1120bc8e477aSFande Kong     ii     = i/dof;
11219566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1122bc8e477aSFande Kong     iremote[i].index = lidx*dof + offset;
11234a56b808SFande Kong     iremote[i].rank  = owner;
11244a56b808SFande Kong   }
11254a56b808SFande Kong 
11269566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm,&sf));
11279566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
11284a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
11299566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE));
11309566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
11319566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
11324a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
11339566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees));
11349566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees));
11354a56b808SFande Kong   rootspacesize = 0;
11364a56b808SFande Kong   for (i = 0; i < pn; i++) {
11374a56b808SFande Kong     rootspacesize += rootdegrees[i];
11384a56b808SFande Kong   }
11399566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize,&rootspace));
11409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
11414a56b808SFande Kong   /* Get information from leaves
11424a56b808SFande Kong    * Number of columns other people contribute to my rows
11434a56b808SFande Kong    * */
11449566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
11459566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
11469566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
11479566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pn+1,&ptap->c_othi));
11484a56b808SFande Kong   /* The number of columns is received for each row */
11494a56b808SFande Kong   ptap->c_othi[0] = 0;
11504a56b808SFande Kong   rootspacesize = 0;
11514a56b808SFande Kong   rootspaceoffsets[0] = 0;
11524a56b808SFande Kong   for (i = 0; i < pn; i++) {
11534a56b808SFande Kong     rcvncols = 0;
11544a56b808SFande Kong     for (j = 0; j<rootdegrees[i]; j++) {
11554a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
11564a56b808SFande Kong       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
11574a56b808SFande Kong       rootspacesize++;
11584a56b808SFande Kong     }
11594a56b808SFande Kong     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
11604a56b808SFande Kong   }
11619566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
11624a56b808SFande Kong 
11639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&c_rmtoffsets));
11649566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
11659566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
11669566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
11679566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
11684a56b808SFande Kong 
11699566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote));
11704a56b808SFande Kong   nleaves = 0;
11714a56b808SFande Kong   for (i = 0; i<pon; i++) {
1172bc8e477aSFande Kong     owner = 0;
1173bc8e477aSFande Kong     ii = i/dof;
11749566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
11754a56b808SFande Kong     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
11764a56b808SFande Kong     for (j=0; j<sendncols; j++) {
11774a56b808SFande Kong       iremote[nleaves].rank = owner;
11784a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
11794a56b808SFande Kong     }
11804a56b808SFande Kong   }
11819566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
11829566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj));
11834a56b808SFande Kong 
11849566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm,&ptap->sf));
11859566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
11869566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
11874a56b808SFande Kong   /* One to one map */
11889566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
11894a56b808SFande Kong 
11909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn,&dnz,pn,&onz));
11919566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
11929566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1193bc8e477aSFande Kong   pcstart *= dof;
1194bc8e477aSFande Kong   pcend   *= dof;
11959566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn,&hta,pn,&hto));
11964a56b808SFande Kong   for (i=0; i<pn; i++) {
11979566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
11989566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
11994a56b808SFande Kong   }
12004a56b808SFande Kong   /* Work on local part */
12014a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
12024a56b808SFande Kong   for (i=0; i<am && pn; i++) {
12039566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
12049566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1205bc8e477aSFande Kong     offset = i%dof;
1206bc8e477aSFande Kong     ii     = i/dof;
1207bc8e477aSFande Kong     nzi = pd->i[ii+1] - pd->i[ii];
12084a56b808SFande Kong     if (!nzi) continue;
12094a56b808SFande Kong 
12109566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
12119566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht,&htsize));
12129566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht,&htosize));
12134a56b808SFande Kong     if (!(htsize+htosize)) continue;
12144a56b808SFande Kong     /* Form C(ii, :) */
1215bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
12164a56b808SFande Kong     for (j=0; j<nzi; j++) {
12179566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht));
12189566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
12194a56b808SFande Kong     }
12204a56b808SFande Kong   }
12214a56b808SFande Kong 
12229566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map,&mappingindices));
1223bc8e477aSFande Kong 
12249566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
12259566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
12264a56b808SFande Kong 
12274a56b808SFande Kong   /* Get remote data */
12289566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
12299566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
12304a56b808SFande Kong 
12314a56b808SFande Kong   for (i = 0; i < pn; i++) {
12324a56b808SFande Kong     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
12334a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
12344a56b808SFande Kong     for (j = 0; j < nzi; j++) {
12354a56b808SFande Kong       col = rdj[j];
12364a56b808SFande Kong       /* diag part */
12374a56b808SFande Kong       if (col>=pcstart && col<pcend) {
12389566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hta[i],col));
12394a56b808SFande Kong       } else { /* off diag */
12409566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i],col));
12414a56b808SFande Kong       }
12424a56b808SFande Kong     }
12439566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
12444a56b808SFande Kong     dnz[i] = htsize;
12459566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
12469566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i],&htsize));
12474a56b808SFande Kong     onz[i] = htsize;
12489566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
12494a56b808SFande Kong   }
12504a56b808SFande Kong 
12519566063dSJacob Faibussowitsch   PetscCall(PetscFree2(hta,hto));
12529566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
12534a56b808SFande Kong 
12544a56b808SFande Kong   /* local sizes and preallocation */
12559566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
12569566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
12579566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
12589566063dSJacob Faibussowitsch   PetscCall(MatSetUp(Cmpi));
12599566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz,onz));
12604a56b808SFande Kong 
12614a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
12626718818eSStefano Zampini   Cmpi->product->data    = ptap;
12636718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
12646718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
12654a56b808SFande Kong 
12664a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
12674a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
12684a56b808SFande Kong   /* pick an algorithm */
1269*d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
12704a56b808SFande Kong   alg = 0;
12719566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
1272*d0609cedSBarry Smith   PetscOptionsEnd();
12734a56b808SFande Kong   switch (alg) {
12744a56b808SFande Kong     case 0:
12754a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
12764a56b808SFande Kong       break;
12774a56b808SFande Kong     case 1:
12784a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
12794a56b808SFande Kong       break;
12804a56b808SFande Kong     default:
1281546078acSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
12824a56b808SFande Kong   }
12834a56b808SFande Kong   PetscFunctionReturn(0);
12844a56b808SFande Kong }
12854a56b808SFande Kong 
12864222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1287bc8e477aSFande Kong {
1288bc8e477aSFande Kong   PetscFunctionBegin;
12899566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C));
1290bc8e477aSFande Kong   PetscFunctionReturn(0);
1291bc8e477aSFande Kong }
1292bc8e477aSFande Kong 
12934222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
12944a56b808SFande Kong {
12954a56b808SFande Kong   Mat_APMPI           *ptap;
12966718818eSStefano Zampini   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
12974a56b808SFande Kong   MPI_Comm            comm;
1298bc8e477aSFande Kong   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
12994a56b808SFande Kong   MatType             mtype;
13004a56b808SFande Kong   PetscSF             sf;
13014a56b808SFande Kong   PetscSFNode         *iremote;
13024a56b808SFande Kong   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
13034a56b808SFande Kong   const PetscInt      *rootdegrees;
13044a56b808SFande Kong   PetscHSetI          ht,oht,*hta,*hto,*htd;
13054a56b808SFande Kong   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1306131c27b5Sprj-   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1307bc8e477aSFande Kong   PetscInt            nalg=2,alg=0,offset,ii;
1308131c27b5Sprj-   PetscMPIInt         owner;
13094a56b808SFande Kong   PetscBool           flg;
13104a56b808SFande Kong   const char          *algTypes[2] = {"merged","overlapping"};
1311bc8e477aSFande Kong   const PetscInt      *mappingindices;
1312bc8e477aSFande Kong   IS                  map;
13134a56b808SFande Kong 
13144a56b808SFande Kong   PetscFunctionBegin;
13156718818eSStefano Zampini   MatCheckProduct(Cmpi,5);
131628b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
13179566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
131834bcad68SFande Kong 
13194a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
13209566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P,NULL,&pn));
1321bc8e477aSFande Kong   pn *= dof;
13229566063dSJacob Faibussowitsch   PetscCall(MatGetType(A,&mtype));
13239566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi,mtype));
13249566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
13254a56b808SFande Kong 
13269566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
13274a56b808SFande Kong   ptap->reuse = MAT_INITIAL_MATRIX;
13284a56b808SFande Kong   ptap->algType = 3;
13294a56b808SFande Kong 
13304a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
13319566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
13329566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
13334a56b808SFande Kong 
13344a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
13359566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
1336bc8e477aSFande Kong   pon *= dof;
13374a56b808SFande Kong   /* offsets */
13389566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti));
13394a56b808SFande Kong   /* The number of columns we will send to remote ranks */
13409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&c_rmtc));
13419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&hta));
13424a56b808SFande Kong   for (i=0; i<pon; i++) {
13439566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
13444a56b808SFande Kong   }
13459566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A,&am,NULL));
13469566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A,&arstart,&arend));
13474a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
13489566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
13499566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
13509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn,&htd,pn,&hto));
13514a56b808SFande Kong   for (i=0; i<pn; i++) {
13529566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&htd[i]));
13539566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
13544a56b808SFande Kong   }
1355bc8e477aSFande Kong 
13569566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map,&mappingindices));
13574a56b808SFande Kong   ptap->c_rmti[0] = 0;
13584a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
13594a56b808SFande Kong   for (i=0; i<am && (pon || pn); i++) {
13604a56b808SFande Kong     /* Form one row of AP */
13619566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
13629566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1363bc8e477aSFande Kong     offset = i%dof;
1364bc8e477aSFande Kong     ii     = i/dof;
13654a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1366bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
1367bc8e477aSFande Kong     dnzi = pd->i[ii+1] - pd->i[ii];
13684a56b808SFande Kong     if (!nzi && !dnzi) continue;
13694a56b808SFande Kong 
13709566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
13719566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht,&htsize));
13729566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht,&htosize));
13734a56b808SFande Kong     /* If AP is empty, just continue */
13744a56b808SFande Kong     if (!(htsize+htosize)) continue;
13754a56b808SFande Kong 
13764a56b808SFande Kong     /* Form remote C(ii, :) */
1377bc8e477aSFande Kong     poj = po->j + po->i[ii];
13784a56b808SFande Kong     for (j=0; j<nzi; j++) {
13799566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
13809566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],oht));
13814a56b808SFande Kong     }
13824a56b808SFande Kong 
13834a56b808SFande Kong     /* Form local C(ii, :) */
1384bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
13854a56b808SFande Kong     for (j=0; j<dnzi; j++) {
13869566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht));
13879566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
13884a56b808SFande Kong     }
13894a56b808SFande Kong   }
13904a56b808SFande Kong 
13919566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map,&mappingindices));
1392bc8e477aSFande Kong 
13939566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
13949566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
13954a56b808SFande Kong 
13964a56b808SFande Kong   for (i=0; i<pon; i++) {
13979566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
13984a56b808SFande Kong     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
13994a56b808SFande Kong     c_rmtc[i] = htsize;
14004a56b808SFande Kong   }
14014a56b808SFande Kong 
14029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
14034a56b808SFande Kong 
14044a56b808SFande Kong   for (i=0; i<pon; i++) {
14054a56b808SFande Kong     off = 0;
14069566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
14079566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
14084a56b808SFande Kong   }
14099566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
14104a56b808SFande Kong 
14119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&iremote));
14124a56b808SFande Kong   for (i=0; i<pon; i++) {
1413ef7a94f2SFande Kong     owner = 0; lidx = 0;
1414bc8e477aSFande Kong     offset = i%dof;
1415bc8e477aSFande Kong     ii     = i/dof;
14169566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1417bc8e477aSFande Kong     iremote[i].index = lidx*dof+offset;
14184a56b808SFande Kong     iremote[i].rank  = owner;
14194a56b808SFande Kong   }
14204a56b808SFande Kong 
14219566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm,&sf));
14229566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
14234a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
14249566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE));
14259566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
14269566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
14274a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
14289566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees));
14299566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees));
14304a56b808SFande Kong   rootspacesize = 0;
14314a56b808SFande Kong   for (i = 0; i < pn; i++) {
14324a56b808SFande Kong     rootspacesize += rootdegrees[i];
14334a56b808SFande Kong   }
14349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize,&rootspace));
14359566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
14364a56b808SFande Kong   /* Get information from leaves
14374a56b808SFande Kong    * Number of columns other people contribute to my rows
14384a56b808SFande Kong    * */
14399566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
14409566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
14419566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
14429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pn+1,&ptap->c_othi));
14434a56b808SFande Kong   /* The number of columns is received for each row */
14444a56b808SFande Kong   ptap->c_othi[0]     = 0;
14454a56b808SFande Kong   rootspacesize       = 0;
14464a56b808SFande Kong   rootspaceoffsets[0] = 0;
14474a56b808SFande Kong   for (i = 0; i < pn; i++) {
14484a56b808SFande Kong     rcvncols = 0;
14494a56b808SFande Kong     for (j = 0; j<rootdegrees[i]; j++) {
14504a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
14514a56b808SFande Kong       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
14524a56b808SFande Kong       rootspacesize++;
14534a56b808SFande Kong     }
14544a56b808SFande Kong     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
14554a56b808SFande Kong   }
14569566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
14574a56b808SFande Kong 
14589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon,&c_rmtoffsets));
14599566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
14609566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
14619566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
14629566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
14634a56b808SFande Kong 
14649566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote));
14654a56b808SFande Kong   nleaves = 0;
14664a56b808SFande Kong   for (i = 0; i<pon; i++) {
1467071fcb05SBarry Smith     owner = 0;
1468bc8e477aSFande Kong     ii    = i/dof;
14699566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
14704a56b808SFande Kong     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
14714a56b808SFande Kong     for (j=0; j<sendncols; j++) {
14724a56b808SFande Kong       iremote[nleaves].rank    = owner;
14734a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
14744a56b808SFande Kong     }
14754a56b808SFande Kong   }
14769566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
14779566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj));
14784a56b808SFande Kong 
14799566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm,&ptap->sf));
14809566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
14819566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
14824a56b808SFande Kong   /* One to one map */
14839566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
14844a56b808SFande Kong   /* Get remote data */
14859566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
14869566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
14879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn,&dnz,pn,&onz));
14889566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1489bc8e477aSFande Kong   pcstart *= dof;
1490bc8e477aSFande Kong   pcend   *= dof;
14914a56b808SFande Kong   for (i = 0; i < pn; i++) {
14924a56b808SFande Kong     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
14934a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
14944a56b808SFande Kong     for (j = 0; j < nzi; j++) {
14954a56b808SFande Kong       col =  rdj[j];
14964a56b808SFande Kong       /* diag part */
14974a56b808SFande Kong       if (col>=pcstart && col<pcend) {
14989566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(htd[i],col));
14994a56b808SFande Kong       } else { /* off diag */
15009566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i],col));
15014a56b808SFande Kong       }
15024a56b808SFande Kong     }
15039566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(htd[i],&htsize));
15044a56b808SFande Kong     dnz[i] = htsize;
15059566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&htd[i]));
15069566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i],&htsize));
15074a56b808SFande Kong     onz[i] = htsize;
15089566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
15094a56b808SFande Kong   }
15104a56b808SFande Kong 
15119566063dSJacob Faibussowitsch   PetscCall(PetscFree2(htd,hto));
15129566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
15134a56b808SFande Kong 
15144a56b808SFande Kong   /* local sizes and preallocation */
15159566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
15169566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
15179566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
15189566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz,onz));
15194a56b808SFande Kong 
15204a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
15216718818eSStefano Zampini   Cmpi->product->data    = ptap;
15226718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
15236718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
15244a56b808SFande Kong 
15254a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
15264a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
15274a56b808SFande Kong   /* pick an algorithm */
1528*d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
15294a56b808SFande Kong   alg = 0;
15309566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
1531*d0609cedSBarry Smith   PetscOptionsEnd();
15324a56b808SFande Kong   switch (alg) {
15334a56b808SFande Kong     case 0:
15344a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
15354a56b808SFande Kong       break;
15364a56b808SFande Kong     case 1:
15374a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
15384a56b808SFande Kong       break;
15394a56b808SFande Kong     default:
1540546078acSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
15414a56b808SFande Kong   }
15424a56b808SFande Kong   PetscFunctionReturn(0);
15434a56b808SFande Kong }
15444a56b808SFande Kong 
15454222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1546bc8e477aSFande Kong {
1547bc8e477aSFande Kong   PetscFunctionBegin;
15489566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C));
1549bc8e477aSFande Kong   PetscFunctionReturn(0);
1550bc8e477aSFande Kong }
1551bc8e477aSFande Kong 
15524222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1553e953e47cSHong Zhang {
15543cdca5ebSHong Zhang   Mat_APMPI           *ptap;
15556718818eSStefano Zampini   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1556e953e47cSHong Zhang   MPI_Comm            comm;
1557e953e47cSHong Zhang   PetscMPIInt         size,rank;
1558e953e47cSHong Zhang   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1559e953e47cSHong Zhang   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1560e953e47cSHong Zhang   PetscInt            *lnk,i,k,pnz,row,nsend;
1561e953e47cSHong Zhang   PetscBT             lnkbt;
1562ec4bef21SJose E. Roman   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1563ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted=0;
1564e953e47cSHong Zhang   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1565e953e47cSHong Zhang   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1566e953e47cSHong Zhang   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1567e953e47cSHong Zhang   MPI_Request         *swaits,*rwaits;
1568e953e47cSHong Zhang   MPI_Status          *sstatus,rstatus;
1569e953e47cSHong Zhang   PetscLayout         rowmap;
1570e953e47cSHong Zhang   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1571e953e47cSHong Zhang   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1572e953e47cSHong Zhang   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1573ec07b8f8SHong Zhang   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1574e953e47cSHong Zhang   PetscScalar         *apv;
1575e953e47cSHong Zhang   PetscTable          ta;
1576b92f168fSBarry Smith   MatType             mtype;
1577e83e5f86SFande Kong   const char          *prefix;
1578e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1579e953e47cSHong Zhang   PetscReal           apfill;
1580e953e47cSHong Zhang #endif
1581e953e47cSHong Zhang 
1582e953e47cSHong Zhang   PetscFunctionBegin;
15836718818eSStefano Zampini   MatCheckProduct(Cmpi,4);
158428b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
15859566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
15869566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm,&size));
15879566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm,&rank));
1588e953e47cSHong Zhang 
158952f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1590ec07b8f8SHong Zhang 
1591e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
15929566063dSJacob Faibussowitsch   PetscCall(MatGetType(A,&mtype));
15939566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi,mtype));
1594e953e47cSHong Zhang 
1595e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1596e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1597e953e47cSHong Zhang 
15983cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
15999566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
160015a3b8e2SHong Zhang   ptap->reuse = MAT_INITIAL_MATRIX;
1601a4ffb656SHong Zhang   ptap->algType = 1;
160215a3b8e2SHong Zhang 
160315a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
16049566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
160515a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
16069566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc));
160715a3b8e2SHong Zhang 
160867a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
160967a12041SHong Zhang   /* --------------------------------- */
16109566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
16119566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
161215a3b8e2SHong Zhang 
161367a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
161467a12041SHong Zhang   /* ----------------------------------------------------------------- */
161567a12041SHong Zhang   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
161652f7967eSHong Zhang   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1617445158ffSHong Zhang 
16189a6dcab7SHong Zhang   /* create and initialize a linked list */
16199566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
16204b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
16214b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
16229566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
1623d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
162478d30b94SHong Zhang 
16259566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
1626445158ffSHong Zhang 
16278cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1628ec07b8f8SHong Zhang   if (ao) {
16299566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
1630ec07b8f8SHong Zhang   } else {
16319566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
1632ec07b8f8SHong Zhang   }
1633445158ffSHong Zhang   current_space = free_space;
163467a12041SHong Zhang   nspacedouble  = 0;
163567a12041SHong Zhang 
16369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+1,&api));
163767a12041SHong Zhang   api[0] = 0;
1638445158ffSHong Zhang   for (i=0; i<am; i++) {
163967a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
164067a12041SHong Zhang     ai = ad->i; pi = p_loc->i;
164167a12041SHong Zhang     nzi = ai[i+1] - ai[i];
164267a12041SHong Zhang     aj  = ad->j + ai[i];
1643445158ffSHong Zhang     for (j=0; j<nzi; j++) {
1644445158ffSHong Zhang       row  = aj[j];
164567a12041SHong Zhang       pnz  = pi[row+1] - pi[row];
164667a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1647445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
16489566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1649445158ffSHong Zhang     }
165067a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1651ec07b8f8SHong Zhang     if (ao) {
165267a12041SHong Zhang       ai = ao->i; pi = p_oth->i;
165367a12041SHong Zhang       nzi = ai[i+1] - ai[i];
165467a12041SHong Zhang       aj  = ao->j + ai[i];
1655445158ffSHong Zhang       for (j=0; j<nzi; j++) {
1656445158ffSHong Zhang         row  = aj[j];
165767a12041SHong Zhang         pnz  = pi[row+1] - pi[row];
165867a12041SHong Zhang         Jptr = p_oth->j + pi[row];
16599566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1660445158ffSHong Zhang       }
1661ec07b8f8SHong Zhang     }
1662445158ffSHong Zhang     apnz     = lnk[0];
1663445158ffSHong Zhang     api[i+1] = api[i] + apnz;
1664445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1665445158ffSHong Zhang 
1666445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1667445158ffSHong Zhang     if (current_space->local_remaining<apnz) {
16689566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
1669445158ffSHong Zhang       nspacedouble++;
1670445158ffSHong Zhang     }
1671445158ffSHong Zhang 
1672445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
16739566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt));
1674445158ffSHong Zhang 
1675445158ffSHong Zhang     current_space->array           += apnz;
1676445158ffSHong Zhang     current_space->local_used      += apnz;
1677445158ffSHong Zhang     current_space->local_remaining -= apnz;
1678445158ffSHong Zhang   }
1679681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1680445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
16819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(api[am],&apj,api[am],&apv));
16829566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,apj));
16839566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk,lnkbt));
16849a6dcab7SHong Zhang 
1685aa690a28SHong Zhang   /* Create AP_loc for reuse */
16869566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
16879566063dSJacob Faibussowitsch   PetscCall(MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name));
1688aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1689ec07b8f8SHong Zhang   if (ao) {
1690aa690a28SHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1691ec07b8f8SHong Zhang   } else {
1692ec07b8f8SHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1693ec07b8f8SHong Zhang   }
1694aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1695aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1696aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1697aa690a28SHong Zhang 
1698aa690a28SHong Zhang   if (api[am]) {
16999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
17009566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
1701aa690a28SHong Zhang   } else {
17029566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n"));
1703aa690a28SHong Zhang   }
1704aa690a28SHong Zhang #endif
1705aa690a28SHong Zhang 
1706681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1707681d504bSHong Zhang   /* ------------------------------------ */
17089566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A,&prefix));
17099566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Ro,prefix));
17109566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_"));
17119566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
17129566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi,&prefix));
17139566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix));
17149566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_"));
17159566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
17169566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth,"default"));
17179566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth,fill));
17189566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
17199566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
172015a3b8e2SHong Zhang 
172167a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
172267a12041SHong Zhang   /* ------------------------------------------ */
172315a3b8e2SHong Zhang   /* determine row ownership */
17249566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm,&rowmap));
1725445158ffSHong Zhang   rowmap->n  = pn;
1726445158ffSHong Zhang   rowmap->bs = 1;
17279566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
1728445158ffSHong Zhang   owners = rowmap->range;
172915a3b8e2SHong Zhang 
173015a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
17319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
17329566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s,size));
17339566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si,size));
173415a3b8e2SHong Zhang 
173567a12041SHong Zhang   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
173667a12041SHong Zhang   coi   = c_oth->i; coj = c_oth->j;
173767a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
173815a3b8e2SHong Zhang   proc  = 0;
173967a12041SHong Zhang   for (i=0; i<con; i++) {
174015a3b8e2SHong Zhang     while (prmap[i] >= owners[proc+1]) proc++;
174115a3b8e2SHong Zhang     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
174215a3b8e2SHong Zhang     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
174315a3b8e2SHong Zhang   }
174415a3b8e2SHong Zhang 
174515a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
174615a3b8e2SHong Zhang   owners_co[0] = 0;
174767a12041SHong Zhang   nsend        = 0;
174815a3b8e2SHong Zhang   for (proc=0; proc<size; proc++) {
174915a3b8e2SHong Zhang     owners_co[proc+1] = owners_co[proc] + len_si[proc];
175015a3b8e2SHong Zhang     if (len_s[proc]) {
1751445158ffSHong Zhang       nsend++;
175215a3b8e2SHong Zhang       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
175315a3b8e2SHong Zhang       len         += len_si[proc];
175415a3b8e2SHong Zhang     }
175515a3b8e2SHong Zhang   }
175615a3b8e2SHong Zhang 
175715a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
17589566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
17599566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
176015a3b8e2SHong Zhang 
176115a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
17629566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm,&tagj));
17639566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
17649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend+1,&swaits));
176515a3b8e2SHong Zhang   for (proc=0, k=0; proc<size; proc++) {
176615a3b8e2SHong Zhang     if (!len_s[proc]) continue;
176715a3b8e2SHong Zhang     i    = owners_co[proc];
17689566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
176915a3b8e2SHong Zhang     k++;
177015a3b8e2SHong Zhang   }
177115a3b8e2SHong Zhang 
1772681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1773681d504bSHong Zhang   /* ---------------------------------------- */
17749566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Rd,prefix));
17759566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Rd,"inner_diag_"));
17769566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
17779566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi,&prefix));
17789566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix));
17799566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_"));
17809566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
17819566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default"));
17829566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc,fill));
17839566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
17849566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
17854222ddf1SHong Zhang 
1786681d504bSHong Zhang   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1787681d504bSHong Zhang 
1788681d504bSHong Zhang   /* receives coj are complete */
1789445158ffSHong Zhang   for (i=0; i<nrecv; i++) {
17909566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
179115a3b8e2SHong Zhang   }
17929566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17939566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
179415a3b8e2SHong Zhang 
179578d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
179678d30b94SHong Zhang   for (k=0; k<nrecv; k++) {/* k-th received message */
179778d30b94SHong Zhang     Jptr = buf_rj[k];
179878d30b94SHong Zhang     for (j=0; j<len_r[k]; j++) {
17999566063dSJacob Faibussowitsch       PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
180078d30b94SHong Zhang     }
180178d30b94SHong Zhang   }
18029566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax));
18039566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
18049a6dcab7SHong Zhang 
180515a3b8e2SHong Zhang   /* (4) send and recv coi */
180615a3b8e2SHong Zhang   /*-----------------------*/
18079566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm,&tagi));
18089566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
18099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len+1,&buf_s));
181015a3b8e2SHong Zhang   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
181115a3b8e2SHong Zhang   for (proc=0,k=0; proc<size; proc++) {
181215a3b8e2SHong Zhang     if (!len_s[proc]) continue;
181315a3b8e2SHong Zhang     /* form outgoing message for i-structure:
181415a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
181515a3b8e2SHong Zhang                [1:nrows]:           row index (global)
181615a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
181715a3b8e2SHong Zhang     */
181815a3b8e2SHong Zhang     /*-------------------------------------------*/
181915a3b8e2SHong Zhang     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
182015a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows+1;
182115a3b8e2SHong Zhang     buf_si[0]   = nrows;
182215a3b8e2SHong Zhang     buf_si_i[0] = 0;
182315a3b8e2SHong Zhang     nrows       = 0;
182415a3b8e2SHong Zhang     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
182515a3b8e2SHong Zhang       nzi = coi[i+1] - coi[i];
182615a3b8e2SHong Zhang       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
182715a3b8e2SHong Zhang       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
182815a3b8e2SHong Zhang       nrows++;
182915a3b8e2SHong Zhang     }
18309566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
183115a3b8e2SHong Zhang     k++;
183215a3b8e2SHong Zhang     buf_si += len_si[proc];
183315a3b8e2SHong Zhang   }
1834681d504bSHong Zhang   for (i=0; i<nrecv; i++) {
18359566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
183615a3b8e2SHong Zhang   }
18379566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
18389566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
183915a3b8e2SHong Zhang 
18409566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co));
18419566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
18429566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
18439566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
1844a4ffb656SHong Zhang 
184567a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
184667a12041SHong Zhang   /* ------------------------------------------ */
184778d30b94SHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
18489566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax,&free_space));
184915a3b8e2SHong Zhang   current_space = free_space;
185015a3b8e2SHong Zhang 
18519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
1852445158ffSHong Zhang   for (k=0; k<nrecv; k++) {
185315a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
185415a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
185515a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1856a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
185715a3b8e2SHong Zhang   }
1858445158ffSHong Zhang 
1859*d0609cedSBarry Smith   MatPreallocateBegin(comm,pn,pn,dnz,onz);
18609566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
186115a3b8e2SHong Zhang   for (i=0; i<pn; i++) {
186267a12041SHong Zhang     /* add C_loc into Cmpi */
186367a12041SHong Zhang     nzi  = c_loc->i[i+1] - c_loc->i[i];
186467a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
18659566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
186615a3b8e2SHong Zhang 
186715a3b8e2SHong Zhang     /* add received col data into lnk */
1868445158ffSHong Zhang     for (k=0; k<nrecv; k++) { /* k-th received message */
186915a3b8e2SHong Zhang       if (i == *nextrow[k]) { /* i-th row */
187015a3b8e2SHong Zhang         nzi  = *(nextci[k]+1) - *nextci[k];
187115a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
18729566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
187315a3b8e2SHong Zhang         nextrow[k]++; nextci[k]++;
187415a3b8e2SHong Zhang       }
187515a3b8e2SHong Zhang     }
1876d0e9a2f7SHong Zhang     nzi = lnk[0];
18778cb82516SHong Zhang 
187815a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
18799566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt));
18809566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
188115a3b8e2SHong Zhang   }
18829566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k,nextrow,nextci));
18839566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk,lnkbt));
18849566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
188580bb4639SHong Zhang 
1886ae5f0867Sstefano_zampini   /* local sizes and preallocation */
18879566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1888ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
18899566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
18909566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
1891ac94c67aSTristan Konolige   }
18929566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1893*d0609cedSBarry Smith   MatPreallocateEnd(dnz,onz);
189415a3b8e2SHong Zhang 
1895445158ffSHong Zhang   /* members in merge */
18969566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
18979566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
18989566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
18999566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
19009566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
19019566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
19029566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
190315a3b8e2SHong Zhang 
19049566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pN,&ptap->apa));
19052259aa2eSHong Zhang 
19066718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
19076718818eSStefano Zampini   Cmpi->product->data    = ptap;
19086718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
19096718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
19106718818eSStefano Zampini 
19111a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
19121a47ec54SHong Zhang   Cmpi->assembled = PETSC_FALSE;
19130d3441aeSHong Zhang   PetscFunctionReturn(0);
19140d3441aeSHong Zhang }
19150d3441aeSHong Zhang 
1916aa690a28SHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
19170d3441aeSHong Zhang {
19186718818eSStefano Zampini   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
19190d3441aeSHong Zhang   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
19202dd9e643SHong Zhang   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
19216718818eSStefano Zampini   Mat_APMPI         *ptap;
19229ce11a7cSHong Zhang   Mat               AP_loc,C_loc,C_oth;
19230d3441aeSHong Zhang   PetscInt          i,rstart,rend,cm,ncols,row;
19248cb82516SHong Zhang   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
19258cb82516SHong Zhang   PetscScalar       *apa;
19260d3441aeSHong Zhang   const PetscInt    *cols;
19270d3441aeSHong Zhang   const PetscScalar *vals;
19280d3441aeSHong Zhang 
19290d3441aeSHong Zhang   PetscFunctionBegin;
19306718818eSStefano Zampini   MatCheckProduct(C,3);
19316718818eSStefano Zampini   ptap = (Mat_APMPI*)C->product->data;
193228b400f6SJacob Faibussowitsch   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
193328b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
1934a9899c97SHong Zhang 
19359566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
1936e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
1937748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
19389566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
19399566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
1940748c7196SHong Zhang   }
19410d3441aeSHong Zhang 
1942e497d3c8SHong Zhang   /* 2) get AP_loc */
19430d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
19448cb82516SHong Zhang   ap = (Mat_SeqAIJ*)AP_loc->data;
19450d3441aeSHong Zhang 
1946e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
19470d3441aeSHong Zhang   /*-----------------------------------------------------*/
1948748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1949748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
19509566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
19519566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
1952e497d3c8SHong Zhang   }
19530d3441aeSHong Zhang 
19548cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
19558cb82516SHong Zhang   /* ---------------------------------------------- */
19560d3441aeSHong Zhang   /* get data from symbolic products */
19578cb82516SHong Zhang   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
19582dd9e643SHong Zhang   if (ptap->P_oth) {
19598cb82516SHong Zhang     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
19602dd9e643SHong Zhang   }
19618cb82516SHong Zhang   apa   = ptap->apa;
1962681d504bSHong Zhang   api   = ap->i;
1963681d504bSHong Zhang   apj   = ap->j;
1964e497d3c8SHong Zhang   for (i=0; i<am; i++) {
1965445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1966e497d3c8SHong Zhang     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1967e497d3c8SHong Zhang     apnz = api[i+1] - api[i];
1968e497d3c8SHong Zhang     for (j=0; j<apnz; j++) {
1969e497d3c8SHong Zhang       col = apj[j+api[i]];
1970e497d3c8SHong Zhang       ap->a[j+ap->i[i]] = apa[col];
1971e497d3c8SHong Zhang       apa[col] = 0.0;
19720d3441aeSHong Zhang     }
19730d3441aeSHong Zhang   }
1974976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
19759566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
19760d3441aeSHong Zhang 
19778cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
19789566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_loc));
19799566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_oth));
19809ce11a7cSHong Zhang   C_loc = ptap->C_loc;
19819ce11a7cSHong Zhang   C_oth = ptap->C_oth;
19820d3441aeSHong Zhang 
19830d3441aeSHong Zhang   /* add C_loc and Co to to C */
19849566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C,&rstart,&rend));
19850d3441aeSHong Zhang 
19860d3441aeSHong Zhang   /* C_loc -> C */
19870d3441aeSHong Zhang   cm    = C_loc->rmap->N;
19889ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ*)C_loc->data;
19898cb82516SHong Zhang   cols = c_seq->j;
19908cb82516SHong Zhang   vals = c_seq->a;
1991904d1e70Sandi selinger 
1992e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1993904d1e70Sandi selinger   /* when there are no off-processor parts.  */
19941de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
19951de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
19961de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
1997904d1e70Sandi selinger   if (C->assembled) {
1998904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
1999904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
2000904d1e70Sandi selinger   }
2001904d1e70Sandi selinger   if (C->was_assembled) {
20020d3441aeSHong Zhang     for (i=0; i<cm; i++) {
20039ce11a7cSHong Zhang       ncols = c_seq->i[i+1] - c_seq->i[i];
20040d3441aeSHong Zhang       row = rstart + i;
20059566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
20068cb82516SHong Zhang       cols += ncols; vals += ncols;
20070d3441aeSHong Zhang     }
2008904d1e70Sandi selinger   } else {
20099566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
2010904d1e70Sandi selinger   }
20110d3441aeSHong Zhang 
20120d3441aeSHong Zhang   /* Co -> C, off-processor part */
20139ce11a7cSHong Zhang   cm = C_oth->rmap->N;
20149ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ*)C_oth->data;
20158cb82516SHong Zhang   cols = c_seq->j;
20168cb82516SHong Zhang   vals = c_seq->a;
20179ce11a7cSHong Zhang   for (i=0; i<cm; i++) {
20189ce11a7cSHong Zhang     ncols = c_seq->i[i+1] - c_seq->i[i];
20190d3441aeSHong Zhang     row = p->garray[i];
20209566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
20218cb82516SHong Zhang     cols += ncols; vals += ncols;
20220d3441aeSHong Zhang   }
2023904d1e70Sandi selinger 
20249566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
20259566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
20260d3441aeSHong Zhang 
2027748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
20280d3441aeSHong Zhang   PetscFunctionReturn(0);
20290d3441aeSHong Zhang }
20304222ddf1SHong Zhang 
20314222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
20324222ddf1SHong Zhang {
20334222ddf1SHong Zhang   Mat_Product         *product = C->product;
20344222ddf1SHong Zhang   Mat                 A=product->A,P=product->B;
20354222ddf1SHong Zhang   MatProductAlgorithm alg=product->alg;
20364222ddf1SHong Zhang   PetscReal           fill=product->fill;
20374222ddf1SHong Zhang   PetscBool           flg;
20384222ddf1SHong Zhang 
20394222ddf1SHong Zhang   PetscFunctionBegin;
20404222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
20419566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"scalable",&flg));
20424222ddf1SHong Zhang   if (flg) {
20439566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C));
20444222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20454222ddf1SHong Zhang     goto next;
20464222ddf1SHong Zhang   }
20474222ddf1SHong Zhang 
20484222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
20499566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"nonscalable",&flg));
20504222ddf1SHong Zhang   if (flg) {
20519566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C));
20524222ddf1SHong Zhang     goto next;
20534222ddf1SHong Zhang   }
20544222ddf1SHong Zhang 
20554222ddf1SHong Zhang   /* allatonce */
20569566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"allatonce",&flg));
20574222ddf1SHong Zhang   if (flg) {
20589566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C));
20594222ddf1SHong Zhang     goto next;
20604222ddf1SHong Zhang   }
20614222ddf1SHong Zhang 
20624222ddf1SHong Zhang   /* allatonce_merged */
20639566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"allatonce_merged",&flg));
20644222ddf1SHong Zhang   if (flg) {
20659566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C));
20664222ddf1SHong Zhang     goto next;
20674222ddf1SHong Zhang   }
20684222ddf1SHong Zhang 
20694e84afc0SStefano Zampini   /* backend general code */
20709566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"backend",&flg));
20714e84afc0SStefano Zampini   if (flg) {
20729566063dSJacob Faibussowitsch     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
20734e84afc0SStefano Zampini     PetscFunctionReturn(0);
20744e84afc0SStefano Zampini   }
20754e84afc0SStefano Zampini 
20764222ddf1SHong Zhang   /* hypre */
20774222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
20789566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"hypre",&flg));
20794222ddf1SHong Zhang   if (flg) {
20809566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C));
20814222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20824222ddf1SHong Zhang     PetscFunctionReturn(0);
20834222ddf1SHong Zhang   }
20844222ddf1SHong Zhang #endif
20854222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
20864222ddf1SHong Zhang 
20874222ddf1SHong Zhang next:
20884222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
20894222ddf1SHong Zhang   PetscFunctionReturn(0);
20904222ddf1SHong Zhang }
2091