xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 976452c91c9a7c4d1061ecd805d3be3275ad7847)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
1624ecddacSHong Zhang 
17cf97cf3cSHong Zhang PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
18cf97cf3cSHong Zhang {
19cf97cf3cSHong Zhang   PetscErrorCode    ierr;
20cf97cf3cSHong Zhang   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
213cdca5ebSHong Zhang   Mat_APMPI         *ptap=a->ap;
22cf97cf3cSHong Zhang   PetscBool         iascii;
23cf97cf3cSHong Zhang   PetscViewerFormat format;
24cf97cf3cSHong Zhang 
25cf97cf3cSHong Zhang   PetscFunctionBegin;
26baa1ba78SHong Zhang   if (!ptap) {
27baa1ba78SHong Zhang     /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
28baa1ba78SHong Zhang     A->ops->view = MatView_MPIAIJ;
29baa1ba78SHong Zhang     ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr);
30baa1ba78SHong Zhang     PetscFunctionReturn(0);
31baa1ba78SHong Zhang   }
32baa1ba78SHong Zhang 
33cf97cf3cSHong Zhang   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
34cf97cf3cSHong Zhang   if (iascii) {
35cf97cf3cSHong Zhang     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
36cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
37cf97cf3cSHong Zhang       if (ptap->algType == 0) {
38cf97cf3cSHong Zhang         ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr);
39cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
40cf97cf3cSHong Zhang         ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr);
414a56b808SFande Kong       } else if (ptap->algType == 2) {
424a56b808SFande Kong         ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
434a56b808SFande Kong       } else if (ptap->algType == 3) {
444a56b808SFande Kong         ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
45cf97cf3cSHong Zhang       }
46cf97cf3cSHong Zhang     }
47cf97cf3cSHong Zhang   }
48cf97cf3cSHong Zhang   ierr = (ptap->view)(A,viewer);CHKERRQ(ierr);
49cf97cf3cSHong Zhang   PetscFunctionReturn(0);
50cf97cf3cSHong Zhang }
51cf97cf3cSHong Zhang 
524624976aSHong Zhang PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
53cc6cb787SHong Zhang {
54cc6cb787SHong Zhang   PetscErrorCode      ierr;
55f8487c73SHong Zhang   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data;
563cdca5ebSHong Zhang   Mat_APMPI           *ptap=a->ap;
5760552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
58cc6cb787SHong Zhang 
59cc6cb787SHong Zhang   PetscFunctionBegin;
60dd4011a9SHong Zhang   if (!ptap) PetscFunctionReturn(0);
61dd4011a9SHong Zhang 
62b7f45c76SHong Zhang   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
63f8487c73SHong Zhang   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
64a1a4e84aSHong Zhang   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
65a1a4e84aSHong Zhang   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
66c5af039cSHong Zhang   ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
6705d62848SHong Zhang   ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
6805d62848SHong Zhang   ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
698403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
70681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
71681d504bSHong Zhang     ierr = PetscFree(ap->i);CHKERRQ(ierr);
72681d504bSHong Zhang     ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr);
730d3441aeSHong Zhang     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
748403a639SHong Zhang   } else { /* used by alg_ptap */
758403a639SHong Zhang     ierr = PetscFree(ptap->api);CHKERRQ(ierr);
768403a639SHong Zhang     ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
77681d504bSHong Zhang   }
782259aa2eSHong Zhang   ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
792259aa2eSHong Zhang   ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
80414894bdSHong Zhang   if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
81a560ef98SHong Zhang 
8260552ceaSHong Zhang   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
8360552ceaSHong Zhang 
8460552ceaSHong Zhang   merge=ptap->merge;
858403a639SHong Zhang   if (merge) { /* used by alg_ptap */
86cc6cb787SHong Zhang     ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
87cc6cb787SHong Zhang     ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
88cc6cb787SHong Zhang     ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
89cc6cb787SHong Zhang     ierr = PetscFree(merge->bi);CHKERRQ(ierr);
90cc6cb787SHong Zhang     ierr = PetscFree(merge->bj);CHKERRQ(ierr);
91c05d87d6SBarry Smith     ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
92cc6cb787SHong Zhang     ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
93c05d87d6SBarry Smith     ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
94cc6cb787SHong Zhang     ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
95445158ffSHong Zhang     ierr = PetscFree(merge->coi);CHKERRQ(ierr);
96445158ffSHong Zhang     ierr = PetscFree(merge->coj);CHKERRQ(ierr);
9705b42c5fSBarry Smith     ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
986bf464f9SBarry Smith     ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
99f8487c73SHong Zhang     ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
100bf0cc555SLisandro Dalcin   }
101a3bb6f32SFande Kong   ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr);
1024a56b808SFande Kong 
1034a56b808SFande Kong   ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr);
1044a56b808SFande Kong   ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr);
1054a56b808SFande Kong   ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr);
1063697aca6SHong Zhang   PetscFunctionReturn(0);
107c8b0795cSMark F. Adams }
108dce485f0SBarry Smith 
1093697aca6SHong Zhang PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
1103697aca6SHong Zhang {
1113697aca6SHong Zhang   PetscErrorCode ierr;
1129b3d8a68SHong Zhang   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1139b3d8a68SHong Zhang   Mat_APMPI      *ptap=a->ap;
1143697aca6SHong Zhang 
1153697aca6SHong Zhang   PetscFunctionBegin;
116dd4011a9SHong Zhang   ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr);
1179b3d8a68SHong Zhang   ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */
1189b3d8a68SHong Zhang   ierr = PetscFree(ptap);CHKERRQ(ierr);
119cc6cb787SHong Zhang   PetscFunctionReturn(0);
120cc6cb787SHong Zhang }
121cc6cb787SHong Zhang 
122cf742fccSHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
123cf742fccSHong Zhang {
124cf742fccSHong Zhang   PetscErrorCode    ierr;
125cf742fccSHong Zhang   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
126cf742fccSHong Zhang   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
12792ec70a1SHong Zhang   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1283cdca5ebSHong Zhang   Mat_APMPI         *ptap = c->ap;
129cf742fccSHong Zhang   Mat               AP_loc,C_loc,C_oth;
130a3bb6f32SFande Kong   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
131cf742fccSHong Zhang   PetscScalar       *apa;
132cf742fccSHong Zhang   const PetscInt    *cols;
133cf742fccSHong Zhang   const PetscScalar *vals;
134cf742fccSHong Zhang 
135cf742fccSHong Zhang   PetscFunctionBegin;
136fa2a5dcfSHong Zhang   if (!ptap->AP_loc) {
13780da8df7SHong Zhang     MPI_Comm comm;
13880da8df7SHong Zhang     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
13980da8df7SHong Zhang     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
14080da8df7SHong Zhang   }
14180da8df7SHong Zhang 
142cf742fccSHong Zhang   ierr = MatZeroEntries(C);CHKERRQ(ierr);
143cf742fccSHong Zhang 
144cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
145cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
146419ecdd9Sandi selinger     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
147419ecdd9Sandi selinger     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
148cf742fccSHong Zhang   }
149cf742fccSHong Zhang 
150cf742fccSHong Zhang   /* 2) get AP_loc */
151cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
152cf742fccSHong Zhang   ap = (Mat_SeqAIJ*)AP_loc->data;
153cf742fccSHong Zhang 
154cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
155cf742fccSHong Zhang   /*-----------------------------------------------------*/
156cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
157cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
158cf742fccSHong Zhang     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
159cf742fccSHong Zhang     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
160cf742fccSHong Zhang   }
161cf742fccSHong Zhang 
162cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
163cf742fccSHong Zhang   /* ---------------------------------------------- */
164cf742fccSHong Zhang   /* get data from symbolic products */
165cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
16652f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
16752f7967eSHong Zhang 
168cf742fccSHong Zhang   api   = ap->i;
169cf742fccSHong Zhang   apj   = ap->j;
170a3bb6f32SFande Kong   ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr);
171cf742fccSHong Zhang   for (i=0; i<am; i++) {
172cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
173cf742fccSHong Zhang     apnz = api[i+1] - api[i];
174b4e8d1b6SHong Zhang     apa = ap->a + api[i];
175580bdb30SBarry Smith     ierr = PetscArrayzero(apa,apnz);CHKERRQ(ierr);
176b4e8d1b6SHong Zhang     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
177cf742fccSHong Zhang   }
178a3bb6f32SFande Kong   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr);
179a3bb6f32SFande Kong   if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
180cf742fccSHong Zhang 
181cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
182154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
183cf742fccSHong Zhang   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
184cf742fccSHong Zhang   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
185cf742fccSHong Zhang 
186cf742fccSHong Zhang   C_loc = ptap->C_loc;
187cf742fccSHong Zhang   C_oth = ptap->C_oth;
188cf742fccSHong Zhang 
189cf742fccSHong Zhang   /* add C_loc and Co to to C */
190cf742fccSHong Zhang   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
191cf742fccSHong Zhang 
192cf742fccSHong Zhang   /* C_loc -> C */
193cf742fccSHong Zhang   cm    = C_loc->rmap->N;
194cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ*)C_loc->data;
195cf742fccSHong Zhang   cols = c_seq->j;
196cf742fccSHong Zhang   vals = c_seq->a;
197a3bb6f32SFande Kong   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
198edeac6deSandi selinger 
199e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
200edeac6deSandi selinger   /* when there are no off-processor parts.  */
2011de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2021de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2031de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
204edeac6deSandi selinger   if (C->assembled) {
205edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
206edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
207edeac6deSandi selinger   }
208edeac6deSandi selinger   if (C->was_assembled) {
209cf742fccSHong Zhang     for (i=0; i<cm; i++) {
210cf742fccSHong Zhang       ncols = c_seq->i[i+1] - c_seq->i[i];
211cf742fccSHong Zhang       row = rstart + i;
212edeac6deSandi selinger       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
213cf742fccSHong Zhang       cols += ncols; vals += ncols;
214cf742fccSHong Zhang     }
215edeac6deSandi selinger   } else {
216e9ede7d0Sandi selinger     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
217edeac6deSandi selinger   }
218a3bb6f32SFande Kong   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
219a3bb6f32SFande Kong   if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
220cf742fccSHong Zhang 
221cf742fccSHong Zhang   /* Co -> C, off-processor part */
222cf742fccSHong Zhang   cm = C_oth->rmap->N;
223cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ*)C_oth->data;
224cf742fccSHong Zhang   cols = c_seq->j;
225cf742fccSHong Zhang   vals = c_seq->a;
226a3bb6f32SFande Kong   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
227cf742fccSHong Zhang   for (i=0; i<cm; i++) {
228cf742fccSHong Zhang     ncols = c_seq->i[i+1] - c_seq->i[i];
229cf742fccSHong Zhang     row = p->garray[i];
230cf742fccSHong Zhang     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
231cf742fccSHong Zhang     cols += ncols; vals += ncols;
232cf742fccSHong Zhang   }
233cf742fccSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
234cf742fccSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
235cf742fccSHong Zhang 
236cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
23780da8df7SHong Zhang 
238a3bb6f32SFande Kong   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
239a3bb6f32SFande Kong   if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
240a3bb6f32SFande Kong 
24180da8df7SHong Zhang   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2427d0a89b7SHong Zhang   if (ptap->freestruct) {
24380da8df7SHong Zhang     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
24480da8df7SHong Zhang   }
245cf742fccSHong Zhang   PetscFunctionReturn(0);
246cf742fccSHong Zhang }
247cf742fccSHong Zhang 
2484222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
2490d3441aeSHong Zhang {
2500d3441aeSHong Zhang   PetscErrorCode      ierr;
2513cdca5ebSHong Zhang   Mat_APMPI           *ptap;
252681d504bSHong Zhang   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
2532259aa2eSHong Zhang   MPI_Comm            comm;
2542259aa2eSHong Zhang   PetscMPIInt         size,rank;
2554222ddf1SHong Zhang   Mat                 P_loc,P_oth;
25615a3b8e2SHong Zhang   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
257d0e9a2f7SHong Zhang   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
258d0e9a2f7SHong Zhang   PetscInt            *lnk,i,k,pnz,row,nsend;
259f671be37SHong Zhang   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
26015a3b8e2SHong Zhang   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
261d0e9a2f7SHong Zhang   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
26215a3b8e2SHong Zhang   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
26315a3b8e2SHong Zhang   MPI_Request         *swaits,*rwaits;
26415a3b8e2SHong Zhang   MPI_Status          *sstatus,rstatus;
265445158ffSHong Zhang   PetscLayout         rowmap;
266445158ffSHong Zhang   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
267445158ffSHong Zhang   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
268a3bb6f32SFande Kong   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
26952f7967eSHong Zhang   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
27067a12041SHong Zhang   PetscScalar         *apv;
27178d30b94SHong Zhang   PetscTable          ta;
272b92f168fSBarry Smith   MatType             mtype;
273e83e5f86SFande Kong   const char          *prefix;
274aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2758cb82516SHong Zhang   PetscReal           apfill;
276aa690a28SHong Zhang #endif
27767a12041SHong Zhang 
27867a12041SHong Zhang   PetscFunctionBegin;
27967a12041SHong Zhang   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
28067a12041SHong Zhang   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
28167a12041SHong Zhang   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
282ae5f0867Sstefano_zampini 
28352f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
28452f7967eSHong Zhang 
285ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
286b92f168fSBarry Smith   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
287b92f168fSBarry Smith   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
288ae5f0867Sstefano_zampini 
2893cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
290e953e47cSHong Zhang   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
291e953e47cSHong Zhang   ptap->reuse = MAT_INITIAL_MATRIX;
292cf97cf3cSHong Zhang   ptap->algType = 0;
293e953e47cSHong Zhang 
294e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
29576db11f6SHong Zhang   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr);
296e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
29776db11f6SHong Zhang   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr);
29876db11f6SHong Zhang 
29976db11f6SHong Zhang   ptap->P_loc = P_loc;
30076db11f6SHong Zhang   ptap->P_oth = P_oth;
301e953e47cSHong Zhang 
302e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
303e953e47cSHong Zhang   /* --------------------------------- */
304419ecdd9Sandi selinger   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
305419ecdd9Sandi selinger   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
306e953e47cSHong Zhang 
307e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
308e953e47cSHong Zhang   /* ----------------------------------------------------------------- */
30976db11f6SHong Zhang   p_loc  = (Mat_SeqAIJ*)P_loc->data;
31052f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
311e953e47cSHong Zhang 
312e953e47cSHong Zhang   /* create and initialize a linked list */
313e953e47cSHong Zhang   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
31476db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
31576db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
316e953e47cSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
317e953e47cSHong Zhang 
31876db11f6SHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
319e953e47cSHong Zhang 
320e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
32152f7967eSHong Zhang   if (ao) {
322e953e47cSHong Zhang     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
32352f7967eSHong Zhang   } else {
32452f7967eSHong Zhang     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
32552f7967eSHong Zhang   }
326e953e47cSHong Zhang   current_space = free_space;
327e953e47cSHong Zhang   nspacedouble  = 0;
328e953e47cSHong Zhang 
329e953e47cSHong Zhang   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
330e953e47cSHong Zhang   api[0] = 0;
331e953e47cSHong Zhang   for (i=0; i<am; i++) {
332e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
333e953e47cSHong Zhang     ai = ad->i; pi = p_loc->i;
334e953e47cSHong Zhang     nzi = ai[i+1] - ai[i];
335e953e47cSHong Zhang     aj  = ad->j + ai[i];
336e953e47cSHong Zhang     for (j=0; j<nzi; j++) {
337e953e47cSHong Zhang       row  = aj[j];
338e953e47cSHong Zhang       pnz  = pi[row+1] - pi[row];
339e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
340e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
34176db11f6SHong Zhang       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
342e953e47cSHong Zhang     }
343e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
34452f7967eSHong Zhang     if (ao) {
345e953e47cSHong Zhang       ai = ao->i; pi = p_oth->i;
346e953e47cSHong Zhang       nzi = ai[i+1] - ai[i];
347e953e47cSHong Zhang       aj  = ao->j + ai[i];
348e953e47cSHong Zhang       for (j=0; j<nzi; j++) {
349e953e47cSHong Zhang         row  = aj[j];
350e953e47cSHong Zhang         pnz  = pi[row+1] - pi[row];
351e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
35276db11f6SHong Zhang         ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
353e953e47cSHong Zhang       }
35452f7967eSHong Zhang     }
355e953e47cSHong Zhang     apnz     = lnk[0];
356e953e47cSHong Zhang     api[i+1] = api[i] + apnz;
357e953e47cSHong Zhang 
358e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
359e953e47cSHong Zhang     if (current_space->local_remaining<apnz) {
360e953e47cSHong Zhang       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
361e953e47cSHong Zhang       nspacedouble++;
362e953e47cSHong Zhang     }
363e953e47cSHong Zhang 
364e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
36576db11f6SHong Zhang     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
366e953e47cSHong Zhang 
367e953e47cSHong Zhang     current_space->array           += apnz;
368e953e47cSHong Zhang     current_space->local_used      += apnz;
369e953e47cSHong Zhang     current_space->local_remaining -= apnz;
370e953e47cSHong Zhang   }
371e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
372e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
373a3bb6f32SFande Kong   ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
374e953e47cSHong Zhang   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
37576db11f6SHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
376e953e47cSHong Zhang 
377e953e47cSHong Zhang   /* Create AP_loc for reuse */
378e953e47cSHong Zhang   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
379a3bb6f32SFande Kong   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr);
380e953e47cSHong Zhang 
381e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
38252f7967eSHong Zhang   if (ao) {
383e953e47cSHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
38452f7967eSHong Zhang   } else {
38552f7967eSHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
38652f7967eSHong Zhang   }
387e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
388e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
389e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
390e953e47cSHong Zhang 
391e953e47cSHong Zhang   if (api[am]) {
392b11c1ec8SHong Zhang     ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
393e953e47cSHong Zhang     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
394e953e47cSHong Zhang   } else {
395b11c1ec8SHong Zhang     ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
396e953e47cSHong Zhang   }
397e953e47cSHong Zhang #endif
398e953e47cSHong Zhang 
399e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
4004222ddf1SHong Zhang   /* -------------------------------------- */
4014222ddf1SHong Zhang   ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr);
402e83e5f86SFande Kong   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
4034222ddf1SHong Zhang   ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr);
4044222ddf1SHong Zhang   ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");CHKERRQ(ierr);
4054222ddf1SHong Zhang 
4064222ddf1SHong Zhang   ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr);
4074222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(ptap->C_oth,"sorted");CHKERRQ(ierr);
4084222ddf1SHong Zhang   ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr);
4094222ddf1SHong Zhang   ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr);
4104222ddf1SHong Zhang   ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr);
411e953e47cSHong Zhang 
412e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
413e953e47cSHong Zhang   /* ------------------------------------------ */
414e953e47cSHong Zhang   /* determine row ownership */
415e953e47cSHong Zhang   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
416e953e47cSHong Zhang   rowmap->n  = pn;
417e953e47cSHong Zhang   rowmap->bs = 1;
418e953e47cSHong Zhang   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
419e953e47cSHong Zhang   owners = rowmap->range;
420e953e47cSHong Zhang 
421e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
422e953e47cSHong Zhang   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
423580bdb30SBarry Smith   ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr);
424580bdb30SBarry Smith   ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr);
425e953e47cSHong Zhang 
426e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
427e953e47cSHong Zhang   coi   = c_oth->i; coj = c_oth->j;
428e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
429e953e47cSHong Zhang   proc  = 0;
430a3bb6f32SFande Kong   ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr);
431e953e47cSHong Zhang   for (i=0; i<con; i++) {
432e953e47cSHong Zhang     while (prmap[i] >= owners[proc+1]) proc++;
433e953e47cSHong Zhang     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
434e953e47cSHong Zhang     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
435e953e47cSHong Zhang   }
436e953e47cSHong Zhang 
437e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
438e953e47cSHong Zhang   owners_co[0] = 0;
439e953e47cSHong Zhang   nsend        = 0;
440e953e47cSHong Zhang   for (proc=0; proc<size; proc++) {
441e953e47cSHong Zhang     owners_co[proc+1] = owners_co[proc] + len_si[proc];
442e953e47cSHong Zhang     if (len_s[proc]) {
443e953e47cSHong Zhang       nsend++;
444e953e47cSHong Zhang       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
445e953e47cSHong Zhang       len         += len_si[proc];
446e953e47cSHong Zhang     }
447e953e47cSHong Zhang   }
448e953e47cSHong Zhang 
449e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
450e953e47cSHong Zhang   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
451e953e47cSHong Zhang   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
452e953e47cSHong Zhang 
453e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
454e953e47cSHong Zhang   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
455e953e47cSHong Zhang   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
456e953e47cSHong Zhang   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
457e953e47cSHong Zhang   for (proc=0, k=0; proc<size; proc++) {
458e953e47cSHong Zhang     if (!len_s[proc]) continue;
459e953e47cSHong Zhang     i    = owners_co[proc];
460e953e47cSHong Zhang     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
461e953e47cSHong Zhang     k++;
462e953e47cSHong Zhang   }
463e953e47cSHong Zhang 
464e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
465e953e47cSHong Zhang   /* ---------------------------------------- */
4664222ddf1SHong Zhang   ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr);
4674222ddf1SHong Zhang   ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr);
4684222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr);
4694222ddf1SHong Zhang   ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr);
4704222ddf1SHong Zhang 
4714222ddf1SHong Zhang   ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr);
4724222ddf1SHong Zhang   ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");CHKERRQ(ierr);
4734222ddf1SHong Zhang 
4744222ddf1SHong Zhang   ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr);
4754222ddf1SHong Zhang   ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr);
4764222ddf1SHong Zhang 
477e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
478a3bb6f32SFande Kong   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr);
479e953e47cSHong Zhang 
480e953e47cSHong Zhang   /* receives coj are complete */
481e953e47cSHong Zhang   for (i=0; i<nrecv; i++) {
482e953e47cSHong Zhang     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
483e953e47cSHong Zhang   }
484e953e47cSHong Zhang   ierr = PetscFree(rwaits);CHKERRQ(ierr);
485e953e47cSHong Zhang   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
486e953e47cSHong Zhang 
487e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
488e953e47cSHong Zhang   for (k=0; k<nrecv; k++) {/* k-th received message */
489e953e47cSHong Zhang     Jptr = buf_rj[k];
490e953e47cSHong Zhang     for (j=0; j<len_r[k]; j++) {
491e953e47cSHong Zhang       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
492e953e47cSHong Zhang     }
493e953e47cSHong Zhang   }
494e953e47cSHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
495e953e47cSHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
496e953e47cSHong Zhang 
497e953e47cSHong Zhang   /* (4) send and recv coi */
498e953e47cSHong Zhang   /*-----------------------*/
499e953e47cSHong Zhang   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
500e953e47cSHong Zhang   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
501e953e47cSHong Zhang   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
502e953e47cSHong Zhang   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
503e953e47cSHong Zhang   for (proc=0,k=0; proc<size; proc++) {
504e953e47cSHong Zhang     if (!len_s[proc]) continue;
505e953e47cSHong Zhang     /* form outgoing message for i-structure:
506e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
507e953e47cSHong Zhang                [1:nrows]:           row index (global)
508e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
509e953e47cSHong Zhang     */
510e953e47cSHong Zhang     /*-------------------------------------------*/
511e953e47cSHong Zhang     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
512e953e47cSHong Zhang     buf_si_i    = buf_si + nrows+1;
513e953e47cSHong Zhang     buf_si[0]   = nrows;
514e953e47cSHong Zhang     buf_si_i[0] = 0;
515e953e47cSHong Zhang     nrows       = 0;
516e953e47cSHong Zhang     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
517e953e47cSHong Zhang       nzi = coi[i+1] - coi[i];
518e953e47cSHong Zhang       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
519e953e47cSHong Zhang       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
520e953e47cSHong Zhang       nrows++;
521e953e47cSHong Zhang     }
522e953e47cSHong Zhang     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
523e953e47cSHong Zhang     k++;
524e953e47cSHong Zhang     buf_si += len_si[proc];
525e953e47cSHong Zhang   }
526e953e47cSHong Zhang   for (i=0; i<nrecv; i++) {
527e953e47cSHong Zhang     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
528e953e47cSHong Zhang   }
529e953e47cSHong Zhang   ierr = PetscFree(rwaits);CHKERRQ(ierr);
530e953e47cSHong Zhang   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
531e953e47cSHong Zhang 
532e953e47cSHong Zhang   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
533e953e47cSHong Zhang   ierr = PetscFree(len_ri);CHKERRQ(ierr);
534e953e47cSHong Zhang   ierr = PetscFree(swaits);CHKERRQ(ierr);
535e953e47cSHong Zhang   ierr = PetscFree(buf_s);CHKERRQ(ierr);
536b4e8d1b6SHong Zhang 
537e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
538e953e47cSHong Zhang   /* ------------------------------------------ */
539e953e47cSHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
540e953e47cSHong Zhang   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
541e953e47cSHong Zhang   current_space = free_space;
542e953e47cSHong Zhang 
543e953e47cSHong Zhang   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
544e953e47cSHong Zhang   for (k=0; k<nrecv; k++) {
545e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
546e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
547e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
548e953e47cSHong Zhang     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
549e953e47cSHong Zhang   }
550e953e47cSHong Zhang 
551e953e47cSHong Zhang   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
552cf742fccSHong Zhang   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
553e953e47cSHong Zhang   for (i=0; i<pn; i++) {
554e953e47cSHong Zhang     /* add C_loc into Cmpi */
555e953e47cSHong Zhang     nzi  = c_loc->i[i+1] - c_loc->i[i];
556e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
557cf742fccSHong Zhang     ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
558e953e47cSHong Zhang 
559e953e47cSHong Zhang     /* add received col data into lnk */
560e953e47cSHong Zhang     for (k=0; k<nrecv; k++) { /* k-th received message */
561e953e47cSHong Zhang       if (i == *nextrow[k]) { /* i-th row */
562e953e47cSHong Zhang         nzi  = *(nextci[k]+1) - *nextci[k];
563e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
564cf742fccSHong Zhang         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
565e953e47cSHong Zhang         nextrow[k]++; nextci[k]++;
566e953e47cSHong Zhang       }
567e953e47cSHong Zhang     }
568e953e47cSHong Zhang     nzi = lnk[0];
569e953e47cSHong Zhang 
570e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
571cf742fccSHong Zhang     ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr);
572e953e47cSHong Zhang     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
573e953e47cSHong Zhang   }
574e953e47cSHong Zhang   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
575cf742fccSHong Zhang   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
576e953e47cSHong Zhang   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
577e953e47cSHong Zhang 
578e953e47cSHong Zhang   /* local sizes and preallocation */
579e953e47cSHong Zhang   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
580ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
581ac94c67aSTristan Konolige     ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr);
582ac94c67aSTristan Konolige     ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr);
583ac94c67aSTristan Konolige   }
584e953e47cSHong Zhang   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
585e953e47cSHong Zhang   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
586e953e47cSHong Zhang 
587e953e47cSHong Zhang   /* members in merge */
588e953e47cSHong Zhang   ierr = PetscFree(id_r);CHKERRQ(ierr);
589e953e47cSHong Zhang   ierr = PetscFree(len_r);CHKERRQ(ierr);
590e953e47cSHong Zhang   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
591e953e47cSHong Zhang   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
592e953e47cSHong Zhang   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
593e953e47cSHong Zhang   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
594e953e47cSHong Zhang   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
595e953e47cSHong Zhang 
596e953e47cSHong Zhang   /* attach the supporting struct to Cmpi for reuse */
597e953e47cSHong Zhang   c = (Mat_MPIAIJ*)Cmpi->data;
5983cdca5ebSHong Zhang   c->ap           = ptap;
599e953e47cSHong Zhang   ptap->duplicate = Cmpi->ops->duplicate;
600e953e47cSHong Zhang   ptap->destroy   = Cmpi->ops->destroy;
601cf97cf3cSHong Zhang   ptap->view      = Cmpi->ops->view;
602e953e47cSHong Zhang 
603e953e47cSHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
604e953e47cSHong Zhang   Cmpi->assembled        = PETSC_FALSE;
605a4ffb656SHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
606e953e47cSHong Zhang   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
607cf97cf3cSHong Zhang   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
6084624976aSHong Zhang   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
609a3bb6f32SFande Kong 
610a3bb6f32SFande Kong    nout = 0;
611a3bb6f32SFande Kong    ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr);
612a3bb6f32SFande Kong    if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
613a3bb6f32SFande Kong    ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr);
614a3bb6f32SFande Kong    if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
615a3bb6f32SFande Kong 
616e953e47cSHong Zhang   PetscFunctionReturn(0);
617e953e47cSHong Zhang }
618e953e47cSHong Zhang 
619bc8e477aSFande Kong PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
6204a56b808SFande Kong {
6214a56b808SFande Kong   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
6224a56b808SFande Kong   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
6234a56b808SFande Kong   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
624bc8e477aSFande Kong   PetscInt             pcstart,pcend,column,offset;
6254a56b808SFande Kong   PetscErrorCode       ierr;
6264a56b808SFande Kong 
6274a56b808SFande Kong   PetscFunctionBegin;
6284a56b808SFande Kong   pcstart = P->cmap->rstart;
629bc8e477aSFande Kong   pcstart *= dof;
6304a56b808SFande Kong   pcend   = P->cmap->rend;
631bc8e477aSFande Kong   pcend   *= dof;
6324a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6334a56b808SFande Kong   ai = ad->i;
6344a56b808SFande Kong   nzi = ai[i+1] - ai[i];
6354a56b808SFande Kong   aj  = ad->j + ai[i];
6364a56b808SFande Kong   for (j=0; j<nzi; j++) {
6374a56b808SFande Kong     row  = aj[j];
638bc8e477aSFande Kong     offset = row%dof;
639bc8e477aSFande Kong     row   /= dof;
6404a56b808SFande Kong     nzpi = pd->i[row+1] - pd->i[row];
6414a56b808SFande Kong     pj  = pd->j + pd->i[row];
6424a56b808SFande Kong     for (k=0; k<nzpi; k++) {
643bc8e477aSFande Kong       ierr = PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);CHKERRQ(ierr);
6444a56b808SFande Kong     }
6454a56b808SFande Kong   }
646bc8e477aSFande Kong   /* off diag P */
6474a56b808SFande Kong   for (j=0; j<nzi; j++) {
6484a56b808SFande Kong     row  = aj[j];
649bc8e477aSFande Kong     offset = row%dof;
650bc8e477aSFande Kong     row   /= dof;
6514a56b808SFande Kong     nzpi = po->i[row+1] - po->i[row];
6524a56b808SFande Kong     pj  = po->j + po->i[row];
6534a56b808SFande Kong     for (k=0; k<nzpi; k++) {
654bc8e477aSFande Kong       ierr = PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);CHKERRQ(ierr);
6554a56b808SFande Kong     }
6564a56b808SFande Kong   }
6574a56b808SFande Kong 
6584a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6594a56b808SFande Kong   if (ao) {
6604a56b808SFande Kong     ai = ao->i;
6614a56b808SFande Kong     pi = p_oth->i;
6624a56b808SFande Kong     nzi = ai[i+1] - ai[i];
6634a56b808SFande Kong     aj  = ao->j + ai[i];
6644a56b808SFande Kong     for (j=0; j<nzi; j++) {
6654a56b808SFande Kong       row  = aj[j];
666bc8e477aSFande Kong       offset = a->garray[row]%dof;
667bc8e477aSFande Kong       row  = map[row];
6684a56b808SFande Kong       pnz  = pi[row+1] - pi[row];
6694a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6704a56b808SFande Kong       for (col=0; col<pnz; col++) {
671bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6724a56b808SFande Kong         if (column>=pcstart && column<pcend) {
6734a56b808SFande Kong           ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr);
6744a56b808SFande Kong         } else {
6754a56b808SFande Kong           ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr);
6764a56b808SFande Kong         }
6774a56b808SFande Kong       }
6784a56b808SFande Kong     }
6794a56b808SFande Kong   } /* end if (ao) */
6804a56b808SFande Kong   PetscFunctionReturn(0);
6814a56b808SFande Kong }
6824a56b808SFande Kong 
683bc8e477aSFande Kong PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
6844a56b808SFande Kong {
6854a56b808SFande Kong   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
6864a56b808SFande Kong   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
687bc8e477aSFande Kong   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
6884a56b808SFande Kong   PetscScalar          ra,*aa,*pa;
6894a56b808SFande Kong   PetscErrorCode       ierr;
6904a56b808SFande Kong 
6914a56b808SFande Kong   PetscFunctionBegin;
6924a56b808SFande Kong   pcstart = P->cmap->rstart;
693bc8e477aSFande Kong   pcstart *= dof;
694bc8e477aSFande Kong 
6954a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6964a56b808SFande Kong   ai  = ad->i;
6974a56b808SFande Kong   nzi = ai[i+1] - ai[i];
6984a56b808SFande Kong   aj  = ad->j + ai[i];
6994a56b808SFande Kong   aa  = ad->a + ai[i];
7004a56b808SFande Kong   for (j=0; j<nzi; j++) {
7014a56b808SFande Kong     ra   = aa[j];
7024a56b808SFande Kong     row  = aj[j];
703bc8e477aSFande Kong     offset = row%dof;
704bc8e477aSFande Kong     row    /= dof;
7054a56b808SFande Kong     nzpi = pd->i[row+1] - pd->i[row];
7064a56b808SFande Kong     pj = pd->j + pd->i[row];
7074a56b808SFande Kong     pa = pd->a + pd->i[row];
7084a56b808SFande Kong     for (k=0; k<nzpi; k++) {
709bc8e477aSFande Kong       ierr = PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);CHKERRQ(ierr);
7104a56b808SFande Kong     }
71149c8f2b8SFande Kong     ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr);
7124a56b808SFande Kong   }
7134a56b808SFande Kong   for (j=0; j<nzi; j++) {
7144a56b808SFande Kong     ra   = aa[j];
7154a56b808SFande Kong     row  = aj[j];
716bc8e477aSFande Kong     offset = row%dof;
717bc8e477aSFande Kong     row   /= dof;
7184a56b808SFande Kong     nzpi = po->i[row+1] - po->i[row];
7194a56b808SFande Kong     pj = po->j + po->i[row];
7204a56b808SFande Kong     pa = po->a + po->i[row];
7214a56b808SFande Kong     for (k=0; k<nzpi; k++) {
722bc8e477aSFande Kong       ierr = PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);CHKERRQ(ierr);
7234a56b808SFande Kong     }
72449c8f2b8SFande Kong     ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr);
7254a56b808SFande Kong   }
7264a56b808SFande Kong 
7274a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
7284a56b808SFande Kong   if (ao) {
7294a56b808SFande Kong     ai = ao->i;
7304a56b808SFande Kong     pi = p_oth->i;
7314a56b808SFande Kong     nzi = ai[i+1] - ai[i];
7324a56b808SFande Kong     aj  = ao->j + ai[i];
7334a56b808SFande Kong     aa  = ao->a + ai[i];
7344a56b808SFande Kong     for (j=0; j<nzi; j++) {
7354a56b808SFande Kong       row  = aj[j];
736bc8e477aSFande Kong       offset = a->garray[row]%dof;
737bc8e477aSFande Kong       row    = map[row];
7384a56b808SFande Kong       ra   = aa[j];
7394a56b808SFande Kong       pnz  = pi[row+1] - pi[row];
7404a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
7414a56b808SFande Kong       pa   = p_oth->a + pi[row];
7424a56b808SFande Kong       for (col=0; col<pnz; col++) {
743bc8e477aSFande Kong         ierr = PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);CHKERRQ(ierr);
7444a56b808SFande Kong       }
74549c8f2b8SFande Kong       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
7464a56b808SFande Kong     }
7474a56b808SFande Kong   } /* end if (ao) */
748bb5ddf68SFande Kong 
7494a56b808SFande Kong   PetscFunctionReturn(0);
7504a56b808SFande Kong }
7514a56b808SFande Kong 
752bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
7535c65b9ecSFande Kong 
754bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
7554a56b808SFande Kong {
7564a56b808SFande Kong   PetscErrorCode    ierr;
7574a56b808SFande Kong   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
758bc8e477aSFande Kong   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
7594a56b808SFande Kong   Mat_APMPI         *ptap = c->ap;
7604a56b808SFande Kong   PetscHMapIV       hmap;
7614a56b808SFande Kong   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
7624a56b808SFande Kong   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
763bc8e477aSFande Kong   PetscInt          offset,ii,pocol;
764bc8e477aSFande Kong   const PetscInt    *mappingindices;
765bc8e477aSFande Kong   IS                map;
7664a56b808SFande Kong   MPI_Comm          comm;
7674a56b808SFande Kong 
7684a56b808SFande Kong   PetscFunctionBegin;
7694a56b808SFande Kong   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
770bb9bda99SHong Zhang   if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
7714a56b808SFande Kong 
7724a56b808SFande Kong   ierr = MatZeroEntries(C);CHKERRQ(ierr);
7734a56b808SFande Kong 
7744a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7754a56b808SFande Kong   /*-----------------------------------------------------*/
7764a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7774a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
778bc8e477aSFande Kong     ierr =  MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
7794a56b808SFande Kong   }
780bc8e477aSFande Kong   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
7814a56b808SFande Kong 
7824a56b808SFande Kong   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
783bc8e477aSFande Kong   pon *= dof;
7844a56b808SFande Kong   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
7854a56b808SFande Kong   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
7864a56b808SFande Kong   cmaxr = 0;
7874a56b808SFande Kong   for (i=0; i<pon; i++) {
7884a56b808SFande Kong     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
7894a56b808SFande Kong   }
7904a56b808SFande Kong   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
7914a56b808SFande Kong   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
7924a56b808SFande Kong   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
793bc8e477aSFande Kong   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
7944a56b808SFande Kong   for (i=0; i<am && pon; i++) {
7954a56b808SFande Kong     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
796bc8e477aSFande Kong     offset = i%dof;
797bc8e477aSFande Kong     ii     = i/dof;
798bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
7994a56b808SFande Kong     if (!nzi) continue;
800bc8e477aSFande Kong     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr);
8014a56b808SFande Kong     voff = 0;
8024a56b808SFande Kong     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
8034a56b808SFande Kong     if (!voff) continue;
8044a56b808SFande Kong 
8054a56b808SFande Kong     /* Form C(ii, :) */
806bc8e477aSFande Kong     poj = po->j + po->i[ii];
807bc8e477aSFande Kong     poa = po->a + po->i[ii];
8084a56b808SFande Kong     for (j=0; j<nzi; j++) {
809bc8e477aSFande Kong       pocol = poj[j]*dof+offset;
810bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
811bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
8124a56b808SFande Kong       for (jj=0; jj<voff; jj++) {
8134a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj]*poa[j];
8144a56b808SFande Kong         /*If the row is empty */
815bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
8164a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
8174a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
818bc8e477aSFande Kong           c_rmtc[pocol]++;
8194a56b808SFande Kong         } else {
820bc8e477aSFande Kong           ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr);
8214a56b808SFande Kong           if (loc>=0){ /* hit */
8224a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
82349c8f2b8SFande Kong             ierr = PetscLogFlops(1.0);CHKERRQ(ierr);
8244a56b808SFande Kong           } else { /* new element */
8254a56b808SFande Kong             loc = -(loc+1);
8264a56b808SFande Kong             /* Move data backward */
827bc8e477aSFande Kong             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
8284a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk-1];
8294a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk-1];
8304a56b808SFande Kong             }/* End kk */
8314a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
8324a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
833bc8e477aSFande Kong             c_rmtc[pocol]++;
8344a56b808SFande Kong           }
8354a56b808SFande Kong         }
83649c8f2b8SFande Kong         ierr = PetscLogFlops(voff);CHKERRQ(ierr);
8374a56b808SFande Kong       } /* End jj */
8384a56b808SFande Kong     } /* End j */
8394a56b808SFande Kong   } /* End i */
8404a56b808SFande Kong 
8414a56b808SFande Kong   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
842d4e5d74dSLawrence Mitchell   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
8434a56b808SFande Kong 
8444a56b808SFande Kong   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
845bc8e477aSFande Kong   pn *= dof;
8464a56b808SFande Kong   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
8474a56b808SFande Kong 
8484a56b808SFande Kong   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
8494a56b808SFande Kong   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
8504a56b808SFande Kong   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
851bc8e477aSFande Kong   pcstart = pcstart*dof;
852bc8e477aSFande Kong   pcend   = pcend*dof;
8534a56b808SFande Kong   cd = (Mat_SeqAIJ*)(c->A)->data;
8544a56b808SFande Kong   co = (Mat_SeqAIJ*)(c->B)->data;
8554a56b808SFande Kong 
8564a56b808SFande Kong   cmaxr = 0;
8574a56b808SFande Kong   for (i=0; i<pn; i++) {
8584a56b808SFande Kong     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
8594a56b808SFande Kong   }
8604a56b808SFande Kong   ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr);
8614a56b808SFande Kong   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
8624a56b808SFande Kong   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
8634a56b808SFande Kong   for (i=0; i<am && pn; i++) {
8644a56b808SFande Kong     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
865bc8e477aSFande Kong     offset = i%dof;
866bc8e477aSFande Kong     ii     = i/dof;
867bc8e477aSFande Kong     nzi = pd->i[ii+1] - pd->i[ii];
8684a56b808SFande Kong     if (!nzi) continue;
869bc8e477aSFande Kong     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr);
8704a56b808SFande Kong     voff = 0;
8714a56b808SFande Kong     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
8724a56b808SFande Kong     if (!voff) continue;
8734a56b808SFande Kong     /* Form C(ii, :) */
874bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
875bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8764a56b808SFande Kong     for (j=0; j<nzi; j++) {
877bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
8784a56b808SFande Kong       for (jj=0; jj<voff; jj++) {
8794a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj]*pda[j];
8804a56b808SFande Kong       }
88149c8f2b8SFande Kong       ierr = PetscLogFlops(voff);CHKERRQ(ierr);
8824a56b808SFande Kong       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
8834a56b808SFande Kong     }
8844a56b808SFande Kong   }
885bc8e477aSFande Kong   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
8864a56b808SFande Kong   ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr);
8874a56b808SFande Kong   ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr);
8884a56b808SFande Kong   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
889bc8e477aSFande Kong   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
890bc8e477aSFande Kong   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
891bc8e477aSFande Kong   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
892bc8e477aSFande Kong 
893bc8e477aSFande Kong   /* Add contributions from remote */
894bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
895bc8e477aSFande Kong     row = i + pcstart;
896bc8e477aSFande Kong     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
897bc8e477aSFande Kong   }
898bc8e477aSFande Kong   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
899bc8e477aSFande Kong 
900bc8e477aSFande Kong   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
901bc8e477aSFande Kong   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
902bc8e477aSFande Kong 
903bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
904bc8e477aSFande Kong 
905bc8e477aSFande Kong   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
906bc8e477aSFande Kong   if (ptap->freestruct) {
907bc8e477aSFande Kong     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
908bc8e477aSFande Kong   }
909bc8e477aSFande Kong   PetscFunctionReturn(0);
910bc8e477aSFande Kong }
911bc8e477aSFande Kong 
912bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
913bc8e477aSFande Kong {
914bc8e477aSFande Kong   PetscErrorCode      ierr;
915bc8e477aSFande Kong 
916bc8e477aSFande Kong   PetscFunctionBegin;
91734bcad68SFande Kong 
918bc8e477aSFande Kong   ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);CHKERRQ(ierr);
919bc8e477aSFande Kong   PetscFunctionReturn(0);
920bc8e477aSFande Kong }
921bc8e477aSFande Kong 
922bc8e477aSFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
923bc8e477aSFande Kong {
924bc8e477aSFande Kong   PetscErrorCode    ierr;
925bc8e477aSFande Kong   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
926bc8e477aSFande Kong   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
927bc8e477aSFande Kong   Mat_APMPI         *ptap = c->ap;
928bc8e477aSFande Kong   PetscHMapIV       hmap;
929bc8e477aSFande Kong   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
930bc8e477aSFande Kong   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
931bc8e477aSFande Kong   PetscInt          offset,ii,pocol;
932bc8e477aSFande Kong   const PetscInt    *mappingindices;
933bc8e477aSFande Kong   IS                map;
934bc8e477aSFande Kong   MPI_Comm          comm;
935bc8e477aSFande Kong 
936bc8e477aSFande Kong   PetscFunctionBegin;
937bc8e477aSFande Kong   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
938bc8e477aSFande Kong   if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
939bc8e477aSFande Kong 
940bc8e477aSFande Kong   ierr = MatZeroEntries(C);CHKERRQ(ierr);
941bc8e477aSFande Kong 
942bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
943bc8e477aSFande Kong   /*-----------------------------------------------------*/
944bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
945bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
946bc8e477aSFande Kong     ierr =  MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
947bc8e477aSFande Kong   }
948bc8e477aSFande Kong   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
949bc8e477aSFande Kong   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
950bc8e477aSFande Kong   pon *= dof;
951bc8e477aSFande Kong   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
952bc8e477aSFande Kong   pn  *= dof;
953bc8e477aSFande Kong 
954bc8e477aSFande Kong   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
955bc8e477aSFande Kong   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
956bc8e477aSFande Kong   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
957bc8e477aSFande Kong   pcstart *= dof;
958bc8e477aSFande Kong   pcend   *= dof;
959bc8e477aSFande Kong   cmaxr = 0;
960bc8e477aSFande Kong   for (i=0; i<pon; i++) {
961bc8e477aSFande Kong     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
962bc8e477aSFande Kong   }
963bc8e477aSFande Kong   cd = (Mat_SeqAIJ*)(c->A)->data;
964bc8e477aSFande Kong   co = (Mat_SeqAIJ*)(c->B)->data;
965bc8e477aSFande Kong   for (i=0; i<pn; i++) {
966bc8e477aSFande Kong     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
967bc8e477aSFande Kong   }
968bc8e477aSFande Kong   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
969bc8e477aSFande Kong   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
970bc8e477aSFande Kong   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
971bc8e477aSFande Kong   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
972bc8e477aSFande Kong   for (i=0; i<am && (pon || pn); i++) {
973bc8e477aSFande Kong     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
974bc8e477aSFande Kong     offset = i%dof;
975bc8e477aSFande Kong     ii     = i/dof;
976bc8e477aSFande Kong     nzi  = po->i[ii+1] - po->i[ii];
977bc8e477aSFande Kong     dnzi = pd->i[ii+1] - pd->i[ii];
978bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
979bc8e477aSFande Kong     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr);
980bc8e477aSFande Kong     voff = 0;
981bc8e477aSFande Kong     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
982bc8e477aSFande Kong     if (!voff) continue;
983bc8e477aSFande Kong 
984bc8e477aSFande Kong     /* Form remote C(ii, :) */
985bc8e477aSFande Kong     poj = po->j + po->i[ii];
986bc8e477aSFande Kong     poa = po->a + po->i[ii];
987bc8e477aSFande Kong     for (j=0; j<nzi; j++) {
988bc8e477aSFande Kong       pocol = poj[j]*dof+offset;
989bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
990bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
991bc8e477aSFande Kong       for (jj=0; jj<voff; jj++) {
992bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj]*poa[j];
993bc8e477aSFande Kong         /*If the row is empty */
994bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
995bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
996bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
997bc8e477aSFande Kong           c_rmtc[pocol]++;
998bc8e477aSFande Kong         } else {
999bc8e477aSFande Kong           ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr);
1000bc8e477aSFande Kong           if (loc>=0){ /* hit */
1001bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
1002bc8e477aSFande Kong             ierr = PetscLogFlops(1.0);CHKERRQ(ierr);
1003bc8e477aSFande Kong           } else { /* new element */
1004bc8e477aSFande Kong             loc = -(loc+1);
1005bc8e477aSFande Kong             /* Move data backward */
1006bc8e477aSFande Kong             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
1007bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk-1];
1008bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk-1];
1009bc8e477aSFande Kong             }/* End kk */
1010bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
1011bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
1012bc8e477aSFande Kong             c_rmtc[pocol]++;
1013bc8e477aSFande Kong           }
1014bc8e477aSFande Kong         }
1015bc8e477aSFande Kong       } /* End jj */
1016bc8e477aSFande Kong       ierr = PetscLogFlops(voff);CHKERRQ(ierr);
1017bc8e477aSFande Kong     } /* End j */
1018bc8e477aSFande Kong 
1019bc8e477aSFande Kong     /* Form local C(ii, :) */
1020bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
1021bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
1022bc8e477aSFande Kong     for (j=0; j<dnzi; j++) {
1023bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
1024bc8e477aSFande Kong       for (jj=0; jj<voff; jj++) {
1025bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj]*pda[j];
1026bc8e477aSFande Kong       }/* End kk */
1027bc8e477aSFande Kong       ierr = PetscLogFlops(voff);CHKERRQ(ierr);
1028bc8e477aSFande Kong       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
1029bc8e477aSFande Kong     }/* End j */
1030bc8e477aSFande Kong   } /* End i */
1031bc8e477aSFande Kong 
1032bc8e477aSFande Kong   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
1033bc8e477aSFande Kong   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
1034bc8e477aSFande Kong   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
1035bc8e477aSFande Kong   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
1036bc8e477aSFande Kong 
1037bc8e477aSFande Kong   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1038bc8e477aSFande Kong   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
1039bc8e477aSFande Kong   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
10404a56b808SFande Kong   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
10414a56b808SFande Kong   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
10424a56b808SFande Kong 
10434a56b808SFande Kong   /* Add contributions from remote */
10444a56b808SFande Kong   for (i = 0; i < pn; i++) {
10454a56b808SFande Kong     row = i + pcstart;
10464a56b808SFande Kong     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
10474a56b808SFande Kong   }
10484a56b808SFande Kong   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
10494a56b808SFande Kong 
10504a56b808SFande Kong   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
10514a56b808SFande Kong   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
10524a56b808SFande Kong 
10534a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
10544a56b808SFande Kong 
10554a56b808SFande Kong   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
10564a56b808SFande Kong   if (ptap->freestruct) {
10574a56b808SFande Kong     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
10584a56b808SFande Kong   }
10594a56b808SFande Kong   PetscFunctionReturn(0);
10604a56b808SFande Kong }
10614a56b808SFande Kong 
10624a56b808SFande Kong PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
10634a56b808SFande Kong {
10644a56b808SFande Kong   PetscErrorCode      ierr;
10654a56b808SFande Kong 
10664a56b808SFande Kong   PetscFunctionBegin;
106734bcad68SFande Kong 
1068bc8e477aSFande Kong   ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);CHKERRQ(ierr);
10694a56b808SFande Kong   PetscFunctionReturn(0);
10704a56b808SFande Kong }
10714a56b808SFande Kong 
10724222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
10734a56b808SFande Kong {
10744a56b808SFande Kong   Mat_APMPI           *ptap;
10754a56b808SFande Kong   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
10764a56b808SFande Kong   MPI_Comm            comm;
1077bc8e477aSFande Kong   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
10784a56b808SFande Kong   MatType             mtype;
10794a56b808SFande Kong   PetscSF             sf;
10804a56b808SFande Kong   PetscSFNode         *iremote;
10814a56b808SFande Kong   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
10824a56b808SFande Kong   const PetscInt      *rootdegrees;
10834a56b808SFande Kong   PetscHSetI          ht,oht,*hta,*hto;
10844a56b808SFande Kong   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1085131c27b5Sprj-   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1086bc8e477aSFande Kong   PetscInt            nalg=2,alg=0,offset,ii;
1087131c27b5Sprj-   PetscMPIInt         owner;
1088bc8e477aSFande Kong   const PetscInt      *mappingindices;
10894a56b808SFande Kong   PetscBool           flg;
10904a56b808SFande Kong   const char          *algTypes[2] = {"overlapping","merged"};
1091bc8e477aSFande Kong   IS                  map;
10924a56b808SFande Kong   PetscErrorCode      ierr;
10934a56b808SFande Kong 
10944a56b808SFande Kong   PetscFunctionBegin;
10954a56b808SFande Kong   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
109634bcad68SFande Kong 
10974a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10984a56b808SFande Kong   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1099bc8e477aSFande Kong   pn *= dof;
11004a56b808SFande Kong   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
11014a56b808SFande Kong   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
11024a56b808SFande Kong   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
11034a56b808SFande Kong 
11044a56b808SFande Kong   ierr = PetscNew(&ptap);CHKERRQ(ierr);
11054a56b808SFande Kong   ptap->reuse = MAT_INITIAL_MATRIX;
11064a56b808SFande Kong   ptap->algType = 2;
11074a56b808SFande Kong 
11084a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1109bc8e477aSFande Kong   ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
1110bc8e477aSFande Kong   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
11114a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
11124a56b808SFande Kong   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1113bc8e477aSFande Kong   pon *= dof;
11144a56b808SFande Kong   /* offsets */
11154a56b808SFande Kong   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
11164a56b808SFande Kong   /* The number of columns we will send to remote ranks */
11174a56b808SFande Kong   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
11184a56b808SFande Kong   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
11194a56b808SFande Kong   for (i=0; i<pon; i++) {
11204a56b808SFande Kong     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
11214a56b808SFande Kong   }
11224a56b808SFande Kong   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
11234a56b808SFande Kong   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
11244a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
11254a56b808SFande Kong   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
11264a56b808SFande Kong 
1127bc8e477aSFande Kong   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
11284a56b808SFande Kong   ptap->c_rmti[0] = 0;
11294a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
11304a56b808SFande Kong   for (i=0; i<am && pon; i++) {
11314a56b808SFande Kong     /* Form one row of AP */
11324a56b808SFande Kong     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1133bc8e477aSFande Kong     offset = i%dof;
1134bc8e477aSFande Kong     ii     = i/dof;
11354a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1136bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
11374a56b808SFande Kong     if (!nzi) continue;
11384a56b808SFande Kong 
1139bc8e477aSFande Kong     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);CHKERRQ(ierr);
11404a56b808SFande Kong     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
11414a56b808SFande Kong     /* If AP is empty, just continue */
11424a56b808SFande Kong     if (!htsize) continue;
11434a56b808SFande Kong     /* Form C(ii, :) */
1144bc8e477aSFande Kong     poj = po->j + po->i[ii];
11454a56b808SFande Kong     for (j=0; j<nzi; j++) {
1146bc8e477aSFande Kong       ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr);
11474a56b808SFande Kong     }
11484a56b808SFande Kong   }
11494a56b808SFande Kong 
11504a56b808SFande Kong   for (i=0; i<pon; i++) {
11514a56b808SFande Kong     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
11524a56b808SFande Kong     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
11534a56b808SFande Kong     c_rmtc[i] = htsize;
11544a56b808SFande Kong   }
11554a56b808SFande Kong 
11564a56b808SFande Kong   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
11574a56b808SFande Kong 
11584a56b808SFande Kong   for (i=0; i<pon; i++) {
11594a56b808SFande Kong     off = 0;
11604a56b808SFande Kong     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
11614a56b808SFande Kong     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
11624a56b808SFande Kong   }
11634a56b808SFande Kong   ierr = PetscFree(hta);CHKERRQ(ierr);
11644a56b808SFande Kong 
1165071fcb05SBarry Smith   ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr);
11664a56b808SFande Kong   for (i=0; i<pon; i++) {
1167ef7a94f2SFande Kong     owner = 0; lidx = 0;
1168bc8e477aSFande Kong     offset = i%dof;
1169bc8e477aSFande Kong     ii     = i/dof;
1170bc8e477aSFande Kong     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr);
1171bc8e477aSFande Kong     iremote[i].index = lidx*dof + offset;
11724a56b808SFande Kong     iremote[i].rank  = owner;
11734a56b808SFande Kong   }
11744a56b808SFande Kong 
11754a56b808SFande Kong   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
11764a56b808SFande Kong   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
11774a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
11784a56b808SFande Kong   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
11794a56b808SFande Kong   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
11804a56b808SFande Kong   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
11814a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
11824a56b808SFande Kong   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
11834a56b808SFande Kong   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
11844a56b808SFande Kong   rootspacesize = 0;
11854a56b808SFande Kong   for (i = 0; i < pn; i++) {
11864a56b808SFande Kong     rootspacesize += rootdegrees[i];
11874a56b808SFande Kong   }
1188071fcb05SBarry Smith   ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1189071fcb05SBarry Smith   ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
11904a56b808SFande Kong   /* Get information from leaves
11914a56b808SFande Kong    * Number of columns other people contribute to my rows
11924a56b808SFande Kong    * */
11934a56b808SFande Kong   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
11944a56b808SFande Kong   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
11954a56b808SFande Kong   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
11964a56b808SFande Kong   ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
11974a56b808SFande Kong   /* The number of columns is received for each row */
11984a56b808SFande Kong   ptap->c_othi[0] = 0;
11994a56b808SFande Kong   rootspacesize = 0;
12004a56b808SFande Kong   rootspaceoffsets[0] = 0;
12014a56b808SFande Kong   for (i = 0; i < pn; i++) {
12024a56b808SFande Kong     rcvncols = 0;
12034a56b808SFande Kong     for (j = 0; j<rootdegrees[i]; j++) {
12044a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
12054a56b808SFande Kong       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
12064a56b808SFande Kong       rootspacesize++;
12074a56b808SFande Kong     }
12084a56b808SFande Kong     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
12094a56b808SFande Kong   }
12104a56b808SFande Kong   ierr = PetscFree(rootspace);CHKERRQ(ierr);
12114a56b808SFande Kong 
1212071fcb05SBarry Smith   ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
12134a56b808SFande Kong   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
12144a56b808SFande Kong   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
12154a56b808SFande Kong   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
12164a56b808SFande Kong   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
12174a56b808SFande Kong 
12184a56b808SFande Kong   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
12194a56b808SFande Kong   nleaves = 0;
12204a56b808SFande Kong   for (i = 0; i<pon; i++) {
1221bc8e477aSFande Kong     owner = 0;
1222bc8e477aSFande Kong     ii = i/dof;
1223bc8e477aSFande Kong     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr);
12244a56b808SFande Kong     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
12254a56b808SFande Kong     for (j=0; j<sendncols; j++) {
12264a56b808SFande Kong       iremote[nleaves].rank = owner;
12274a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
12284a56b808SFande Kong     }
12294a56b808SFande Kong   }
12304a56b808SFande Kong   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
12314a56b808SFande Kong   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
12324a56b808SFande Kong 
12334a56b808SFande Kong   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
12344a56b808SFande Kong   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
12354a56b808SFande Kong   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
12364a56b808SFande Kong   /* One to one map */
12374a56b808SFande Kong   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
12384a56b808SFande Kong 
1239071fcb05SBarry Smith   ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
12404a56b808SFande Kong   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
12414a56b808SFande Kong   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1242bc8e477aSFande Kong   pcstart *= dof;
1243bc8e477aSFande Kong   pcend   *= dof;
12444a56b808SFande Kong   ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr);
12454a56b808SFande Kong   for (i=0; i<pn; i++) {
12464a56b808SFande Kong     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
12474a56b808SFande Kong     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
12484a56b808SFande Kong   }
12494a56b808SFande Kong   /* Work on local part */
12504a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
12514a56b808SFande Kong   for (i=0; i<am && pn; i++) {
12524a56b808SFande Kong     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
12534a56b808SFande Kong     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1254bc8e477aSFande Kong     offset = i%dof;
1255bc8e477aSFande Kong     ii     = i/dof;
1256bc8e477aSFande Kong     nzi = pd->i[ii+1] - pd->i[ii];
12574a56b808SFande Kong     if (!nzi) continue;
12584a56b808SFande Kong 
1259bc8e477aSFande Kong     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr);
12604a56b808SFande Kong     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
12614a56b808SFande Kong     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
12624a56b808SFande Kong     if (!(htsize+htosize)) continue;
12634a56b808SFande Kong     /* Form C(ii, :) */
1264bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
12654a56b808SFande Kong     for (j=0; j<nzi; j++) {
1266bc8e477aSFande Kong       ierr = PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);CHKERRQ(ierr);
1267bc8e477aSFande Kong       ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr);
12684a56b808SFande Kong     }
12694a56b808SFande Kong   }
12704a56b808SFande Kong 
1271bc8e477aSFande Kong   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
1272bc8e477aSFande Kong 
12734a56b808SFande Kong   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
12744a56b808SFande Kong   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
12754a56b808SFande Kong 
12764a56b808SFande Kong   /* Get remote data */
12774a56b808SFande Kong   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
12784a56b808SFande Kong   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
12794a56b808SFande Kong 
12804a56b808SFande Kong   for (i = 0; i < pn; i++) {
12814a56b808SFande Kong     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
12824a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
12834a56b808SFande Kong     for (j = 0; j < nzi; j++) {
12844a56b808SFande Kong       col =  rdj[j];
12854a56b808SFande Kong       /* diag part */
12864a56b808SFande Kong       if (col>=pcstart && col<pcend) {
12874a56b808SFande Kong         ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr);
12884a56b808SFande Kong       } else { /* off diag */
12894a56b808SFande Kong         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
12904a56b808SFande Kong       }
12914a56b808SFande Kong     }
12924a56b808SFande Kong     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
12934a56b808SFande Kong     dnz[i] = htsize;
12944a56b808SFande Kong     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
12954a56b808SFande Kong     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
12964a56b808SFande Kong     onz[i] = htsize;
12974a56b808SFande Kong     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
12984a56b808SFande Kong   }
12994a56b808SFande Kong 
13004a56b808SFande Kong   ierr = PetscFree2(hta,hto);CHKERRQ(ierr);
13014a56b808SFande Kong   ierr = PetscFree(c_othj);CHKERRQ(ierr);
13024a56b808SFande Kong 
13034a56b808SFande Kong   /* local sizes and preallocation */
13044a56b808SFande Kong   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
130534bcad68SFande Kong   ierr = MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr);
13064a56b808SFande Kong   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1307bc8e477aSFande Kong   ierr = MatSetUp(Cmpi);CHKERRQ(ierr);
13084a56b808SFande Kong   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
13094a56b808SFande Kong 
13104a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
13114a56b808SFande Kong   c = (Mat_MPIAIJ*)Cmpi->data;
13124a56b808SFande Kong   c->ap           = ptap;
13134a56b808SFande Kong   ptap->duplicate = Cmpi->ops->duplicate;
13144a56b808SFande Kong   ptap->destroy   = Cmpi->ops->destroy;
13154a56b808SFande Kong   ptap->view      = Cmpi->ops->view;
13164a56b808SFande Kong 
13174a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
13184a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
13194a56b808SFande Kong   /* pick an algorithm */
13204a56b808SFande Kong   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
13214a56b808SFande Kong   alg = 0;
13224a56b808SFande Kong   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
13234a56b808SFande Kong   ierr = PetscOptionsEnd();CHKERRQ(ierr);
13244a56b808SFande Kong   switch (alg) {
13254a56b808SFande Kong     case 0:
13264a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
13274a56b808SFande Kong       break;
13284a56b808SFande Kong     case 1:
13294a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
13304a56b808SFande Kong       break;
13314a56b808SFande Kong     default:
13324a56b808SFande Kong       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
13334a56b808SFande Kong   }
13344a56b808SFande Kong   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
13354a56b808SFande Kong   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
13364a56b808SFande Kong   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
13374a56b808SFande Kong   PetscFunctionReturn(0);
13384a56b808SFande Kong }
13394a56b808SFande Kong 
13404222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1341bc8e477aSFande Kong {
1342bc8e477aSFande Kong   PetscErrorCode      ierr;
1343bc8e477aSFande Kong 
1344bc8e477aSFande Kong   PetscFunctionBegin;
134534bcad68SFande Kong 
1346bc8e477aSFande Kong   ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);CHKERRQ(ierr);
1347bc8e477aSFande Kong   PetscFunctionReturn(0);
1348bc8e477aSFande Kong }
1349bc8e477aSFande Kong 
13504222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
13514a56b808SFande Kong {
13524a56b808SFande Kong   Mat_APMPI           *ptap;
13534a56b808SFande Kong   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
13544a56b808SFande Kong   MPI_Comm            comm;
1355bc8e477aSFande Kong   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
13564a56b808SFande Kong   MatType             mtype;
13574a56b808SFande Kong   PetscSF             sf;
13584a56b808SFande Kong   PetscSFNode         *iremote;
13594a56b808SFande Kong   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
13604a56b808SFande Kong   const PetscInt      *rootdegrees;
13614a56b808SFande Kong   PetscHSetI          ht,oht,*hta,*hto,*htd;
13624a56b808SFande Kong   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1363131c27b5Sprj-   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1364bc8e477aSFande Kong   PetscInt            nalg=2,alg=0,offset,ii;
1365131c27b5Sprj-   PetscMPIInt         owner;
13664a56b808SFande Kong   PetscBool           flg;
13674a56b808SFande Kong   const char          *algTypes[2] = {"merged","overlapping"};
1368bc8e477aSFande Kong   const PetscInt      *mappingindices;
1369bc8e477aSFande Kong   IS                  map;
13704a56b808SFande Kong   PetscErrorCode      ierr;
13714a56b808SFande Kong 
13724a56b808SFande Kong   PetscFunctionBegin;
13734a56b808SFande Kong   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
137434bcad68SFande Kong 
13754a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
13764a56b808SFande Kong   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1377bc8e477aSFande Kong   pn *= dof;
13784a56b808SFande Kong   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
13794a56b808SFande Kong   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
13804a56b808SFande Kong   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
13814a56b808SFande Kong 
13824a56b808SFande Kong   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
13834a56b808SFande Kong   ptap->reuse = MAT_INITIAL_MATRIX;
13844a56b808SFande Kong   ptap->algType = 3;
13854a56b808SFande Kong 
13864a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1387bc8e477aSFande Kong   ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
1388bc8e477aSFande Kong   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
13894a56b808SFande Kong 
13904a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
13914a56b808SFande Kong   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1392bc8e477aSFande Kong   pon *= dof;
13934a56b808SFande Kong   /* offsets */
13944a56b808SFande Kong   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
13954a56b808SFande Kong   /* The number of columns we will send to remote ranks */
13964a56b808SFande Kong   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
13974a56b808SFande Kong   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
13984a56b808SFande Kong   for (i=0; i<pon; i++) {
13994a56b808SFande Kong     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
14004a56b808SFande Kong   }
14014a56b808SFande Kong   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
14024a56b808SFande Kong   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
14034a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
14044a56b808SFande Kong   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
14054a56b808SFande Kong   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
14064a56b808SFande Kong   ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr);
14074a56b808SFande Kong   for (i=0; i<pn; i++) {
14084a56b808SFande Kong     ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr);
14094a56b808SFande Kong     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
14104a56b808SFande Kong   }
1411bc8e477aSFande Kong 
1412bc8e477aSFande Kong   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
14134a56b808SFande Kong   ptap->c_rmti[0] = 0;
14144a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
14154a56b808SFande Kong   for (i=0; i<am && (pon || pn); i++) {
14164a56b808SFande Kong     /* Form one row of AP */
14174a56b808SFande Kong     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
14184a56b808SFande Kong     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1419bc8e477aSFande Kong     offset = i%dof;
1420bc8e477aSFande Kong     ii     = i/dof;
14214a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1422bc8e477aSFande Kong     nzi = po->i[ii+1] - po->i[ii];
1423bc8e477aSFande Kong     dnzi = pd->i[ii+1] - pd->i[ii];
14244a56b808SFande Kong     if (!nzi && !dnzi) continue;
14254a56b808SFande Kong 
1426bc8e477aSFande Kong     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr);
14274a56b808SFande Kong     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
14284a56b808SFande Kong     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
14294a56b808SFande Kong     /* If AP is empty, just continue */
14304a56b808SFande Kong     if (!(htsize+htosize)) continue;
14314a56b808SFande Kong 
14324a56b808SFande Kong     /* Form remote C(ii, :) */
1433bc8e477aSFande Kong     poj = po->j + po->i[ii];
14344a56b808SFande Kong     for (j=0; j<nzi; j++) {
1435bc8e477aSFande Kong       ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr);
1436bc8e477aSFande Kong       ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);CHKERRQ(ierr);
14374a56b808SFande Kong     }
14384a56b808SFande Kong 
14394a56b808SFande Kong     /* Form local C(ii, :) */
1440bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
14414a56b808SFande Kong     for (j=0; j<dnzi; j++) {
1442bc8e477aSFande Kong       ierr = PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);CHKERRQ(ierr);
1443bc8e477aSFande Kong       ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr);
14444a56b808SFande Kong     }
14454a56b808SFande Kong   }
14464a56b808SFande Kong 
1447bc8e477aSFande Kong   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
1448bc8e477aSFande Kong 
14494a56b808SFande Kong   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
14504a56b808SFande Kong   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
14514a56b808SFande Kong 
14524a56b808SFande Kong   for (i=0; i<pon; i++) {
14534a56b808SFande Kong     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
14544a56b808SFande Kong     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
14554a56b808SFande Kong     c_rmtc[i] = htsize;
14564a56b808SFande Kong   }
14574a56b808SFande Kong 
14584a56b808SFande Kong   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
14594a56b808SFande Kong 
14604a56b808SFande Kong   for (i=0; i<pon; i++) {
14614a56b808SFande Kong     off = 0;
14624a56b808SFande Kong     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
14634a56b808SFande Kong     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
14644a56b808SFande Kong   }
14654a56b808SFande Kong   ierr = PetscFree(hta);CHKERRQ(ierr);
14664a56b808SFande Kong 
1467071fcb05SBarry Smith   ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr);
14684a56b808SFande Kong   for (i=0; i<pon; i++) {
1469ef7a94f2SFande Kong     owner = 0; lidx = 0;
1470bc8e477aSFande Kong     offset = i%dof;
1471bc8e477aSFande Kong     ii     = i/dof;
1472bc8e477aSFande Kong     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr);
1473bc8e477aSFande Kong     iremote[i].index = lidx*dof+offset;
14744a56b808SFande Kong     iremote[i].rank  = owner;
14754a56b808SFande Kong   }
14764a56b808SFande Kong 
14774a56b808SFande Kong   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
14784a56b808SFande Kong   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
14794a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
14804a56b808SFande Kong   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
14814a56b808SFande Kong   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
14824a56b808SFande Kong   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
14834a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
14844a56b808SFande Kong   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
14854a56b808SFande Kong   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
14864a56b808SFande Kong   rootspacesize = 0;
14874a56b808SFande Kong   for (i = 0; i < pn; i++) {
14884a56b808SFande Kong     rootspacesize += rootdegrees[i];
14894a56b808SFande Kong   }
1490071fcb05SBarry Smith   ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1491071fcb05SBarry Smith   ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
14924a56b808SFande Kong   /* Get information from leaves
14934a56b808SFande Kong    * Number of columns other people contribute to my rows
14944a56b808SFande Kong    * */
14954a56b808SFande Kong   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
14964a56b808SFande Kong   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
14974a56b808SFande Kong   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1498071fcb05SBarry Smith   ierr = PetscMalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
14994a56b808SFande Kong   /* The number of columns is received for each row */
15004a56b808SFande Kong   ptap->c_othi[0]     = 0;
15014a56b808SFande Kong   rootspacesize       = 0;
15024a56b808SFande Kong   rootspaceoffsets[0] = 0;
15034a56b808SFande Kong   for (i = 0; i < pn; i++) {
15044a56b808SFande Kong     rcvncols = 0;
15054a56b808SFande Kong     for (j = 0; j<rootdegrees[i]; j++) {
15064a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
15074a56b808SFande Kong       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
15084a56b808SFande Kong       rootspacesize++;
15094a56b808SFande Kong     }
15104a56b808SFande Kong     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
15114a56b808SFande Kong   }
15124a56b808SFande Kong   ierr = PetscFree(rootspace);CHKERRQ(ierr);
15134a56b808SFande Kong 
1514071fcb05SBarry Smith   ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
15154a56b808SFande Kong   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
15164a56b808SFande Kong   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
15174a56b808SFande Kong   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
15184a56b808SFande Kong   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
15194a56b808SFande Kong 
15204a56b808SFande Kong   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
15214a56b808SFande Kong   nleaves = 0;
15224a56b808SFande Kong   for (i = 0; i<pon; i++) {
1523071fcb05SBarry Smith     owner = 0;
1524bc8e477aSFande Kong     ii    = i/dof;
1525bc8e477aSFande Kong     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr);
15264a56b808SFande Kong     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
15274a56b808SFande Kong     for (j=0; j<sendncols; j++) {
15284a56b808SFande Kong       iremote[nleaves].rank    = owner;
15294a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
15304a56b808SFande Kong     }
15314a56b808SFande Kong   }
15324a56b808SFande Kong   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
15334a56b808SFande Kong   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
15344a56b808SFande Kong 
15354a56b808SFande Kong   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
15364a56b808SFande Kong   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
15374a56b808SFande Kong   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
15384a56b808SFande Kong   /* One to one map */
15394a56b808SFande Kong   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
15404a56b808SFande Kong   /* Get remote data */
15414a56b808SFande Kong   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
15424a56b808SFande Kong   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1543071fcb05SBarry Smith   ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
15444a56b808SFande Kong   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1545bc8e477aSFande Kong   pcstart *= dof;
1546bc8e477aSFande Kong   pcend   *= dof;
15474a56b808SFande Kong   for (i = 0; i < pn; i++) {
15484a56b808SFande Kong     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
15494a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
15504a56b808SFande Kong     for (j = 0; j < nzi; j++) {
15514a56b808SFande Kong       col =  rdj[j];
15524a56b808SFande Kong       /* diag part */
15534a56b808SFande Kong       if (col>=pcstart && col<pcend) {
15544a56b808SFande Kong         ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr);
15554a56b808SFande Kong       } else { /* off diag */
15564a56b808SFande Kong         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
15574a56b808SFande Kong       }
15584a56b808SFande Kong     }
15594a56b808SFande Kong     ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr);
15604a56b808SFande Kong     dnz[i] = htsize;
15614a56b808SFande Kong     ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr);
15624a56b808SFande Kong     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
15634a56b808SFande Kong     onz[i] = htsize;
15644a56b808SFande Kong     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
15654a56b808SFande Kong   }
15664a56b808SFande Kong 
15674a56b808SFande Kong   ierr = PetscFree2(htd,hto);CHKERRQ(ierr);
15684a56b808SFande Kong   ierr = PetscFree(c_othj);CHKERRQ(ierr);
15694a56b808SFande Kong 
15704a56b808SFande Kong   /* local sizes and preallocation */
15714a56b808SFande Kong   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
157234bcad68SFande Kong   ierr = MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr);
15734a56b808SFande Kong   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
15744a56b808SFande Kong   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
15754a56b808SFande Kong 
15764a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
15774a56b808SFande Kong   c = (Mat_MPIAIJ*)Cmpi->data;
15784a56b808SFande Kong   c->ap           = ptap;
15794a56b808SFande Kong   ptap->duplicate = Cmpi->ops->duplicate;
15804a56b808SFande Kong   ptap->destroy   = Cmpi->ops->destroy;
15814a56b808SFande Kong   ptap->view      = Cmpi->ops->view;
15824a56b808SFande Kong 
15834a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
15844a56b808SFande Kong   Cmpi->assembled        = PETSC_FALSE;
15854a56b808SFande Kong   /* pick an algorithm */
15864a56b808SFande Kong   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
15874a56b808SFande Kong   alg = 0;
15884a56b808SFande Kong   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
15894a56b808SFande Kong   ierr = PetscOptionsEnd();CHKERRQ(ierr);
15904a56b808SFande Kong   switch (alg) {
15914a56b808SFande Kong     case 0:
15924a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
15934a56b808SFande Kong       break;
15944a56b808SFande Kong     case 1:
15954a56b808SFande Kong       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
15964a56b808SFande Kong       break;
15974a56b808SFande Kong     default:
15984a56b808SFande Kong       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
15994a56b808SFande Kong   }
16004a56b808SFande Kong   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
16014a56b808SFande Kong   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
16024a56b808SFande Kong   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
16034a56b808SFande Kong   PetscFunctionReturn(0);
16044a56b808SFande Kong }
16054a56b808SFande Kong 
16064222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1607bc8e477aSFande Kong {
1608bc8e477aSFande Kong   PetscErrorCode      ierr;
1609bc8e477aSFande Kong 
1610bc8e477aSFande Kong   PetscFunctionBegin;
161134bcad68SFande Kong 
1612bc8e477aSFande Kong   ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);CHKERRQ(ierr);
1613bc8e477aSFande Kong   PetscFunctionReturn(0);
1614bc8e477aSFande Kong }
1615bc8e477aSFande Kong 
16164222ddf1SHong Zhang PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1617e953e47cSHong Zhang {
1618e953e47cSHong Zhang   PetscErrorCode      ierr;
16193cdca5ebSHong Zhang   Mat_APMPI           *ptap;
1620e953e47cSHong Zhang   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1621e953e47cSHong Zhang   MPI_Comm            comm;
1622e953e47cSHong Zhang   PetscMPIInt         size,rank;
1623e953e47cSHong Zhang   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1624e953e47cSHong Zhang   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1625e953e47cSHong Zhang   PetscInt            *lnk,i,k,pnz,row,nsend;
1626e953e47cSHong Zhang   PetscBT             lnkbt;
1627e953e47cSHong Zhang   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1628e953e47cSHong Zhang   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1629e953e47cSHong Zhang   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1630e953e47cSHong Zhang   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1631e953e47cSHong Zhang   MPI_Request         *swaits,*rwaits;
1632e953e47cSHong Zhang   MPI_Status          *sstatus,rstatus;
1633e953e47cSHong Zhang   PetscLayout         rowmap;
1634e953e47cSHong Zhang   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1635e953e47cSHong Zhang   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1636e953e47cSHong Zhang   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1637ec07b8f8SHong Zhang   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1638e953e47cSHong Zhang   PetscScalar         *apv;
1639e953e47cSHong Zhang   PetscTable          ta;
1640b92f168fSBarry Smith   MatType             mtype;
1641e83e5f86SFande Kong   const char          *prefix;
1642e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1643e953e47cSHong Zhang   PetscReal           apfill;
1644e953e47cSHong Zhang #endif
1645e953e47cSHong Zhang 
1646e953e47cSHong Zhang   PetscFunctionBegin;
1647b4e8d1b6SHong Zhang   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1648e953e47cSHong Zhang   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1649e953e47cSHong Zhang   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1650e953e47cSHong Zhang 
165152f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1652ec07b8f8SHong Zhang 
1653e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
1654b92f168fSBarry Smith   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1655b92f168fSBarry Smith   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1656e953e47cSHong Zhang 
1657e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1658e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1659e953e47cSHong Zhang 
16603cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
166115a3b8e2SHong Zhang   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
166215a3b8e2SHong Zhang   ptap->reuse = MAT_INITIAL_MATRIX;
1663a4ffb656SHong Zhang   ptap->algType = 1;
166415a3b8e2SHong Zhang 
166515a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
166615a3b8e2SHong Zhang   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
166715a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
166815a3b8e2SHong Zhang   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
166915a3b8e2SHong Zhang 
167067a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
167167a12041SHong Zhang   /* --------------------------------- */
1672419ecdd9Sandi selinger   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1673419ecdd9Sandi selinger   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
167415a3b8e2SHong Zhang 
167567a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
167667a12041SHong Zhang   /* ----------------------------------------------------------------- */
167767a12041SHong Zhang   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
167852f7967eSHong Zhang   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1679445158ffSHong Zhang 
16809a6dcab7SHong Zhang   /* create and initialize a linked list */
168145d00d1dSHong Zhang   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
16824b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
16834b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
168478d30b94SHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
1685d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
168678d30b94SHong Zhang 
168778d30b94SHong Zhang   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1688445158ffSHong Zhang 
16898cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1690ec07b8f8SHong Zhang   if (ao) {
1691f91af8c7SBarry Smith     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
1692ec07b8f8SHong Zhang   } else {
1693ec07b8f8SHong Zhang     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
1694ec07b8f8SHong Zhang   }
1695445158ffSHong Zhang   current_space = free_space;
169667a12041SHong Zhang   nspacedouble  = 0;
169767a12041SHong Zhang 
169867a12041SHong Zhang   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
169967a12041SHong Zhang   api[0] = 0;
1700445158ffSHong Zhang   for (i=0; i<am; i++) {
170167a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
170267a12041SHong Zhang     ai = ad->i; pi = p_loc->i;
170367a12041SHong Zhang     nzi = ai[i+1] - ai[i];
170467a12041SHong Zhang     aj  = ad->j + ai[i];
1705445158ffSHong Zhang     for (j=0; j<nzi; j++) {
1706445158ffSHong Zhang       row  = aj[j];
170767a12041SHong Zhang       pnz  = pi[row+1] - pi[row];
170867a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1709445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
1710445158ffSHong Zhang       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1711445158ffSHong Zhang     }
171267a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1713ec07b8f8SHong Zhang     if (ao) {
171467a12041SHong Zhang       ai = ao->i; pi = p_oth->i;
171567a12041SHong Zhang       nzi = ai[i+1] - ai[i];
171667a12041SHong Zhang       aj  = ao->j + ai[i];
1717445158ffSHong Zhang       for (j=0; j<nzi; j++) {
1718445158ffSHong Zhang         row  = aj[j];
171967a12041SHong Zhang         pnz  = pi[row+1] - pi[row];
172067a12041SHong Zhang         Jptr = p_oth->j + pi[row];
1721445158ffSHong Zhang         ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1722445158ffSHong Zhang       }
1723ec07b8f8SHong Zhang     }
1724445158ffSHong Zhang     apnz     = lnk[0];
1725445158ffSHong Zhang     api[i+1] = api[i] + apnz;
1726445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1727445158ffSHong Zhang 
1728445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1729445158ffSHong Zhang     if (current_space->local_remaining<apnz) {
1730f91af8c7SBarry Smith       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1731445158ffSHong Zhang       nspacedouble++;
1732445158ffSHong Zhang     }
1733445158ffSHong Zhang 
1734445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
1735445158ffSHong Zhang     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1736445158ffSHong Zhang 
1737445158ffSHong Zhang     current_space->array           += apnz;
1738445158ffSHong Zhang     current_space->local_used      += apnz;
1739445158ffSHong Zhang     current_space->local_remaining -= apnz;
1740445158ffSHong Zhang   }
1741681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1742445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
1743445158ffSHong Zhang   ierr   = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
1744445158ffSHong Zhang   ierr   = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
17459a6dcab7SHong Zhang   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
17469a6dcab7SHong Zhang 
1747aa690a28SHong Zhang   /* Create AP_loc for reuse */
1748445158ffSHong Zhang   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
1749aa690a28SHong Zhang 
1750aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1751ec07b8f8SHong Zhang   if (ao) {
1752aa690a28SHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1753ec07b8f8SHong Zhang   } else {
1754ec07b8f8SHong Zhang     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1755ec07b8f8SHong Zhang   }
1756aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1757aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1758aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1759aa690a28SHong Zhang 
1760aa690a28SHong Zhang   if (api[am]) {
1761b11c1ec8SHong Zhang     ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
1762aa690a28SHong Zhang     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
1763aa690a28SHong Zhang   } else {
1764b11c1ec8SHong Zhang     ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
1765aa690a28SHong Zhang   }
1766aa690a28SHong Zhang #endif
1767aa690a28SHong Zhang 
1768681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1769681d504bSHong Zhang   /* ------------------------------------ */
1770e83e5f86SFande Kong   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
1771e83e5f86SFande Kong   ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr);
1772e83e5f86SFande Kong   ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr);
17734222ddf1SHong Zhang 
17744222ddf1SHong Zhang   ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr);
17754222ddf1SHong Zhang   ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr);
17764222ddf1SHong Zhang   ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr);
17774222ddf1SHong Zhang   ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");CHKERRQ(ierr);
17784222ddf1SHong Zhang 
17794222ddf1SHong Zhang   ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr);
17804222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(ptap->C_oth,"default");CHKERRQ(ierr);
17814222ddf1SHong Zhang   ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr);
17824222ddf1SHong Zhang   ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr);
17834222ddf1SHong Zhang   ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr);
178415a3b8e2SHong Zhang 
178567a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
178667a12041SHong Zhang   /* ------------------------------------------ */
178715a3b8e2SHong Zhang   /* determine row ownership */
1788445158ffSHong Zhang   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
1789445158ffSHong Zhang   rowmap->n  = pn;
1790445158ffSHong Zhang   rowmap->bs = 1;
1791445158ffSHong Zhang   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
1792445158ffSHong Zhang   owners = rowmap->range;
179315a3b8e2SHong Zhang 
179415a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
17958cb82516SHong Zhang   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
1796580bdb30SBarry Smith   ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr);
1797580bdb30SBarry Smith   ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr);
179815a3b8e2SHong Zhang 
179967a12041SHong Zhang   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
180067a12041SHong Zhang   coi   = c_oth->i; coj = c_oth->j;
180167a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
180215a3b8e2SHong Zhang   proc  = 0;
180367a12041SHong Zhang   for (i=0; i<con; i++) {
180415a3b8e2SHong Zhang     while (prmap[i] >= owners[proc+1]) proc++;
180515a3b8e2SHong Zhang     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
180615a3b8e2SHong Zhang     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
180715a3b8e2SHong Zhang   }
180815a3b8e2SHong Zhang 
180915a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
181015a3b8e2SHong Zhang   owners_co[0] = 0;
181167a12041SHong Zhang   nsend        = 0;
181215a3b8e2SHong Zhang   for (proc=0; proc<size; proc++) {
181315a3b8e2SHong Zhang     owners_co[proc+1] = owners_co[proc] + len_si[proc];
181415a3b8e2SHong Zhang     if (len_s[proc]) {
1815445158ffSHong Zhang       nsend++;
181615a3b8e2SHong Zhang       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
181715a3b8e2SHong Zhang       len         += len_si[proc];
181815a3b8e2SHong Zhang     }
181915a3b8e2SHong Zhang   }
182015a3b8e2SHong Zhang 
182115a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
1822445158ffSHong Zhang   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
1823445158ffSHong Zhang   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
182415a3b8e2SHong Zhang 
182515a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
182615a3b8e2SHong Zhang   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1827445158ffSHong Zhang   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1828445158ffSHong Zhang   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
182915a3b8e2SHong Zhang   for (proc=0, k=0; proc<size; proc++) {
183015a3b8e2SHong Zhang     if (!len_s[proc]) continue;
183115a3b8e2SHong Zhang     i    = owners_co[proc];
183215a3b8e2SHong Zhang     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
183315a3b8e2SHong Zhang     k++;
183415a3b8e2SHong Zhang   }
183515a3b8e2SHong Zhang 
1836681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1837681d504bSHong Zhang   /* ---------------------------------------- */
1838e83e5f86SFande Kong   ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr);
1839e83e5f86SFande Kong   ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr);
18404222ddf1SHong Zhang 
18414222ddf1SHong Zhang   ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr);
18424222ddf1SHong Zhang   ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr);
18434222ddf1SHong Zhang   ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr);
18444222ddf1SHong Zhang   ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");CHKERRQ(ierr);
18454222ddf1SHong Zhang 
18464222ddf1SHong Zhang   ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr);
18474222ddf1SHong Zhang   ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr);
18484222ddf1SHong Zhang   ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr);
18494222ddf1SHong Zhang   ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr);
18504222ddf1SHong Zhang   ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr);
18514222ddf1SHong Zhang 
1852681d504bSHong Zhang   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1853681d504bSHong Zhang 
1854681d504bSHong Zhang   /* receives coj are complete */
1855445158ffSHong Zhang   for (i=0; i<nrecv; i++) {
1856445158ffSHong Zhang     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
185715a3b8e2SHong Zhang   }
185815a3b8e2SHong Zhang   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1859445158ffSHong Zhang   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
186015a3b8e2SHong Zhang 
186178d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
186278d30b94SHong Zhang   for (k=0; k<nrecv; k++) {/* k-th received message */
186378d30b94SHong Zhang     Jptr = buf_rj[k];
186478d30b94SHong Zhang     for (j=0; j<len_r[k]; j++) {
186578d30b94SHong Zhang       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
186678d30b94SHong Zhang     }
186778d30b94SHong Zhang   }
186878d30b94SHong Zhang   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
186978d30b94SHong Zhang   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
18709a6dcab7SHong Zhang 
187115a3b8e2SHong Zhang   /* (4) send and recv coi */
187215a3b8e2SHong Zhang   /*-----------------------*/
187315a3b8e2SHong Zhang   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1874445158ffSHong Zhang   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
187515a3b8e2SHong Zhang   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
187615a3b8e2SHong Zhang   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
187715a3b8e2SHong Zhang   for (proc=0,k=0; proc<size; proc++) {
187815a3b8e2SHong Zhang     if (!len_s[proc]) continue;
187915a3b8e2SHong Zhang     /* form outgoing message for i-structure:
188015a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
188115a3b8e2SHong Zhang                [1:nrows]:           row index (global)
188215a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
188315a3b8e2SHong Zhang     */
188415a3b8e2SHong Zhang     /*-------------------------------------------*/
188515a3b8e2SHong Zhang     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
188615a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows+1;
188715a3b8e2SHong Zhang     buf_si[0]   = nrows;
188815a3b8e2SHong Zhang     buf_si_i[0] = 0;
188915a3b8e2SHong Zhang     nrows       = 0;
189015a3b8e2SHong Zhang     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
189115a3b8e2SHong Zhang       nzi = coi[i+1] - coi[i];
189215a3b8e2SHong Zhang       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
189315a3b8e2SHong Zhang       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
189415a3b8e2SHong Zhang       nrows++;
189515a3b8e2SHong Zhang     }
189615a3b8e2SHong Zhang     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
189715a3b8e2SHong Zhang     k++;
189815a3b8e2SHong Zhang     buf_si += len_si[proc];
189915a3b8e2SHong Zhang   }
1900681d504bSHong Zhang   for (i=0; i<nrecv; i++) {
1901445158ffSHong Zhang     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
190215a3b8e2SHong Zhang   }
190315a3b8e2SHong Zhang   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1904445158ffSHong Zhang   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
190515a3b8e2SHong Zhang 
19068cb82516SHong Zhang   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
190715a3b8e2SHong Zhang   ierr = PetscFree(len_ri);CHKERRQ(ierr);
190815a3b8e2SHong Zhang   ierr = PetscFree(swaits);CHKERRQ(ierr);
190915a3b8e2SHong Zhang   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1910a4ffb656SHong Zhang 
191167a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
191267a12041SHong Zhang   /* ------------------------------------------ */
191378d30b94SHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
191478d30b94SHong Zhang   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
191515a3b8e2SHong Zhang   current_space = free_space;
191615a3b8e2SHong Zhang 
1917445158ffSHong Zhang   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
1918445158ffSHong Zhang   for (k=0; k<nrecv; k++) {
191915a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
192015a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
192115a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
192215a3b8e2SHong Zhang     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
192315a3b8e2SHong Zhang   }
1924445158ffSHong Zhang 
192578d30b94SHong Zhang   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
192678d30b94SHong Zhang   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
192715a3b8e2SHong Zhang   for (i=0; i<pn; i++) {
192867a12041SHong Zhang     /* add C_loc into Cmpi */
192967a12041SHong Zhang     nzi  = c_loc->i[i+1] - c_loc->i[i];
193067a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
193167a12041SHong Zhang     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
193215a3b8e2SHong Zhang 
193315a3b8e2SHong Zhang     /* add received col data into lnk */
1934445158ffSHong Zhang     for (k=0; k<nrecv; k++) { /* k-th received message */
193515a3b8e2SHong Zhang       if (i == *nextrow[k]) { /* i-th row */
193615a3b8e2SHong Zhang         nzi  = *(nextci[k]+1) - *nextci[k];
193715a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
193815a3b8e2SHong Zhang         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
193915a3b8e2SHong Zhang         nextrow[k]++; nextci[k]++;
194015a3b8e2SHong Zhang       }
194115a3b8e2SHong Zhang     }
1942d0e9a2f7SHong Zhang     nzi = lnk[0];
19438cb82516SHong Zhang 
194415a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
1945d0e9a2f7SHong Zhang     ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1946d0e9a2f7SHong Zhang     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
194715a3b8e2SHong Zhang   }
194815a3b8e2SHong Zhang   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
194915a3b8e2SHong Zhang   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1950445158ffSHong Zhang   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
195180bb4639SHong Zhang 
1952ae5f0867Sstefano_zampini   /* local sizes and preallocation */
195315a3b8e2SHong Zhang   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1954ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
1955ac94c67aSTristan Konolige     ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr);
1956ac94c67aSTristan Konolige     ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr);
1957ac94c67aSTristan Konolige   }
195815a3b8e2SHong Zhang   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
195915a3b8e2SHong Zhang   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
196015a3b8e2SHong Zhang 
1961445158ffSHong Zhang   /* members in merge */
1962445158ffSHong Zhang   ierr = PetscFree(id_r);CHKERRQ(ierr);
1963445158ffSHong Zhang   ierr = PetscFree(len_r);CHKERRQ(ierr);
1964445158ffSHong Zhang   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
1965445158ffSHong Zhang   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
1966445158ffSHong Zhang   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
1967445158ffSHong Zhang   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
1968445158ffSHong Zhang   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
196915a3b8e2SHong Zhang 
197015a3b8e2SHong Zhang   /* attach the supporting struct to Cmpi for reuse */
197115a3b8e2SHong Zhang   c = (Mat_MPIAIJ*)Cmpi->data;
19723cdca5ebSHong Zhang   c->ap           = ptap;
19731a47ec54SHong Zhang   ptap->duplicate = Cmpi->ops->duplicate;
19741a47ec54SHong Zhang   ptap->destroy   = Cmpi->ops->destroy;
1975cf97cf3cSHong Zhang   ptap->view      = Cmpi->ops->view;
19768cb82516SHong Zhang   ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
19772259aa2eSHong Zhang 
19781a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
19791a47ec54SHong Zhang   Cmpi->assembled        = PETSC_FALSE;
19801a47ec54SHong Zhang   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1981cf97cf3cSHong Zhang   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
19824624976aSHong Zhang   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
19830d3441aeSHong Zhang   PetscFunctionReturn(0);
19840d3441aeSHong Zhang }
19850d3441aeSHong Zhang 
1986aa690a28SHong Zhang PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
19870d3441aeSHong Zhang {
19880d3441aeSHong Zhang   PetscErrorCode    ierr;
19890d3441aeSHong Zhang   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
19900d3441aeSHong Zhang   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
19912dd9e643SHong Zhang   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
19923cdca5ebSHong Zhang   Mat_APMPI         *ptap = c->ap;
19939ce11a7cSHong Zhang   Mat               AP_loc,C_loc,C_oth;
19940d3441aeSHong Zhang   PetscInt          i,rstart,rend,cm,ncols,row;
19958cb82516SHong Zhang   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
19968cb82516SHong Zhang   PetscScalar       *apa;
19970d3441aeSHong Zhang   const PetscInt    *cols;
19980d3441aeSHong Zhang   const PetscScalar *vals;
19990d3441aeSHong Zhang 
20000d3441aeSHong Zhang   PetscFunctionBegin;
2001bb9bda99SHong Zhang   if (!ptap->AP_loc) {
2002a9899c97SHong Zhang     MPI_Comm comm;
2003a9899c97SHong Zhang     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
2004dd4011a9SHong Zhang     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
2005a9899c97SHong Zhang   }
2006a9899c97SHong Zhang 
20070d3441aeSHong Zhang   ierr = MatZeroEntries(C);CHKERRQ(ierr);
2008e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
2009748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
2010419ecdd9Sandi selinger     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
2011419ecdd9Sandi selinger     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
2012748c7196SHong Zhang   }
20130d3441aeSHong Zhang 
2014e497d3c8SHong Zhang   /* 2) get AP_loc */
20150d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
20168cb82516SHong Zhang   ap = (Mat_SeqAIJ*)AP_loc->data;
20170d3441aeSHong Zhang 
2018e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
20190d3441aeSHong Zhang   /*-----------------------------------------------------*/
2020748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
2021748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
20220d3441aeSHong Zhang     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
20230d3441aeSHong Zhang     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
2024e497d3c8SHong Zhang   }
20250d3441aeSHong Zhang 
20268cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
20278cb82516SHong Zhang   /* ---------------------------------------------- */
20280d3441aeSHong Zhang   /* get data from symbolic products */
20298cb82516SHong Zhang   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
20302dd9e643SHong Zhang   if (ptap->P_oth) {
20318cb82516SHong Zhang     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
20322dd9e643SHong Zhang   }
20338cb82516SHong Zhang   apa   = ptap->apa;
2034681d504bSHong Zhang   api   = ap->i;
2035681d504bSHong Zhang   apj   = ap->j;
2036e497d3c8SHong Zhang   for (i=0; i<am; i++) {
2037445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2038e497d3c8SHong Zhang     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2039e497d3c8SHong Zhang     apnz = api[i+1] - api[i];
2040e497d3c8SHong Zhang     for (j=0; j<apnz; j++) {
2041e497d3c8SHong Zhang       col = apj[j+api[i]];
2042e497d3c8SHong Zhang       ap->a[j+ap->i[i]] = apa[col];
2043e497d3c8SHong Zhang       apa[col] = 0.0;
20440d3441aeSHong Zhang     }
20450d3441aeSHong Zhang   }
2046*976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
2047*976452c9SRichard Tran Mills   ierr = PetscObjectStateIncrease((PetscObject)AP_loc);CHKERRQ(ierr);
20480d3441aeSHong Zhang 
20498cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2050445158ffSHong Zhang   ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
2051445158ffSHong Zhang   ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
20529ce11a7cSHong Zhang   C_loc = ptap->C_loc;
20539ce11a7cSHong Zhang   C_oth = ptap->C_oth;
20540d3441aeSHong Zhang 
20550d3441aeSHong Zhang   /* add C_loc and Co to to C */
20560d3441aeSHong Zhang   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
20570d3441aeSHong Zhang 
20580d3441aeSHong Zhang   /* C_loc -> C */
20590d3441aeSHong Zhang   cm    = C_loc->rmap->N;
20609ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ*)C_loc->data;
20618cb82516SHong Zhang   cols = c_seq->j;
20628cb82516SHong Zhang   vals = c_seq->a;
2063904d1e70Sandi selinger 
2064904d1e70Sandi selinger 
2065e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2066904d1e70Sandi selinger   /* when there are no off-processor parts.  */
20671de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
20681de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
20691de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
2070904d1e70Sandi selinger   if (C->assembled) {
2071904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
2072904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
2073904d1e70Sandi selinger   }
2074904d1e70Sandi selinger   if (C->was_assembled) {
20750d3441aeSHong Zhang     for (i=0; i<cm; i++) {
20769ce11a7cSHong Zhang       ncols = c_seq->i[i+1] - c_seq->i[i];
20770d3441aeSHong Zhang       row = rstart + i;
2078904d1e70Sandi selinger       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
20798cb82516SHong Zhang       cols += ncols; vals += ncols;
20800d3441aeSHong Zhang     }
2081904d1e70Sandi selinger   } else {
2082e9ede7d0Sandi selinger     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
2083904d1e70Sandi selinger   }
20840d3441aeSHong Zhang 
20850d3441aeSHong Zhang   /* Co -> C, off-processor part */
20869ce11a7cSHong Zhang   cm = C_oth->rmap->N;
20879ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ*)C_oth->data;
20888cb82516SHong Zhang   cols = c_seq->j;
20898cb82516SHong Zhang   vals = c_seq->a;
20909ce11a7cSHong Zhang   for (i=0; i<cm; i++) {
20919ce11a7cSHong Zhang     ncols = c_seq->i[i+1] - c_seq->i[i];
20920d3441aeSHong Zhang     row = p->garray[i];
20930d3441aeSHong Zhang     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
20948cb82516SHong Zhang     cols += ncols; vals += ncols;
20950d3441aeSHong Zhang   }
2096904d1e70Sandi selinger 
20970d3441aeSHong Zhang   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
20980d3441aeSHong Zhang   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
20990d3441aeSHong Zhang 
2100748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
21013697aca6SHong Zhang 
2102dd4011a9SHong Zhang   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
21037d0a89b7SHong Zhang   if (ptap->freestruct) {
2104dd4011a9SHong Zhang     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
21053697aca6SHong Zhang   }
21060d3441aeSHong Zhang   PetscFunctionReturn(0);
21070d3441aeSHong Zhang }
21084222ddf1SHong Zhang 
21094222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
21104222ddf1SHong Zhang {
21114222ddf1SHong Zhang   PetscErrorCode      ierr;
21124222ddf1SHong Zhang   Mat_Product         *product = C->product;
21134222ddf1SHong Zhang   Mat                 A=product->A,P=product->B;
21144222ddf1SHong Zhang   MatProductAlgorithm alg=product->alg;
21154222ddf1SHong Zhang   PetscReal           fill=product->fill;
21164222ddf1SHong Zhang   PetscBool           flg;
21174222ddf1SHong Zhang 
21184222ddf1SHong Zhang   PetscFunctionBegin;
21194222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
21204222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
21214222ddf1SHong Zhang   if (flg) {
21224222ddf1SHong Zhang     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);CHKERRQ(ierr);
21234222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
21244222ddf1SHong Zhang     goto next;
21254222ddf1SHong Zhang   }
21264222ddf1SHong Zhang 
21274222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
21284222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"nonscalable",&flg);CHKERRQ(ierr);
21294222ddf1SHong Zhang   if (flg) {
21304222ddf1SHong Zhang     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
21314222ddf1SHong Zhang     goto next;
21324222ddf1SHong Zhang   }
21334222ddf1SHong Zhang 
21344222ddf1SHong Zhang   /* allatonce */
21354222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"allatonce",&flg);CHKERRQ(ierr);
21364222ddf1SHong Zhang   if (flg) {
21374222ddf1SHong Zhang     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr);
21384222ddf1SHong Zhang     goto next;
21394222ddf1SHong Zhang   }
21404222ddf1SHong Zhang 
21414222ddf1SHong Zhang   /* allatonce_merged */
21424222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"allatonce_merged",&flg);CHKERRQ(ierr);
21434222ddf1SHong Zhang   if (flg) {
21444222ddf1SHong Zhang     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr);
21454222ddf1SHong Zhang     goto next;
21464222ddf1SHong Zhang   }
21474222ddf1SHong Zhang 
21484222ddf1SHong Zhang   /* hypre */
21494222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
21504222ddf1SHong Zhang   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
21514222ddf1SHong Zhang   if (flg) {
21524222ddf1SHong Zhang     ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr);
21534222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
21544222ddf1SHong Zhang     PetscFunctionReturn(0);
21554222ddf1SHong Zhang   }
21564222ddf1SHong Zhang #endif
21574222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
21584222ddf1SHong Zhang 
21594222ddf1SHong Zhang next:
21604222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
21614222ddf1SHong Zhang   {
21624222ddf1SHong Zhang     Mat_MPIAIJ *c  = (Mat_MPIAIJ*)C->data;
21634222ddf1SHong Zhang     Mat_APMPI  *ap = c->ap;
21644222ddf1SHong Zhang     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr);
21654222ddf1SHong Zhang     ap->freestruct = PETSC_FALSE;
21664222ddf1SHong Zhang     ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr);
21674222ddf1SHong Zhang     ierr = PetscOptionsEnd();CHKERRQ(ierr);
21684222ddf1SHong Zhang   }
21694222ddf1SHong Zhang   PetscFunctionReturn(0);
21704222ddf1SHong Zhang }
2171