xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
16*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17*d71ae5a4SJacob Faibussowitsch {
18cf97cf3cSHong Zhang   PetscBool         iascii;
19cf97cf3cSHong Zhang   PetscViewerFormat format;
206718818eSStefano Zampini   Mat_APMPI        *ptap;
21cf97cf3cSHong Zhang 
22cf97cf3cSHong Zhang   PetscFunctionBegin;
236718818eSStefano Zampini   MatCheckProduct(A, 1);
246718818eSStefano Zampini   ptap = (Mat_APMPI *)A->product->data;
259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
26cf97cf3cSHong Zhang   if (iascii) {
279566063dSJacob Faibussowitsch     PetscCall(PetscViewerGetFormat(viewer, &format));
28cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29cf97cf3cSHong Zhang       if (ptap->algType == 0) {
309566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
31cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
329566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
334a56b808SFande Kong       } else if (ptap->algType == 2) {
349566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
354a56b808SFande Kong       } else if (ptap->algType == 3) {
369566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
37cf97cf3cSHong Zhang       }
38cf97cf3cSHong Zhang     }
39cf97cf3cSHong Zhang   }
40cf97cf3cSHong Zhang   PetscFunctionReturn(0);
41cf97cf3cSHong Zhang }
42cf97cf3cSHong Zhang 
43*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44*d71ae5a4SJacob Faibussowitsch {
456718818eSStefano Zampini   Mat_APMPI           *ptap = (Mat_APMPI *)data;
4660552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
47cc6cb787SHong Zhang 
48cc6cb787SHong Zhang   PetscFunctionBegin;
499566063dSJacob Faibussowitsch   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
509566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->bufa));
519566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_loc));
529566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_oth));
539566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
549566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Rd));
559566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Ro));
568403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
57681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
589566063dSJacob Faibussowitsch     PetscCall(PetscFree(ap->i));
599566063dSJacob Faibussowitsch     PetscCall(PetscFree2(ap->j, ap->a));
609566063dSJacob Faibussowitsch     PetscCall(MatDestroy(&ptap->AP_loc));
618403a639SHong Zhang   } else { /* used by alg_ptap */
629566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->api));
639566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->apj));
64681d504bSHong Zhang   }
659566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_loc));
669566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_oth));
679566063dSJacob Faibussowitsch   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68a560ef98SHong Zhang 
699566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Pt));
7060552ceaSHong Zhang 
7160552ceaSHong Zhang   merge = ptap->merge;
728403a639SHong Zhang   if (merge) { /* used by alg_ptap */
739566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->id_r));
749566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_s));
759566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_r));
769566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bi));
779566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bj));
789566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri[0]));
799566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri));
809566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj[0]));
819566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj));
829566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coi));
839566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coj));
849566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->owners_co));
859566063dSJacob Faibussowitsch     PetscCall(PetscLayoutDestroy(&merge->rowmap));
869566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->merge));
87bf0cc555SLisandro Dalcin   }
889566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
894a56b808SFande Kong 
909566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&ptap->sf));
919566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_othi));
929566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_rmti));
939566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap));
94cc6cb787SHong Zhang   PetscFunctionReturn(0);
95cc6cb787SHong Zhang }
96cc6cb787SHong Zhang 
97*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
98*d71ae5a4SJacob Faibussowitsch {
996718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100cf742fccSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
10192ec70a1SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1026718818eSStefano Zampini   Mat_APMPI         *ptap;
103cf742fccSHong Zhang   Mat                AP_loc, C_loc, C_oth;
104a3bb6f32SFande Kong   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105cf742fccSHong Zhang   PetscScalar       *apa;
106cf742fccSHong Zhang   const PetscInt    *cols;
107cf742fccSHong Zhang   const PetscScalar *vals;
108cf742fccSHong Zhang 
109cf742fccSHong Zhang   PetscFunctionBegin;
1106718818eSStefano Zampini   MatCheckProduct(C, 3);
1116718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
11228b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
11328b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
11480da8df7SHong Zhang 
1159566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
116cf742fccSHong Zhang 
117cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
118cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1199566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1209566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121cf742fccSHong Zhang   }
122cf742fccSHong Zhang 
123cf742fccSHong Zhang   /* 2) get AP_loc */
124cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
125cf742fccSHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
126cf742fccSHong Zhang 
127cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128cf742fccSHong Zhang   /*-----------------------------------------------------*/
129cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
130cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1319566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1329566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
133cf742fccSHong Zhang   }
134cf742fccSHong Zhang 
135cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
136cf742fccSHong Zhang   /* ---------------------------------------------- */
137cf742fccSHong Zhang   /* get data from symbolic products */
138cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
13952f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
14052f7967eSHong Zhang 
141cf742fccSHong Zhang   api = ap->i;
142cf742fccSHong Zhang   apj = ap->j;
1439566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
144cf742fccSHong Zhang   for (i = 0; i < am; i++) {
145cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146cf742fccSHong Zhang     apnz = api[i + 1] - api[i];
147b4e8d1b6SHong Zhang     apa  = ap->a + api[i];
1489566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(apa, apnz));
149b4e8d1b6SHong Zhang     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
150cf742fccSHong Zhang   }
1519566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
15208401ef6SPierre Jolivet   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
153cf742fccSHong Zhang 
154cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
1569566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
1579566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
158cf742fccSHong Zhang 
159cf742fccSHong Zhang   C_loc = ptap->C_loc;
160cf742fccSHong Zhang   C_oth = ptap->C_oth;
161cf742fccSHong Zhang 
162cf742fccSHong Zhang   /* add C_loc and Co to to C */
1639566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
164cf742fccSHong Zhang 
165cf742fccSHong Zhang   /* C_loc -> C */
166cf742fccSHong Zhang   cm    = C_loc->rmap->N;
167cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
168cf742fccSHong Zhang   cols  = c_seq->j;
169cf742fccSHong Zhang   vals  = c_seq->a;
1709566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
171edeac6deSandi selinger 
172e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173edeac6deSandi selinger   /* when there are no off-processor parts.  */
1741de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1751de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1761de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
177edeac6deSandi selinger   if (C->assembled) {
178edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
179edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
180edeac6deSandi selinger   }
181edeac6deSandi selinger   if (C->was_assembled) {
182cf742fccSHong Zhang     for (i = 0; i < cm; i++) {
183cf742fccSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
184cf742fccSHong Zhang       row   = rstart + i;
1859566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1869371c9d4SSatish Balay       cols += ncols;
1879371c9d4SSatish Balay       vals += ncols;
188cf742fccSHong Zhang     }
189edeac6deSandi selinger   } else {
1909566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
191edeac6deSandi selinger   }
1929566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
19308401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
194cf742fccSHong Zhang 
195cf742fccSHong Zhang   /* Co -> C, off-processor part */
196cf742fccSHong Zhang   cm    = C_oth->rmap->N;
197cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
198cf742fccSHong Zhang   cols  = c_seq->j;
199cf742fccSHong Zhang   vals  = c_seq->a;
2009566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
201cf742fccSHong Zhang   for (i = 0; i < cm; i++) {
202cf742fccSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
203cf742fccSHong Zhang     row   = p->garray[i];
2049566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
2059371c9d4SSatish Balay     cols += ncols;
2069371c9d4SSatish Balay     vals += ncols;
207cf742fccSHong Zhang   }
2089566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2099566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
210cf742fccSHong Zhang 
211cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
21280da8df7SHong Zhang 
2139566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
21408401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
215cf742fccSHong Zhang   PetscFunctionReturn(0);
216cf742fccSHong Zhang }
217cf742fccSHong Zhang 
218*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
219*d71ae5a4SJacob Faibussowitsch {
2203cdca5ebSHong Zhang   Mat_APMPI               *ptap;
2216718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
2222259aa2eSHong Zhang   MPI_Comm                 comm;
2232259aa2eSHong Zhang   PetscMPIInt              size, rank;
2244222ddf1SHong Zhang   Mat                      P_loc, P_oth;
22515a3b8e2SHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
226d0e9a2f7SHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
227d0e9a2f7SHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
228ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
229ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
23015a3b8e2SHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
2317c70b0e9SStefano Zampini   const PetscInt          *owners;
2327c70b0e9SStefano Zampini   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
23315a3b8e2SHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
23415a3b8e2SHong Zhang   MPI_Request             *swaits, *rwaits;
23515a3b8e2SHong Zhang   MPI_Status              *sstatus, rstatus;
236445158ffSHong Zhang   PetscLayout              rowmap;
237445158ffSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
238445158ffSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
239a3bb6f32SFande Kong   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
24052f7967eSHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
24167a12041SHong Zhang   PetscScalar             *apv;
24278d30b94SHong Zhang   PetscTable               ta;
243b92f168fSBarry Smith   MatType                  mtype;
244e83e5f86SFande Kong   const char              *prefix;
245aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2468cb82516SHong Zhang   PetscReal apfill;
247aa690a28SHong Zhang #endif
24867a12041SHong Zhang 
24967a12041SHong Zhang   PetscFunctionBegin;
2506718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
25128b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
2529566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2539566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
2549566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
255ae5f0867Sstefano_zampini 
25652f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
25752f7967eSHong Zhang 
258ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
2599566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
2609566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
261ae5f0867Sstefano_zampini 
2623cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
2639566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
264e953e47cSHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
265cf97cf3cSHong Zhang   ptap->algType = 0;
266e953e47cSHong Zhang 
267e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
2689566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
269e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
2709566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
27176db11f6SHong Zhang 
27276db11f6SHong Zhang   ptap->P_loc = P_loc;
27376db11f6SHong Zhang   ptap->P_oth = P_oth;
274e953e47cSHong Zhang 
275e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
276e953e47cSHong Zhang   /* --------------------------------- */
2779566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
2789566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
279e953e47cSHong Zhang 
280e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
281e953e47cSHong Zhang   /* ----------------------------------------------------------------- */
28276db11f6SHong Zhang   p_loc = (Mat_SeqAIJ *)P_loc->data;
28352f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
284e953e47cSHong Zhang 
285e953e47cSHong Zhang   /* create and initialize a linked list */
2869566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */
28776db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
28876db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
2899566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
290e953e47cSHong Zhang 
2919566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
292e953e47cSHong Zhang 
293e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
29452f7967eSHong Zhang   if (ao) {
2959566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
29652f7967eSHong Zhang   } else {
2979566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
29852f7967eSHong Zhang   }
299e953e47cSHong Zhang   current_space = free_space;
300e953e47cSHong Zhang   nspacedouble  = 0;
301e953e47cSHong Zhang 
3029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
303e953e47cSHong Zhang   api[0] = 0;
304e953e47cSHong Zhang   for (i = 0; i < am; i++) {
305e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
3069371c9d4SSatish Balay     ai  = ad->i;
3079371c9d4SSatish Balay     pi  = p_loc->i;
308e953e47cSHong Zhang     nzi = ai[i + 1] - ai[i];
309e953e47cSHong Zhang     aj  = ad->j + ai[i];
310e953e47cSHong Zhang     for (j = 0; j < nzi; j++) {
311e953e47cSHong Zhang       row  = aj[j];
312e953e47cSHong Zhang       pnz  = pi[row + 1] - pi[row];
313e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
314e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
3159566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
316e953e47cSHong Zhang     }
317e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
31852f7967eSHong Zhang     if (ao) {
3199371c9d4SSatish Balay       ai  = ao->i;
3209371c9d4SSatish Balay       pi  = p_oth->i;
321e953e47cSHong Zhang       nzi = ai[i + 1] - ai[i];
322e953e47cSHong Zhang       aj  = ao->j + ai[i];
323e953e47cSHong Zhang       for (j = 0; j < nzi; j++) {
324e953e47cSHong Zhang         row  = aj[j];
325e953e47cSHong Zhang         pnz  = pi[row + 1] - pi[row];
326e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
3279566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
328e953e47cSHong Zhang       }
32952f7967eSHong Zhang     }
330e953e47cSHong Zhang     apnz       = lnk[0];
331e953e47cSHong Zhang     api[i + 1] = api[i] + apnz;
332e953e47cSHong Zhang 
333e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
334e953e47cSHong Zhang     if (current_space->local_remaining < apnz) {
3359566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
336e953e47cSHong Zhang       nspacedouble++;
337e953e47cSHong Zhang     }
338e953e47cSHong Zhang 
339e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
3409566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
341e953e47cSHong Zhang 
342e953e47cSHong Zhang     current_space->array += apnz;
343e953e47cSHong Zhang     current_space->local_used += apnz;
344e953e47cSHong Zhang     current_space->local_remaining -= apnz;
345e953e47cSHong Zhang   }
346e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
347e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
3489566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
3499566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
3509566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
351e953e47cSHong Zhang 
352e953e47cSHong Zhang   /* Create AP_loc for reuse */
3539566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
3549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
355e953e47cSHong Zhang 
356e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
35752f7967eSHong Zhang   if (ao) {
358e953e47cSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
35952f7967eSHong Zhang   } else {
36052f7967eSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
36152f7967eSHong Zhang   }
362e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
363e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
364e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
365e953e47cSHong Zhang 
366e953e47cSHong Zhang   if (api[am]) {
3679566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
3689566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
369e953e47cSHong Zhang   } else {
3709566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
371e953e47cSHong Zhang   }
372e953e47cSHong Zhang #endif
373e953e47cSHong Zhang 
374e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
3754222ddf1SHong Zhang   /* -------------------------------------- */
3769566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
3779566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
3789566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
3799566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
3804222ddf1SHong Zhang 
3819566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
3829566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
3839566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
3849566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
3859566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
386e953e47cSHong Zhang 
387e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
388e953e47cSHong Zhang   /* ------------------------------------------ */
389e953e47cSHong Zhang   /* determine row ownership */
3909566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
3919566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
3929566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
3939566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
3949566063dSJacob Faibussowitsch   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
395e953e47cSHong Zhang 
396e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
3979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
3989566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
3999566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
400e953e47cSHong Zhang 
401e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
4029371c9d4SSatish Balay   coi   = c_oth->i;
4039371c9d4SSatish Balay   coj   = c_oth->j;
404e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
405e953e47cSHong Zhang   proc  = 0;
4069566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
407e953e47cSHong Zhang   for (i = 0; i < con; i++) {
408e953e47cSHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
409e953e47cSHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
410e953e47cSHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
411e953e47cSHong Zhang   }
412e953e47cSHong Zhang 
413e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
414e953e47cSHong Zhang   owners_co[0] = 0;
415e953e47cSHong Zhang   nsend        = 0;
416e953e47cSHong Zhang   for (proc = 0; proc < size; proc++) {
417e953e47cSHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
418e953e47cSHong Zhang     if (len_s[proc]) {
419e953e47cSHong Zhang       nsend++;
420e953e47cSHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
421e953e47cSHong Zhang       len += len_si[proc];
422e953e47cSHong Zhang     }
423e953e47cSHong Zhang   }
424e953e47cSHong Zhang 
425e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
4269566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
4279566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
428e953e47cSHong Zhang 
429e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
4309566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
4319566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
4329566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
433e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
434e953e47cSHong Zhang     if (!len_s[proc]) continue;
435e953e47cSHong Zhang     i = owners_co[proc];
4369566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
437e953e47cSHong Zhang     k++;
438e953e47cSHong Zhang   }
439e953e47cSHong Zhang 
440e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
441e953e47cSHong Zhang   /* ---------------------------------------- */
4429566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
4439566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
4449566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
4459566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
4464222ddf1SHong Zhang 
4479566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
4489566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
4494222ddf1SHong Zhang 
4509566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
4519566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
4524222ddf1SHong Zhang 
453e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
4549566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
455e953e47cSHong Zhang 
456e953e47cSHong Zhang   /* receives coj are complete */
45748a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4589566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4599566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
460e953e47cSHong Zhang 
461e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
462e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
463e953e47cSHong Zhang     Jptr = buf_rj[k];
46448a46eb9SPierre Jolivet     for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
465e953e47cSHong Zhang   }
4669566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
4679566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
468e953e47cSHong Zhang 
469e953e47cSHong Zhang   /* (4) send and recv coi */
470e953e47cSHong Zhang   /*-----------------------*/
4719566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
4729566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
4739566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
474e953e47cSHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
475e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
476e953e47cSHong Zhang     if (!len_s[proc]) continue;
477e953e47cSHong Zhang     /* form outgoing message for i-structure:
478e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
479e953e47cSHong Zhang                [1:nrows]:           row index (global)
480e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
481e953e47cSHong Zhang     */
482e953e47cSHong Zhang     /*-------------------------------------------*/
483e953e47cSHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
484e953e47cSHong Zhang     buf_si_i    = buf_si + nrows + 1;
485e953e47cSHong Zhang     buf_si[0]   = nrows;
486e953e47cSHong Zhang     buf_si_i[0] = 0;
487e953e47cSHong Zhang     nrows       = 0;
488e953e47cSHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
489e953e47cSHong Zhang       nzi                 = coi[i + 1] - coi[i];
490e953e47cSHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
491e953e47cSHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
492e953e47cSHong Zhang       nrows++;
493e953e47cSHong Zhang     }
4949566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
495e953e47cSHong Zhang     k++;
496e953e47cSHong Zhang     buf_si += len_si[proc];
497e953e47cSHong Zhang   }
49848a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4999566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
5009566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
501e953e47cSHong Zhang 
5029566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
5039566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
5049566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
5059566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
506b4e8d1b6SHong Zhang 
507e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
508e953e47cSHong Zhang   /* ------------------------------------------ */
509e953e47cSHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
5109566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
511e953e47cSHong Zhang   current_space = free_space;
512e953e47cSHong Zhang 
5139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
514e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) {
515e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
516e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
517e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
518a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
519e953e47cSHong Zhang   }
520e953e47cSHong Zhang 
521d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
5229566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
523e953e47cSHong Zhang   for (i = 0; i < pn; i++) {
524e953e47cSHong Zhang     /* add C_loc into Cmpi */
525e953e47cSHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
526e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
5279566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
528e953e47cSHong Zhang 
529e953e47cSHong Zhang     /* add received col data into lnk */
530e953e47cSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
531e953e47cSHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
532e953e47cSHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
533e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
5349566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
5359371c9d4SSatish Balay         nextrow[k]++;
5369371c9d4SSatish Balay         nextci[k]++;
537e953e47cSHong Zhang       }
538e953e47cSHong Zhang     }
539e953e47cSHong Zhang     nzi = lnk[0];
540e953e47cSHong Zhang 
541e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
5429566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
5439566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
544e953e47cSHong Zhang   }
5459566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
5469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
5479566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
548e953e47cSHong Zhang 
549e953e47cSHong Zhang   /* local sizes and preallocation */
5509566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
551ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
5529566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
5539566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
554ac94c67aSTristan Konolige   }
5559566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
556d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
557e953e47cSHong Zhang 
558e953e47cSHong Zhang   /* members in merge */
5599566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
5609566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
5619566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
5629566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
5639566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
5649566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
5659566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
566e953e47cSHong Zhang 
567a3bb6f32SFande Kong   nout = 0;
5689566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
56908401ef6SPierre Jolivet   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
5709566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
57108401ef6SPierre Jolivet   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
572a3bb6f32SFande Kong 
5736718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
5746718818eSStefano Zampini   Cmpi->product->data    = ptap;
5756718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
5766718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
5776718818eSStefano Zampini 
5786718818eSStefano Zampini   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
5796718818eSStefano Zampini   Cmpi->assembled        = PETSC_FALSE;
5806718818eSStefano Zampini   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
581e953e47cSHong Zhang   PetscFunctionReturn(0);
582e953e47cSHong Zhang }
583e953e47cSHong Zhang 
584*d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
585*d71ae5a4SJacob Faibussowitsch {
5864a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5874a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
5884a56b808SFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
589bc8e477aSFande Kong   PetscInt    pcstart, pcend, column, offset;
5904a56b808SFande Kong 
5914a56b808SFande Kong   PetscFunctionBegin;
5924a56b808SFande Kong   pcstart = P->cmap->rstart;
593bc8e477aSFande Kong   pcstart *= dof;
5944a56b808SFande Kong   pcend = P->cmap->rend;
595bc8e477aSFande Kong   pcend *= dof;
5964a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
5974a56b808SFande Kong   ai  = ad->i;
5984a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
5994a56b808SFande Kong   aj  = ad->j + ai[i];
6004a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6014a56b808SFande Kong     row    = aj[j];
602bc8e477aSFande Kong     offset = row % dof;
603bc8e477aSFande Kong     row /= dof;
6044a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6054a56b808SFande Kong     pj   = pd->j + pd->i[row];
60648a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
6074a56b808SFande Kong   }
608bc8e477aSFande Kong   /* off diag P */
6094a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6104a56b808SFande Kong     row    = aj[j];
611bc8e477aSFande Kong     offset = row % dof;
612bc8e477aSFande Kong     row /= dof;
6134a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6144a56b808SFande Kong     pj   = po->j + po->i[row];
61548a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
6164a56b808SFande Kong   }
6174a56b808SFande Kong 
6184a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6194a56b808SFande Kong   if (ao) {
6204a56b808SFande Kong     ai  = ao->i;
6214a56b808SFande Kong     pi  = p_oth->i;
6224a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6234a56b808SFande Kong     aj  = ao->j + ai[i];
6244a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6254a56b808SFande Kong       row       = aj[j];
626bc8e477aSFande Kong       offset    = a->garray[row] % dof;
627bc8e477aSFande Kong       row       = map[row];
6284a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6294a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6304a56b808SFande Kong       for (col = 0; col < pnz; col++) {
631bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6324a56b808SFande Kong         if (column >= pcstart && column < pcend) {
6339566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(dht, column));
6344a56b808SFande Kong         } else {
6359566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(oht, column));
6364a56b808SFande Kong         }
6374a56b808SFande Kong       }
6384a56b808SFande Kong     }
6394a56b808SFande Kong   } /* end if (ao) */
6404a56b808SFande Kong   PetscFunctionReturn(0);
6414a56b808SFande Kong }
6424a56b808SFande Kong 
643*d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
644*d71ae5a4SJacob Faibussowitsch {
6454a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
6464a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
647bc8e477aSFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
6484a56b808SFande Kong   PetscScalar ra, *aa, *pa;
6494a56b808SFande Kong 
6504a56b808SFande Kong   PetscFunctionBegin;
6514a56b808SFande Kong   pcstart = P->cmap->rstart;
652bc8e477aSFande Kong   pcstart *= dof;
653bc8e477aSFande Kong 
6544a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6554a56b808SFande Kong   ai  = ad->i;
6564a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
6574a56b808SFande Kong   aj  = ad->j + ai[i];
6584a56b808SFande Kong   aa  = ad->a + ai[i];
6594a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6604a56b808SFande Kong     ra     = aa[j];
6614a56b808SFande Kong     row    = aj[j];
662bc8e477aSFande Kong     offset = row % dof;
663bc8e477aSFande Kong     row /= dof;
6644a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6654a56b808SFande Kong     pj   = pd->j + pd->i[row];
6664a56b808SFande Kong     pa   = pd->a + pd->i[row];
66748a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
6689566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6694a56b808SFande Kong   }
6704a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6714a56b808SFande Kong     ra     = aa[j];
6724a56b808SFande Kong     row    = aj[j];
673bc8e477aSFande Kong     offset = row % dof;
674bc8e477aSFande Kong     row /= dof;
6754a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6764a56b808SFande Kong     pj   = po->j + po->i[row];
6774a56b808SFande Kong     pa   = po->a + po->i[row];
67848a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
6799566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6804a56b808SFande Kong   }
6814a56b808SFande Kong 
6824a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6834a56b808SFande Kong   if (ao) {
6844a56b808SFande Kong     ai  = ao->i;
6854a56b808SFande Kong     pi  = p_oth->i;
6864a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6874a56b808SFande Kong     aj  = ao->j + ai[i];
6884a56b808SFande Kong     aa  = ao->a + ai[i];
6894a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6904a56b808SFande Kong       row       = aj[j];
691bc8e477aSFande Kong       offset    = a->garray[row] % dof;
692bc8e477aSFande Kong       row       = map[row];
6934a56b808SFande Kong       ra        = aa[j];
6944a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6954a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6964a56b808SFande Kong       pa        = p_oth->a + pi[row];
69748a46eb9SPierre Jolivet       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
6989566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * pnz));
6994a56b808SFande Kong     }
7004a56b808SFande Kong   } /* end if (ao) */
701bb5ddf68SFande Kong 
7024a56b808SFande Kong   PetscFunctionReturn(0);
7034a56b808SFande Kong }
7044a56b808SFande Kong 
705bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
7065c65b9ecSFande Kong 
707*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
708*d71ae5a4SJacob Faibussowitsch {
7094a56b808SFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
710bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
7116718818eSStefano Zampini   Mat_APMPI      *ptap;
7124a56b808SFande Kong   PetscHMapIV     hmap;
7134a56b808SFande Kong   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
7144a56b808SFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
715bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
716bc8e477aSFande Kong   const PetscInt *mappingindices;
717bc8e477aSFande Kong   IS              map;
7184a56b808SFande Kong 
7194a56b808SFande Kong   PetscFunctionBegin;
7206718818eSStefano Zampini   MatCheckProduct(C, 4);
7216718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
72228b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
72328b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
7244a56b808SFande Kong 
7259566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
7264a56b808SFande Kong 
7274a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7284a56b808SFande Kong   /*-----------------------------------------------------*/
7294a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7304a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
7319566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
7324a56b808SFande Kong   }
7339566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
7344a56b808SFande Kong 
7359566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
736bc8e477aSFande Kong   pon *= dof;
7379566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
7389566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
7394a56b808SFande Kong   cmaxr = 0;
740ad540459SPierre Jolivet   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
7419566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
7429566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
7439566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap, cmaxr));
7449566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
7454a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
7469566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
747bc8e477aSFande Kong     offset = i % dof;
748bc8e477aSFande Kong     ii     = i / dof;
749bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
7504a56b808SFande Kong     if (!nzi) continue;
7519566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
7524a56b808SFande Kong     voff = 0;
7539566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
7544a56b808SFande Kong     if (!voff) continue;
7554a56b808SFande Kong 
7564a56b808SFande Kong     /* Form C(ii, :) */
757bc8e477aSFande Kong     poj = po->j + po->i[ii];
758bc8e477aSFande Kong     poa = po->a + po->i[ii];
7594a56b808SFande Kong     for (j = 0; j < nzi; j++) {
760bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
761bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
762bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
7634a56b808SFande Kong       for (jj = 0; jj < voff; jj++) {
7644a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
7654a56b808SFande Kong         /* If the row is empty */
766bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
7674a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
7684a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
769bc8e477aSFande Kong           c_rmtc[pocol]++;
7704a56b808SFande Kong         } else {
7719566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
7724a56b808SFande Kong           if (loc >= 0) { /* hit */
7734a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
7749566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
7754a56b808SFande Kong           } else { /* new element */
7764a56b808SFande Kong             loc = -(loc + 1);
7774a56b808SFande Kong             /* Move data backward */
778bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
7794a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
7804a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
7814a56b808SFande Kong             } /* End kk */
7824a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
7834a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
784bc8e477aSFande Kong             c_rmtc[pocol]++;
7854a56b808SFande Kong           }
7864a56b808SFande Kong         }
7879566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(voff));
7884a56b808SFande Kong       } /* End jj */
7894a56b808SFande Kong     }   /* End j */
7904a56b808SFande Kong   }     /* End i */
7914a56b808SFande Kong 
7929566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
7939566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
7944a56b808SFande Kong 
7959566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
796bc8e477aSFande Kong   pn *= dof;
7979566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
7984a56b808SFande Kong 
7999566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
8009566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
8019566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
802bc8e477aSFande Kong   pcstart = pcstart * dof;
803bc8e477aSFande Kong   pcend   = pcend * dof;
8044a56b808SFande Kong   cd      = (Mat_SeqAIJ *)(c->A)->data;
8054a56b808SFande Kong   co      = (Mat_SeqAIJ *)(c->B)->data;
8064a56b808SFande Kong 
8074a56b808SFande Kong   cmaxr = 0;
808ad540459SPierre Jolivet   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
8099566063dSJacob Faibussowitsch   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
8109566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
8119566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap, cmaxr));
8124a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
8139566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
814bc8e477aSFande Kong     offset = i % dof;
815bc8e477aSFande Kong     ii     = i / dof;
816bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
8174a56b808SFande Kong     if (!nzi) continue;
8189566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
8194a56b808SFande Kong     voff = 0;
8209566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
8214a56b808SFande Kong     if (!voff) continue;
8224a56b808SFande Kong     /* Form C(ii, :) */
823bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
824bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8254a56b808SFande Kong     for (j = 0; j < nzi; j++) {
826bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
827ad540459SPierre Jolivet       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
8289566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
8299566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
8304a56b808SFande Kong     }
8314a56b808SFande Kong   }
8329566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
8339566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
8349566063dSJacob Faibussowitsch   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
8359566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
8369566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
8379566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
8389566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
839bc8e477aSFande Kong 
840bc8e477aSFande Kong   /* Add contributions from remote */
841bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
842bc8e477aSFande Kong     row = i + pcstart;
8439566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
844bc8e477aSFande Kong   }
8459566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
846bc8e477aSFande Kong 
8479566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
8489566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
849bc8e477aSFande Kong 
850bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
851bc8e477aSFande Kong   PetscFunctionReturn(0);
852bc8e477aSFande Kong }
853bc8e477aSFande Kong 
854*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
855*d71ae5a4SJacob Faibussowitsch {
856bc8e477aSFande Kong   PetscFunctionBegin;
85734bcad68SFande Kong 
8589566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
859bc8e477aSFande Kong   PetscFunctionReturn(0);
860bc8e477aSFande Kong }
861bc8e477aSFande Kong 
862*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
863*d71ae5a4SJacob Faibussowitsch {
864bc8e477aSFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
865bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
8666718818eSStefano Zampini   Mat_APMPI      *ptap;
867bc8e477aSFande Kong   PetscHMapIV     hmap;
868bc8e477aSFande Kong   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
869bc8e477aSFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
870bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
871bc8e477aSFande Kong   const PetscInt *mappingindices;
872bc8e477aSFande Kong   IS              map;
873bc8e477aSFande Kong 
874bc8e477aSFande Kong   PetscFunctionBegin;
8756718818eSStefano Zampini   MatCheckProduct(C, 4);
8766718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
87728b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
87828b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
879bc8e477aSFande Kong 
8809566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
881bc8e477aSFande Kong 
882bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
883bc8e477aSFande Kong   /*-----------------------------------------------------*/
884bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
885bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
8869566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
887bc8e477aSFande Kong   }
8889566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
8899566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
890bc8e477aSFande Kong   pon *= dof;
8919566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
892bc8e477aSFande Kong   pn *= dof;
893bc8e477aSFande Kong 
8949566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
8959566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
8969566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
897bc8e477aSFande Kong   pcstart *= dof;
898bc8e477aSFande Kong   pcend *= dof;
899bc8e477aSFande Kong   cmaxr = 0;
900ad540459SPierre Jolivet   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
901bc8e477aSFande Kong   cd = (Mat_SeqAIJ *)(c->A)->data;
902bc8e477aSFande Kong   co = (Mat_SeqAIJ *)(c->B)->data;
903ad540459SPierre Jolivet   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
9049566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
9059566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
9069566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap, cmaxr));
9079566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
908bc8e477aSFande Kong   for (i = 0; i < am && (pon || pn); i++) {
9099566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
910bc8e477aSFande Kong     offset = i % dof;
911bc8e477aSFande Kong     ii     = i / dof;
912bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
913bc8e477aSFande Kong     dnzi   = pd->i[ii + 1] - pd->i[ii];
914bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
9159566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
916bc8e477aSFande Kong     voff = 0;
9179566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
918bc8e477aSFande Kong     if (!voff) continue;
919bc8e477aSFande Kong 
920bc8e477aSFande Kong     /* Form remote C(ii, :) */
921bc8e477aSFande Kong     poj = po->j + po->i[ii];
922bc8e477aSFande Kong     poa = po->a + po->i[ii];
923bc8e477aSFande Kong     for (j = 0; j < nzi; j++) {
924bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
925bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
926bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
927bc8e477aSFande Kong       for (jj = 0; jj < voff; jj++) {
928bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
929bc8e477aSFande Kong         /* If the row is empty */
930bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
931bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
932bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
933bc8e477aSFande Kong           c_rmtc[pocol]++;
934bc8e477aSFande Kong         } else {
9359566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
936bc8e477aSFande Kong           if (loc >= 0) { /* hit */
937bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
9389566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
939bc8e477aSFande Kong           } else { /* new element */
940bc8e477aSFande Kong             loc = -(loc + 1);
941bc8e477aSFande Kong             /* Move data backward */
942bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
943bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
944bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
945bc8e477aSFande Kong             } /* End kk */
946bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
947bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
948bc8e477aSFande Kong             c_rmtc[pocol]++;
949bc8e477aSFande Kong           }
950bc8e477aSFande Kong         }
951bc8e477aSFande Kong       } /* End jj */
9529566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
953bc8e477aSFande Kong     } /* End j */
954bc8e477aSFande Kong 
955bc8e477aSFande Kong     /* Form local C(ii, :) */
956bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
957bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
958bc8e477aSFande Kong     for (j = 0; j < dnzi; j++) {
959bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
960ad540459SPierre Jolivet       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
9619566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
9629566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
963bc8e477aSFande Kong     } /* End j */
964bc8e477aSFande Kong   }   /* End i */
965bc8e477aSFande Kong 
9669566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
9679566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
9689566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
9699566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
970bc8e477aSFande Kong 
9719566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9729566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9739566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9749566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9759566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
9764a56b808SFande Kong 
9774a56b808SFande Kong   /* Add contributions from remote */
9784a56b808SFande Kong   for (i = 0; i < pn; i++) {
9794a56b808SFande Kong     row = i + pcstart;
9809566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
9814a56b808SFande Kong   }
9829566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
9834a56b808SFande Kong 
9849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
9859566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
9864a56b808SFande Kong 
9874a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
9884a56b808SFande Kong   PetscFunctionReturn(0);
9894a56b808SFande Kong }
9904a56b808SFande Kong 
991*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
992*d71ae5a4SJacob Faibussowitsch {
9934a56b808SFande Kong   PetscFunctionBegin;
99434bcad68SFande Kong 
9959566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
9964a56b808SFande Kong   PetscFunctionReturn(0);
9974a56b808SFande Kong }
9984a56b808SFande Kong 
9996718818eSStefano Zampini /* TODO: move algorithm selection to MatProductSetFromOptions */
1000*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1001*d71ae5a4SJacob Faibussowitsch {
10024a56b808SFande Kong   Mat_APMPI      *ptap;
10036718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
10044a56b808SFande Kong   MPI_Comm        comm;
1005bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
10064a56b808SFande Kong   MatType         mtype;
10074a56b808SFande Kong   PetscSF         sf;
10084a56b808SFande Kong   PetscSFNode    *iremote;
10094a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
10104a56b808SFande Kong   const PetscInt *rootdegrees;
10114a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto;
10124a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1013131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1014bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1015131c27b5Sprj-   PetscMPIInt     owner;
1016bc8e477aSFande Kong   const PetscInt *mappingindices;
10174a56b808SFande Kong   PetscBool       flg;
10184a56b808SFande Kong   const char     *algTypes[2] = {"overlapping", "merged"};
1019bc8e477aSFande Kong   IS              map;
10204a56b808SFande Kong 
10214a56b808SFande Kong   PetscFunctionBegin;
10226718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
102328b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
10249566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
102534bcad68SFande Kong 
10264a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10279566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1028bc8e477aSFande Kong   pn *= dof;
10299566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
10309566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
10319566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
10324a56b808SFande Kong 
10339566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
10344a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
10354a56b808SFande Kong   ptap->algType = 2;
10364a56b808SFande Kong 
10374a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
10389566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
10399566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
10404a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
10419566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1042bc8e477aSFande Kong   pon *= dof;
10434a56b808SFande Kong   /* offsets */
10449566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
10454a56b808SFande Kong   /* The number of columns we will send to remote ranks */
10469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
10479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
104848a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
10499566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
10509566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
10514a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
10529566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
10534a56b808SFande Kong 
10549566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
10554a56b808SFande Kong   ptap->c_rmti[0] = 0;
10564a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
10574a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
10584a56b808SFande Kong     /* Form one row of AP */
10599566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
1060bc8e477aSFande Kong     offset = i % dof;
1061bc8e477aSFande Kong     ii     = i / dof;
10624a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1063bc8e477aSFande Kong     nzi = po->i[ii + 1] - po->i[ii];
10644a56b808SFande Kong     if (!nzi) continue;
10654a56b808SFande Kong 
10669566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
10679566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
10684a56b808SFande Kong     /* If AP is empty, just continue */
10694a56b808SFande Kong     if (!htsize) continue;
10704a56b808SFande Kong     /* Form C(ii, :) */
1071bc8e477aSFande Kong     poj = po->j + po->i[ii];
107248a46eb9SPierre Jolivet     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
10734a56b808SFande Kong   }
10744a56b808SFande Kong 
10754a56b808SFande Kong   for (i = 0; i < pon; i++) {
10769566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
10774a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
10784a56b808SFande Kong     c_rmtc[i]           = htsize;
10794a56b808SFande Kong   }
10804a56b808SFande Kong 
10819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
10824a56b808SFande Kong 
10834a56b808SFande Kong   for (i = 0; i < pon; i++) {
10844a56b808SFande Kong     off = 0;
10859566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
10869566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
10874a56b808SFande Kong   }
10889566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
10894a56b808SFande Kong 
10909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
10914a56b808SFande Kong   for (i = 0; i < pon; i++) {
10929371c9d4SSatish Balay     owner  = 0;
10939371c9d4SSatish Balay     lidx   = 0;
1094bc8e477aSFande Kong     offset = i % dof;
1095bc8e477aSFande Kong     ii     = i / dof;
10969566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1097bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
10984a56b808SFande Kong     iremote[i].rank  = owner;
10994a56b808SFande Kong   }
11004a56b808SFande Kong 
11019566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
11029566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
11034a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
11049566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
11059566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
11069566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
11074a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
11089566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
11099566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
11104a56b808SFande Kong   rootspacesize = 0;
1111ad540459SPierre Jolivet   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
11129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
11139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
11144a56b808SFande Kong   /* Get information from leaves
11154a56b808SFande Kong    * Number of columns other people contribute to my rows
11164a56b808SFande Kong    * */
11179566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
11189566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
11199566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
11209566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
11214a56b808SFande Kong   /* The number of columns is received for each row */
11224a56b808SFande Kong   ptap->c_othi[0]     = 0;
11234a56b808SFande Kong   rootspacesize       = 0;
11244a56b808SFande Kong   rootspaceoffsets[0] = 0;
11254a56b808SFande Kong   for (i = 0; i < pn; i++) {
11264a56b808SFande Kong     rcvncols = 0;
11274a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
11284a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
11294a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
11304a56b808SFande Kong       rootspacesize++;
11314a56b808SFande Kong     }
11324a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
11334a56b808SFande Kong   }
11349566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
11354a56b808SFande Kong 
11369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
11379566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11389566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11399566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
11409566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
11414a56b808SFande Kong 
11429566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
11434a56b808SFande Kong   nleaves = 0;
11444a56b808SFande Kong   for (i = 0; i < pon; i++) {
1145bc8e477aSFande Kong     owner = 0;
1146bc8e477aSFande Kong     ii    = i / dof;
11479566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
11484a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
11494a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
11504a56b808SFande Kong       iremote[nleaves].rank    = owner;
11514a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
11524a56b808SFande Kong     }
11534a56b808SFande Kong   }
11549566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
11559566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
11564a56b808SFande Kong 
11579566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
11589566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
11599566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
11604a56b808SFande Kong   /* One to one map */
11619566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11624a56b808SFande Kong 
11639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
11649566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
11659566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1166bc8e477aSFande Kong   pcstart *= dof;
1167bc8e477aSFande Kong   pcend *= dof;
11689566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
11694a56b808SFande Kong   for (i = 0; i < pn; i++) {
11709566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
11719566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
11724a56b808SFande Kong   }
11734a56b808SFande Kong   /* Work on local part */
11744a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
11754a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
11769566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
11779566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1178bc8e477aSFande Kong     offset = i % dof;
1179bc8e477aSFande Kong     ii     = i / dof;
1180bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
11814a56b808SFande Kong     if (!nzi) continue;
11824a56b808SFande Kong 
11839566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
11849566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
11859566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
11864a56b808SFande Kong     if (!(htsize + htosize)) continue;
11874a56b808SFande Kong     /* Form C(ii, :) */
1188bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
11894a56b808SFande Kong     for (j = 0; j < nzi; j++) {
11909566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
11919566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
11924a56b808SFande Kong     }
11934a56b808SFande Kong   }
11944a56b808SFande Kong 
11959566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1196bc8e477aSFande Kong 
11979566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
11989566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
11994a56b808SFande Kong 
12004a56b808SFande Kong   /* Get remote data */
12019566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
12029566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
12034a56b808SFande Kong 
12044a56b808SFande Kong   for (i = 0; i < pn; i++) {
12054a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
12064a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
12074a56b808SFande Kong     for (j = 0; j < nzi; j++) {
12084a56b808SFande Kong       col = rdj[j];
12094a56b808SFande Kong       /* diag part */
12104a56b808SFande Kong       if (col >= pcstart && col < pcend) {
12119566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hta[i], col));
12124a56b808SFande Kong       } else { /* off diag */
12139566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
12144a56b808SFande Kong       }
12154a56b808SFande Kong     }
12169566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
12174a56b808SFande Kong     dnz[i] = htsize;
12189566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
12199566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
12204a56b808SFande Kong     onz[i] = htsize;
12219566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
12224a56b808SFande Kong   }
12234a56b808SFande Kong 
12249566063dSJacob Faibussowitsch   PetscCall(PetscFree2(hta, hto));
12259566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
12264a56b808SFande Kong 
12274a56b808SFande Kong   /* local sizes and preallocation */
12289566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12299566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
12309566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
12319566063dSJacob Faibussowitsch   PetscCall(MatSetUp(Cmpi));
12329566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
12334a56b808SFande Kong 
12344a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
12356718818eSStefano Zampini   Cmpi->product->data    = ptap;
12366718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
12376718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
12384a56b808SFande Kong 
12394a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
12404a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
12414a56b808SFande Kong   /* pick an algorithm */
1242d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
12434a56b808SFande Kong   alg = 0;
12449566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1245d0609cedSBarry Smith   PetscOptionsEnd();
12464a56b808SFande Kong   switch (alg) {
1247*d71ae5a4SJacob Faibussowitsch   case 0:
1248*d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1249*d71ae5a4SJacob Faibussowitsch     break;
1250*d71ae5a4SJacob Faibussowitsch   case 1:
1251*d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1252*d71ae5a4SJacob Faibussowitsch     break;
1253*d71ae5a4SJacob Faibussowitsch   default:
1254*d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
12554a56b808SFande Kong   }
12564a56b808SFande Kong   PetscFunctionReturn(0);
12574a56b808SFande Kong }
12584a56b808SFande Kong 
1259*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1260*d71ae5a4SJacob Faibussowitsch {
1261bc8e477aSFande Kong   PetscFunctionBegin;
12629566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1263bc8e477aSFande Kong   PetscFunctionReturn(0);
1264bc8e477aSFande Kong }
1265bc8e477aSFande Kong 
1266*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1267*d71ae5a4SJacob Faibussowitsch {
12684a56b808SFande Kong   Mat_APMPI      *ptap;
12696718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
12704a56b808SFande Kong   MPI_Comm        comm;
1271bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
12724a56b808SFande Kong   MatType         mtype;
12734a56b808SFande Kong   PetscSF         sf;
12744a56b808SFande Kong   PetscSFNode    *iremote;
12754a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
12764a56b808SFande Kong   const PetscInt *rootdegrees;
12774a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto, *htd;
12784a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1279131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1280bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1281131c27b5Sprj-   PetscMPIInt     owner;
12824a56b808SFande Kong   PetscBool       flg;
12834a56b808SFande Kong   const char     *algTypes[2] = {"merged", "overlapping"};
1284bc8e477aSFande Kong   const PetscInt *mappingindices;
1285bc8e477aSFande Kong   IS              map;
12864a56b808SFande Kong 
12874a56b808SFande Kong   PetscFunctionBegin;
12886718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
128928b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
12909566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
129134bcad68SFande Kong 
12924a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
12939566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1294bc8e477aSFande Kong   pn *= dof;
12959566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
12969566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
12979566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12984a56b808SFande Kong 
12999566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
13004a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
13014a56b808SFande Kong   ptap->algType = 3;
13024a56b808SFande Kong 
13034a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
13049566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
13059566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
13064a56b808SFande Kong 
13074a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
13089566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1309bc8e477aSFande Kong   pon *= dof;
13104a56b808SFande Kong   /* offsets */
13119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
13124a56b808SFande Kong   /* The number of columns we will send to remote ranks */
13139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
13149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
131548a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
13169566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
13179566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
13184a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
13199566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
13209566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
13219566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
13224a56b808SFande Kong   for (i = 0; i < pn; i++) {
13239566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&htd[i]));
13249566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
13254a56b808SFande Kong   }
1326bc8e477aSFande Kong 
13279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
13284a56b808SFande Kong   ptap->c_rmti[0] = 0;
13294a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
13304a56b808SFande Kong   for (i = 0; i < am && (pon || pn); i++) {
13314a56b808SFande Kong     /* Form one row of AP */
13329566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
13339566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1334bc8e477aSFande Kong     offset = i % dof;
1335bc8e477aSFande Kong     ii     = i / dof;
13364a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1337bc8e477aSFande Kong     nzi  = po->i[ii + 1] - po->i[ii];
1338bc8e477aSFande Kong     dnzi = pd->i[ii + 1] - pd->i[ii];
13394a56b808SFande Kong     if (!nzi && !dnzi) continue;
13404a56b808SFande Kong 
13419566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
13429566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
13439566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
13444a56b808SFande Kong     /* If AP is empty, just continue */
13454a56b808SFande Kong     if (!(htsize + htosize)) continue;
13464a56b808SFande Kong 
13474a56b808SFande Kong     /* Form remote C(ii, :) */
1348bc8e477aSFande Kong     poj = po->j + po->i[ii];
13494a56b808SFande Kong     for (j = 0; j < nzi; j++) {
13509566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
13519566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
13524a56b808SFande Kong     }
13534a56b808SFande Kong 
13544a56b808SFande Kong     /* Form local C(ii, :) */
1355bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
13564a56b808SFande Kong     for (j = 0; j < dnzi; j++) {
13579566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
13589566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
13594a56b808SFande Kong     }
13604a56b808SFande Kong   }
13614a56b808SFande Kong 
13629566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1363bc8e477aSFande Kong 
13649566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
13659566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
13664a56b808SFande Kong 
13674a56b808SFande Kong   for (i = 0; i < pon; i++) {
13689566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
13694a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
13704a56b808SFande Kong     c_rmtc[i]           = htsize;
13714a56b808SFande Kong   }
13724a56b808SFande Kong 
13739566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
13744a56b808SFande Kong 
13754a56b808SFande Kong   for (i = 0; i < pon; i++) {
13764a56b808SFande Kong     off = 0;
13779566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
13789566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
13794a56b808SFande Kong   }
13809566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
13814a56b808SFande Kong 
13829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
13834a56b808SFande Kong   for (i = 0; i < pon; i++) {
13849371c9d4SSatish Balay     owner  = 0;
13859371c9d4SSatish Balay     lidx   = 0;
1386bc8e477aSFande Kong     offset = i % dof;
1387bc8e477aSFande Kong     ii     = i / dof;
13889566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1389bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
13904a56b808SFande Kong     iremote[i].rank  = owner;
13914a56b808SFande Kong   }
13924a56b808SFande Kong 
13939566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
13949566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
13954a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
13969566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
13979566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
13989566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
13994a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
14009566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
14019566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
14024a56b808SFande Kong   rootspacesize = 0;
1403ad540459SPierre Jolivet   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
14049566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
14059566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
14064a56b808SFande Kong   /* Get information from leaves
14074a56b808SFande Kong    * Number of columns other people contribute to my rows
14084a56b808SFande Kong    * */
14099566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
14109566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
14119566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
14129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
14134a56b808SFande Kong   /* The number of columns is received for each row */
14144a56b808SFande Kong   ptap->c_othi[0]     = 0;
14154a56b808SFande Kong   rootspacesize       = 0;
14164a56b808SFande Kong   rootspaceoffsets[0] = 0;
14174a56b808SFande Kong   for (i = 0; i < pn; i++) {
14184a56b808SFande Kong     rcvncols = 0;
14194a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
14204a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
14214a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
14224a56b808SFande Kong       rootspacesize++;
14234a56b808SFande Kong     }
14244a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
14254a56b808SFande Kong   }
14269566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
14274a56b808SFande Kong 
14289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
14299566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14309566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14319566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
14329566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
14334a56b808SFande Kong 
14349566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
14354a56b808SFande Kong   nleaves = 0;
14364a56b808SFande Kong   for (i = 0; i < pon; i++) {
1437071fcb05SBarry Smith     owner = 0;
1438bc8e477aSFande Kong     ii    = i / dof;
14399566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
14404a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
14414a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
14424a56b808SFande Kong       iremote[nleaves].rank    = owner;
14434a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
14444a56b808SFande Kong     }
14454a56b808SFande Kong   }
14469566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
14479566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
14484a56b808SFande Kong 
14499566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
14509566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
14519566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
14524a56b808SFande Kong   /* One to one map */
14539566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14544a56b808SFande Kong   /* Get remote data */
14559566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14569566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
14579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
14589566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1459bc8e477aSFande Kong   pcstart *= dof;
1460bc8e477aSFande Kong   pcend *= dof;
14614a56b808SFande Kong   for (i = 0; i < pn; i++) {
14624a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
14634a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
14644a56b808SFande Kong     for (j = 0; j < nzi; j++) {
14654a56b808SFande Kong       col = rdj[j];
14664a56b808SFande Kong       /* diag part */
14674a56b808SFande Kong       if (col >= pcstart && col < pcend) {
14689566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(htd[i], col));
14694a56b808SFande Kong       } else { /* off diag */
14709566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
14714a56b808SFande Kong       }
14724a56b808SFande Kong     }
14739566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
14744a56b808SFande Kong     dnz[i] = htsize;
14759566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&htd[i]));
14769566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
14774a56b808SFande Kong     onz[i] = htsize;
14789566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
14794a56b808SFande Kong   }
14804a56b808SFande Kong 
14819566063dSJacob Faibussowitsch   PetscCall(PetscFree2(htd, hto));
14829566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
14834a56b808SFande Kong 
14844a56b808SFande Kong   /* local sizes and preallocation */
14859566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
14869566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
14879566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
14889566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
14894a56b808SFande Kong 
14904a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
14916718818eSStefano Zampini   Cmpi->product->data    = ptap;
14926718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
14936718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
14944a56b808SFande Kong 
14954a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
14964a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
14974a56b808SFande Kong   /* pick an algorithm */
1498d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
14994a56b808SFande Kong   alg = 0;
15009566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1501d0609cedSBarry Smith   PetscOptionsEnd();
15024a56b808SFande Kong   switch (alg) {
1503*d71ae5a4SJacob Faibussowitsch   case 0:
1504*d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1505*d71ae5a4SJacob Faibussowitsch     break;
1506*d71ae5a4SJacob Faibussowitsch   case 1:
1507*d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1508*d71ae5a4SJacob Faibussowitsch     break;
1509*d71ae5a4SJacob Faibussowitsch   default:
1510*d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
15114a56b808SFande Kong   }
15124a56b808SFande Kong   PetscFunctionReturn(0);
15134a56b808SFande Kong }
15144a56b808SFande Kong 
1515*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1516*d71ae5a4SJacob Faibussowitsch {
1517bc8e477aSFande Kong   PetscFunctionBegin;
15189566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1519bc8e477aSFande Kong   PetscFunctionReturn(0);
1520bc8e477aSFande Kong }
1521bc8e477aSFande Kong 
1522*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1523*d71ae5a4SJacob Faibussowitsch {
15243cdca5ebSHong Zhang   Mat_APMPI               *ptap;
15256718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1526e953e47cSHong Zhang   MPI_Comm                 comm;
1527e953e47cSHong Zhang   PetscMPIInt              size, rank;
1528e953e47cSHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1529e953e47cSHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1530e953e47cSHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
1531e953e47cSHong Zhang   PetscBT                  lnkbt;
1532ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1533ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
1534e953e47cSHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1535e953e47cSHong Zhang   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1536e953e47cSHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1537e953e47cSHong Zhang   MPI_Request             *swaits, *rwaits;
1538e953e47cSHong Zhang   MPI_Status              *sstatus, rstatus;
1539e953e47cSHong Zhang   PetscLayout              rowmap;
1540e953e47cSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1541e953e47cSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1542e953e47cSHong Zhang   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1543ec07b8f8SHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1544e953e47cSHong Zhang   PetscScalar             *apv;
1545e953e47cSHong Zhang   PetscTable               ta;
1546b92f168fSBarry Smith   MatType                  mtype;
1547e83e5f86SFande Kong   const char              *prefix;
1548e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1549e953e47cSHong Zhang   PetscReal apfill;
1550e953e47cSHong Zhang #endif
1551e953e47cSHong Zhang 
1552e953e47cSHong Zhang   PetscFunctionBegin;
15536718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
155428b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
15559566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
15569566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
15579566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1558e953e47cSHong Zhang 
155952f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1560ec07b8f8SHong Zhang 
1561e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
15629566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
15639566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
1564e953e47cSHong Zhang 
1565e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1566e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1567e953e47cSHong Zhang 
15683cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
15699566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
157015a3b8e2SHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
1571a4ffb656SHong Zhang   ptap->algType = 1;
157215a3b8e2SHong Zhang 
157315a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
15749566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
157515a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
15769566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
157715a3b8e2SHong Zhang 
157867a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
157967a12041SHong Zhang   /* --------------------------------- */
15809566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
15819566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
158215a3b8e2SHong Zhang 
158367a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
158467a12041SHong Zhang   /* ----------------------------------------------------------------- */
158567a12041SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
158652f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1587445158ffSHong Zhang 
15889a6dcab7SHong Zhang   /* create and initialize a linked list */
15899566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */
15904b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
15914b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
15929566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1593d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
159478d30b94SHong Zhang 
15959566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1596445158ffSHong Zhang 
15978cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1598ec07b8f8SHong Zhang   if (ao) {
15999566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1600ec07b8f8SHong Zhang   } else {
16019566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1602ec07b8f8SHong Zhang   }
1603445158ffSHong Zhang   current_space = free_space;
160467a12041SHong Zhang   nspacedouble  = 0;
160567a12041SHong Zhang 
16069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
160767a12041SHong Zhang   api[0] = 0;
1608445158ffSHong Zhang   for (i = 0; i < am; i++) {
160967a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
16109371c9d4SSatish Balay     ai  = ad->i;
16119371c9d4SSatish Balay     pi  = p_loc->i;
161267a12041SHong Zhang     nzi = ai[i + 1] - ai[i];
161367a12041SHong Zhang     aj  = ad->j + ai[i];
1614445158ffSHong Zhang     for (j = 0; j < nzi; j++) {
1615445158ffSHong Zhang       row  = aj[j];
161667a12041SHong Zhang       pnz  = pi[row + 1] - pi[row];
161767a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1618445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
16199566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1620445158ffSHong Zhang     }
162167a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1622ec07b8f8SHong Zhang     if (ao) {
16239371c9d4SSatish Balay       ai  = ao->i;
16249371c9d4SSatish Balay       pi  = p_oth->i;
162567a12041SHong Zhang       nzi = ai[i + 1] - ai[i];
162667a12041SHong Zhang       aj  = ao->j + ai[i];
1627445158ffSHong Zhang       for (j = 0; j < nzi; j++) {
1628445158ffSHong Zhang         row  = aj[j];
162967a12041SHong Zhang         pnz  = pi[row + 1] - pi[row];
163067a12041SHong Zhang         Jptr = p_oth->j + pi[row];
16319566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1632445158ffSHong Zhang       }
1633ec07b8f8SHong Zhang     }
1634445158ffSHong Zhang     apnz       = lnk[0];
1635445158ffSHong Zhang     api[i + 1] = api[i] + apnz;
1636445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1637445158ffSHong Zhang 
1638445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1639445158ffSHong Zhang     if (current_space->local_remaining < apnz) {
16409566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1641445158ffSHong Zhang       nspacedouble++;
1642445158ffSHong Zhang     }
1643445158ffSHong Zhang 
1644445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
16459566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1646445158ffSHong Zhang 
1647445158ffSHong Zhang     current_space->array += apnz;
1648445158ffSHong Zhang     current_space->local_used += apnz;
1649445158ffSHong Zhang     current_space->local_remaining -= apnz;
1650445158ffSHong Zhang   }
1651681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1652445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
16539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
16549566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
16559566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
16569a6dcab7SHong Zhang 
1657aa690a28SHong Zhang   /* Create AP_loc for reuse */
16589566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
16599566063dSJacob Faibussowitsch   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1660aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1661ec07b8f8SHong Zhang   if (ao) {
1662aa690a28SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1663ec07b8f8SHong Zhang   } else {
1664ec07b8f8SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1665ec07b8f8SHong Zhang   }
1666aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1667aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1668aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1669aa690a28SHong Zhang 
1670aa690a28SHong Zhang   if (api[am]) {
16719566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
16729566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1673aa690a28SHong Zhang   } else {
16749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1675aa690a28SHong Zhang   }
1676aa690a28SHong Zhang #endif
1677aa690a28SHong Zhang 
1678681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1679681d504bSHong Zhang   /* ------------------------------------ */
16809566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
16819566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
16829566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
16839566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
16849566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
16859566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
16869566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
16879566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
16889566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
16899566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
16909566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
16919566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
169215a3b8e2SHong Zhang 
169367a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
169467a12041SHong Zhang   /* ------------------------------------------ */
169515a3b8e2SHong Zhang   /* determine row ownership */
16969566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
1697445158ffSHong Zhang   rowmap->n  = pn;
1698445158ffSHong Zhang   rowmap->bs = 1;
16999566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
1700445158ffSHong Zhang   owners = rowmap->range;
170115a3b8e2SHong Zhang 
170215a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
17039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
17049566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
17059566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
170615a3b8e2SHong Zhang 
170767a12041SHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
17089371c9d4SSatish Balay   coi   = c_oth->i;
17099371c9d4SSatish Balay   coj   = c_oth->j;
171067a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
171115a3b8e2SHong Zhang   proc  = 0;
171267a12041SHong Zhang   for (i = 0; i < con; i++) {
171315a3b8e2SHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
171415a3b8e2SHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
171515a3b8e2SHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
171615a3b8e2SHong Zhang   }
171715a3b8e2SHong Zhang 
171815a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
171915a3b8e2SHong Zhang   owners_co[0] = 0;
172067a12041SHong Zhang   nsend        = 0;
172115a3b8e2SHong Zhang   for (proc = 0; proc < size; proc++) {
172215a3b8e2SHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
172315a3b8e2SHong Zhang     if (len_s[proc]) {
1724445158ffSHong Zhang       nsend++;
172515a3b8e2SHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
172615a3b8e2SHong Zhang       len += len_si[proc];
172715a3b8e2SHong Zhang     }
172815a3b8e2SHong Zhang   }
172915a3b8e2SHong Zhang 
173015a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
17319566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
17329566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
173315a3b8e2SHong Zhang 
173415a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
17359566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
17369566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
17379566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
173815a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
173915a3b8e2SHong Zhang     if (!len_s[proc]) continue;
174015a3b8e2SHong Zhang     i = owners_co[proc];
17419566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
174215a3b8e2SHong Zhang     k++;
174315a3b8e2SHong Zhang   }
174415a3b8e2SHong Zhang 
1745681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1746681d504bSHong Zhang   /* ---------------------------------------- */
17479566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
17489566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
17499566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
17509566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
17519566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
17529566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
17539566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
17549566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
17559566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
17569566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
17579566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
17584222ddf1SHong Zhang 
1759681d504bSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1760681d504bSHong Zhang 
1761681d504bSHong Zhang   /* receives coj are complete */
176248a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
17639566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17649566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
176515a3b8e2SHong Zhang 
176678d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
176778d30b94SHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
176878d30b94SHong Zhang     Jptr = buf_rj[k];
176948a46eb9SPierre Jolivet     for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
177078d30b94SHong Zhang   }
17719566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
17729566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
17739a6dcab7SHong Zhang 
177415a3b8e2SHong Zhang   /* (4) send and recv coi */
177515a3b8e2SHong Zhang   /*-----------------------*/
17769566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
17779566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
17789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
177915a3b8e2SHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
178015a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
178115a3b8e2SHong Zhang     if (!len_s[proc]) continue;
178215a3b8e2SHong Zhang     /* form outgoing message for i-structure:
178315a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
178415a3b8e2SHong Zhang                [1:nrows]:           row index (global)
178515a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
178615a3b8e2SHong Zhang     */
178715a3b8e2SHong Zhang     /*-------------------------------------------*/
178815a3b8e2SHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
178915a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows + 1;
179015a3b8e2SHong Zhang     buf_si[0]   = nrows;
179115a3b8e2SHong Zhang     buf_si_i[0] = 0;
179215a3b8e2SHong Zhang     nrows       = 0;
179315a3b8e2SHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
179415a3b8e2SHong Zhang       nzi                 = coi[i + 1] - coi[i];
179515a3b8e2SHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
179615a3b8e2SHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
179715a3b8e2SHong Zhang       nrows++;
179815a3b8e2SHong Zhang     }
17999566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
180015a3b8e2SHong Zhang     k++;
180115a3b8e2SHong Zhang     buf_si += len_si[proc];
180215a3b8e2SHong Zhang   }
180348a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
18049566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
18059566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
180615a3b8e2SHong Zhang 
18079566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
18089566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
18099566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
18109566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
1811a4ffb656SHong Zhang 
181267a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
181367a12041SHong Zhang   /* ------------------------------------------ */
181478d30b94SHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
18159566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
181615a3b8e2SHong Zhang   current_space = free_space;
181715a3b8e2SHong Zhang 
18189566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1819445158ffSHong Zhang   for (k = 0; k < nrecv; k++) {
182015a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
182115a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
182215a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1823a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
182415a3b8e2SHong Zhang   }
1825445158ffSHong Zhang 
1826d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
18279566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
182815a3b8e2SHong Zhang   for (i = 0; i < pn; i++) {
182967a12041SHong Zhang     /* add C_loc into Cmpi */
183067a12041SHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
183167a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
18329566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
183315a3b8e2SHong Zhang 
183415a3b8e2SHong Zhang     /* add received col data into lnk */
1835445158ffSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
183615a3b8e2SHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
183715a3b8e2SHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
183815a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
18399566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
18409371c9d4SSatish Balay         nextrow[k]++;
18419371c9d4SSatish Balay         nextci[k]++;
184215a3b8e2SHong Zhang       }
184315a3b8e2SHong Zhang     }
1844d0e9a2f7SHong Zhang     nzi = lnk[0];
18458cb82516SHong Zhang 
184615a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
18479566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
18489566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
184915a3b8e2SHong Zhang   }
18509566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
18519566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
18529566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
185380bb4639SHong Zhang 
1854ae5f0867Sstefano_zampini   /* local sizes and preallocation */
18559566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1856ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
18579566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
18589566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1859ac94c67aSTristan Konolige   }
18609566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1861d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
186215a3b8e2SHong Zhang 
1863445158ffSHong Zhang   /* members in merge */
18649566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
18659566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
18669566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
18679566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
18689566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
18699566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
18709566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
187115a3b8e2SHong Zhang 
18729566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pN, &ptap->apa));
18732259aa2eSHong Zhang 
18746718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
18756718818eSStefano Zampini   Cmpi->product->data    = ptap;
18766718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
18776718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
18786718818eSStefano Zampini 
18791a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
18801a47ec54SHong Zhang   Cmpi->assembled = PETSC_FALSE;
18810d3441aeSHong Zhang   PetscFunctionReturn(0);
18820d3441aeSHong Zhang }
18830d3441aeSHong Zhang 
1884*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1885*d71ae5a4SJacob Faibussowitsch {
18866718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
18870d3441aeSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
18882dd9e643SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
18896718818eSStefano Zampini   Mat_APMPI         *ptap;
18909ce11a7cSHong Zhang   Mat                AP_loc, C_loc, C_oth;
18910d3441aeSHong Zhang   PetscInt           i, rstart, rend, cm, ncols, row;
18928cb82516SHong Zhang   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
18938cb82516SHong Zhang   PetscScalar       *apa;
18940d3441aeSHong Zhang   const PetscInt    *cols;
18950d3441aeSHong Zhang   const PetscScalar *vals;
18960d3441aeSHong Zhang 
18970d3441aeSHong Zhang   PetscFunctionBegin;
18986718818eSStefano Zampini   MatCheckProduct(C, 3);
18996718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
190028b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
190128b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1902a9899c97SHong Zhang 
19039566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
1904e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
1905748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
19069566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
19079566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1908748c7196SHong Zhang   }
19090d3441aeSHong Zhang 
1910e497d3c8SHong Zhang   /* 2) get AP_loc */
19110d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
19128cb82516SHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
19130d3441aeSHong Zhang 
1914e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
19150d3441aeSHong Zhang   /*-----------------------------------------------------*/
1916748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1917748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
19189566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
19199566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1920e497d3c8SHong Zhang   }
19210d3441aeSHong Zhang 
19228cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
19238cb82516SHong Zhang   /* ---------------------------------------------- */
19240d3441aeSHong Zhang   /* get data from symbolic products */
19258cb82516SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1926ad540459SPierre Jolivet   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
19278cb82516SHong Zhang   apa = ptap->apa;
1928681d504bSHong Zhang   api = ap->i;
1929681d504bSHong Zhang   apj = ap->j;
1930e497d3c8SHong Zhang   for (i = 0; i < am; i++) {
1931445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1932e497d3c8SHong Zhang     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1933e497d3c8SHong Zhang     apnz = api[i + 1] - api[i];
1934e497d3c8SHong Zhang     for (j = 0; j < apnz; j++) {
1935e497d3c8SHong Zhang       col                 = apj[j + api[i]];
1936e497d3c8SHong Zhang       ap->a[j + ap->i[i]] = apa[col];
1937e497d3c8SHong Zhang       apa[col]            = 0.0;
19380d3441aeSHong Zhang     }
19390d3441aeSHong Zhang   }
1940976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
19419566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
19420d3441aeSHong Zhang 
19438cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
19449566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_loc));
19459566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_oth));
19469ce11a7cSHong Zhang   C_loc = ptap->C_loc;
19479ce11a7cSHong Zhang   C_oth = ptap->C_oth;
19480d3441aeSHong Zhang 
19490d3441aeSHong Zhang   /* add C_loc and Co to to C */
19509566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
19510d3441aeSHong Zhang 
19520d3441aeSHong Zhang   /* C_loc -> C */
19530d3441aeSHong Zhang   cm    = C_loc->rmap->N;
19549ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
19558cb82516SHong Zhang   cols  = c_seq->j;
19568cb82516SHong Zhang   vals  = c_seq->a;
1957904d1e70Sandi selinger 
1958e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1959904d1e70Sandi selinger   /* when there are no off-processor parts.  */
19601de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
19611de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
19621de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
1963904d1e70Sandi selinger   if (C->assembled) {
1964904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
1965904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
1966904d1e70Sandi selinger   }
1967904d1e70Sandi selinger   if (C->was_assembled) {
19680d3441aeSHong Zhang     for (i = 0; i < cm; i++) {
19699ce11a7cSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
19700d3441aeSHong Zhang       row   = rstart + i;
19719566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19729371c9d4SSatish Balay       cols += ncols;
19739371c9d4SSatish Balay       vals += ncols;
19740d3441aeSHong Zhang     }
1975904d1e70Sandi selinger   } else {
19769566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1977904d1e70Sandi selinger   }
19780d3441aeSHong Zhang 
19790d3441aeSHong Zhang   /* Co -> C, off-processor part */
19809ce11a7cSHong Zhang   cm    = C_oth->rmap->N;
19819ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
19828cb82516SHong Zhang   cols  = c_seq->j;
19838cb82516SHong Zhang   vals  = c_seq->a;
19849ce11a7cSHong Zhang   for (i = 0; i < cm; i++) {
19859ce11a7cSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
19860d3441aeSHong Zhang     row   = p->garray[i];
19879566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19889371c9d4SSatish Balay     cols += ncols;
19899371c9d4SSatish Balay     vals += ncols;
19900d3441aeSHong Zhang   }
1991904d1e70Sandi selinger 
19929566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
19939566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
19940d3441aeSHong Zhang 
1995748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
19960d3441aeSHong Zhang   PetscFunctionReturn(0);
19970d3441aeSHong Zhang }
19984222ddf1SHong Zhang 
1999*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2000*d71ae5a4SJacob Faibussowitsch {
20014222ddf1SHong Zhang   Mat_Product        *product = C->product;
20024222ddf1SHong Zhang   Mat                 A = product->A, P = product->B;
20034222ddf1SHong Zhang   MatProductAlgorithm alg  = product->alg;
20044222ddf1SHong Zhang   PetscReal           fill = product->fill;
20054222ddf1SHong Zhang   PetscBool           flg;
20064222ddf1SHong Zhang 
20074222ddf1SHong Zhang   PetscFunctionBegin;
20084222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
20099566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
20104222ddf1SHong Zhang   if (flg) {
20119566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
20124222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20134222ddf1SHong Zhang     goto next;
20144222ddf1SHong Zhang   }
20154222ddf1SHong Zhang 
20164222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
20179566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
20184222ddf1SHong Zhang   if (flg) {
20199566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
20204222ddf1SHong Zhang     goto next;
20214222ddf1SHong Zhang   }
20224222ddf1SHong Zhang 
20234222ddf1SHong Zhang   /* allatonce */
20249566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
20254222ddf1SHong Zhang   if (flg) {
20269566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
20274222ddf1SHong Zhang     goto next;
20284222ddf1SHong Zhang   }
20294222ddf1SHong Zhang 
20304222ddf1SHong Zhang   /* allatonce_merged */
20319566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
20324222ddf1SHong Zhang   if (flg) {
20339566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
20344222ddf1SHong Zhang     goto next;
20354222ddf1SHong Zhang   }
20364222ddf1SHong Zhang 
20374e84afc0SStefano Zampini   /* backend general code */
20389566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "backend", &flg));
20394e84afc0SStefano Zampini   if (flg) {
20409566063dSJacob Faibussowitsch     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
20414e84afc0SStefano Zampini     PetscFunctionReturn(0);
20424e84afc0SStefano Zampini   }
20434e84afc0SStefano Zampini 
20444222ddf1SHong Zhang   /* hypre */
20454222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
20469566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
20474222ddf1SHong Zhang   if (flg) {
20489566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
20494222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20504222ddf1SHong Zhang     PetscFunctionReturn(0);
20514222ddf1SHong Zhang   }
20524222ddf1SHong Zhang #endif
20534222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
20544222ddf1SHong Zhang 
20554222ddf1SHong Zhang next:
20564222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
20574222ddf1SHong Zhang   PetscFunctionReturn(0);
20584222ddf1SHong Zhang }
2059