xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
169371c9d4SSatish Balay PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) {
17cf97cf3cSHong Zhang   PetscBool         iascii;
18cf97cf3cSHong Zhang   PetscViewerFormat format;
196718818eSStefano Zampini   Mat_APMPI        *ptap;
20cf97cf3cSHong Zhang 
21cf97cf3cSHong Zhang   PetscFunctionBegin;
226718818eSStefano Zampini   MatCheckProduct(A, 1);
236718818eSStefano Zampini   ptap = (Mat_APMPI *)A->product->data;
249566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
25cf97cf3cSHong Zhang   if (iascii) {
269566063dSJacob Faibussowitsch     PetscCall(PetscViewerGetFormat(viewer, &format));
27cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28cf97cf3cSHong Zhang       if (ptap->algType == 0) {
299566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
30cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
319566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
324a56b808SFande Kong       } else if (ptap->algType == 2) {
339566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
344a56b808SFande Kong       } else if (ptap->algType == 3) {
359566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
36cf97cf3cSHong Zhang       }
37cf97cf3cSHong Zhang     }
38cf97cf3cSHong Zhang   }
39cf97cf3cSHong Zhang   PetscFunctionReturn(0);
40cf97cf3cSHong Zhang }
41cf97cf3cSHong Zhang 
429371c9d4SSatish Balay PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) {
436718818eSStefano Zampini   Mat_APMPI           *ptap = (Mat_APMPI *)data;
4460552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
45cc6cb787SHong Zhang 
46cc6cb787SHong Zhang   PetscFunctionBegin;
479566063dSJacob Faibussowitsch   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
489566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->bufa));
499566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_loc));
509566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_oth));
519566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
529566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Rd));
539566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Ro));
548403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
55681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
569566063dSJacob Faibussowitsch     PetscCall(PetscFree(ap->i));
579566063dSJacob Faibussowitsch     PetscCall(PetscFree2(ap->j, ap->a));
589566063dSJacob Faibussowitsch     PetscCall(MatDestroy(&ptap->AP_loc));
598403a639SHong Zhang   } else { /* used by alg_ptap */
609566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->api));
619566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->apj));
62681d504bSHong Zhang   }
639566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_loc));
649566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_oth));
659566063dSJacob Faibussowitsch   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
66a560ef98SHong Zhang 
679566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Pt));
6860552ceaSHong Zhang 
6960552ceaSHong Zhang   merge = ptap->merge;
708403a639SHong Zhang   if (merge) { /* used by alg_ptap */
719566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->id_r));
729566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_s));
739566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_r));
749566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bi));
759566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bj));
769566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri[0]));
779566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri));
789566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj[0]));
799566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj));
809566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coi));
819566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coj));
829566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->owners_co));
839566063dSJacob Faibussowitsch     PetscCall(PetscLayoutDestroy(&merge->rowmap));
849566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->merge));
85bf0cc555SLisandro Dalcin   }
869566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
874a56b808SFande Kong 
889566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&ptap->sf));
899566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_othi));
909566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_rmti));
919566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap));
92cc6cb787SHong Zhang   PetscFunctionReturn(0);
93cc6cb787SHong Zhang }
94cc6cb787SHong Zhang 
959371c9d4SSatish Balay PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) {
966718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
97cf742fccSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
9892ec70a1SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
996718818eSStefano Zampini   Mat_APMPI         *ptap;
100cf742fccSHong Zhang   Mat                AP_loc, C_loc, C_oth;
101a3bb6f32SFande Kong   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
102cf742fccSHong Zhang   PetscScalar       *apa;
103cf742fccSHong Zhang   const PetscInt    *cols;
104cf742fccSHong Zhang   const PetscScalar *vals;
105cf742fccSHong Zhang 
106cf742fccSHong Zhang   PetscFunctionBegin;
1076718818eSStefano Zampini   MatCheckProduct(C, 3);
1086718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
10928b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
11028b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
11180da8df7SHong Zhang 
1129566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
113cf742fccSHong Zhang 
114cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
115cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1169566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1179566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
118cf742fccSHong Zhang   }
119cf742fccSHong Zhang 
120cf742fccSHong Zhang   /* 2) get AP_loc */
121cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
122cf742fccSHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
123cf742fccSHong Zhang 
124cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
125cf742fccSHong Zhang   /*-----------------------------------------------------*/
126cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
127cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1289566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1299566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
130cf742fccSHong Zhang   }
131cf742fccSHong Zhang 
132cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
133cf742fccSHong Zhang   /* ---------------------------------------------- */
134cf742fccSHong Zhang   /* get data from symbolic products */
135cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
13652f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
13752f7967eSHong Zhang 
138cf742fccSHong Zhang   api = ap->i;
139cf742fccSHong Zhang   apj = ap->j;
1409566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
141cf742fccSHong Zhang   for (i = 0; i < am; i++) {
142cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143cf742fccSHong Zhang     apnz = api[i + 1] - api[i];
144b4e8d1b6SHong Zhang     apa  = ap->a + api[i];
1459566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(apa, apnz));
146b4e8d1b6SHong Zhang     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
147cf742fccSHong Zhang   }
1489566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
14908401ef6SPierre Jolivet   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
150cf742fccSHong Zhang 
151cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
1539566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
1549566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
155cf742fccSHong Zhang 
156cf742fccSHong Zhang   C_loc = ptap->C_loc;
157cf742fccSHong Zhang   C_oth = ptap->C_oth;
158cf742fccSHong Zhang 
159cf742fccSHong Zhang   /* add C_loc and Co to to C */
1609566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
161cf742fccSHong Zhang 
162cf742fccSHong Zhang   /* C_loc -> C */
163cf742fccSHong Zhang   cm    = C_loc->rmap->N;
164cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
165cf742fccSHong Zhang   cols  = c_seq->j;
166cf742fccSHong Zhang   vals  = c_seq->a;
1679566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
168edeac6deSandi selinger 
169e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170edeac6deSandi selinger   /* when there are no off-processor parts.  */
1711de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1721de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1731de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
174edeac6deSandi selinger   if (C->assembled) {
175edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
176edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
177edeac6deSandi selinger   }
178edeac6deSandi selinger   if (C->was_assembled) {
179cf742fccSHong Zhang     for (i = 0; i < cm; i++) {
180cf742fccSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
181cf742fccSHong Zhang       row   = rstart + i;
1829566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1839371c9d4SSatish Balay       cols += ncols;
1849371c9d4SSatish Balay       vals += ncols;
185cf742fccSHong Zhang     }
186edeac6deSandi selinger   } else {
1879566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
188edeac6deSandi selinger   }
1899566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
19008401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
191cf742fccSHong Zhang 
192cf742fccSHong Zhang   /* Co -> C, off-processor part */
193cf742fccSHong Zhang   cm    = C_oth->rmap->N;
194cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
195cf742fccSHong Zhang   cols  = c_seq->j;
196cf742fccSHong Zhang   vals  = c_seq->a;
1979566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
198cf742fccSHong Zhang   for (i = 0; i < cm; i++) {
199cf742fccSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
200cf742fccSHong Zhang     row   = p->garray[i];
2019566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
2029371c9d4SSatish Balay     cols += ncols;
2039371c9d4SSatish Balay     vals += ncols;
204cf742fccSHong Zhang   }
2059566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2069566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
207cf742fccSHong Zhang 
208cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
20980da8df7SHong Zhang 
2109566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
21108401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
212cf742fccSHong Zhang   PetscFunctionReturn(0);
213cf742fccSHong Zhang }
214cf742fccSHong Zhang 
2159371c9d4SSatish Balay PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) {
2163cdca5ebSHong Zhang   Mat_APMPI               *ptap;
2176718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
2182259aa2eSHong Zhang   MPI_Comm                 comm;
2192259aa2eSHong Zhang   PetscMPIInt              size, rank;
2204222ddf1SHong Zhang   Mat                      P_loc, P_oth;
22115a3b8e2SHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
222d0e9a2f7SHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
223d0e9a2f7SHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
224ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
225ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
22615a3b8e2SHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
2277c70b0e9SStefano Zampini   const PetscInt          *owners;
2287c70b0e9SStefano Zampini   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
22915a3b8e2SHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
23015a3b8e2SHong Zhang   MPI_Request             *swaits, *rwaits;
23115a3b8e2SHong Zhang   MPI_Status              *sstatus, rstatus;
232445158ffSHong Zhang   PetscLayout              rowmap;
233445158ffSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
234445158ffSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
235a3bb6f32SFande Kong   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
23652f7967eSHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
23767a12041SHong Zhang   PetscScalar             *apv;
23878d30b94SHong Zhang   PetscTable               ta;
239b92f168fSBarry Smith   MatType                  mtype;
240e83e5f86SFande Kong   const char              *prefix;
241aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2428cb82516SHong Zhang   PetscReal apfill;
243aa690a28SHong Zhang #endif
24467a12041SHong Zhang 
24567a12041SHong Zhang   PetscFunctionBegin;
2466718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
24728b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
2489566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2499566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
2509566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
251ae5f0867Sstefano_zampini 
25252f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
25352f7967eSHong Zhang 
254ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
2559566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
2569566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
257ae5f0867Sstefano_zampini 
2583cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
2599566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
260e953e47cSHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
261cf97cf3cSHong Zhang   ptap->algType = 0;
262e953e47cSHong Zhang 
263e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
2649566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
265e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
2669566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
26776db11f6SHong Zhang 
26876db11f6SHong Zhang   ptap->P_loc = P_loc;
26976db11f6SHong Zhang   ptap->P_oth = P_oth;
270e953e47cSHong Zhang 
271e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
272e953e47cSHong Zhang   /* --------------------------------- */
2739566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
2749566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
275e953e47cSHong Zhang 
276e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
277e953e47cSHong Zhang   /* ----------------------------------------------------------------- */
27876db11f6SHong Zhang   p_loc = (Mat_SeqAIJ *)P_loc->data;
27952f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
280e953e47cSHong Zhang 
281e953e47cSHong Zhang   /* create and initialize a linked list */
2829566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */
28376db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
28476db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
2859566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
286e953e47cSHong Zhang 
2879566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
288e953e47cSHong Zhang 
289e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
29052f7967eSHong Zhang   if (ao) {
2919566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
29252f7967eSHong Zhang   } else {
2939566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
29452f7967eSHong Zhang   }
295e953e47cSHong Zhang   current_space = free_space;
296e953e47cSHong Zhang   nspacedouble  = 0;
297e953e47cSHong Zhang 
2989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
299e953e47cSHong Zhang   api[0] = 0;
300e953e47cSHong Zhang   for (i = 0; i < am; i++) {
301e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
3029371c9d4SSatish Balay     ai  = ad->i;
3039371c9d4SSatish Balay     pi  = p_loc->i;
304e953e47cSHong Zhang     nzi = ai[i + 1] - ai[i];
305e953e47cSHong Zhang     aj  = ad->j + ai[i];
306e953e47cSHong Zhang     for (j = 0; j < nzi; j++) {
307e953e47cSHong Zhang       row  = aj[j];
308e953e47cSHong Zhang       pnz  = pi[row + 1] - pi[row];
309e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
310e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
3119566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
312e953e47cSHong Zhang     }
313e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
31452f7967eSHong Zhang     if (ao) {
3159371c9d4SSatish Balay       ai  = ao->i;
3169371c9d4SSatish Balay       pi  = p_oth->i;
317e953e47cSHong Zhang       nzi = ai[i + 1] - ai[i];
318e953e47cSHong Zhang       aj  = ao->j + ai[i];
319e953e47cSHong Zhang       for (j = 0; j < nzi; j++) {
320e953e47cSHong Zhang         row  = aj[j];
321e953e47cSHong Zhang         pnz  = pi[row + 1] - pi[row];
322e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
3239566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
324e953e47cSHong Zhang       }
32552f7967eSHong Zhang     }
326e953e47cSHong Zhang     apnz       = lnk[0];
327e953e47cSHong Zhang     api[i + 1] = api[i] + apnz;
328e953e47cSHong Zhang 
329e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
330e953e47cSHong Zhang     if (current_space->local_remaining < apnz) {
3319566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
332e953e47cSHong Zhang       nspacedouble++;
333e953e47cSHong Zhang     }
334e953e47cSHong Zhang 
335e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
3369566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
337e953e47cSHong Zhang 
338e953e47cSHong Zhang     current_space->array += apnz;
339e953e47cSHong Zhang     current_space->local_used += apnz;
340e953e47cSHong Zhang     current_space->local_remaining -= apnz;
341e953e47cSHong Zhang   }
342e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
343e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
3449566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
3459566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
3469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347e953e47cSHong Zhang 
348e953e47cSHong Zhang   /* Create AP_loc for reuse */
3499566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
3509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351e953e47cSHong Zhang 
352e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
35352f7967eSHong Zhang   if (ao) {
354e953e47cSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
35552f7967eSHong Zhang   } else {
35652f7967eSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
35752f7967eSHong Zhang   }
358e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
359e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
360e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
361e953e47cSHong Zhang 
362e953e47cSHong Zhang   if (api[am]) {
3639566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
3649566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
365e953e47cSHong Zhang   } else {
3669566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
367e953e47cSHong Zhang   }
368e953e47cSHong Zhang #endif
369e953e47cSHong Zhang 
370e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
3714222ddf1SHong Zhang   /* -------------------------------------- */
3729566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
3739566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
3749566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
3759566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
3764222ddf1SHong Zhang 
3779566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
3789566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
3799566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
3809566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
3819566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
382e953e47cSHong Zhang 
383e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
384e953e47cSHong Zhang   /* ------------------------------------------ */
385e953e47cSHong Zhang   /* determine row ownership */
3869566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
3879566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
3889566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
3899566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
3909566063dSJacob Faibussowitsch   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
391e953e47cSHong Zhang 
392e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
3939566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
3949566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
3959566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
396e953e47cSHong Zhang 
397e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
3989371c9d4SSatish Balay   coi   = c_oth->i;
3999371c9d4SSatish Balay   coj   = c_oth->j;
400e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
401e953e47cSHong Zhang   proc  = 0;
4029566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
403e953e47cSHong Zhang   for (i = 0; i < con; i++) {
404e953e47cSHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
405e953e47cSHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
406e953e47cSHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
407e953e47cSHong Zhang   }
408e953e47cSHong Zhang 
409e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
410e953e47cSHong Zhang   owners_co[0] = 0;
411e953e47cSHong Zhang   nsend        = 0;
412e953e47cSHong Zhang   for (proc = 0; proc < size; proc++) {
413e953e47cSHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
414e953e47cSHong Zhang     if (len_s[proc]) {
415e953e47cSHong Zhang       nsend++;
416e953e47cSHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
417e953e47cSHong Zhang       len += len_si[proc];
418e953e47cSHong Zhang     }
419e953e47cSHong Zhang   }
420e953e47cSHong Zhang 
421e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
4229566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
4239566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
424e953e47cSHong Zhang 
425e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
4269566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
4279566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
4289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
429e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
430e953e47cSHong Zhang     if (!len_s[proc]) continue;
431e953e47cSHong Zhang     i = owners_co[proc];
4329566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
433e953e47cSHong Zhang     k++;
434e953e47cSHong Zhang   }
435e953e47cSHong Zhang 
436e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
437e953e47cSHong Zhang   /* ---------------------------------------- */
4389566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
4399566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
4409566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
4419566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
4424222ddf1SHong Zhang 
4439566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
4449566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
4454222ddf1SHong Zhang 
4469566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
4479566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
4484222ddf1SHong Zhang 
449e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
4509566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
451e953e47cSHong Zhang 
452e953e47cSHong Zhang   /* receives coj are complete */
453*48a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4549566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4559566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
456e953e47cSHong Zhang 
457e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
458e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
459e953e47cSHong Zhang     Jptr = buf_rj[k];
460*48a46eb9SPierre Jolivet     for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
461e953e47cSHong Zhang   }
4629566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
4639566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
464e953e47cSHong Zhang 
465e953e47cSHong Zhang   /* (4) send and recv coi */
466e953e47cSHong Zhang   /*-----------------------*/
4679566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
4689566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
4699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
470e953e47cSHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
471e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
472e953e47cSHong Zhang     if (!len_s[proc]) continue;
473e953e47cSHong Zhang     /* form outgoing message for i-structure:
474e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
475e953e47cSHong Zhang                [1:nrows]:           row index (global)
476e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
477e953e47cSHong Zhang     */
478e953e47cSHong Zhang     /*-------------------------------------------*/
479e953e47cSHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
480e953e47cSHong Zhang     buf_si_i    = buf_si + nrows + 1;
481e953e47cSHong Zhang     buf_si[0]   = nrows;
482e953e47cSHong Zhang     buf_si_i[0] = 0;
483e953e47cSHong Zhang     nrows       = 0;
484e953e47cSHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
485e953e47cSHong Zhang       nzi                 = coi[i + 1] - coi[i];
486e953e47cSHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
487e953e47cSHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
488e953e47cSHong Zhang       nrows++;
489e953e47cSHong Zhang     }
4909566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
491e953e47cSHong Zhang     k++;
492e953e47cSHong Zhang     buf_si += len_si[proc];
493e953e47cSHong Zhang   }
494*48a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4959566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4969566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
497e953e47cSHong Zhang 
4989566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
4999566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
5009566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
5019566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
502b4e8d1b6SHong Zhang 
503e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
504e953e47cSHong Zhang   /* ------------------------------------------ */
505e953e47cSHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
5069566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
507e953e47cSHong Zhang   current_space = free_space;
508e953e47cSHong Zhang 
5099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
510e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) {
511e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
512e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
513e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
514a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
515e953e47cSHong Zhang   }
516e953e47cSHong Zhang 
517d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
5189566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
519e953e47cSHong Zhang   for (i = 0; i < pn; i++) {
520e953e47cSHong Zhang     /* add C_loc into Cmpi */
521e953e47cSHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
522e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
5239566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
524e953e47cSHong Zhang 
525e953e47cSHong Zhang     /* add received col data into lnk */
526e953e47cSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
527e953e47cSHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
528e953e47cSHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
529e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
5309566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
5319371c9d4SSatish Balay         nextrow[k]++;
5329371c9d4SSatish Balay         nextci[k]++;
533e953e47cSHong Zhang       }
534e953e47cSHong Zhang     }
535e953e47cSHong Zhang     nzi = lnk[0];
536e953e47cSHong Zhang 
537e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
5389566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
5399566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
540e953e47cSHong Zhang   }
5419566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
5429566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
5439566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
544e953e47cSHong Zhang 
545e953e47cSHong Zhang   /* local sizes and preallocation */
5469566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
547ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
5489566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
5499566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
550ac94c67aSTristan Konolige   }
5519566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
552d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
553e953e47cSHong Zhang 
554e953e47cSHong Zhang   /* members in merge */
5559566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
5569566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
5579566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
5589566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
5599566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
5609566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
5619566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
562e953e47cSHong Zhang 
563a3bb6f32SFande Kong   nout = 0;
5649566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
56508401ef6SPierre Jolivet   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
5669566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
56708401ef6SPierre Jolivet   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
568a3bb6f32SFande Kong 
5696718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
5706718818eSStefano Zampini   Cmpi->product->data    = ptap;
5716718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
5726718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
5736718818eSStefano Zampini 
5746718818eSStefano Zampini   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
5756718818eSStefano Zampini   Cmpi->assembled        = PETSC_FALSE;
5766718818eSStefano Zampini   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
577e953e47cSHong Zhang   PetscFunctionReturn(0);
578e953e47cSHong Zhang }
579e953e47cSHong Zhang 
5809371c9d4SSatish Balay static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) {
5814a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5824a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
5834a56b808SFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
584bc8e477aSFande Kong   PetscInt    pcstart, pcend, column, offset;
5854a56b808SFande Kong 
5864a56b808SFande Kong   PetscFunctionBegin;
5874a56b808SFande Kong   pcstart = P->cmap->rstart;
588bc8e477aSFande Kong   pcstart *= dof;
5894a56b808SFande Kong   pcend = P->cmap->rend;
590bc8e477aSFande Kong   pcend *= dof;
5914a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
5924a56b808SFande Kong   ai  = ad->i;
5934a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
5944a56b808SFande Kong   aj  = ad->j + ai[i];
5954a56b808SFande Kong   for (j = 0; j < nzi; j++) {
5964a56b808SFande Kong     row    = aj[j];
597bc8e477aSFande Kong     offset = row % dof;
598bc8e477aSFande Kong     row /= dof;
5994a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6004a56b808SFande Kong     pj   = pd->j + pd->i[row];
601*48a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
6024a56b808SFande Kong   }
603bc8e477aSFande Kong   /* off diag P */
6044a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6054a56b808SFande Kong     row    = aj[j];
606bc8e477aSFande Kong     offset = row % dof;
607bc8e477aSFande Kong     row /= dof;
6084a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6094a56b808SFande Kong     pj   = po->j + po->i[row];
610*48a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
6114a56b808SFande Kong   }
6124a56b808SFande Kong 
6134a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6144a56b808SFande Kong   if (ao) {
6154a56b808SFande Kong     ai  = ao->i;
6164a56b808SFande Kong     pi  = p_oth->i;
6174a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6184a56b808SFande Kong     aj  = ao->j + ai[i];
6194a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6204a56b808SFande Kong       row       = aj[j];
621bc8e477aSFande Kong       offset    = a->garray[row] % dof;
622bc8e477aSFande Kong       row       = map[row];
6234a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6244a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6254a56b808SFande Kong       for (col = 0; col < pnz; col++) {
626bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6274a56b808SFande Kong         if (column >= pcstart && column < pcend) {
6289566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(dht, column));
6294a56b808SFande Kong         } else {
6309566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(oht, column));
6314a56b808SFande Kong         }
6324a56b808SFande Kong       }
6334a56b808SFande Kong     }
6344a56b808SFande Kong   } /* end if (ao) */
6354a56b808SFande Kong   PetscFunctionReturn(0);
6364a56b808SFande Kong }
6374a56b808SFande Kong 
6389371c9d4SSatish Balay static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) {
6394a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
6404a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
641bc8e477aSFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
6424a56b808SFande Kong   PetscScalar ra, *aa, *pa;
6434a56b808SFande Kong 
6444a56b808SFande Kong   PetscFunctionBegin;
6454a56b808SFande Kong   pcstart = P->cmap->rstart;
646bc8e477aSFande Kong   pcstart *= dof;
647bc8e477aSFande Kong 
6484a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6494a56b808SFande Kong   ai  = ad->i;
6504a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
6514a56b808SFande Kong   aj  = ad->j + ai[i];
6524a56b808SFande Kong   aa  = ad->a + ai[i];
6534a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6544a56b808SFande Kong     ra     = aa[j];
6554a56b808SFande Kong     row    = aj[j];
656bc8e477aSFande Kong     offset = row % dof;
657bc8e477aSFande Kong     row /= dof;
6584a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6594a56b808SFande Kong     pj   = pd->j + pd->i[row];
6604a56b808SFande Kong     pa   = pd->a + pd->i[row];
661*48a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
6629566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6634a56b808SFande Kong   }
6644a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6654a56b808SFande Kong     ra     = aa[j];
6664a56b808SFande Kong     row    = aj[j];
667bc8e477aSFande Kong     offset = row % dof;
668bc8e477aSFande Kong     row /= dof;
6694a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6704a56b808SFande Kong     pj   = po->j + po->i[row];
6714a56b808SFande Kong     pa   = po->a + po->i[row];
672*48a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
6739566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6744a56b808SFande Kong   }
6754a56b808SFande Kong 
6764a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6774a56b808SFande Kong   if (ao) {
6784a56b808SFande Kong     ai  = ao->i;
6794a56b808SFande Kong     pi  = p_oth->i;
6804a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6814a56b808SFande Kong     aj  = ao->j + ai[i];
6824a56b808SFande Kong     aa  = ao->a + ai[i];
6834a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6844a56b808SFande Kong       row       = aj[j];
685bc8e477aSFande Kong       offset    = a->garray[row] % dof;
686bc8e477aSFande Kong       row       = map[row];
6874a56b808SFande Kong       ra        = aa[j];
6884a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6894a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6904a56b808SFande Kong       pa        = p_oth->a + pi[row];
691*48a46eb9SPierre Jolivet       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
6929566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * pnz));
6934a56b808SFande Kong     }
6944a56b808SFande Kong   } /* end if (ao) */
695bb5ddf68SFande Kong 
6964a56b808SFande Kong   PetscFunctionReturn(0);
6974a56b808SFande Kong }
6984a56b808SFande Kong 
699bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
7005c65b9ecSFande Kong 
7019371c9d4SSatish Balay PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) {
7024a56b808SFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
703bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
7046718818eSStefano Zampini   Mat_APMPI      *ptap;
7054a56b808SFande Kong   PetscHMapIV     hmap;
7064a56b808SFande Kong   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
7074a56b808SFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
708bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
709bc8e477aSFande Kong   const PetscInt *mappingindices;
710bc8e477aSFande Kong   IS              map;
7114a56b808SFande Kong 
7124a56b808SFande Kong   PetscFunctionBegin;
7136718818eSStefano Zampini   MatCheckProduct(C, 4);
7146718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
71528b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
71628b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
7174a56b808SFande Kong 
7189566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
7194a56b808SFande Kong 
7204a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7214a56b808SFande Kong   /*-----------------------------------------------------*/
7224a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7234a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
7249566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
7254a56b808SFande Kong   }
7269566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
7274a56b808SFande Kong 
7289566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
729bc8e477aSFande Kong   pon *= dof;
7309566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
7319566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
7324a56b808SFande Kong   cmaxr = 0;
7339371c9d4SSatish Balay   for (i = 0; i < pon; i++) { cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); }
7349566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
7359566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
7369566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap, cmaxr));
7379566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
7384a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
7399566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
740bc8e477aSFande Kong     offset = i % dof;
741bc8e477aSFande Kong     ii     = i / dof;
742bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
7434a56b808SFande Kong     if (!nzi) continue;
7449566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
7454a56b808SFande Kong     voff = 0;
7469566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
7474a56b808SFande Kong     if (!voff) continue;
7484a56b808SFande Kong 
7494a56b808SFande Kong     /* Form C(ii, :) */
750bc8e477aSFande Kong     poj = po->j + po->i[ii];
751bc8e477aSFande Kong     poa = po->a + po->i[ii];
7524a56b808SFande Kong     for (j = 0; j < nzi; j++) {
753bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
754bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
755bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
7564a56b808SFande Kong       for (jj = 0; jj < voff; jj++) {
7574a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
7584a56b808SFande Kong         /*If the row is empty */
759bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
7604a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
7614a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
762bc8e477aSFande Kong           c_rmtc[pocol]++;
7634a56b808SFande Kong         } else {
7649566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
7654a56b808SFande Kong           if (loc >= 0) { /* hit */
7664a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
7679566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
7684a56b808SFande Kong           } else { /* new element */
7694a56b808SFande Kong             loc = -(loc + 1);
7704a56b808SFande Kong             /* Move data backward */
771bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
7724a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
7734a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
7744a56b808SFande Kong             } /* End kk */
7754a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
7764a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
777bc8e477aSFande Kong             c_rmtc[pocol]++;
7784a56b808SFande Kong           }
7794a56b808SFande Kong         }
7809566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(voff));
7814a56b808SFande Kong       } /* End jj */
7824a56b808SFande Kong     }   /* End j */
7834a56b808SFande Kong   }     /* End i */
7844a56b808SFande Kong 
7859566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
7869566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
7874a56b808SFande Kong 
7889566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
789bc8e477aSFande Kong   pn *= dof;
7909566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
7914a56b808SFande Kong 
7929566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
7939566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
7949566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
795bc8e477aSFande Kong   pcstart = pcstart * dof;
796bc8e477aSFande Kong   pcend   = pcend * dof;
7974a56b808SFande Kong   cd      = (Mat_SeqAIJ *)(c->A)->data;
7984a56b808SFande Kong   co      = (Mat_SeqAIJ *)(c->B)->data;
7994a56b808SFande Kong 
8004a56b808SFande Kong   cmaxr = 0;
8019371c9d4SSatish Balay   for (i = 0; i < pn; i++) { cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); }
8029566063dSJacob Faibussowitsch   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
8039566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
8049566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap, cmaxr));
8054a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
8069566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
807bc8e477aSFande Kong     offset = i % dof;
808bc8e477aSFande Kong     ii     = i / dof;
809bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
8104a56b808SFande Kong     if (!nzi) continue;
8119566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
8124a56b808SFande Kong     voff = 0;
8139566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
8144a56b808SFande Kong     if (!voff) continue;
8154a56b808SFande Kong     /* Form C(ii, :) */
816bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
817bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8184a56b808SFande Kong     for (j = 0; j < nzi; j++) {
819bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
8209371c9d4SSatish Balay       for (jj = 0; jj < voff; jj++) { apvaluestmp[jj] = apvalues[jj] * pda[j]; }
8219566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
8229566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
8234a56b808SFande Kong     }
8244a56b808SFande Kong   }
8259566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
8269566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
8279566063dSJacob Faibussowitsch   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
8289566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
8299566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
8309566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
8319566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
832bc8e477aSFande Kong 
833bc8e477aSFande Kong   /* Add contributions from remote */
834bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
835bc8e477aSFande Kong     row = i + pcstart;
8369566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
837bc8e477aSFande Kong   }
8389566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
839bc8e477aSFande Kong 
8409566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
8419566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
842bc8e477aSFande Kong 
843bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
844bc8e477aSFande Kong   PetscFunctionReturn(0);
845bc8e477aSFande Kong }
846bc8e477aSFande Kong 
8479371c9d4SSatish Balay PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) {
848bc8e477aSFande Kong   PetscFunctionBegin;
84934bcad68SFande Kong 
8509566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
851bc8e477aSFande Kong   PetscFunctionReturn(0);
852bc8e477aSFande Kong }
853bc8e477aSFande Kong 
8549371c9d4SSatish Balay PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) {
855bc8e477aSFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
856bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
8576718818eSStefano Zampini   Mat_APMPI      *ptap;
858bc8e477aSFande Kong   PetscHMapIV     hmap;
859bc8e477aSFande Kong   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
860bc8e477aSFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
861bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
862bc8e477aSFande Kong   const PetscInt *mappingindices;
863bc8e477aSFande Kong   IS              map;
864bc8e477aSFande Kong 
865bc8e477aSFande Kong   PetscFunctionBegin;
8666718818eSStefano Zampini   MatCheckProduct(C, 4);
8676718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
86828b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
86928b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
870bc8e477aSFande Kong 
8719566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
872bc8e477aSFande Kong 
873bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
874bc8e477aSFande Kong   /*-----------------------------------------------------*/
875bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
876bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
8779566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
878bc8e477aSFande Kong   }
8799566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
8809566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
881bc8e477aSFande Kong   pon *= dof;
8829566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
883bc8e477aSFande Kong   pn *= dof;
884bc8e477aSFande Kong 
8859566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
8869566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
8879566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
888bc8e477aSFande Kong   pcstart *= dof;
889bc8e477aSFande Kong   pcend *= dof;
890bc8e477aSFande Kong   cmaxr = 0;
8919371c9d4SSatish Balay   for (i = 0; i < pon; i++) { cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); }
892bc8e477aSFande Kong   cd = (Mat_SeqAIJ *)(c->A)->data;
893bc8e477aSFande Kong   co = (Mat_SeqAIJ *)(c->B)->data;
8949371c9d4SSatish Balay   for (i = 0; i < pn; i++) { cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); }
8959566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
8969566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVCreate(&hmap));
8979566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVResize(hmap, cmaxr));
8989566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
899bc8e477aSFande Kong   for (i = 0; i < am && (pon || pn); i++) {
9009566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
901bc8e477aSFande Kong     offset = i % dof;
902bc8e477aSFande Kong     ii     = i / dof;
903bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
904bc8e477aSFande Kong     dnzi   = pd->i[ii + 1] - pd->i[ii];
905bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
9069566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
907bc8e477aSFande Kong     voff = 0;
9089566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
909bc8e477aSFande Kong     if (!voff) continue;
910bc8e477aSFande Kong 
911bc8e477aSFande Kong     /* Form remote C(ii, :) */
912bc8e477aSFande Kong     poj = po->j + po->i[ii];
913bc8e477aSFande Kong     poa = po->a + po->i[ii];
914bc8e477aSFande Kong     for (j = 0; j < nzi; j++) {
915bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
916bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
917bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
918bc8e477aSFande Kong       for (jj = 0; jj < voff; jj++) {
919bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
920bc8e477aSFande Kong         /*If the row is empty */
921bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
922bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
923bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
924bc8e477aSFande Kong           c_rmtc[pocol]++;
925bc8e477aSFande Kong         } else {
9269566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
927bc8e477aSFande Kong           if (loc >= 0) { /* hit */
928bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
9299566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
930bc8e477aSFande Kong           } else { /* new element */
931bc8e477aSFande Kong             loc = -(loc + 1);
932bc8e477aSFande Kong             /* Move data backward */
933bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
934bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
935bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
936bc8e477aSFande Kong             } /* End kk */
937bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
938bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
939bc8e477aSFande Kong             c_rmtc[pocol]++;
940bc8e477aSFande Kong           }
941bc8e477aSFande Kong         }
942bc8e477aSFande Kong       } /* End jj */
9439566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
944bc8e477aSFande Kong     } /* End j */
945bc8e477aSFande Kong 
946bc8e477aSFande Kong     /* Form local C(ii, :) */
947bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
948bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
949bc8e477aSFande Kong     for (j = 0; j < dnzi; j++) {
950bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
9519371c9d4SSatish Balay       for (jj = 0; jj < voff; jj++) { apvaluestmp[jj] = apvalues[jj] * pda[j]; } /* End kk */
9529566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
9539566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
954bc8e477aSFande Kong     } /* End j */
955bc8e477aSFande Kong   }   /* End i */
956bc8e477aSFande Kong 
9579566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
9589566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
9599566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
9609566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
961bc8e477aSFande Kong 
9629566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9639566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9649566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9659566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9669566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
9674a56b808SFande Kong 
9684a56b808SFande Kong   /* Add contributions from remote */
9694a56b808SFande Kong   for (i = 0; i < pn; i++) {
9704a56b808SFande Kong     row = i + pcstart;
9719566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
9724a56b808SFande Kong   }
9739566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
9744a56b808SFande Kong 
9759566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
9769566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
9774a56b808SFande Kong 
9784a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
9794a56b808SFande Kong   PetscFunctionReturn(0);
9804a56b808SFande Kong }
9814a56b808SFande Kong 
9829371c9d4SSatish Balay PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) {
9834a56b808SFande Kong   PetscFunctionBegin;
98434bcad68SFande Kong 
9859566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
9864a56b808SFande Kong   PetscFunctionReturn(0);
9874a56b808SFande Kong }
9884a56b808SFande Kong 
9896718818eSStefano Zampini /* TODO: move algorithm selection to MatProductSetFromOptions */
9909371c9d4SSatish Balay PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) {
9914a56b808SFande Kong   Mat_APMPI      *ptap;
9926718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
9934a56b808SFande Kong   MPI_Comm        comm;
994bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
9954a56b808SFande Kong   MatType         mtype;
9964a56b808SFande Kong   PetscSF         sf;
9974a56b808SFande Kong   PetscSFNode    *iremote;
9984a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
9994a56b808SFande Kong   const PetscInt *rootdegrees;
10004a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto;
10014a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1002131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1003bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1004131c27b5Sprj-   PetscMPIInt     owner;
1005bc8e477aSFande Kong   const PetscInt *mappingindices;
10064a56b808SFande Kong   PetscBool       flg;
10074a56b808SFande Kong   const char     *algTypes[2] = {"overlapping", "merged"};
1008bc8e477aSFande Kong   IS              map;
10094a56b808SFande Kong 
10104a56b808SFande Kong   PetscFunctionBegin;
10116718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
101228b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
10139566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
101434bcad68SFande Kong 
10154a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10169566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1017bc8e477aSFande Kong   pn *= dof;
10189566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
10199566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
10209566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
10214a56b808SFande Kong 
10229566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
10234a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
10244a56b808SFande Kong   ptap->algType = 2;
10254a56b808SFande Kong 
10264a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
10279566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
10289566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
10294a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
10309566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1031bc8e477aSFande Kong   pon *= dof;
10324a56b808SFande Kong   /* offsets */
10339566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
10344a56b808SFande Kong   /* The number of columns we will send to remote ranks */
10359566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
10369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
1037*48a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
10389566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
10399566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
10404a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
10419566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
10424a56b808SFande Kong 
10439566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
10444a56b808SFande Kong   ptap->c_rmti[0] = 0;
10454a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
10464a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
10474a56b808SFande Kong     /* Form one row of AP */
10489566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
1049bc8e477aSFande Kong     offset = i % dof;
1050bc8e477aSFande Kong     ii     = i / dof;
10514a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1052bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
10534a56b808SFande Kong     if (!nzi) continue;
10544a56b808SFande Kong 
10559566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
10569566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
10574a56b808SFande Kong     /* If AP is empty, just continue */
10584a56b808SFande Kong     if (!htsize) continue;
10594a56b808SFande Kong     /* Form C(ii, :) */
1060bc8e477aSFande Kong     poj = po->j + po->i[ii];
1061*48a46eb9SPierre Jolivet     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
10624a56b808SFande Kong   }
10634a56b808SFande Kong 
10644a56b808SFande Kong   for (i = 0; i < pon; i++) {
10659566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
10664a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
10674a56b808SFande Kong     c_rmtc[i]           = htsize;
10684a56b808SFande Kong   }
10694a56b808SFande Kong 
10709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
10714a56b808SFande Kong 
10724a56b808SFande Kong   for (i = 0; i < pon; i++) {
10734a56b808SFande Kong     off = 0;
10749566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
10759566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
10764a56b808SFande Kong   }
10779566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
10784a56b808SFande Kong 
10799566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
10804a56b808SFande Kong   for (i = 0; i < pon; i++) {
10819371c9d4SSatish Balay     owner  = 0;
10829371c9d4SSatish Balay     lidx   = 0;
1083bc8e477aSFande Kong     offset = i % dof;
1084bc8e477aSFande Kong     ii     = i / dof;
10859566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1086bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
10874a56b808SFande Kong     iremote[i].rank  = owner;
10884a56b808SFande Kong   }
10894a56b808SFande Kong 
10909566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
10919566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
10924a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
10939566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
10949566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
10959566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
10964a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
10979566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
10989566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
10994a56b808SFande Kong   rootspacesize = 0;
11009371c9d4SSatish Balay   for (i = 0; i < pn; i++) { rootspacesize += rootdegrees[i]; }
11019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
11029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
11034a56b808SFande Kong   /* Get information from leaves
11044a56b808SFande Kong    * Number of columns other people contribute to my rows
11054a56b808SFande Kong    * */
11069566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
11079566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
11089566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
11099566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
11104a56b808SFande Kong   /* The number of columns is received for each row */
11114a56b808SFande Kong   ptap->c_othi[0]     = 0;
11124a56b808SFande Kong   rootspacesize       = 0;
11134a56b808SFande Kong   rootspaceoffsets[0] = 0;
11144a56b808SFande Kong   for (i = 0; i < pn; i++) {
11154a56b808SFande Kong     rcvncols = 0;
11164a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
11174a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
11184a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
11194a56b808SFande Kong       rootspacesize++;
11204a56b808SFande Kong     }
11214a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
11224a56b808SFande Kong   }
11239566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
11244a56b808SFande Kong 
11259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
11269566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11279566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11289566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
11299566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
11304a56b808SFande Kong 
11319566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
11324a56b808SFande Kong   nleaves = 0;
11334a56b808SFande Kong   for (i = 0; i < pon; i++) {
1134bc8e477aSFande Kong     owner = 0;
1135bc8e477aSFande Kong     ii    = i / dof;
11369566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
11374a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
11384a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
11394a56b808SFande Kong       iremote[nleaves].rank    = owner;
11404a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
11414a56b808SFande Kong     }
11424a56b808SFande Kong   }
11439566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
11449566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
11454a56b808SFande Kong 
11469566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
11479566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
11489566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
11494a56b808SFande Kong   /* One to one map */
11509566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11514a56b808SFande Kong 
11529566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
11539566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
11549566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1155bc8e477aSFande Kong   pcstart *= dof;
1156bc8e477aSFande Kong   pcend *= dof;
11579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
11584a56b808SFande Kong   for (i = 0; i < pn; i++) {
11599566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
11609566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
11614a56b808SFande Kong   }
11624a56b808SFande Kong   /* Work on local part */
11634a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
11644a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
11659566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
11669566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1167bc8e477aSFande Kong     offset = i % dof;
1168bc8e477aSFande Kong     ii     = i / dof;
1169bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
11704a56b808SFande Kong     if (!nzi) continue;
11714a56b808SFande Kong 
11729566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
11739566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
11749566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
11754a56b808SFande Kong     if (!(htsize + htosize)) continue;
11764a56b808SFande Kong     /* Form C(ii, :) */
1177bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
11784a56b808SFande Kong     for (j = 0; j < nzi; j++) {
11799566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
11809566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
11814a56b808SFande Kong     }
11824a56b808SFande Kong   }
11834a56b808SFande Kong 
11849566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1185bc8e477aSFande Kong 
11869566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
11879566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
11884a56b808SFande Kong 
11894a56b808SFande Kong   /* Get remote data */
11909566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11919566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
11924a56b808SFande Kong 
11934a56b808SFande Kong   for (i = 0; i < pn; i++) {
11944a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
11954a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
11964a56b808SFande Kong     for (j = 0; j < nzi; j++) {
11974a56b808SFande Kong       col = rdj[j];
11984a56b808SFande Kong       /* diag part */
11994a56b808SFande Kong       if (col >= pcstart && col < pcend) {
12009566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hta[i], col));
12014a56b808SFande Kong       } else { /* off diag */
12029566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
12034a56b808SFande Kong       }
12044a56b808SFande Kong     }
12059566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
12064a56b808SFande Kong     dnz[i] = htsize;
12079566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
12089566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
12094a56b808SFande Kong     onz[i] = htsize;
12109566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
12114a56b808SFande Kong   }
12124a56b808SFande Kong 
12139566063dSJacob Faibussowitsch   PetscCall(PetscFree2(hta, hto));
12149566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
12154a56b808SFande Kong 
12164a56b808SFande Kong   /* local sizes and preallocation */
12179566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12189566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
12199566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
12209566063dSJacob Faibussowitsch   PetscCall(MatSetUp(Cmpi));
12219566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
12224a56b808SFande Kong 
12234a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
12246718818eSStefano Zampini   Cmpi->product->data    = ptap;
12256718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
12266718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
12274a56b808SFande Kong 
12284a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
12294a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
12304a56b808SFande Kong   /* pick an algorithm */
1231d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
12324a56b808SFande Kong   alg = 0;
12339566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1234d0609cedSBarry Smith   PetscOptionsEnd();
12354a56b808SFande Kong   switch (alg) {
12369371c9d4SSatish Balay   case 0: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; break;
12379371c9d4SSatish Balay   case 1: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; break;
12389371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
12394a56b808SFande Kong   }
12404a56b808SFande Kong   PetscFunctionReturn(0);
12414a56b808SFande Kong }
12424a56b808SFande Kong 
12439371c9d4SSatish Balay PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) {
1244bc8e477aSFande Kong   PetscFunctionBegin;
12459566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1246bc8e477aSFande Kong   PetscFunctionReturn(0);
1247bc8e477aSFande Kong }
1248bc8e477aSFande Kong 
12499371c9d4SSatish Balay PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) {
12504a56b808SFande Kong   Mat_APMPI      *ptap;
12516718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
12524a56b808SFande Kong   MPI_Comm        comm;
1253bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
12544a56b808SFande Kong   MatType         mtype;
12554a56b808SFande Kong   PetscSF         sf;
12564a56b808SFande Kong   PetscSFNode    *iremote;
12574a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
12584a56b808SFande Kong   const PetscInt *rootdegrees;
12594a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto, *htd;
12604a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1261131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1262bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1263131c27b5Sprj-   PetscMPIInt     owner;
12644a56b808SFande Kong   PetscBool       flg;
12654a56b808SFande Kong   const char     *algTypes[2] = {"merged", "overlapping"};
1266bc8e477aSFande Kong   const PetscInt *mappingindices;
1267bc8e477aSFande Kong   IS              map;
12684a56b808SFande Kong 
12694a56b808SFande Kong   PetscFunctionBegin;
12706718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
127128b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
12729566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
127334bcad68SFande Kong 
12744a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
12759566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1276bc8e477aSFande Kong   pn *= dof;
12779566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
12789566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
12799566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12804a56b808SFande Kong 
12819566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
12824a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
12834a56b808SFande Kong   ptap->algType = 3;
12844a56b808SFande Kong 
12854a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
12869566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
12879566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
12884a56b808SFande Kong 
12894a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
12909566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1291bc8e477aSFande Kong   pon *= dof;
12924a56b808SFande Kong   /* offsets */
12939566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
12944a56b808SFande Kong   /* The number of columns we will send to remote ranks */
12959566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
12969566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
1297*48a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
12989566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
12999566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
13004a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
13019566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
13029566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
13039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
13044a56b808SFande Kong   for (i = 0; i < pn; i++) {
13059566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&htd[i]));
13069566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
13074a56b808SFande Kong   }
1308bc8e477aSFande Kong 
13099566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
13104a56b808SFande Kong   ptap->c_rmti[0] = 0;
13114a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
13124a56b808SFande Kong   for (i = 0; i < am && (pon || pn); i++) {
13134a56b808SFande Kong     /* Form one row of AP */
13149566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
13159566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1316bc8e477aSFande Kong     offset = i % dof;
1317bc8e477aSFande Kong     ii     = i / dof;
13184a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1319bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
1320bc8e477aSFande Kong     dnzi   = pd->i[ii + 1] - pd->i[ii];
13214a56b808SFande Kong     if (!nzi && !dnzi) continue;
13224a56b808SFande Kong 
13239566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
13249566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
13259566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
13264a56b808SFande Kong     /* If AP is empty, just continue */
13274a56b808SFande Kong     if (!(htsize + htosize)) continue;
13284a56b808SFande Kong 
13294a56b808SFande Kong     /* Form remote C(ii, :) */
1330bc8e477aSFande Kong     poj = po->j + po->i[ii];
13314a56b808SFande Kong     for (j = 0; j < nzi; j++) {
13329566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
13339566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
13344a56b808SFande Kong     }
13354a56b808SFande Kong 
13364a56b808SFande Kong     /* Form local C(ii, :) */
1337bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
13384a56b808SFande Kong     for (j = 0; j < dnzi; j++) {
13399566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
13409566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
13414a56b808SFande Kong     }
13424a56b808SFande Kong   }
13434a56b808SFande Kong 
13449566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1345bc8e477aSFande Kong 
13469566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
13479566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
13484a56b808SFande Kong 
13494a56b808SFande Kong   for (i = 0; i < pon; i++) {
13509566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
13514a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
13524a56b808SFande Kong     c_rmtc[i]           = htsize;
13534a56b808SFande Kong   }
13544a56b808SFande Kong 
13559566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
13564a56b808SFande Kong 
13574a56b808SFande Kong   for (i = 0; i < pon; i++) {
13584a56b808SFande Kong     off = 0;
13599566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
13609566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
13614a56b808SFande Kong   }
13629566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
13634a56b808SFande Kong 
13649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
13654a56b808SFande Kong   for (i = 0; i < pon; i++) {
13669371c9d4SSatish Balay     owner  = 0;
13679371c9d4SSatish Balay     lidx   = 0;
1368bc8e477aSFande Kong     offset = i % dof;
1369bc8e477aSFande Kong     ii     = i / dof;
13709566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1371bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
13724a56b808SFande Kong     iremote[i].rank  = owner;
13734a56b808SFande Kong   }
13744a56b808SFande Kong 
13759566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
13769566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
13774a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
13789566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
13799566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
13809566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
13814a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
13829566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
13839566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
13844a56b808SFande Kong   rootspacesize = 0;
13859371c9d4SSatish Balay   for (i = 0; i < pn; i++) { rootspacesize += rootdegrees[i]; }
13869566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
13879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
13884a56b808SFande Kong   /* Get information from leaves
13894a56b808SFande Kong    * Number of columns other people contribute to my rows
13904a56b808SFande Kong    * */
13919566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
13929566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
13939566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
13949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
13954a56b808SFande Kong   /* The number of columns is received for each row */
13964a56b808SFande Kong   ptap->c_othi[0]     = 0;
13974a56b808SFande Kong   rootspacesize       = 0;
13984a56b808SFande Kong   rootspaceoffsets[0] = 0;
13994a56b808SFande Kong   for (i = 0; i < pn; i++) {
14004a56b808SFande Kong     rcvncols = 0;
14014a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
14024a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
14034a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
14044a56b808SFande Kong       rootspacesize++;
14054a56b808SFande Kong     }
14064a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
14074a56b808SFande Kong   }
14089566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
14094a56b808SFande Kong 
14109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
14119566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14129566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14139566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
14149566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
14154a56b808SFande Kong 
14169566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
14174a56b808SFande Kong   nleaves = 0;
14184a56b808SFande Kong   for (i = 0; i < pon; i++) {
1419071fcb05SBarry Smith     owner = 0;
1420bc8e477aSFande Kong     ii    = i / dof;
14219566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
14224a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
14234a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
14244a56b808SFande Kong       iremote[nleaves].rank    = owner;
14254a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
14264a56b808SFande Kong     }
14274a56b808SFande Kong   }
14289566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
14299566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
14304a56b808SFande Kong 
14319566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
14329566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
14339566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
14344a56b808SFande Kong   /* One to one map */
14359566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14364a56b808SFande Kong   /* Get remote data */
14379566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14389566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
14399566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
14409566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1441bc8e477aSFande Kong   pcstart *= dof;
1442bc8e477aSFande Kong   pcend *= dof;
14434a56b808SFande Kong   for (i = 0; i < pn; i++) {
14444a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
14454a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
14464a56b808SFande Kong     for (j = 0; j < nzi; j++) {
14474a56b808SFande Kong       col = rdj[j];
14484a56b808SFande Kong       /* diag part */
14494a56b808SFande Kong       if (col >= pcstart && col < pcend) {
14509566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(htd[i], col));
14514a56b808SFande Kong       } else { /* off diag */
14529566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
14534a56b808SFande Kong       }
14544a56b808SFande Kong     }
14559566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
14564a56b808SFande Kong     dnz[i] = htsize;
14579566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&htd[i]));
14589566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
14594a56b808SFande Kong     onz[i] = htsize;
14609566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
14614a56b808SFande Kong   }
14624a56b808SFande Kong 
14639566063dSJacob Faibussowitsch   PetscCall(PetscFree2(htd, hto));
14649566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
14654a56b808SFande Kong 
14664a56b808SFande Kong   /* local sizes and preallocation */
14679566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
14689566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
14699566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
14709566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
14714a56b808SFande Kong 
14724a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
14736718818eSStefano Zampini   Cmpi->product->data    = ptap;
14746718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
14756718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
14764a56b808SFande Kong 
14774a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
14784a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
14794a56b808SFande Kong   /* pick an algorithm */
1480d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
14814a56b808SFande Kong   alg = 0;
14829566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1483d0609cedSBarry Smith   PetscOptionsEnd();
14844a56b808SFande Kong   switch (alg) {
14859371c9d4SSatish Balay   case 0: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; break;
14869371c9d4SSatish Balay   case 1: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; break;
14879371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
14884a56b808SFande Kong   }
14894a56b808SFande Kong   PetscFunctionReturn(0);
14904a56b808SFande Kong }
14914a56b808SFande Kong 
14929371c9d4SSatish Balay PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) {
1493bc8e477aSFande Kong   PetscFunctionBegin;
14949566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1495bc8e477aSFande Kong   PetscFunctionReturn(0);
1496bc8e477aSFande Kong }
1497bc8e477aSFande Kong 
14989371c9d4SSatish Balay PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) {
14993cdca5ebSHong Zhang   Mat_APMPI               *ptap;
15006718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1501e953e47cSHong Zhang   MPI_Comm                 comm;
1502e953e47cSHong Zhang   PetscMPIInt              size, rank;
1503e953e47cSHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1504e953e47cSHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1505e953e47cSHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
1506e953e47cSHong Zhang   PetscBT                  lnkbt;
1507ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1508ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
1509e953e47cSHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1510e953e47cSHong Zhang   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1511e953e47cSHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1512e953e47cSHong Zhang   MPI_Request             *swaits, *rwaits;
1513e953e47cSHong Zhang   MPI_Status              *sstatus, rstatus;
1514e953e47cSHong Zhang   PetscLayout              rowmap;
1515e953e47cSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1516e953e47cSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1517e953e47cSHong Zhang   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1518ec07b8f8SHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1519e953e47cSHong Zhang   PetscScalar             *apv;
1520e953e47cSHong Zhang   PetscTable               ta;
1521b92f168fSBarry Smith   MatType                  mtype;
1522e83e5f86SFande Kong   const char              *prefix;
1523e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1524e953e47cSHong Zhang   PetscReal apfill;
1525e953e47cSHong Zhang #endif
1526e953e47cSHong Zhang 
1527e953e47cSHong Zhang   PetscFunctionBegin;
15286718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
152928b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
15309566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
15319566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
15329566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1533e953e47cSHong Zhang 
153452f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1535ec07b8f8SHong Zhang 
1536e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
15379566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
15389566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
1539e953e47cSHong Zhang 
1540e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1541e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1542e953e47cSHong Zhang 
15433cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
15449566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
154515a3b8e2SHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
1546a4ffb656SHong Zhang   ptap->algType = 1;
154715a3b8e2SHong Zhang 
154815a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
15499566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
155015a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
15519566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
155215a3b8e2SHong Zhang 
155367a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
155467a12041SHong Zhang   /* --------------------------------- */
15559566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
15569566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
155715a3b8e2SHong Zhang 
155867a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
155967a12041SHong Zhang   /* ----------------------------------------------------------------- */
156067a12041SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
156152f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1562445158ffSHong Zhang 
15639a6dcab7SHong Zhang   /* create and initialize a linked list */
15649566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */
15654b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
15664b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
15679566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1568d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
156978d30b94SHong Zhang 
15709566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1571445158ffSHong Zhang 
15728cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1573ec07b8f8SHong Zhang   if (ao) {
15749566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1575ec07b8f8SHong Zhang   } else {
15769566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1577ec07b8f8SHong Zhang   }
1578445158ffSHong Zhang   current_space = free_space;
157967a12041SHong Zhang   nspacedouble  = 0;
158067a12041SHong Zhang 
15819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
158267a12041SHong Zhang   api[0] = 0;
1583445158ffSHong Zhang   for (i = 0; i < am; i++) {
158467a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
15859371c9d4SSatish Balay     ai  = ad->i;
15869371c9d4SSatish Balay     pi  = p_loc->i;
158767a12041SHong Zhang     nzi = ai[i + 1] - ai[i];
158867a12041SHong Zhang     aj  = ad->j + ai[i];
1589445158ffSHong Zhang     for (j = 0; j < nzi; j++) {
1590445158ffSHong Zhang       row  = aj[j];
159167a12041SHong Zhang       pnz  = pi[row + 1] - pi[row];
159267a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1593445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
15949566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1595445158ffSHong Zhang     }
159667a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1597ec07b8f8SHong Zhang     if (ao) {
15989371c9d4SSatish Balay       ai  = ao->i;
15999371c9d4SSatish Balay       pi  = p_oth->i;
160067a12041SHong Zhang       nzi = ai[i + 1] - ai[i];
160167a12041SHong Zhang       aj  = ao->j + ai[i];
1602445158ffSHong Zhang       for (j = 0; j < nzi; j++) {
1603445158ffSHong Zhang         row  = aj[j];
160467a12041SHong Zhang         pnz  = pi[row + 1] - pi[row];
160567a12041SHong Zhang         Jptr = p_oth->j + pi[row];
16069566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1607445158ffSHong Zhang       }
1608ec07b8f8SHong Zhang     }
1609445158ffSHong Zhang     apnz       = lnk[0];
1610445158ffSHong Zhang     api[i + 1] = api[i] + apnz;
1611445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1612445158ffSHong Zhang 
1613445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1614445158ffSHong Zhang     if (current_space->local_remaining < apnz) {
16159566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1616445158ffSHong Zhang       nspacedouble++;
1617445158ffSHong Zhang     }
1618445158ffSHong Zhang 
1619445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
16209566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1621445158ffSHong Zhang 
1622445158ffSHong Zhang     current_space->array += apnz;
1623445158ffSHong Zhang     current_space->local_used += apnz;
1624445158ffSHong Zhang     current_space->local_remaining -= apnz;
1625445158ffSHong Zhang   }
1626681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1627445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
16289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
16299566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
16309566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
16319a6dcab7SHong Zhang 
1632aa690a28SHong Zhang   /* Create AP_loc for reuse */
16339566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
16349566063dSJacob Faibussowitsch   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1635aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1636ec07b8f8SHong Zhang   if (ao) {
1637aa690a28SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1638ec07b8f8SHong Zhang   } else {
1639ec07b8f8SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1640ec07b8f8SHong Zhang   }
1641aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1642aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1643aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1644aa690a28SHong Zhang 
1645aa690a28SHong Zhang   if (api[am]) {
16469566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
16479566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1648aa690a28SHong Zhang   } else {
16499566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1650aa690a28SHong Zhang   }
1651aa690a28SHong Zhang #endif
1652aa690a28SHong Zhang 
1653681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1654681d504bSHong Zhang   /* ------------------------------------ */
16559566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
16569566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
16579566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
16589566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
16599566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
16609566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
16619566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
16629566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
16639566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
16649566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
16659566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
16669566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
166715a3b8e2SHong Zhang 
166867a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
166967a12041SHong Zhang   /* ------------------------------------------ */
167015a3b8e2SHong Zhang   /* determine row ownership */
16719566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
1672445158ffSHong Zhang   rowmap->n  = pn;
1673445158ffSHong Zhang   rowmap->bs = 1;
16749566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
1675445158ffSHong Zhang   owners = rowmap->range;
167615a3b8e2SHong Zhang 
167715a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
16789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
16799566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
16809566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
168115a3b8e2SHong Zhang 
168267a12041SHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
16839371c9d4SSatish Balay   coi   = c_oth->i;
16849371c9d4SSatish Balay   coj   = c_oth->j;
168567a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
168615a3b8e2SHong Zhang   proc  = 0;
168767a12041SHong Zhang   for (i = 0; i < con; i++) {
168815a3b8e2SHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
168915a3b8e2SHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
169015a3b8e2SHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
169115a3b8e2SHong Zhang   }
169215a3b8e2SHong Zhang 
169315a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
169415a3b8e2SHong Zhang   owners_co[0] = 0;
169567a12041SHong Zhang   nsend        = 0;
169615a3b8e2SHong Zhang   for (proc = 0; proc < size; proc++) {
169715a3b8e2SHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
169815a3b8e2SHong Zhang     if (len_s[proc]) {
1699445158ffSHong Zhang       nsend++;
170015a3b8e2SHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
170115a3b8e2SHong Zhang       len += len_si[proc];
170215a3b8e2SHong Zhang     }
170315a3b8e2SHong Zhang   }
170415a3b8e2SHong Zhang 
170515a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
17069566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
17079566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
170815a3b8e2SHong Zhang 
170915a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
17109566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
17119566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
17129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
171315a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
171415a3b8e2SHong Zhang     if (!len_s[proc]) continue;
171515a3b8e2SHong Zhang     i = owners_co[proc];
17169566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
171715a3b8e2SHong Zhang     k++;
171815a3b8e2SHong Zhang   }
171915a3b8e2SHong Zhang 
1720681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1721681d504bSHong Zhang   /* ---------------------------------------- */
17229566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
17239566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
17249566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
17259566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
17269566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
17279566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
17289566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
17299566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
17309566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
17319566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
17329566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
17334222ddf1SHong Zhang 
1734681d504bSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1735681d504bSHong Zhang 
1736681d504bSHong Zhang   /* receives coj are complete */
1737*48a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
17389566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17399566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
174015a3b8e2SHong Zhang 
174178d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
174278d30b94SHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
174378d30b94SHong Zhang     Jptr = buf_rj[k];
1744*48a46eb9SPierre Jolivet     for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
174578d30b94SHong Zhang   }
17469566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta, &Crmax));
17479566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
17489a6dcab7SHong Zhang 
174915a3b8e2SHong Zhang   /* (4) send and recv coi */
175015a3b8e2SHong Zhang   /*-----------------------*/
17519566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
17529566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
17539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
175415a3b8e2SHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
175515a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
175615a3b8e2SHong Zhang     if (!len_s[proc]) continue;
175715a3b8e2SHong Zhang     /* form outgoing message for i-structure:
175815a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
175915a3b8e2SHong Zhang                [1:nrows]:           row index (global)
176015a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
176115a3b8e2SHong Zhang     */
176215a3b8e2SHong Zhang     /*-------------------------------------------*/
176315a3b8e2SHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
176415a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows + 1;
176515a3b8e2SHong Zhang     buf_si[0]   = nrows;
176615a3b8e2SHong Zhang     buf_si_i[0] = 0;
176715a3b8e2SHong Zhang     nrows       = 0;
176815a3b8e2SHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
176915a3b8e2SHong Zhang       nzi                 = coi[i + 1] - coi[i];
177015a3b8e2SHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
177115a3b8e2SHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
177215a3b8e2SHong Zhang       nrows++;
177315a3b8e2SHong Zhang     }
17749566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
177515a3b8e2SHong Zhang     k++;
177615a3b8e2SHong Zhang     buf_si += len_si[proc];
177715a3b8e2SHong Zhang   }
1778*48a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
17799566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17809566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
178115a3b8e2SHong Zhang 
17829566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
17839566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
17849566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
17859566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
1786a4ffb656SHong Zhang 
178767a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
178867a12041SHong Zhang   /* ------------------------------------------ */
178978d30b94SHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
17909566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
179115a3b8e2SHong Zhang   current_space = free_space;
179215a3b8e2SHong Zhang 
17939566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1794445158ffSHong Zhang   for (k = 0; k < nrecv; k++) {
179515a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
179615a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
179715a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1798a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
179915a3b8e2SHong Zhang   }
1800445158ffSHong Zhang 
1801d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
18029566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
180315a3b8e2SHong Zhang   for (i = 0; i < pn; i++) {
180467a12041SHong Zhang     /* add C_loc into Cmpi */
180567a12041SHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
180667a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
18079566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
180815a3b8e2SHong Zhang 
180915a3b8e2SHong Zhang     /* add received col data into lnk */
1810445158ffSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
181115a3b8e2SHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
181215a3b8e2SHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
181315a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
18149566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
18159371c9d4SSatish Balay         nextrow[k]++;
18169371c9d4SSatish Balay         nextci[k]++;
181715a3b8e2SHong Zhang       }
181815a3b8e2SHong Zhang     }
1819d0e9a2f7SHong Zhang     nzi = lnk[0];
18208cb82516SHong Zhang 
182115a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
18229566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
18239566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
182415a3b8e2SHong Zhang   }
18259566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
18269566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
18279566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
182880bb4639SHong Zhang 
1829ae5f0867Sstefano_zampini   /* local sizes and preallocation */
18309566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1831ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
18329566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
18339566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1834ac94c67aSTristan Konolige   }
18359566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1836d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
183715a3b8e2SHong Zhang 
1838445158ffSHong Zhang   /* members in merge */
18399566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
18409566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
18419566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
18429566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
18439566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
18449566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
18459566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
184615a3b8e2SHong Zhang 
18479566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pN, &ptap->apa));
18482259aa2eSHong Zhang 
18496718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
18506718818eSStefano Zampini   Cmpi->product->data    = ptap;
18516718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
18526718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
18536718818eSStefano Zampini 
18541a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
18551a47ec54SHong Zhang   Cmpi->assembled = PETSC_FALSE;
18560d3441aeSHong Zhang   PetscFunctionReturn(0);
18570d3441aeSHong Zhang }
18580d3441aeSHong Zhang 
18599371c9d4SSatish Balay PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) {
18606718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
18610d3441aeSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
18622dd9e643SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
18636718818eSStefano Zampini   Mat_APMPI         *ptap;
18649ce11a7cSHong Zhang   Mat                AP_loc, C_loc, C_oth;
18650d3441aeSHong Zhang   PetscInt           i, rstart, rend, cm, ncols, row;
18668cb82516SHong Zhang   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
18678cb82516SHong Zhang   PetscScalar       *apa;
18680d3441aeSHong Zhang   const PetscInt    *cols;
18690d3441aeSHong Zhang   const PetscScalar *vals;
18700d3441aeSHong Zhang 
18710d3441aeSHong Zhang   PetscFunctionBegin;
18726718818eSStefano Zampini   MatCheckProduct(C, 3);
18736718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
187428b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
187528b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1876a9899c97SHong Zhang 
18779566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
1878e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
1879748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
18809566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
18819566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1882748c7196SHong Zhang   }
18830d3441aeSHong Zhang 
1884e497d3c8SHong Zhang   /* 2) get AP_loc */
18850d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
18868cb82516SHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
18870d3441aeSHong Zhang 
1888e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
18890d3441aeSHong Zhang   /*-----------------------------------------------------*/
1890748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1891748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
18929566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
18939566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1894e497d3c8SHong Zhang   }
18950d3441aeSHong Zhang 
18968cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
18978cb82516SHong Zhang   /* ---------------------------------------------- */
18980d3441aeSHong Zhang   /* get data from symbolic products */
18998cb82516SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
19009371c9d4SSatish Balay   if (ptap->P_oth) { p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; }
19018cb82516SHong Zhang   apa = ptap->apa;
1902681d504bSHong Zhang   api = ap->i;
1903681d504bSHong Zhang   apj = ap->j;
1904e497d3c8SHong Zhang   for (i = 0; i < am; i++) {
1905445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1906e497d3c8SHong Zhang     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1907e497d3c8SHong Zhang     apnz = api[i + 1] - api[i];
1908e497d3c8SHong Zhang     for (j = 0; j < apnz; j++) {
1909e497d3c8SHong Zhang       col                 = apj[j + api[i]];
1910e497d3c8SHong Zhang       ap->a[j + ap->i[i]] = apa[col];
1911e497d3c8SHong Zhang       apa[col]            = 0.0;
19120d3441aeSHong Zhang     }
19130d3441aeSHong Zhang   }
1914976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
19159566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
19160d3441aeSHong Zhang 
19178cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
19189566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_loc));
19199566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_oth));
19209ce11a7cSHong Zhang   C_loc = ptap->C_loc;
19219ce11a7cSHong Zhang   C_oth = ptap->C_oth;
19220d3441aeSHong Zhang 
19230d3441aeSHong Zhang   /* add C_loc and Co to to C */
19249566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
19250d3441aeSHong Zhang 
19260d3441aeSHong Zhang   /* C_loc -> C */
19270d3441aeSHong Zhang   cm    = C_loc->rmap->N;
19289ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
19298cb82516SHong Zhang   cols  = c_seq->j;
19308cb82516SHong Zhang   vals  = c_seq->a;
1931904d1e70Sandi selinger 
1932e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1933904d1e70Sandi selinger   /* when there are no off-processor parts.  */
19341de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
19351de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
19361de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
1937904d1e70Sandi selinger   if (C->assembled) {
1938904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
1939904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
1940904d1e70Sandi selinger   }
1941904d1e70Sandi selinger   if (C->was_assembled) {
19420d3441aeSHong Zhang     for (i = 0; i < cm; i++) {
19439ce11a7cSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
19440d3441aeSHong Zhang       row   = rstart + i;
19459566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19469371c9d4SSatish Balay       cols += ncols;
19479371c9d4SSatish Balay       vals += ncols;
19480d3441aeSHong Zhang     }
1949904d1e70Sandi selinger   } else {
19509566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1951904d1e70Sandi selinger   }
19520d3441aeSHong Zhang 
19530d3441aeSHong Zhang   /* Co -> C, off-processor part */
19549ce11a7cSHong Zhang   cm    = C_oth->rmap->N;
19559ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
19568cb82516SHong Zhang   cols  = c_seq->j;
19578cb82516SHong Zhang   vals  = c_seq->a;
19589ce11a7cSHong Zhang   for (i = 0; i < cm; i++) {
19599ce11a7cSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
19600d3441aeSHong Zhang     row   = p->garray[i];
19619566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19629371c9d4SSatish Balay     cols += ncols;
19639371c9d4SSatish Balay     vals += ncols;
19640d3441aeSHong Zhang   }
1965904d1e70Sandi selinger 
19669566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
19679566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
19680d3441aeSHong Zhang 
1969748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
19700d3441aeSHong Zhang   PetscFunctionReturn(0);
19710d3441aeSHong Zhang }
19724222ddf1SHong Zhang 
19739371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) {
19744222ddf1SHong Zhang   Mat_Product        *product = C->product;
19754222ddf1SHong Zhang   Mat                 A = product->A, P = product->B;
19764222ddf1SHong Zhang   MatProductAlgorithm alg  = product->alg;
19774222ddf1SHong Zhang   PetscReal           fill = product->fill;
19784222ddf1SHong Zhang   PetscBool           flg;
19794222ddf1SHong Zhang 
19804222ddf1SHong Zhang   PetscFunctionBegin;
19814222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
19829566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
19834222ddf1SHong Zhang   if (flg) {
19849566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
19854222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
19864222ddf1SHong Zhang     goto next;
19874222ddf1SHong Zhang   }
19884222ddf1SHong Zhang 
19894222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
19909566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
19914222ddf1SHong Zhang   if (flg) {
19929566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
19934222ddf1SHong Zhang     goto next;
19944222ddf1SHong Zhang   }
19954222ddf1SHong Zhang 
19964222ddf1SHong Zhang   /* allatonce */
19979566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
19984222ddf1SHong Zhang   if (flg) {
19999566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
20004222ddf1SHong Zhang     goto next;
20014222ddf1SHong Zhang   }
20024222ddf1SHong Zhang 
20034222ddf1SHong Zhang   /* allatonce_merged */
20049566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
20054222ddf1SHong Zhang   if (flg) {
20069566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
20074222ddf1SHong Zhang     goto next;
20084222ddf1SHong Zhang   }
20094222ddf1SHong Zhang 
20104e84afc0SStefano Zampini   /* backend general code */
20119566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "backend", &flg));
20124e84afc0SStefano Zampini   if (flg) {
20139566063dSJacob Faibussowitsch     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
20144e84afc0SStefano Zampini     PetscFunctionReturn(0);
20154e84afc0SStefano Zampini   }
20164e84afc0SStefano Zampini 
20174222ddf1SHong Zhang   /* hypre */
20184222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
20199566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
20204222ddf1SHong Zhang   if (flg) {
20219566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
20224222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20234222ddf1SHong Zhang     PetscFunctionReturn(0);
20244222ddf1SHong Zhang   }
20254222ddf1SHong Zhang #endif
20264222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
20274222ddf1SHong Zhang 
20284222ddf1SHong Zhang next:
20294222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
20304222ddf1SHong Zhang   PetscFunctionReturn(0);
20314222ddf1SHong Zhang }
2032