xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision eec179cf895b6fbcd6a0b58694319392b06e361b)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
16d71ae5a4SJacob Faibussowitsch PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17d71ae5a4SJacob Faibussowitsch {
18cf97cf3cSHong Zhang   PetscBool         iascii;
19cf97cf3cSHong Zhang   PetscViewerFormat format;
206718818eSStefano Zampini   Mat_APMPI        *ptap;
21cf97cf3cSHong Zhang 
22cf97cf3cSHong Zhang   PetscFunctionBegin;
236718818eSStefano Zampini   MatCheckProduct(A, 1);
246718818eSStefano Zampini   ptap = (Mat_APMPI *)A->product->data;
259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
26cf97cf3cSHong Zhang   if (iascii) {
279566063dSJacob Faibussowitsch     PetscCall(PetscViewerGetFormat(viewer, &format));
28cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29cf97cf3cSHong Zhang       if (ptap->algType == 0) {
309566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
31cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
329566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
334a56b808SFande Kong       } else if (ptap->algType == 2) {
349566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
354a56b808SFande Kong       } else if (ptap->algType == 3) {
369566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
37cf97cf3cSHong Zhang       }
38cf97cf3cSHong Zhang     }
39cf97cf3cSHong Zhang   }
40cf97cf3cSHong Zhang   PetscFunctionReturn(0);
41cf97cf3cSHong Zhang }
42cf97cf3cSHong Zhang 
43d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44d71ae5a4SJacob Faibussowitsch {
456718818eSStefano Zampini   Mat_APMPI           *ptap = (Mat_APMPI *)data;
4660552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
47cc6cb787SHong Zhang 
48cc6cb787SHong Zhang   PetscFunctionBegin;
499566063dSJacob Faibussowitsch   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
509566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->bufa));
519566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_loc));
529566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_oth));
539566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
549566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Rd));
559566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Ro));
568403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
57681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
589566063dSJacob Faibussowitsch     PetscCall(PetscFree(ap->i));
599566063dSJacob Faibussowitsch     PetscCall(PetscFree2(ap->j, ap->a));
609566063dSJacob Faibussowitsch     PetscCall(MatDestroy(&ptap->AP_loc));
618403a639SHong Zhang   } else { /* used by alg_ptap */
629566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->api));
639566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->apj));
64681d504bSHong Zhang   }
659566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_loc));
669566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_oth));
679566063dSJacob Faibussowitsch   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68a560ef98SHong Zhang 
699566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Pt));
7060552ceaSHong Zhang 
7160552ceaSHong Zhang   merge = ptap->merge;
728403a639SHong Zhang   if (merge) { /* used by alg_ptap */
739566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->id_r));
749566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_s));
759566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_r));
769566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bi));
779566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bj));
789566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri[0]));
799566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri));
809566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj[0]));
819566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj));
829566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coi));
839566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coj));
849566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->owners_co));
859566063dSJacob Faibussowitsch     PetscCall(PetscLayoutDestroy(&merge->rowmap));
869566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->merge));
87bf0cc555SLisandro Dalcin   }
889566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
894a56b808SFande Kong 
909566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&ptap->sf));
919566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_othi));
929566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_rmti));
939566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap));
94cc6cb787SHong Zhang   PetscFunctionReturn(0);
95cc6cb787SHong Zhang }
96cc6cb787SHong Zhang 
97d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
98d71ae5a4SJacob Faibussowitsch {
996718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100cf742fccSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
10192ec70a1SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1026718818eSStefano Zampini   Mat_APMPI         *ptap;
103cf742fccSHong Zhang   Mat                AP_loc, C_loc, C_oth;
104a3bb6f32SFande Kong   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105cf742fccSHong Zhang   PetscScalar       *apa;
106cf742fccSHong Zhang   const PetscInt    *cols;
107cf742fccSHong Zhang   const PetscScalar *vals;
108cf742fccSHong Zhang 
109cf742fccSHong Zhang   PetscFunctionBegin;
1106718818eSStefano Zampini   MatCheckProduct(C, 3);
1116718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
11228b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
11328b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
11480da8df7SHong Zhang 
1159566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
116cf742fccSHong Zhang 
117cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
118cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1199566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1209566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121cf742fccSHong Zhang   }
122cf742fccSHong Zhang 
123cf742fccSHong Zhang   /* 2) get AP_loc */
124cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
125cf742fccSHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
126cf742fccSHong Zhang 
127cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128cf742fccSHong Zhang   /*-----------------------------------------------------*/
129cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
130cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1319566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1329566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
133cf742fccSHong Zhang   }
134cf742fccSHong Zhang 
135cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
136cf742fccSHong Zhang   /* ---------------------------------------------- */
137cf742fccSHong Zhang   /* get data from symbolic products */
138cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
13952f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
14052f7967eSHong Zhang 
141cf742fccSHong Zhang   api = ap->i;
142cf742fccSHong Zhang   apj = ap->j;
1439566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
144cf742fccSHong Zhang   for (i = 0; i < am; i++) {
145cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146cf742fccSHong Zhang     apnz = api[i + 1] - api[i];
147b4e8d1b6SHong Zhang     apa  = ap->a + api[i];
1489566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(apa, apnz));
149b4e8d1b6SHong Zhang     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
150cf742fccSHong Zhang   }
1519566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
15208401ef6SPierre Jolivet   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
153cf742fccSHong Zhang 
154cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
1569566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
1579566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
158cf742fccSHong Zhang 
159cf742fccSHong Zhang   C_loc = ptap->C_loc;
160cf742fccSHong Zhang   C_oth = ptap->C_oth;
161cf742fccSHong Zhang 
162cf742fccSHong Zhang   /* add C_loc and Co to to C */
1639566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
164cf742fccSHong Zhang 
165cf742fccSHong Zhang   /* C_loc -> C */
166cf742fccSHong Zhang   cm    = C_loc->rmap->N;
167cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
168cf742fccSHong Zhang   cols  = c_seq->j;
169cf742fccSHong Zhang   vals  = c_seq->a;
1709566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
171edeac6deSandi selinger 
172e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173edeac6deSandi selinger   /* when there are no off-processor parts.  */
1741de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1751de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1761de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
177edeac6deSandi selinger   if (C->assembled) {
178edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
179edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
180edeac6deSandi selinger   }
181edeac6deSandi selinger   if (C->was_assembled) {
182cf742fccSHong Zhang     for (i = 0; i < cm; i++) {
183cf742fccSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
184cf742fccSHong Zhang       row   = rstart + i;
1859566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1869371c9d4SSatish Balay       cols += ncols;
1879371c9d4SSatish Balay       vals += ncols;
188cf742fccSHong Zhang     }
189edeac6deSandi selinger   } else {
1909566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
191edeac6deSandi selinger   }
1929566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
19308401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
194cf742fccSHong Zhang 
195cf742fccSHong Zhang   /* Co -> C, off-processor part */
196cf742fccSHong Zhang   cm    = C_oth->rmap->N;
197cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
198cf742fccSHong Zhang   cols  = c_seq->j;
199cf742fccSHong Zhang   vals  = c_seq->a;
2009566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
201cf742fccSHong Zhang   for (i = 0; i < cm; i++) {
202cf742fccSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
203cf742fccSHong Zhang     row   = p->garray[i];
2049566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
2059371c9d4SSatish Balay     cols += ncols;
2069371c9d4SSatish Balay     vals += ncols;
207cf742fccSHong Zhang   }
2089566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2099566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
210cf742fccSHong Zhang 
211cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
21280da8df7SHong Zhang 
2139566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
21408401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
215cf742fccSHong Zhang   PetscFunctionReturn(0);
216cf742fccSHong Zhang }
217cf742fccSHong Zhang 
218d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
219d71ae5a4SJacob Faibussowitsch {
2203cdca5ebSHong Zhang   Mat_APMPI               *ptap;
2216718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
2222259aa2eSHong Zhang   MPI_Comm                 comm;
2232259aa2eSHong Zhang   PetscMPIInt              size, rank;
2244222ddf1SHong Zhang   Mat                      P_loc, P_oth;
22515a3b8e2SHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
226d0e9a2f7SHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
227d0e9a2f7SHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
228ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
229ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
23015a3b8e2SHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
2317c70b0e9SStefano Zampini   const PetscInt          *owners;
2327c70b0e9SStefano Zampini   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
23315a3b8e2SHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
23415a3b8e2SHong Zhang   MPI_Request             *swaits, *rwaits;
23515a3b8e2SHong Zhang   MPI_Status              *sstatus, rstatus;
236445158ffSHong Zhang   PetscLayout              rowmap;
237445158ffSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
238445158ffSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
239a3bb6f32SFande Kong   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
24052f7967eSHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
24167a12041SHong Zhang   PetscScalar             *apv;
242*eec179cfSJacob Faibussowitsch   PetscHMapI               ta;
243b92f168fSBarry Smith   MatType                  mtype;
244e83e5f86SFande Kong   const char              *prefix;
245aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2468cb82516SHong Zhang   PetscReal apfill;
247aa690a28SHong Zhang #endif
24867a12041SHong Zhang 
24967a12041SHong Zhang   PetscFunctionBegin;
2506718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
25128b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
2529566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2539566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
2549566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
255ae5f0867Sstefano_zampini 
25652f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
25752f7967eSHong Zhang 
258ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
2599566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
2609566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
261ae5f0867Sstefano_zampini 
2623cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
2639566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
264e953e47cSHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
265cf97cf3cSHong Zhang   ptap->algType = 0;
266e953e47cSHong Zhang 
267e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
2689566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
269e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
2709566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
27176db11f6SHong Zhang 
27276db11f6SHong Zhang   ptap->P_loc = P_loc;
27376db11f6SHong Zhang   ptap->P_oth = P_oth;
274e953e47cSHong Zhang 
275e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
276e953e47cSHong Zhang   /* --------------------------------- */
2779566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
2789566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
279e953e47cSHong Zhang 
280e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
281e953e47cSHong Zhang   /* ----------------------------------------------------------------- */
28276db11f6SHong Zhang   p_loc = (Mat_SeqAIJ *)P_loc->data;
28352f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
284e953e47cSHong Zhang 
285e953e47cSHong Zhang   /* create and initialize a linked list */
286*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
28776db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
28876db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
289*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
290e953e47cSHong Zhang 
2919566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
292e953e47cSHong Zhang 
293e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
29452f7967eSHong Zhang   if (ao) {
2959566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
29652f7967eSHong Zhang   } else {
2979566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
29852f7967eSHong Zhang   }
299e953e47cSHong Zhang   current_space = free_space;
300e953e47cSHong Zhang   nspacedouble  = 0;
301e953e47cSHong Zhang 
3029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
303e953e47cSHong Zhang   api[0] = 0;
304e953e47cSHong Zhang   for (i = 0; i < am; i++) {
305e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
3069371c9d4SSatish Balay     ai  = ad->i;
3079371c9d4SSatish Balay     pi  = p_loc->i;
308e953e47cSHong Zhang     nzi = ai[i + 1] - ai[i];
309e953e47cSHong Zhang     aj  = ad->j + ai[i];
310e953e47cSHong Zhang     for (j = 0; j < nzi; j++) {
311e953e47cSHong Zhang       row  = aj[j];
312e953e47cSHong Zhang       pnz  = pi[row + 1] - pi[row];
313e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
314e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
3159566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
316e953e47cSHong Zhang     }
317e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
31852f7967eSHong Zhang     if (ao) {
3199371c9d4SSatish Balay       ai  = ao->i;
3209371c9d4SSatish Balay       pi  = p_oth->i;
321e953e47cSHong Zhang       nzi = ai[i + 1] - ai[i];
322e953e47cSHong Zhang       aj  = ao->j + ai[i];
323e953e47cSHong Zhang       for (j = 0; j < nzi; j++) {
324e953e47cSHong Zhang         row  = aj[j];
325e953e47cSHong Zhang         pnz  = pi[row + 1] - pi[row];
326e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
3279566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
328e953e47cSHong Zhang       }
32952f7967eSHong Zhang     }
330e953e47cSHong Zhang     apnz       = lnk[0];
331e953e47cSHong Zhang     api[i + 1] = api[i] + apnz;
332e953e47cSHong Zhang 
333e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
334e953e47cSHong Zhang     if (current_space->local_remaining < apnz) {
3359566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
336e953e47cSHong Zhang       nspacedouble++;
337e953e47cSHong Zhang     }
338e953e47cSHong Zhang 
339e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
3409566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
341e953e47cSHong Zhang 
342e953e47cSHong Zhang     current_space->array += apnz;
343e953e47cSHong Zhang     current_space->local_used += apnz;
344e953e47cSHong Zhang     current_space->local_remaining -= apnz;
345e953e47cSHong Zhang   }
346e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
347e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
3489566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
3499566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
3509566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
351e953e47cSHong Zhang 
352e953e47cSHong Zhang   /* Create AP_loc for reuse */
3539566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
3549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
355e953e47cSHong Zhang 
356e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
35752f7967eSHong Zhang   if (ao) {
358e953e47cSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
35952f7967eSHong Zhang   } else {
36052f7967eSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
36152f7967eSHong Zhang   }
362e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
363e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
364e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
365e953e47cSHong Zhang 
366e953e47cSHong Zhang   if (api[am]) {
3679566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
3689566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
369e953e47cSHong Zhang   } else {
3709566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
371e953e47cSHong Zhang   }
372e953e47cSHong Zhang #endif
373e953e47cSHong Zhang 
374e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
3754222ddf1SHong Zhang   /* -------------------------------------- */
3769566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
3779566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
3789566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
3799566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
3804222ddf1SHong Zhang 
3819566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
3829566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
3839566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
3849566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
3859566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
386e953e47cSHong Zhang 
387e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
388e953e47cSHong Zhang   /* ------------------------------------------ */
389e953e47cSHong Zhang   /* determine row ownership */
3909566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
3919566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
3929566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
3939566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
3949566063dSJacob Faibussowitsch   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
395e953e47cSHong Zhang 
396e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
3979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
3989566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
3999566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
400e953e47cSHong Zhang 
401e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
4029371c9d4SSatish Balay   coi   = c_oth->i;
4039371c9d4SSatish Balay   coj   = c_oth->j;
404e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
405e953e47cSHong Zhang   proc  = 0;
4069566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
407e953e47cSHong Zhang   for (i = 0; i < con; i++) {
408e953e47cSHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
409e953e47cSHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
410e953e47cSHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
411e953e47cSHong Zhang   }
412e953e47cSHong Zhang 
413e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
414e953e47cSHong Zhang   owners_co[0] = 0;
415e953e47cSHong Zhang   nsend        = 0;
416e953e47cSHong Zhang   for (proc = 0; proc < size; proc++) {
417e953e47cSHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
418e953e47cSHong Zhang     if (len_s[proc]) {
419e953e47cSHong Zhang       nsend++;
420e953e47cSHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
421e953e47cSHong Zhang       len += len_si[proc];
422e953e47cSHong Zhang     }
423e953e47cSHong Zhang   }
424e953e47cSHong Zhang 
425e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
4269566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
4279566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
428e953e47cSHong Zhang 
429e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
4309566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
4319566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
4329566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
433e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
434e953e47cSHong Zhang     if (!len_s[proc]) continue;
435e953e47cSHong Zhang     i = owners_co[proc];
4369566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
437e953e47cSHong Zhang     k++;
438e953e47cSHong Zhang   }
439e953e47cSHong Zhang 
440e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
441e953e47cSHong Zhang   /* ---------------------------------------- */
4429566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
4439566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
4449566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
4459566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
4464222ddf1SHong Zhang 
4479566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
4489566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
4494222ddf1SHong Zhang 
4509566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
4519566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
4524222ddf1SHong Zhang 
453e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
4549566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
455e953e47cSHong Zhang 
456e953e47cSHong Zhang   /* receives coj are complete */
45748a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4589566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4599566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
460e953e47cSHong Zhang 
461e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
462e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
463e953e47cSHong Zhang     Jptr = buf_rj[k];
464*eec179cfSJacob Faibussowitsch     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISetWithMode(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
465e953e47cSHong Zhang   }
466*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
467*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
468e953e47cSHong Zhang 
469e953e47cSHong Zhang   /* (4) send and recv coi */
470e953e47cSHong Zhang   /*-----------------------*/
4719566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
4729566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
4739566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
474e953e47cSHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
475e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
476e953e47cSHong Zhang     if (!len_s[proc]) continue;
477e953e47cSHong Zhang     /* form outgoing message for i-structure:
478e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
479e953e47cSHong Zhang                [1:nrows]:           row index (global)
480e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
481e953e47cSHong Zhang     */
482e953e47cSHong Zhang     /*-------------------------------------------*/
483e953e47cSHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
484e953e47cSHong Zhang     buf_si_i    = buf_si + nrows + 1;
485e953e47cSHong Zhang     buf_si[0]   = nrows;
486e953e47cSHong Zhang     buf_si_i[0] = 0;
487e953e47cSHong Zhang     nrows       = 0;
488e953e47cSHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
489e953e47cSHong Zhang       nzi                 = coi[i + 1] - coi[i];
490e953e47cSHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
491e953e47cSHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
492e953e47cSHong Zhang       nrows++;
493e953e47cSHong Zhang     }
4949566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
495e953e47cSHong Zhang     k++;
496e953e47cSHong Zhang     buf_si += len_si[proc];
497e953e47cSHong Zhang   }
49848a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4999566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
5009566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
501e953e47cSHong Zhang 
5029566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
5039566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
5049566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
5059566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
506b4e8d1b6SHong Zhang 
507e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
508e953e47cSHong Zhang   /* ------------------------------------------ */
509e953e47cSHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
5109566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
511e953e47cSHong Zhang   current_space = free_space;
512e953e47cSHong Zhang 
5139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
514e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) {
515e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
516e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
517e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
518a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
519e953e47cSHong Zhang   }
520e953e47cSHong Zhang 
521d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
5229566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
523e953e47cSHong Zhang   for (i = 0; i < pn; i++) {
524e953e47cSHong Zhang     /* add C_loc into Cmpi */
525e953e47cSHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
526e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
5279566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
528e953e47cSHong Zhang 
529e953e47cSHong Zhang     /* add received col data into lnk */
530e953e47cSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
531e953e47cSHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
532e953e47cSHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
533e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
5349566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
5359371c9d4SSatish Balay         nextrow[k]++;
5369371c9d4SSatish Balay         nextci[k]++;
537e953e47cSHong Zhang       }
538e953e47cSHong Zhang     }
539e953e47cSHong Zhang     nzi = lnk[0];
540e953e47cSHong Zhang 
541e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
5429566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
5439566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
544e953e47cSHong Zhang   }
5459566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
5469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
5479566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
548e953e47cSHong Zhang 
549e953e47cSHong Zhang   /* local sizes and preallocation */
5509566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
551ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
5529566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
5539566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
554ac94c67aSTristan Konolige   }
5559566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
556d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
557e953e47cSHong Zhang 
558e953e47cSHong Zhang   /* members in merge */
5599566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
5609566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
5619566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
5629566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
5639566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
5649566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
5659566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
566e953e47cSHong Zhang 
567a3bb6f32SFande Kong   nout = 0;
5689566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
56908401ef6SPierre Jolivet   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
5709566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
57108401ef6SPierre Jolivet   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
572a3bb6f32SFande Kong 
5736718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
5746718818eSStefano Zampini   Cmpi->product->data    = ptap;
5756718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
5766718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
5776718818eSStefano Zampini 
5786718818eSStefano Zampini   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
5796718818eSStefano Zampini   Cmpi->assembled        = PETSC_FALSE;
5806718818eSStefano Zampini   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
581e953e47cSHong Zhang   PetscFunctionReturn(0);
582e953e47cSHong Zhang }
583e953e47cSHong Zhang 
584d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
585d71ae5a4SJacob Faibussowitsch {
5864a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5874a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
5884a56b808SFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
589bc8e477aSFande Kong   PetscInt    pcstart, pcend, column, offset;
5904a56b808SFande Kong 
5914a56b808SFande Kong   PetscFunctionBegin;
5924a56b808SFande Kong   pcstart = P->cmap->rstart;
593bc8e477aSFande Kong   pcstart *= dof;
5944a56b808SFande Kong   pcend = P->cmap->rend;
595bc8e477aSFande Kong   pcend *= dof;
5964a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
5974a56b808SFande Kong   ai  = ad->i;
5984a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
5994a56b808SFande Kong   aj  = ad->j + ai[i];
6004a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6014a56b808SFande Kong     row    = aj[j];
602bc8e477aSFande Kong     offset = row % dof;
603bc8e477aSFande Kong     row /= dof;
6044a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6054a56b808SFande Kong     pj   = pd->j + pd->i[row];
60648a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
6074a56b808SFande Kong   }
608bc8e477aSFande Kong   /* off diag P */
6094a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6104a56b808SFande Kong     row    = aj[j];
611bc8e477aSFande Kong     offset = row % dof;
612bc8e477aSFande Kong     row /= dof;
6134a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6144a56b808SFande Kong     pj   = po->j + po->i[row];
61548a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
6164a56b808SFande Kong   }
6174a56b808SFande Kong 
6184a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6194a56b808SFande Kong   if (ao) {
6204a56b808SFande Kong     ai  = ao->i;
6214a56b808SFande Kong     pi  = p_oth->i;
6224a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6234a56b808SFande Kong     aj  = ao->j + ai[i];
6244a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6254a56b808SFande Kong       row       = aj[j];
626bc8e477aSFande Kong       offset    = a->garray[row] % dof;
627bc8e477aSFande Kong       row       = map[row];
6284a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6294a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6304a56b808SFande Kong       for (col = 0; col < pnz; col++) {
631bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6324a56b808SFande Kong         if (column >= pcstart && column < pcend) {
6339566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(dht, column));
6344a56b808SFande Kong         } else {
6359566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(oht, column));
6364a56b808SFande Kong         }
6374a56b808SFande Kong       }
6384a56b808SFande Kong     }
6394a56b808SFande Kong   } /* end if (ao) */
6404a56b808SFande Kong   PetscFunctionReturn(0);
6414a56b808SFande Kong }
6424a56b808SFande Kong 
643d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
644d71ae5a4SJacob Faibussowitsch {
6454a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
6464a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
647bc8e477aSFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
6484a56b808SFande Kong   PetscScalar ra, *aa, *pa;
6494a56b808SFande Kong 
6504a56b808SFande Kong   PetscFunctionBegin;
6514a56b808SFande Kong   pcstart = P->cmap->rstart;
652bc8e477aSFande Kong   pcstart *= dof;
653bc8e477aSFande Kong 
6544a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6554a56b808SFande Kong   ai  = ad->i;
6564a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
6574a56b808SFande Kong   aj  = ad->j + ai[i];
6584a56b808SFande Kong   aa  = ad->a + ai[i];
6594a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6604a56b808SFande Kong     ra     = aa[j];
6614a56b808SFande Kong     row    = aj[j];
662bc8e477aSFande Kong     offset = row % dof;
663bc8e477aSFande Kong     row /= dof;
6644a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6654a56b808SFande Kong     pj   = pd->j + pd->i[row];
6664a56b808SFande Kong     pa   = pd->a + pd->i[row];
66748a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
6689566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6694a56b808SFande Kong   }
6704a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6714a56b808SFande Kong     ra     = aa[j];
6724a56b808SFande Kong     row    = aj[j];
673bc8e477aSFande Kong     offset = row % dof;
674bc8e477aSFande Kong     row /= dof;
6754a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6764a56b808SFande Kong     pj   = po->j + po->i[row];
6774a56b808SFande Kong     pa   = po->a + po->i[row];
67848a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
6799566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6804a56b808SFande Kong   }
6814a56b808SFande Kong 
6824a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6834a56b808SFande Kong   if (ao) {
6844a56b808SFande Kong     ai  = ao->i;
6854a56b808SFande Kong     pi  = p_oth->i;
6864a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6874a56b808SFande Kong     aj  = ao->j + ai[i];
6884a56b808SFande Kong     aa  = ao->a + ai[i];
6894a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6904a56b808SFande Kong       row       = aj[j];
691bc8e477aSFande Kong       offset    = a->garray[row] % dof;
692bc8e477aSFande Kong       row       = map[row];
6934a56b808SFande Kong       ra        = aa[j];
6944a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6954a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6964a56b808SFande Kong       pa        = p_oth->a + pi[row];
69748a46eb9SPierre Jolivet       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
6989566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * pnz));
6994a56b808SFande Kong     }
7004a56b808SFande Kong   } /* end if (ao) */
701bb5ddf68SFande Kong 
7024a56b808SFande Kong   PetscFunctionReturn(0);
7034a56b808SFande Kong }
7044a56b808SFande Kong 
705bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
7065c65b9ecSFande Kong 
707d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
708d71ae5a4SJacob Faibussowitsch {
7094a56b808SFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
710bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
7116718818eSStefano Zampini   Mat_APMPI      *ptap;
7124a56b808SFande Kong   PetscHMapIV     hmap;
7134a56b808SFande Kong   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
7144a56b808SFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
715bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
716bc8e477aSFande Kong   const PetscInt *mappingindices;
717bc8e477aSFande Kong   IS              map;
7184a56b808SFande Kong 
7194a56b808SFande Kong   PetscFunctionBegin;
7206718818eSStefano Zampini   MatCheckProduct(C, 4);
7216718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
72228b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
72328b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
7244a56b808SFande Kong 
7259566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
7264a56b808SFande Kong 
7274a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7284a56b808SFande Kong   /*-----------------------------------------------------*/
7294a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7304a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
7319566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
7324a56b808SFande Kong   }
7339566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
7344a56b808SFande Kong 
7359566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
736bc8e477aSFande Kong   pon *= dof;
7379566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
7389566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
7394a56b808SFande Kong   cmaxr = 0;
740ad540459SPierre Jolivet   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
7419566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
742*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
7439566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
7444a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
7459566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
746bc8e477aSFande Kong     offset = i % dof;
747bc8e477aSFande Kong     ii     = i / dof;
748bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
7494a56b808SFande Kong     if (!nzi) continue;
7509566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
7514a56b808SFande Kong     voff = 0;
7529566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
7534a56b808SFande Kong     if (!voff) continue;
7544a56b808SFande Kong 
7554a56b808SFande Kong     /* Form C(ii, :) */
756bc8e477aSFande Kong     poj = po->j + po->i[ii];
757bc8e477aSFande Kong     poa = po->a + po->i[ii];
7584a56b808SFande Kong     for (j = 0; j < nzi; j++) {
759bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
760bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
761bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
7624a56b808SFande Kong       for (jj = 0; jj < voff; jj++) {
7634a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
7644a56b808SFande Kong         /* If the row is empty */
765bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
7664a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
7674a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
768bc8e477aSFande Kong           c_rmtc[pocol]++;
7694a56b808SFande Kong         } else {
7709566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
7714a56b808SFande Kong           if (loc >= 0) { /* hit */
7724a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
7739566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
7744a56b808SFande Kong           } else { /* new element */
7754a56b808SFande Kong             loc = -(loc + 1);
7764a56b808SFande Kong             /* Move data backward */
777bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
7784a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
7794a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
7804a56b808SFande Kong             } /* End kk */
7814a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
7824a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
783bc8e477aSFande Kong             c_rmtc[pocol]++;
7844a56b808SFande Kong           }
7854a56b808SFande Kong         }
7869566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(voff));
7874a56b808SFande Kong       } /* End jj */
7884a56b808SFande Kong     }   /* End j */
7894a56b808SFande Kong   }     /* End i */
7904a56b808SFande Kong 
7919566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
7929566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
7934a56b808SFande Kong 
7949566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
795bc8e477aSFande Kong   pn *= dof;
7969566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
7974a56b808SFande Kong 
7989566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
7999566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
8009566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
801bc8e477aSFande Kong   pcstart = pcstart * dof;
802bc8e477aSFande Kong   pcend   = pcend * dof;
8034a56b808SFande Kong   cd      = (Mat_SeqAIJ *)(c->A)->data;
8044a56b808SFande Kong   co      = (Mat_SeqAIJ *)(c->B)->data;
8054a56b808SFande Kong 
8064a56b808SFande Kong   cmaxr = 0;
807ad540459SPierre Jolivet   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
8089566063dSJacob Faibussowitsch   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
809*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
8104a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
8119566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
812bc8e477aSFande Kong     offset = i % dof;
813bc8e477aSFande Kong     ii     = i / dof;
814bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
8154a56b808SFande Kong     if (!nzi) continue;
8169566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
8174a56b808SFande Kong     voff = 0;
8189566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
8194a56b808SFande Kong     if (!voff) continue;
8204a56b808SFande Kong     /* Form C(ii, :) */
821bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
822bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8234a56b808SFande Kong     for (j = 0; j < nzi; j++) {
824bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
825ad540459SPierre Jolivet       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
8269566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
8279566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
8284a56b808SFande Kong     }
8294a56b808SFande Kong   }
8309566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
8319566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
8329566063dSJacob Faibussowitsch   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
8339566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
8349566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
8359566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
8369566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
837bc8e477aSFande Kong 
838bc8e477aSFande Kong   /* Add contributions from remote */
839bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
840bc8e477aSFande Kong     row = i + pcstart;
8419566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
842bc8e477aSFande Kong   }
8439566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
844bc8e477aSFande Kong 
8459566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
8469566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
847bc8e477aSFande Kong 
848bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
849bc8e477aSFande Kong   PetscFunctionReturn(0);
850bc8e477aSFande Kong }
851bc8e477aSFande Kong 
852d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
853d71ae5a4SJacob Faibussowitsch {
854bc8e477aSFande Kong   PetscFunctionBegin;
85534bcad68SFande Kong 
8569566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
857bc8e477aSFande Kong   PetscFunctionReturn(0);
858bc8e477aSFande Kong }
859bc8e477aSFande Kong 
860d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
861d71ae5a4SJacob Faibussowitsch {
862bc8e477aSFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
863bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
8646718818eSStefano Zampini   Mat_APMPI      *ptap;
865bc8e477aSFande Kong   PetscHMapIV     hmap;
866bc8e477aSFande Kong   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
867bc8e477aSFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
868bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
869bc8e477aSFande Kong   const PetscInt *mappingindices;
870bc8e477aSFande Kong   IS              map;
871bc8e477aSFande Kong 
872bc8e477aSFande Kong   PetscFunctionBegin;
8736718818eSStefano Zampini   MatCheckProduct(C, 4);
8746718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
87528b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
87628b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
877bc8e477aSFande Kong 
8789566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
879bc8e477aSFande Kong 
880bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
881bc8e477aSFande Kong   /*-----------------------------------------------------*/
882bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
883bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
8849566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
885bc8e477aSFande Kong   }
8869566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
8879566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
888bc8e477aSFande Kong   pon *= dof;
8899566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
890bc8e477aSFande Kong   pn *= dof;
891bc8e477aSFande Kong 
8929566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
8939566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
8949566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
895bc8e477aSFande Kong   pcstart *= dof;
896bc8e477aSFande Kong   pcend *= dof;
897bc8e477aSFande Kong   cmaxr = 0;
898ad540459SPierre Jolivet   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
899bc8e477aSFande Kong   cd = (Mat_SeqAIJ *)(c->A)->data;
900bc8e477aSFande Kong   co = (Mat_SeqAIJ *)(c->B)->data;
901ad540459SPierre Jolivet   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
9029566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
903*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
9049566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
905bc8e477aSFande Kong   for (i = 0; i < am && (pon || pn); i++) {
9069566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
907bc8e477aSFande Kong     offset = i % dof;
908bc8e477aSFande Kong     ii     = i / dof;
909bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
910bc8e477aSFande Kong     dnzi   = pd->i[ii + 1] - pd->i[ii];
911bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
9129566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
913bc8e477aSFande Kong     voff = 0;
9149566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
915bc8e477aSFande Kong     if (!voff) continue;
916bc8e477aSFande Kong 
917bc8e477aSFande Kong     /* Form remote C(ii, :) */
918bc8e477aSFande Kong     poj = po->j + po->i[ii];
919bc8e477aSFande Kong     poa = po->a + po->i[ii];
920bc8e477aSFande Kong     for (j = 0; j < nzi; j++) {
921bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
922bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
923bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
924bc8e477aSFande Kong       for (jj = 0; jj < voff; jj++) {
925bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
926bc8e477aSFande Kong         /* If the row is empty */
927bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
928bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
929bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
930bc8e477aSFande Kong           c_rmtc[pocol]++;
931bc8e477aSFande Kong         } else {
9329566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
933bc8e477aSFande Kong           if (loc >= 0) { /* hit */
934bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
9359566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
936bc8e477aSFande Kong           } else { /* new element */
937bc8e477aSFande Kong             loc = -(loc + 1);
938bc8e477aSFande Kong             /* Move data backward */
939bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
940bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
941bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
942bc8e477aSFande Kong             } /* End kk */
943bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
944bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
945bc8e477aSFande Kong             c_rmtc[pocol]++;
946bc8e477aSFande Kong           }
947bc8e477aSFande Kong         }
948bc8e477aSFande Kong       } /* End jj */
9499566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
950bc8e477aSFande Kong     } /* End j */
951bc8e477aSFande Kong 
952bc8e477aSFande Kong     /* Form local C(ii, :) */
953bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
954bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
955bc8e477aSFande Kong     for (j = 0; j < dnzi; j++) {
956bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
957ad540459SPierre Jolivet       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
9589566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
9599566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
960bc8e477aSFande Kong     } /* End j */
961bc8e477aSFande Kong   }   /* End i */
962bc8e477aSFande Kong 
9639566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
9649566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
9659566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
9669566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
967bc8e477aSFande Kong 
9689566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9699566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9709566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9719566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9729566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
9734a56b808SFande Kong 
9744a56b808SFande Kong   /* Add contributions from remote */
9754a56b808SFande Kong   for (i = 0; i < pn; i++) {
9764a56b808SFande Kong     row = i + pcstart;
9779566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
9784a56b808SFande Kong   }
9799566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
9804a56b808SFande Kong 
9819566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
9829566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
9834a56b808SFande Kong 
9844a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
9854a56b808SFande Kong   PetscFunctionReturn(0);
9864a56b808SFande Kong }
9874a56b808SFande Kong 
988d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
989d71ae5a4SJacob Faibussowitsch {
9904a56b808SFande Kong   PetscFunctionBegin;
99134bcad68SFande Kong 
9929566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
9934a56b808SFande Kong   PetscFunctionReturn(0);
9944a56b808SFande Kong }
9954a56b808SFande Kong 
9966718818eSStefano Zampini /* TODO: move algorithm selection to MatProductSetFromOptions */
997d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
998d71ae5a4SJacob Faibussowitsch {
9994a56b808SFande Kong   Mat_APMPI      *ptap;
10006718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
10014a56b808SFande Kong   MPI_Comm        comm;
1002bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
10034a56b808SFande Kong   MatType         mtype;
10044a56b808SFande Kong   PetscSF         sf;
10054a56b808SFande Kong   PetscSFNode    *iremote;
10064a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
10074a56b808SFande Kong   const PetscInt *rootdegrees;
10084a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto;
10094a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1010131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1011bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1012131c27b5Sprj-   PetscMPIInt     owner;
1013bc8e477aSFande Kong   const PetscInt *mappingindices;
10144a56b808SFande Kong   PetscBool       flg;
10154a56b808SFande Kong   const char     *algTypes[2] = {"overlapping", "merged"};
1016bc8e477aSFande Kong   IS              map;
10174a56b808SFande Kong 
10184a56b808SFande Kong   PetscFunctionBegin;
10196718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
102028b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
10219566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
102234bcad68SFande Kong 
10234a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10249566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1025bc8e477aSFande Kong   pn *= dof;
10269566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
10279566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
10289566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
10294a56b808SFande Kong 
10309566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
10314a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
10324a56b808SFande Kong   ptap->algType = 2;
10334a56b808SFande Kong 
10344a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
10359566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
10369566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
10374a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
10389566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1039bc8e477aSFande Kong   pon *= dof;
10404a56b808SFande Kong   /* offsets */
10419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
10424a56b808SFande Kong   /* The number of columns we will send to remote ranks */
10439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
10449566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
104548a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
10469566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
10479566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
10484a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
10499566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
10504a56b808SFande Kong 
10519566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
10524a56b808SFande Kong   ptap->c_rmti[0] = 0;
10534a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
10544a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
10554a56b808SFande Kong     /* Form one row of AP */
10569566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
1057bc8e477aSFande Kong     offset = i % dof;
1058bc8e477aSFande Kong     ii     = i / dof;
10594a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1060bc8e477aSFande Kong     nzi = po->i[ii + 1] - po->i[ii];
10614a56b808SFande Kong     if (!nzi) continue;
10624a56b808SFande Kong 
10639566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
10649566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
10654a56b808SFande Kong     /* If AP is empty, just continue */
10664a56b808SFande Kong     if (!htsize) continue;
10674a56b808SFande Kong     /* Form C(ii, :) */
1068bc8e477aSFande Kong     poj = po->j + po->i[ii];
106948a46eb9SPierre Jolivet     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
10704a56b808SFande Kong   }
10714a56b808SFande Kong 
10724a56b808SFande Kong   for (i = 0; i < pon; i++) {
10739566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
10744a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
10754a56b808SFande Kong     c_rmtc[i]           = htsize;
10764a56b808SFande Kong   }
10774a56b808SFande Kong 
10789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
10794a56b808SFande Kong 
10804a56b808SFande Kong   for (i = 0; i < pon; i++) {
10814a56b808SFande Kong     off = 0;
10829566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
10839566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
10844a56b808SFande Kong   }
10859566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
10864a56b808SFande Kong 
10879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
10884a56b808SFande Kong   for (i = 0; i < pon; i++) {
10899371c9d4SSatish Balay     owner  = 0;
10909371c9d4SSatish Balay     lidx   = 0;
1091bc8e477aSFande Kong     offset = i % dof;
1092bc8e477aSFande Kong     ii     = i / dof;
10939566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1094bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
10954a56b808SFande Kong     iremote[i].rank  = owner;
10964a56b808SFande Kong   }
10974a56b808SFande Kong 
10989566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
10999566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
11004a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
11019566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
11029566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
11039566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
11044a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
11059566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
11069566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
11074a56b808SFande Kong   rootspacesize = 0;
1108ad540459SPierre Jolivet   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
11099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
11109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
11114a56b808SFande Kong   /* Get information from leaves
11124a56b808SFande Kong    * Number of columns other people contribute to my rows
11134a56b808SFande Kong    * */
11149566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
11159566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
11169566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
11179566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
11184a56b808SFande Kong   /* The number of columns is received for each row */
11194a56b808SFande Kong   ptap->c_othi[0]     = 0;
11204a56b808SFande Kong   rootspacesize       = 0;
11214a56b808SFande Kong   rootspaceoffsets[0] = 0;
11224a56b808SFande Kong   for (i = 0; i < pn; i++) {
11234a56b808SFande Kong     rcvncols = 0;
11244a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
11254a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
11264a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
11274a56b808SFande Kong       rootspacesize++;
11284a56b808SFande Kong     }
11294a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
11304a56b808SFande Kong   }
11319566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
11324a56b808SFande Kong 
11339566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
11349566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11359566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11369566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
11379566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
11384a56b808SFande Kong 
11399566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
11404a56b808SFande Kong   nleaves = 0;
11414a56b808SFande Kong   for (i = 0; i < pon; i++) {
1142bc8e477aSFande Kong     owner = 0;
1143bc8e477aSFande Kong     ii    = i / dof;
11449566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
11454a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
11464a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
11474a56b808SFande Kong       iremote[nleaves].rank    = owner;
11484a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
11494a56b808SFande Kong     }
11504a56b808SFande Kong   }
11519566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
11529566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
11534a56b808SFande Kong 
11549566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
11559566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
11569566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
11574a56b808SFande Kong   /* One to one map */
11589566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11594a56b808SFande Kong 
11609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
11619566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
11629566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1163bc8e477aSFande Kong   pcstart *= dof;
1164bc8e477aSFande Kong   pcend *= dof;
11659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
11664a56b808SFande Kong   for (i = 0; i < pn; i++) {
11679566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
11689566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
11694a56b808SFande Kong   }
11704a56b808SFande Kong   /* Work on local part */
11714a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
11724a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
11739566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
11749566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1175bc8e477aSFande Kong     offset = i % dof;
1176bc8e477aSFande Kong     ii     = i / dof;
1177bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
11784a56b808SFande Kong     if (!nzi) continue;
11794a56b808SFande Kong 
11809566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
11819566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
11829566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
11834a56b808SFande Kong     if (!(htsize + htosize)) continue;
11844a56b808SFande Kong     /* Form C(ii, :) */
1185bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
11864a56b808SFande Kong     for (j = 0; j < nzi; j++) {
11879566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
11889566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
11894a56b808SFande Kong     }
11904a56b808SFande Kong   }
11914a56b808SFande Kong 
11929566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1193bc8e477aSFande Kong 
11949566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
11959566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
11964a56b808SFande Kong 
11974a56b808SFande Kong   /* Get remote data */
11989566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11999566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
12004a56b808SFande Kong 
12014a56b808SFande Kong   for (i = 0; i < pn; i++) {
12024a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
12034a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
12044a56b808SFande Kong     for (j = 0; j < nzi; j++) {
12054a56b808SFande Kong       col = rdj[j];
12064a56b808SFande Kong       /* diag part */
12074a56b808SFande Kong       if (col >= pcstart && col < pcend) {
12089566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hta[i], col));
12094a56b808SFande Kong       } else { /* off diag */
12109566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
12114a56b808SFande Kong       }
12124a56b808SFande Kong     }
12139566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
12144a56b808SFande Kong     dnz[i] = htsize;
12159566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
12169566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
12174a56b808SFande Kong     onz[i] = htsize;
12189566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
12194a56b808SFande Kong   }
12204a56b808SFande Kong 
12219566063dSJacob Faibussowitsch   PetscCall(PetscFree2(hta, hto));
12229566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
12234a56b808SFande Kong 
12244a56b808SFande Kong   /* local sizes and preallocation */
12259566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12269566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
12279566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
12289566063dSJacob Faibussowitsch   PetscCall(MatSetUp(Cmpi));
12299566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
12304a56b808SFande Kong 
12314a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
12326718818eSStefano Zampini   Cmpi->product->data    = ptap;
12336718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
12346718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
12354a56b808SFande Kong 
12364a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
12374a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
12384a56b808SFande Kong   /* pick an algorithm */
1239d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
12404a56b808SFande Kong   alg = 0;
12419566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1242d0609cedSBarry Smith   PetscOptionsEnd();
12434a56b808SFande Kong   switch (alg) {
1244d71ae5a4SJacob Faibussowitsch   case 0:
1245d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1246d71ae5a4SJacob Faibussowitsch     break;
1247d71ae5a4SJacob Faibussowitsch   case 1:
1248d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1249d71ae5a4SJacob Faibussowitsch     break;
1250d71ae5a4SJacob Faibussowitsch   default:
1251d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
12524a56b808SFande Kong   }
12534a56b808SFande Kong   PetscFunctionReturn(0);
12544a56b808SFande Kong }
12554a56b808SFande Kong 
1256d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1257d71ae5a4SJacob Faibussowitsch {
1258bc8e477aSFande Kong   PetscFunctionBegin;
12599566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1260bc8e477aSFande Kong   PetscFunctionReturn(0);
1261bc8e477aSFande Kong }
1262bc8e477aSFande Kong 
1263d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1264d71ae5a4SJacob Faibussowitsch {
12654a56b808SFande Kong   Mat_APMPI      *ptap;
12666718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
12674a56b808SFande Kong   MPI_Comm        comm;
1268bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
12694a56b808SFande Kong   MatType         mtype;
12704a56b808SFande Kong   PetscSF         sf;
12714a56b808SFande Kong   PetscSFNode    *iremote;
12724a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
12734a56b808SFande Kong   const PetscInt *rootdegrees;
12744a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto, *htd;
12754a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1276131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1277bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1278131c27b5Sprj-   PetscMPIInt     owner;
12794a56b808SFande Kong   PetscBool       flg;
12804a56b808SFande Kong   const char     *algTypes[2] = {"merged", "overlapping"};
1281bc8e477aSFande Kong   const PetscInt *mappingindices;
1282bc8e477aSFande Kong   IS              map;
12834a56b808SFande Kong 
12844a56b808SFande Kong   PetscFunctionBegin;
12856718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
128628b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
12879566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
128834bcad68SFande Kong 
12894a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
12909566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1291bc8e477aSFande Kong   pn *= dof;
12929566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
12939566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
12949566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12954a56b808SFande Kong 
12969566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
12974a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
12984a56b808SFande Kong   ptap->algType = 3;
12994a56b808SFande Kong 
13004a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
13019566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
13029566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
13034a56b808SFande Kong 
13044a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
13059566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1306bc8e477aSFande Kong   pon *= dof;
13074a56b808SFande Kong   /* offsets */
13089566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
13094a56b808SFande Kong   /* The number of columns we will send to remote ranks */
13109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
13119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
131248a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
13139566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
13149566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
13154a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
13169566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
13179566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
13189566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
13194a56b808SFande Kong   for (i = 0; i < pn; i++) {
13209566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&htd[i]));
13219566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
13224a56b808SFande Kong   }
1323bc8e477aSFande Kong 
13249566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
13254a56b808SFande Kong   ptap->c_rmti[0] = 0;
13264a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
13274a56b808SFande Kong   for (i = 0; i < am && (pon || pn); i++) {
13284a56b808SFande Kong     /* Form one row of AP */
13299566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
13309566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1331bc8e477aSFande Kong     offset = i % dof;
1332bc8e477aSFande Kong     ii     = i / dof;
13334a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1334bc8e477aSFande Kong     nzi  = po->i[ii + 1] - po->i[ii];
1335bc8e477aSFande Kong     dnzi = pd->i[ii + 1] - pd->i[ii];
13364a56b808SFande Kong     if (!nzi && !dnzi) continue;
13374a56b808SFande Kong 
13389566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
13399566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
13409566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
13414a56b808SFande Kong     /* If AP is empty, just continue */
13424a56b808SFande Kong     if (!(htsize + htosize)) continue;
13434a56b808SFande Kong 
13444a56b808SFande Kong     /* Form remote C(ii, :) */
1345bc8e477aSFande Kong     poj = po->j + po->i[ii];
13464a56b808SFande Kong     for (j = 0; j < nzi; j++) {
13479566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
13489566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
13494a56b808SFande Kong     }
13504a56b808SFande Kong 
13514a56b808SFande Kong     /* Form local C(ii, :) */
1352bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
13534a56b808SFande Kong     for (j = 0; j < dnzi; j++) {
13549566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
13559566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
13564a56b808SFande Kong     }
13574a56b808SFande Kong   }
13584a56b808SFande Kong 
13599566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1360bc8e477aSFande Kong 
13619566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
13629566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
13634a56b808SFande Kong 
13644a56b808SFande Kong   for (i = 0; i < pon; i++) {
13659566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
13664a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
13674a56b808SFande Kong     c_rmtc[i]           = htsize;
13684a56b808SFande Kong   }
13694a56b808SFande Kong 
13709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
13714a56b808SFande Kong 
13724a56b808SFande Kong   for (i = 0; i < pon; i++) {
13734a56b808SFande Kong     off = 0;
13749566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
13759566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
13764a56b808SFande Kong   }
13779566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
13784a56b808SFande Kong 
13799566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
13804a56b808SFande Kong   for (i = 0; i < pon; i++) {
13819371c9d4SSatish Balay     owner  = 0;
13829371c9d4SSatish Balay     lidx   = 0;
1383bc8e477aSFande Kong     offset = i % dof;
1384bc8e477aSFande Kong     ii     = i / dof;
13859566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1386bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
13874a56b808SFande Kong     iremote[i].rank  = owner;
13884a56b808SFande Kong   }
13894a56b808SFande Kong 
13909566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
13919566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
13924a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
13939566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
13949566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
13959566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
13964a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
13979566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
13989566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
13994a56b808SFande Kong   rootspacesize = 0;
1400ad540459SPierre Jolivet   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
14019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
14029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
14034a56b808SFande Kong   /* Get information from leaves
14044a56b808SFande Kong    * Number of columns other people contribute to my rows
14054a56b808SFande Kong    * */
14069566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
14079566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
14089566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
14099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
14104a56b808SFande Kong   /* The number of columns is received for each row */
14114a56b808SFande Kong   ptap->c_othi[0]     = 0;
14124a56b808SFande Kong   rootspacesize       = 0;
14134a56b808SFande Kong   rootspaceoffsets[0] = 0;
14144a56b808SFande Kong   for (i = 0; i < pn; i++) {
14154a56b808SFande Kong     rcvncols = 0;
14164a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
14174a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
14184a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
14194a56b808SFande Kong       rootspacesize++;
14204a56b808SFande Kong     }
14214a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
14224a56b808SFande Kong   }
14239566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
14244a56b808SFande Kong 
14259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
14269566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14279566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14289566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
14299566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
14304a56b808SFande Kong 
14319566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
14324a56b808SFande Kong   nleaves = 0;
14334a56b808SFande Kong   for (i = 0; i < pon; i++) {
1434071fcb05SBarry Smith     owner = 0;
1435bc8e477aSFande Kong     ii    = i / dof;
14369566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
14374a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
14384a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
14394a56b808SFande Kong       iremote[nleaves].rank    = owner;
14404a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
14414a56b808SFande Kong     }
14424a56b808SFande Kong   }
14439566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
14449566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
14454a56b808SFande Kong 
14469566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
14479566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
14489566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
14494a56b808SFande Kong   /* One to one map */
14509566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14514a56b808SFande Kong   /* Get remote data */
14529566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14539566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
14549566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
14559566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1456bc8e477aSFande Kong   pcstart *= dof;
1457bc8e477aSFande Kong   pcend *= dof;
14584a56b808SFande Kong   for (i = 0; i < pn; i++) {
14594a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
14604a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
14614a56b808SFande Kong     for (j = 0; j < nzi; j++) {
14624a56b808SFande Kong       col = rdj[j];
14634a56b808SFande Kong       /* diag part */
14644a56b808SFande Kong       if (col >= pcstart && col < pcend) {
14659566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(htd[i], col));
14664a56b808SFande Kong       } else { /* off diag */
14679566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
14684a56b808SFande Kong       }
14694a56b808SFande Kong     }
14709566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
14714a56b808SFande Kong     dnz[i] = htsize;
14729566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&htd[i]));
14739566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
14744a56b808SFande Kong     onz[i] = htsize;
14759566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
14764a56b808SFande Kong   }
14774a56b808SFande Kong 
14789566063dSJacob Faibussowitsch   PetscCall(PetscFree2(htd, hto));
14799566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
14804a56b808SFande Kong 
14814a56b808SFande Kong   /* local sizes and preallocation */
14829566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
14839566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
14849566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
14859566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
14864a56b808SFande Kong 
14874a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
14886718818eSStefano Zampini   Cmpi->product->data    = ptap;
14896718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
14906718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
14914a56b808SFande Kong 
14924a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
14934a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
14944a56b808SFande Kong   /* pick an algorithm */
1495d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
14964a56b808SFande Kong   alg = 0;
14979566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1498d0609cedSBarry Smith   PetscOptionsEnd();
14994a56b808SFande Kong   switch (alg) {
1500d71ae5a4SJacob Faibussowitsch   case 0:
1501d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1502d71ae5a4SJacob Faibussowitsch     break;
1503d71ae5a4SJacob Faibussowitsch   case 1:
1504d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1505d71ae5a4SJacob Faibussowitsch     break;
1506d71ae5a4SJacob Faibussowitsch   default:
1507d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
15084a56b808SFande Kong   }
15094a56b808SFande Kong   PetscFunctionReturn(0);
15104a56b808SFande Kong }
15114a56b808SFande Kong 
1512d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1513d71ae5a4SJacob Faibussowitsch {
1514bc8e477aSFande Kong   PetscFunctionBegin;
15159566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1516bc8e477aSFande Kong   PetscFunctionReturn(0);
1517bc8e477aSFande Kong }
1518bc8e477aSFande Kong 
1519d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1520d71ae5a4SJacob Faibussowitsch {
15213cdca5ebSHong Zhang   Mat_APMPI               *ptap;
15226718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1523e953e47cSHong Zhang   MPI_Comm                 comm;
1524e953e47cSHong Zhang   PetscMPIInt              size, rank;
1525e953e47cSHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1526e953e47cSHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1527e953e47cSHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
1528e953e47cSHong Zhang   PetscBT                  lnkbt;
1529ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1530ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
1531e953e47cSHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1532e953e47cSHong Zhang   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1533e953e47cSHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1534e953e47cSHong Zhang   MPI_Request             *swaits, *rwaits;
1535e953e47cSHong Zhang   MPI_Status              *sstatus, rstatus;
1536e953e47cSHong Zhang   PetscLayout              rowmap;
1537e953e47cSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1538e953e47cSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1539e953e47cSHong Zhang   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1540ec07b8f8SHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1541e953e47cSHong Zhang   PetscScalar             *apv;
1542*eec179cfSJacob Faibussowitsch   PetscHMapI               ta;
1543b92f168fSBarry Smith   MatType                  mtype;
1544e83e5f86SFande Kong   const char              *prefix;
1545e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1546e953e47cSHong Zhang   PetscReal apfill;
1547e953e47cSHong Zhang #endif
1548e953e47cSHong Zhang 
1549e953e47cSHong Zhang   PetscFunctionBegin;
15506718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
155128b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
15529566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
15539566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
15549566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1555e953e47cSHong Zhang 
155652f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1557ec07b8f8SHong Zhang 
1558e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
15599566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
15609566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
1561e953e47cSHong Zhang 
1562e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1563e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1564e953e47cSHong Zhang 
15653cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
15669566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
156715a3b8e2SHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
1568a4ffb656SHong Zhang   ptap->algType = 1;
156915a3b8e2SHong Zhang 
157015a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
15719566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
157215a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
15739566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
157415a3b8e2SHong Zhang 
157567a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
157667a12041SHong Zhang   /* --------------------------------- */
15779566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
15789566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
157915a3b8e2SHong Zhang 
158067a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
158167a12041SHong Zhang   /* ----------------------------------------------------------------- */
158267a12041SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
158352f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1584445158ffSHong Zhang 
15859a6dcab7SHong Zhang   /* create and initialize a linked list */
1586*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
15874b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
15884b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1589*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1590d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
159178d30b94SHong Zhang 
15929566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1593445158ffSHong Zhang 
15948cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1595ec07b8f8SHong Zhang   if (ao) {
15969566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1597ec07b8f8SHong Zhang   } else {
15989566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1599ec07b8f8SHong Zhang   }
1600445158ffSHong Zhang   current_space = free_space;
160167a12041SHong Zhang   nspacedouble  = 0;
160267a12041SHong Zhang 
16039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
160467a12041SHong Zhang   api[0] = 0;
1605445158ffSHong Zhang   for (i = 0; i < am; i++) {
160667a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
16079371c9d4SSatish Balay     ai  = ad->i;
16089371c9d4SSatish Balay     pi  = p_loc->i;
160967a12041SHong Zhang     nzi = ai[i + 1] - ai[i];
161067a12041SHong Zhang     aj  = ad->j + ai[i];
1611445158ffSHong Zhang     for (j = 0; j < nzi; j++) {
1612445158ffSHong Zhang       row  = aj[j];
161367a12041SHong Zhang       pnz  = pi[row + 1] - pi[row];
161467a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1615445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
16169566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1617445158ffSHong Zhang     }
161867a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1619ec07b8f8SHong Zhang     if (ao) {
16209371c9d4SSatish Balay       ai  = ao->i;
16219371c9d4SSatish Balay       pi  = p_oth->i;
162267a12041SHong Zhang       nzi = ai[i + 1] - ai[i];
162367a12041SHong Zhang       aj  = ao->j + ai[i];
1624445158ffSHong Zhang       for (j = 0; j < nzi; j++) {
1625445158ffSHong Zhang         row  = aj[j];
162667a12041SHong Zhang         pnz  = pi[row + 1] - pi[row];
162767a12041SHong Zhang         Jptr = p_oth->j + pi[row];
16289566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1629445158ffSHong Zhang       }
1630ec07b8f8SHong Zhang     }
1631445158ffSHong Zhang     apnz       = lnk[0];
1632445158ffSHong Zhang     api[i + 1] = api[i] + apnz;
1633445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1634445158ffSHong Zhang 
1635445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1636445158ffSHong Zhang     if (current_space->local_remaining < apnz) {
16379566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1638445158ffSHong Zhang       nspacedouble++;
1639445158ffSHong Zhang     }
1640445158ffSHong Zhang 
1641445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
16429566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1643445158ffSHong Zhang 
1644445158ffSHong Zhang     current_space->array += apnz;
1645445158ffSHong Zhang     current_space->local_used += apnz;
1646445158ffSHong Zhang     current_space->local_remaining -= apnz;
1647445158ffSHong Zhang   }
1648681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1649445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
16509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
16519566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
16529566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
16539a6dcab7SHong Zhang 
1654aa690a28SHong Zhang   /* Create AP_loc for reuse */
16559566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
16569566063dSJacob Faibussowitsch   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1657aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1658ec07b8f8SHong Zhang   if (ao) {
1659aa690a28SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1660ec07b8f8SHong Zhang   } else {
1661ec07b8f8SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1662ec07b8f8SHong Zhang   }
1663aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1664aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1665aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1666aa690a28SHong Zhang 
1667aa690a28SHong Zhang   if (api[am]) {
16689566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
16699566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1670aa690a28SHong Zhang   } else {
16719566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1672aa690a28SHong Zhang   }
1673aa690a28SHong Zhang #endif
1674aa690a28SHong Zhang 
1675681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1676681d504bSHong Zhang   /* ------------------------------------ */
16779566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
16789566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
16799566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
16809566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
16819566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
16829566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
16839566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
16849566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
16859566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
16869566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
16879566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
16889566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
168915a3b8e2SHong Zhang 
169067a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
169167a12041SHong Zhang   /* ------------------------------------------ */
169215a3b8e2SHong Zhang   /* determine row ownership */
16939566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
1694445158ffSHong Zhang   rowmap->n  = pn;
1695445158ffSHong Zhang   rowmap->bs = 1;
16969566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
1697445158ffSHong Zhang   owners = rowmap->range;
169815a3b8e2SHong Zhang 
169915a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
17009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
17019566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
17029566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
170315a3b8e2SHong Zhang 
170467a12041SHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
17059371c9d4SSatish Balay   coi   = c_oth->i;
17069371c9d4SSatish Balay   coj   = c_oth->j;
170767a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
170815a3b8e2SHong Zhang   proc  = 0;
170967a12041SHong Zhang   for (i = 0; i < con; i++) {
171015a3b8e2SHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
171115a3b8e2SHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
171215a3b8e2SHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
171315a3b8e2SHong Zhang   }
171415a3b8e2SHong Zhang 
171515a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
171615a3b8e2SHong Zhang   owners_co[0] = 0;
171767a12041SHong Zhang   nsend        = 0;
171815a3b8e2SHong Zhang   for (proc = 0; proc < size; proc++) {
171915a3b8e2SHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
172015a3b8e2SHong Zhang     if (len_s[proc]) {
1721445158ffSHong Zhang       nsend++;
172215a3b8e2SHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
172315a3b8e2SHong Zhang       len += len_si[proc];
172415a3b8e2SHong Zhang     }
172515a3b8e2SHong Zhang   }
172615a3b8e2SHong Zhang 
172715a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
17289566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
17299566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
173015a3b8e2SHong Zhang 
173115a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
17329566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
17339566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
17349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
173515a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
173615a3b8e2SHong Zhang     if (!len_s[proc]) continue;
173715a3b8e2SHong Zhang     i = owners_co[proc];
17389566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
173915a3b8e2SHong Zhang     k++;
174015a3b8e2SHong Zhang   }
174115a3b8e2SHong Zhang 
1742681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1743681d504bSHong Zhang   /* ---------------------------------------- */
17449566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
17459566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
17469566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
17479566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
17489566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
17499566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
17509566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
17519566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
17529566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
17539566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
17549566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
17554222ddf1SHong Zhang 
1756681d504bSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1757681d504bSHong Zhang 
1758681d504bSHong Zhang   /* receives coj are complete */
175948a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
17609566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17619566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
176215a3b8e2SHong Zhang 
176378d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
176478d30b94SHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
176578d30b94SHong Zhang     Jptr = buf_rj[k];
1766*eec179cfSJacob Faibussowitsch     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISetWithMode(ta, *(Jptr + j) + 1, 1, INSERT_VALUES));
176778d30b94SHong Zhang   }
1768*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1769*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
17709a6dcab7SHong Zhang 
177115a3b8e2SHong Zhang   /* (4) send and recv coi */
177215a3b8e2SHong Zhang   /*-----------------------*/
17739566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
17749566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
17759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
177615a3b8e2SHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
177715a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
177815a3b8e2SHong Zhang     if (!len_s[proc]) continue;
177915a3b8e2SHong Zhang     /* form outgoing message for i-structure:
178015a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
178115a3b8e2SHong Zhang                [1:nrows]:           row index (global)
178215a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
178315a3b8e2SHong Zhang     */
178415a3b8e2SHong Zhang     /*-------------------------------------------*/
178515a3b8e2SHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
178615a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows + 1;
178715a3b8e2SHong Zhang     buf_si[0]   = nrows;
178815a3b8e2SHong Zhang     buf_si_i[0] = 0;
178915a3b8e2SHong Zhang     nrows       = 0;
179015a3b8e2SHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
179115a3b8e2SHong Zhang       nzi                 = coi[i + 1] - coi[i];
179215a3b8e2SHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
179315a3b8e2SHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
179415a3b8e2SHong Zhang       nrows++;
179515a3b8e2SHong Zhang     }
17969566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
179715a3b8e2SHong Zhang     k++;
179815a3b8e2SHong Zhang     buf_si += len_si[proc];
179915a3b8e2SHong Zhang   }
180048a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
18019566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
18029566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
180315a3b8e2SHong Zhang 
18049566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
18059566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
18069566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
18079566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
1808a4ffb656SHong Zhang 
180967a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
181067a12041SHong Zhang   /* ------------------------------------------ */
181178d30b94SHong Zhang   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
18129566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
181315a3b8e2SHong Zhang   current_space = free_space;
181415a3b8e2SHong Zhang 
18159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1816445158ffSHong Zhang   for (k = 0; k < nrecv; k++) {
181715a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
181815a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
181915a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1820a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
182115a3b8e2SHong Zhang   }
1822445158ffSHong Zhang 
1823d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
18249566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
182515a3b8e2SHong Zhang   for (i = 0; i < pn; i++) {
182667a12041SHong Zhang     /* add C_loc into Cmpi */
182767a12041SHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
182867a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
18299566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
183015a3b8e2SHong Zhang 
183115a3b8e2SHong Zhang     /* add received col data into lnk */
1832445158ffSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
183315a3b8e2SHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
183415a3b8e2SHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
183515a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
18369566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
18379371c9d4SSatish Balay         nextrow[k]++;
18389371c9d4SSatish Balay         nextci[k]++;
183915a3b8e2SHong Zhang       }
184015a3b8e2SHong Zhang     }
1841d0e9a2f7SHong Zhang     nzi = lnk[0];
18428cb82516SHong Zhang 
184315a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
18449566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
18459566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
184615a3b8e2SHong Zhang   }
18479566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
18489566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
18499566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
185080bb4639SHong Zhang 
1851ae5f0867Sstefano_zampini   /* local sizes and preallocation */
18529566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1853ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
18549566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
18559566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1856ac94c67aSTristan Konolige   }
18579566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1858d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
185915a3b8e2SHong Zhang 
1860445158ffSHong Zhang   /* members in merge */
18619566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
18629566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
18639566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
18649566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
18659566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
18669566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
18679566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
186815a3b8e2SHong Zhang 
18699566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pN, &ptap->apa));
18702259aa2eSHong Zhang 
18716718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
18726718818eSStefano Zampini   Cmpi->product->data    = ptap;
18736718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
18746718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
18756718818eSStefano Zampini 
18761a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
18771a47ec54SHong Zhang   Cmpi->assembled = PETSC_FALSE;
18780d3441aeSHong Zhang   PetscFunctionReturn(0);
18790d3441aeSHong Zhang }
18800d3441aeSHong Zhang 
1881d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1882d71ae5a4SJacob Faibussowitsch {
18836718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
18840d3441aeSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
18852dd9e643SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
18866718818eSStefano Zampini   Mat_APMPI         *ptap;
18879ce11a7cSHong Zhang   Mat                AP_loc, C_loc, C_oth;
18880d3441aeSHong Zhang   PetscInt           i, rstart, rend, cm, ncols, row;
18898cb82516SHong Zhang   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
18908cb82516SHong Zhang   PetscScalar       *apa;
18910d3441aeSHong Zhang   const PetscInt    *cols;
18920d3441aeSHong Zhang   const PetscScalar *vals;
18930d3441aeSHong Zhang 
18940d3441aeSHong Zhang   PetscFunctionBegin;
18956718818eSStefano Zampini   MatCheckProduct(C, 3);
18966718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
189728b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
189828b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1899a9899c97SHong Zhang 
19009566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
1901e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
1902748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
19039566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
19049566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1905748c7196SHong Zhang   }
19060d3441aeSHong Zhang 
1907e497d3c8SHong Zhang   /* 2) get AP_loc */
19080d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
19098cb82516SHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
19100d3441aeSHong Zhang 
1911e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
19120d3441aeSHong Zhang   /*-----------------------------------------------------*/
1913748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1914748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
19159566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
19169566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1917e497d3c8SHong Zhang   }
19180d3441aeSHong Zhang 
19198cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
19208cb82516SHong Zhang   /* ---------------------------------------------- */
19210d3441aeSHong Zhang   /* get data from symbolic products */
19228cb82516SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1923ad540459SPierre Jolivet   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
19248cb82516SHong Zhang   apa = ptap->apa;
1925681d504bSHong Zhang   api = ap->i;
1926681d504bSHong Zhang   apj = ap->j;
1927e497d3c8SHong Zhang   for (i = 0; i < am; i++) {
1928445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1929e497d3c8SHong Zhang     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1930e497d3c8SHong Zhang     apnz = api[i + 1] - api[i];
1931e497d3c8SHong Zhang     for (j = 0; j < apnz; j++) {
1932e497d3c8SHong Zhang       col                 = apj[j + api[i]];
1933e497d3c8SHong Zhang       ap->a[j + ap->i[i]] = apa[col];
1934e497d3c8SHong Zhang       apa[col]            = 0.0;
19350d3441aeSHong Zhang     }
19360d3441aeSHong Zhang   }
1937976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
19389566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
19390d3441aeSHong Zhang 
19408cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
19419566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_loc));
19429566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_oth));
19439ce11a7cSHong Zhang   C_loc = ptap->C_loc;
19449ce11a7cSHong Zhang   C_oth = ptap->C_oth;
19450d3441aeSHong Zhang 
19460d3441aeSHong Zhang   /* add C_loc and Co to to C */
19479566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
19480d3441aeSHong Zhang 
19490d3441aeSHong Zhang   /* C_loc -> C */
19500d3441aeSHong Zhang   cm    = C_loc->rmap->N;
19519ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
19528cb82516SHong Zhang   cols  = c_seq->j;
19538cb82516SHong Zhang   vals  = c_seq->a;
1954904d1e70Sandi selinger 
1955e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1956904d1e70Sandi selinger   /* when there are no off-processor parts.  */
19571de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
19581de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
19591de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
1960904d1e70Sandi selinger   if (C->assembled) {
1961904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
1962904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
1963904d1e70Sandi selinger   }
1964904d1e70Sandi selinger   if (C->was_assembled) {
19650d3441aeSHong Zhang     for (i = 0; i < cm; i++) {
19669ce11a7cSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
19670d3441aeSHong Zhang       row   = rstart + i;
19689566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19699371c9d4SSatish Balay       cols += ncols;
19709371c9d4SSatish Balay       vals += ncols;
19710d3441aeSHong Zhang     }
1972904d1e70Sandi selinger   } else {
19739566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1974904d1e70Sandi selinger   }
19750d3441aeSHong Zhang 
19760d3441aeSHong Zhang   /* Co -> C, off-processor part */
19779ce11a7cSHong Zhang   cm    = C_oth->rmap->N;
19789ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
19798cb82516SHong Zhang   cols  = c_seq->j;
19808cb82516SHong Zhang   vals  = c_seq->a;
19819ce11a7cSHong Zhang   for (i = 0; i < cm; i++) {
19829ce11a7cSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
19830d3441aeSHong Zhang     row   = p->garray[i];
19849566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19859371c9d4SSatish Balay     cols += ncols;
19869371c9d4SSatish Balay     vals += ncols;
19870d3441aeSHong Zhang   }
1988904d1e70Sandi selinger 
19899566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
19909566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
19910d3441aeSHong Zhang 
1992748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
19930d3441aeSHong Zhang   PetscFunctionReturn(0);
19940d3441aeSHong Zhang }
19954222ddf1SHong Zhang 
1996d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1997d71ae5a4SJacob Faibussowitsch {
19984222ddf1SHong Zhang   Mat_Product        *product = C->product;
19994222ddf1SHong Zhang   Mat                 A = product->A, P = product->B;
20004222ddf1SHong Zhang   MatProductAlgorithm alg  = product->alg;
20014222ddf1SHong Zhang   PetscReal           fill = product->fill;
20024222ddf1SHong Zhang   PetscBool           flg;
20034222ddf1SHong Zhang 
20044222ddf1SHong Zhang   PetscFunctionBegin;
20054222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
20069566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
20074222ddf1SHong Zhang   if (flg) {
20089566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
20094222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20104222ddf1SHong Zhang     goto next;
20114222ddf1SHong Zhang   }
20124222ddf1SHong Zhang 
20134222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
20149566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
20154222ddf1SHong Zhang   if (flg) {
20169566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
20174222ddf1SHong Zhang     goto next;
20184222ddf1SHong Zhang   }
20194222ddf1SHong Zhang 
20204222ddf1SHong Zhang   /* allatonce */
20219566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
20224222ddf1SHong Zhang   if (flg) {
20239566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
20244222ddf1SHong Zhang     goto next;
20254222ddf1SHong Zhang   }
20264222ddf1SHong Zhang 
20274222ddf1SHong Zhang   /* allatonce_merged */
20289566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
20294222ddf1SHong Zhang   if (flg) {
20309566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
20314222ddf1SHong Zhang     goto next;
20324222ddf1SHong Zhang   }
20334222ddf1SHong Zhang 
20344e84afc0SStefano Zampini   /* backend general code */
20359566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "backend", &flg));
20364e84afc0SStefano Zampini   if (flg) {
20379566063dSJacob Faibussowitsch     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
20384e84afc0SStefano Zampini     PetscFunctionReturn(0);
20394e84afc0SStefano Zampini   }
20404e84afc0SStefano Zampini 
20414222ddf1SHong Zhang   /* hypre */
20424222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
20439566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
20444222ddf1SHong Zhang   if (flg) {
20459566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
20464222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20474222ddf1SHong Zhang     PetscFunctionReturn(0);
20484222ddf1SHong Zhang   }
20494222ddf1SHong Zhang #endif
20504222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
20514222ddf1SHong Zhang 
20524222ddf1SHong Zhang next:
20534222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
20544222ddf1SHong Zhang   PetscFunctionReturn(0);
20554222ddf1SHong Zhang }
2056