xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision bbea24aa56d87d3065f0603396f6e854e33a88f9)
182412ba7SHong Zhang 
242c57489SHong Zhang /*
342c57489SHong Zhang   Defines projective product routines where A is a MPIAIJ matrix
442c57489SHong Zhang           C = P^T * A * P
542c57489SHong Zhang */
642c57489SHong Zhang 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <../src/mat/impls/aij/mpi/mpiaij.h>
10c6db04a5SJed Brown #include <petscbt.h>
118563dfccSBarry Smith #include <petsctime.h>
12694f88d4SFande Kong #include <petsc/private/hashmapiv.h>
134a56b808SFande Kong #include <petsc/private/hashseti.h>
144a56b808SFande Kong #include <petscsf.h>
1542c57489SHong Zhang 
16d71ae5a4SJacob Faibussowitsch PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17d71ae5a4SJacob Faibussowitsch {
18cf97cf3cSHong Zhang   PetscBool         iascii;
19cf97cf3cSHong Zhang   PetscViewerFormat format;
206718818eSStefano Zampini   Mat_APMPI        *ptap;
21cf97cf3cSHong Zhang 
22cf97cf3cSHong Zhang   PetscFunctionBegin;
236718818eSStefano Zampini   MatCheckProduct(A, 1);
246718818eSStefano Zampini   ptap = (Mat_APMPI *)A->product->data;
259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
26cf97cf3cSHong Zhang   if (iascii) {
279566063dSJacob Faibussowitsch     PetscCall(PetscViewerGetFormat(viewer, &format));
28cf97cf3cSHong Zhang     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29cf97cf3cSHong Zhang       if (ptap->algType == 0) {
309566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
31cf97cf3cSHong Zhang       } else if (ptap->algType == 1) {
329566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
334a56b808SFande Kong       } else if (ptap->algType == 2) {
349566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
354a56b808SFande Kong       } else if (ptap->algType == 3) {
369566063dSJacob Faibussowitsch         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
37cf97cf3cSHong Zhang       }
38cf97cf3cSHong Zhang     }
39cf97cf3cSHong Zhang   }
403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41cf97cf3cSHong Zhang }
42cf97cf3cSHong Zhang 
43d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44d71ae5a4SJacob Faibussowitsch {
456718818eSStefano Zampini   Mat_APMPI           *ptap = (Mat_APMPI *)data;
4660552ceaSHong Zhang   Mat_Merge_SeqsToMPI *merge;
47cc6cb787SHong Zhang 
48cc6cb787SHong Zhang   PetscFunctionBegin;
499566063dSJacob Faibussowitsch   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
509566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->bufa));
519566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_loc));
529566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->P_oth));
539566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
549566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Rd));
559566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Ro));
568403a639SHong Zhang   if (ptap->AP_loc) { /* used by alg_rap */
57681d504bSHong Zhang     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
589566063dSJacob Faibussowitsch     PetscCall(PetscFree(ap->i));
599566063dSJacob Faibussowitsch     PetscCall(PetscFree2(ap->j, ap->a));
609566063dSJacob Faibussowitsch     PetscCall(MatDestroy(&ptap->AP_loc));
618403a639SHong Zhang   } else { /* used by alg_ptap */
629566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->api));
639566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->apj));
64681d504bSHong Zhang   }
659566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_loc));
669566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->C_oth));
679566063dSJacob Faibussowitsch   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68a560ef98SHong Zhang 
699566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&ptap->Pt));
7060552ceaSHong Zhang 
7160552ceaSHong Zhang   merge = ptap->merge;
728403a639SHong Zhang   if (merge) { /* used by alg_ptap */
739566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->id_r));
749566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_s));
759566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->len_r));
769566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bi));
779566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->bj));
789566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri[0]));
799566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_ri));
809566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj[0]));
819566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->buf_rj));
829566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coi));
839566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->coj));
849566063dSJacob Faibussowitsch     PetscCall(PetscFree(merge->owners_co));
859566063dSJacob Faibussowitsch     PetscCall(PetscLayoutDestroy(&merge->rowmap));
869566063dSJacob Faibussowitsch     PetscCall(PetscFree(ptap->merge));
87bf0cc555SLisandro Dalcin   }
889566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
894a56b808SFande Kong 
909566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&ptap->sf));
919566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_othi));
929566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap->c_rmti));
939566063dSJacob Faibussowitsch   PetscCall(PetscFree(ptap));
943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
95cc6cb787SHong Zhang }
96cc6cb787SHong Zhang 
97d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
98d71ae5a4SJacob Faibussowitsch {
996718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100cf742fccSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
10192ec70a1SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1026718818eSStefano Zampini   Mat_APMPI         *ptap;
103cf742fccSHong Zhang   Mat                AP_loc, C_loc, C_oth;
104a3bb6f32SFande Kong   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105cf742fccSHong Zhang   PetscScalar       *apa;
106cf742fccSHong Zhang   const PetscInt    *cols;
107cf742fccSHong Zhang   const PetscScalar *vals;
108cf742fccSHong Zhang 
109cf742fccSHong Zhang   PetscFunctionBegin;
1106718818eSStefano Zampini   MatCheckProduct(C, 3);
1116718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
11228b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
11328b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
11480da8df7SHong Zhang 
1159566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
116cf742fccSHong Zhang 
117cf742fccSHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
118cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1199566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1209566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121cf742fccSHong Zhang   }
122cf742fccSHong Zhang 
123cf742fccSHong Zhang   /* 2) get AP_loc */
124cf742fccSHong Zhang   AP_loc = ptap->AP_loc;
125cf742fccSHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
126cf742fccSHong Zhang 
127cf742fccSHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128cf742fccSHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
129cf742fccSHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1309566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1319566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
132cf742fccSHong Zhang   }
133cf742fccSHong Zhang 
134cf742fccSHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
135cf742fccSHong Zhang   /* get data from symbolic products */
136cf742fccSHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
13752f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
13852f7967eSHong Zhang 
139cf742fccSHong Zhang   api = ap->i;
140cf742fccSHong Zhang   apj = ap->j;
1419566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
142cf742fccSHong Zhang   for (i = 0; i < am; i++) {
143cf742fccSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
144cf742fccSHong Zhang     apnz = api[i + 1] - api[i];
145b4e8d1b6SHong Zhang     apa  = ap->a + api[i];
1469566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(apa, apnz));
147b4e8d1b6SHong Zhang     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
148cf742fccSHong Zhang   }
1499566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
15008401ef6SPierre Jolivet   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
151cf742fccSHong Zhang 
152cf742fccSHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
153154d0d78SFande Kong   /* Always use scalable version since we are in the MPI scalable version */
1549566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
1559566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
156cf742fccSHong Zhang 
157cf742fccSHong Zhang   C_loc = ptap->C_loc;
158cf742fccSHong Zhang   C_oth = ptap->C_oth;
159cf742fccSHong Zhang 
160cf742fccSHong Zhang   /* add C_loc and Co to to C */
1619566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
162cf742fccSHong Zhang 
163cf742fccSHong Zhang   /* C_loc -> C */
164cf742fccSHong Zhang   cm    = C_loc->rmap->N;
165cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
166cf742fccSHong Zhang   cols  = c_seq->j;
167cf742fccSHong Zhang   vals  = c_seq->a;
1689566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
169edeac6deSandi selinger 
170e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
171edeac6deSandi selinger   /* when there are no off-processor parts.  */
1721de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1731de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1741de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
175edeac6deSandi selinger   if (C->assembled) {
176edeac6deSandi selinger     C->was_assembled = PETSC_TRUE;
177edeac6deSandi selinger     C->assembled     = PETSC_FALSE;
178edeac6deSandi selinger   }
179edeac6deSandi selinger   if (C->was_assembled) {
180cf742fccSHong Zhang     for (i = 0; i < cm; i++) {
181cf742fccSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
182cf742fccSHong Zhang       row   = rstart + i;
1839566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1849371c9d4SSatish Balay       cols += ncols;
1859371c9d4SSatish Balay       vals += ncols;
186cf742fccSHong Zhang     }
187edeac6deSandi selinger   } else {
1889566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
189edeac6deSandi selinger   }
1909566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
19108401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
192cf742fccSHong Zhang 
193cf742fccSHong Zhang   /* Co -> C, off-processor part */
194cf742fccSHong Zhang   cm    = C_oth->rmap->N;
195cf742fccSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
196cf742fccSHong Zhang   cols  = c_seq->j;
197cf742fccSHong Zhang   vals  = c_seq->a;
1989566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
199cf742fccSHong Zhang   for (i = 0; i < cm; i++) {
200cf742fccSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
201cf742fccSHong Zhang     row   = p->garray[i];
2029566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
2039371c9d4SSatish Balay     cols += ncols;
2049371c9d4SSatish Balay     vals += ncols;
205cf742fccSHong Zhang   }
2069566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
2079566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
208cf742fccSHong Zhang 
209cf742fccSHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
21080da8df7SHong Zhang 
2119566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
21208401ef6SPierre Jolivet   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
2133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
214cf742fccSHong Zhang }
215cf742fccSHong Zhang 
216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
217d71ae5a4SJacob Faibussowitsch {
2183cdca5ebSHong Zhang   Mat_APMPI               *ptap;
2196718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
2202259aa2eSHong Zhang   MPI_Comm                 comm;
2212259aa2eSHong Zhang   PetscMPIInt              size, rank;
2224222ddf1SHong Zhang   Mat                      P_loc, P_oth;
22315a3b8e2SHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
224d0e9a2f7SHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
225d0e9a2f7SHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
226ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
227ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
22815a3b8e2SHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
2297c70b0e9SStefano Zampini   const PetscInt          *owners;
2307c70b0e9SStefano Zampini   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
23115a3b8e2SHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
23215a3b8e2SHong Zhang   MPI_Request             *swaits, *rwaits;
23315a3b8e2SHong Zhang   MPI_Status              *sstatus, rstatus;
234445158ffSHong Zhang   PetscLayout              rowmap;
235445158ffSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
236445158ffSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
237a3bb6f32SFande Kong   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
23852f7967eSHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
23967a12041SHong Zhang   PetscScalar             *apv;
240eec179cfSJacob Faibussowitsch   PetscHMapI               ta;
241b92f168fSBarry Smith   MatType                  mtype;
242e83e5f86SFande Kong   const char              *prefix;
243aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
2448cb82516SHong Zhang   PetscReal apfill;
245aa690a28SHong Zhang #endif
24667a12041SHong Zhang 
24767a12041SHong Zhang   PetscFunctionBegin;
2486718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
24928b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
2509566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2519566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
2529566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
253ae5f0867Sstefano_zampini 
25452f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
25552f7967eSHong Zhang 
256ae5f0867Sstefano_zampini   /* create symbolic parallel matrix Cmpi */
2579566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
2589566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
259ae5f0867Sstefano_zampini 
2603cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
2619566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
262e953e47cSHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
263cf97cf3cSHong Zhang   ptap->algType = 0;
264e953e47cSHong Zhang 
265e953e47cSHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
2669566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
267e953e47cSHong Zhang   /* get P_loc by taking all local rows of P */
2689566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
26976db11f6SHong Zhang 
27076db11f6SHong Zhang   ptap->P_loc = P_loc;
27176db11f6SHong Zhang   ptap->P_oth = P_oth;
272e953e47cSHong Zhang 
273e953e47cSHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
2749566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
2759566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
276e953e47cSHong Zhang 
277e953e47cSHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
27876db11f6SHong Zhang   p_loc = (Mat_SeqAIJ *)P_loc->data;
27952f7967eSHong Zhang   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
280e953e47cSHong Zhang 
281e953e47cSHong Zhang   /* create and initialize a linked list */
282eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
28376db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
28476db11f6SHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
286e953e47cSHong Zhang 
2879566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
288e953e47cSHong Zhang 
289e953e47cSHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
29052f7967eSHong Zhang   if (ao) {
2919566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
29252f7967eSHong Zhang   } else {
2939566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
29452f7967eSHong Zhang   }
295e953e47cSHong Zhang   current_space = free_space;
296e953e47cSHong Zhang   nspacedouble  = 0;
297e953e47cSHong Zhang 
2989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
299e953e47cSHong Zhang   api[0] = 0;
300e953e47cSHong Zhang   for (i = 0; i < am; i++) {
301e953e47cSHong Zhang     /* diagonal portion: Ad[i,:]*P */
3029371c9d4SSatish Balay     ai  = ad->i;
3039371c9d4SSatish Balay     pi  = p_loc->i;
304e953e47cSHong Zhang     nzi = ai[i + 1] - ai[i];
305e953e47cSHong Zhang     aj  = ad->j + ai[i];
306e953e47cSHong Zhang     for (j = 0; j < nzi; j++) {
307e953e47cSHong Zhang       row  = aj[j];
308e953e47cSHong Zhang       pnz  = pi[row + 1] - pi[row];
309e953e47cSHong Zhang       Jptr = p_loc->j + pi[row];
310e953e47cSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
3119566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
312e953e47cSHong Zhang     }
313e953e47cSHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
31452f7967eSHong Zhang     if (ao) {
3159371c9d4SSatish Balay       ai  = ao->i;
3169371c9d4SSatish Balay       pi  = p_oth->i;
317e953e47cSHong Zhang       nzi = ai[i + 1] - ai[i];
318e953e47cSHong Zhang       aj  = ao->j + ai[i];
319e953e47cSHong Zhang       for (j = 0; j < nzi; j++) {
320e953e47cSHong Zhang         row  = aj[j];
321e953e47cSHong Zhang         pnz  = pi[row + 1] - pi[row];
322e953e47cSHong Zhang         Jptr = p_oth->j + pi[row];
3239566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
324e953e47cSHong Zhang       }
32552f7967eSHong Zhang     }
326e953e47cSHong Zhang     apnz       = lnk[0];
327e953e47cSHong Zhang     api[i + 1] = api[i] + apnz;
328e953e47cSHong Zhang 
329e953e47cSHong Zhang     /* if free space is not available, double the total space in the list */
330e953e47cSHong Zhang     if (current_space->local_remaining < apnz) {
3319566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
332e953e47cSHong Zhang       nspacedouble++;
333e953e47cSHong Zhang     }
334e953e47cSHong Zhang 
335e953e47cSHong Zhang     /* Copy data into free space, then initialize lnk */
3369566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
337e953e47cSHong Zhang 
338e953e47cSHong Zhang     current_space->array += apnz;
339e953e47cSHong Zhang     current_space->local_used += apnz;
340e953e47cSHong Zhang     current_space->local_remaining -= apnz;
341e953e47cSHong Zhang   }
342e953e47cSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
343e953e47cSHong Zhang   /* destroy list of free space and other temporary array(s) */
3449566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
3459566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
3469566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
347e953e47cSHong Zhang 
348e953e47cSHong Zhang   /* Create AP_loc for reuse */
3499566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
3509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
351e953e47cSHong Zhang 
352e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
35352f7967eSHong Zhang   if (ao) {
354e953e47cSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
35552f7967eSHong Zhang   } else {
35652f7967eSHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
35752f7967eSHong Zhang   }
358e953e47cSHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
359e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
360e953e47cSHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
361e953e47cSHong Zhang 
362e953e47cSHong Zhang   if (api[am]) {
3639566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
3649566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
365e953e47cSHong Zhang   } else {
3669566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
367e953e47cSHong Zhang   }
368e953e47cSHong Zhang #endif
369e953e47cSHong Zhang 
370e953e47cSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
3719566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
3729566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
3739566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
3749566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
3754222ddf1SHong Zhang 
3769566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
3779566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
3789566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
3799566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
3809566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
381e953e47cSHong Zhang 
382e953e47cSHong Zhang   /* (3) send coj of C_oth to other processors  */
383e953e47cSHong Zhang   /* determine row ownership */
3849566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
3859566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
3869566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
3879566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
3889566063dSJacob Faibussowitsch   PetscCall(PetscLayoutGetRanges(rowmap, &owners));
389e953e47cSHong Zhang 
390e953e47cSHong Zhang   /* determine the number of messages to send, their lengths */
3919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
3929566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
3939566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
394e953e47cSHong Zhang 
395e953e47cSHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
3969371c9d4SSatish Balay   coi   = c_oth->i;
3979371c9d4SSatish Balay   coj   = c_oth->j;
398e953e47cSHong Zhang   con   = ptap->C_oth->rmap->n;
399e953e47cSHong Zhang   proc  = 0;
4009566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
401e953e47cSHong Zhang   for (i = 0; i < con; i++) {
402e953e47cSHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
403e953e47cSHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
404e953e47cSHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
405e953e47cSHong Zhang   }
406e953e47cSHong Zhang 
407e953e47cSHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
408e953e47cSHong Zhang   owners_co[0] = 0;
409e953e47cSHong Zhang   nsend        = 0;
410e953e47cSHong Zhang   for (proc = 0; proc < size; proc++) {
411e953e47cSHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
412e953e47cSHong Zhang     if (len_s[proc]) {
413e953e47cSHong Zhang       nsend++;
414e953e47cSHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
415e953e47cSHong Zhang       len += len_si[proc];
416e953e47cSHong Zhang     }
417e953e47cSHong Zhang   }
418e953e47cSHong Zhang 
419e953e47cSHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
4209566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
4219566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
422e953e47cSHong Zhang 
423e953e47cSHong Zhang   /* post the Irecv and Isend of coj */
4249566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
4259566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
4269566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
427e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
428e953e47cSHong Zhang     if (!len_s[proc]) continue;
429e953e47cSHong Zhang     i = owners_co[proc];
4309566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
431e953e47cSHong Zhang     k++;
432e953e47cSHong Zhang   }
433e953e47cSHong Zhang 
434e953e47cSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
4359566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
4369566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
4379566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
4389566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
4394222ddf1SHong Zhang 
4409566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
4419566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
4424222ddf1SHong Zhang 
4439566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
4449566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
4454222ddf1SHong Zhang 
446e953e47cSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
4479566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
448e953e47cSHong Zhang 
449e953e47cSHong Zhang   /* receives coj are complete */
45048a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4519566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4529566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
453e953e47cSHong Zhang 
454e953e47cSHong Zhang   /* add received column indices into ta to update Crmax */
455e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
456e953e47cSHong Zhang     Jptr = buf_rj[k];
457c76ffc5fSJacob Faibussowitsch     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
458e953e47cSHong Zhang   }
459eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
460eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
461e953e47cSHong Zhang 
462e953e47cSHong Zhang   /* (4) send and recv coi */
4639566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
4649566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
4659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
466e953e47cSHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
467e953e47cSHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
468e953e47cSHong Zhang     if (!len_s[proc]) continue;
469e953e47cSHong Zhang     /* form outgoing message for i-structure:
470e953e47cSHong Zhang          buf_si[0]:                 nrows to be sent
471e953e47cSHong Zhang                [1:nrows]:           row index (global)
472e953e47cSHong Zhang                [nrows+1:2*nrows+1]: i-structure index
473e953e47cSHong Zhang     */
474e953e47cSHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
475e953e47cSHong Zhang     buf_si_i    = buf_si + nrows + 1;
476e953e47cSHong Zhang     buf_si[0]   = nrows;
477e953e47cSHong Zhang     buf_si_i[0] = 0;
478e953e47cSHong Zhang     nrows       = 0;
479e953e47cSHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
480e953e47cSHong Zhang       nzi                 = coi[i + 1] - coi[i];
481e953e47cSHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
482e953e47cSHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
483e953e47cSHong Zhang       nrows++;
484e953e47cSHong Zhang     }
4859566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
486e953e47cSHong Zhang     k++;
487e953e47cSHong Zhang     buf_si += len_si[proc];
488e953e47cSHong Zhang   }
48948a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
4909566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
4919566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
492e953e47cSHong Zhang 
4939566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
4949566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
4959566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
4969566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
497b4e8d1b6SHong Zhang 
498e953e47cSHong Zhang   /* (5) compute the local portion of Cmpi      */
499*bbea24aaSStefano Zampini   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
5009566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
501e953e47cSHong Zhang   current_space = free_space;
502e953e47cSHong Zhang 
5039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
504e953e47cSHong Zhang   for (k = 0; k < nrecv; k++) {
505e953e47cSHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
506e953e47cSHong Zhang     nrows       = *buf_ri_k[k];
507e953e47cSHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
508a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
509e953e47cSHong Zhang   }
510e953e47cSHong Zhang 
511d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
5129566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
513e953e47cSHong Zhang   for (i = 0; i < pn; i++) {
514e953e47cSHong Zhang     /* add C_loc into Cmpi */
515e953e47cSHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
516e953e47cSHong Zhang     Jptr = c_loc->j + c_loc->i[i];
5179566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
518e953e47cSHong Zhang 
519e953e47cSHong Zhang     /* add received col data into lnk */
520e953e47cSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
521e953e47cSHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
522e953e47cSHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
523e953e47cSHong Zhang         Jptr = buf_rj[k] + *nextci[k];
5249566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
5259371c9d4SSatish Balay         nextrow[k]++;
5269371c9d4SSatish Balay         nextci[k]++;
527e953e47cSHong Zhang       }
528e953e47cSHong Zhang     }
529e953e47cSHong Zhang     nzi = lnk[0];
530e953e47cSHong Zhang 
531e953e47cSHong Zhang     /* copy data into free space, then initialize lnk */
5329566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
5339566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
534e953e47cSHong Zhang   }
5359566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
5369566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
5379566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
538e953e47cSHong Zhang 
539e953e47cSHong Zhang   /* local sizes and preallocation */
5409566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
541ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
5429566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
5439566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
544ac94c67aSTristan Konolige   }
5459566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
546d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
547e953e47cSHong Zhang 
548e953e47cSHong Zhang   /* members in merge */
5499566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
5509566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
5519566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
5529566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
5539566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
5549566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
5559566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
556e953e47cSHong Zhang 
557a3bb6f32SFande Kong   nout = 0;
5589566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
55908401ef6SPierre Jolivet   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
5609566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
56108401ef6SPierre Jolivet   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
562a3bb6f32SFande Kong 
5636718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
5646718818eSStefano Zampini   Cmpi->product->data    = ptap;
5656718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
5666718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
5676718818eSStefano Zampini 
5686718818eSStefano Zampini   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
5696718818eSStefano Zampini   Cmpi->assembled        = PETSC_FALSE;
5706718818eSStefano Zampini   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
5713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
572e953e47cSHong Zhang }
573e953e47cSHong Zhang 
574d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
575d71ae5a4SJacob Faibussowitsch {
5764a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5774a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
5784a56b808SFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
579bc8e477aSFande Kong   PetscInt    pcstart, pcend, column, offset;
5804a56b808SFande Kong 
5814a56b808SFande Kong   PetscFunctionBegin;
5824a56b808SFande Kong   pcstart = P->cmap->rstart;
583bc8e477aSFande Kong   pcstart *= dof;
5844a56b808SFande Kong   pcend = P->cmap->rend;
585bc8e477aSFande Kong   pcend *= dof;
5864a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
5874a56b808SFande Kong   ai  = ad->i;
5884a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
5894a56b808SFande Kong   aj  = ad->j + ai[i];
5904a56b808SFande Kong   for (j = 0; j < nzi; j++) {
5914a56b808SFande Kong     row    = aj[j];
592bc8e477aSFande Kong     offset = row % dof;
593bc8e477aSFande Kong     row /= dof;
5944a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
5954a56b808SFande Kong     pj   = pd->j + pd->i[row];
59648a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
5974a56b808SFande Kong   }
598bc8e477aSFande Kong   /* off diag P */
5994a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6004a56b808SFande Kong     row    = aj[j];
601bc8e477aSFande Kong     offset = row % dof;
602bc8e477aSFande Kong     row /= dof;
6034a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6044a56b808SFande Kong     pj   = po->j + po->i[row];
60548a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
6064a56b808SFande Kong   }
6074a56b808SFande Kong 
6084a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6094a56b808SFande Kong   if (ao) {
6104a56b808SFande Kong     ai  = ao->i;
6114a56b808SFande Kong     pi  = p_oth->i;
6124a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6134a56b808SFande Kong     aj  = ao->j + ai[i];
6144a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6154a56b808SFande Kong       row       = aj[j];
616bc8e477aSFande Kong       offset    = a->garray[row] % dof;
617bc8e477aSFande Kong       row       = map[row];
6184a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6194a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6204a56b808SFande Kong       for (col = 0; col < pnz; col++) {
621bc8e477aSFande Kong         column = p_othcols[col] * dof + offset;
6224a56b808SFande Kong         if (column >= pcstart && column < pcend) {
6239566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(dht, column));
6244a56b808SFande Kong         } else {
6259566063dSJacob Faibussowitsch           PetscCall(PetscHSetIAdd(oht, column));
6264a56b808SFande Kong         }
6274a56b808SFande Kong       }
6284a56b808SFande Kong     }
6294a56b808SFande Kong   } /* end if (ao) */
6303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6314a56b808SFande Kong }
6324a56b808SFande Kong 
633d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
634d71ae5a4SJacob Faibussowitsch {
6354a56b808SFande Kong   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
6364a56b808SFande Kong   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
637bc8e477aSFande Kong   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
6384a56b808SFande Kong   PetscScalar ra, *aa, *pa;
6394a56b808SFande Kong 
6404a56b808SFande Kong   PetscFunctionBegin;
6414a56b808SFande Kong   pcstart = P->cmap->rstart;
642bc8e477aSFande Kong   pcstart *= dof;
643bc8e477aSFande Kong 
6444a56b808SFande Kong   /* diagonal portion: Ad[i,:]*P */
6454a56b808SFande Kong   ai  = ad->i;
6464a56b808SFande Kong   nzi = ai[i + 1] - ai[i];
6474a56b808SFande Kong   aj  = ad->j + ai[i];
6484a56b808SFande Kong   aa  = ad->a + ai[i];
6494a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6504a56b808SFande Kong     ra     = aa[j];
6514a56b808SFande Kong     row    = aj[j];
652bc8e477aSFande Kong     offset = row % dof;
653bc8e477aSFande Kong     row /= dof;
6544a56b808SFande Kong     nzpi = pd->i[row + 1] - pd->i[row];
6554a56b808SFande Kong     pj   = pd->j + pd->i[row];
6564a56b808SFande Kong     pa   = pd->a + pd->i[row];
65748a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
6589566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6594a56b808SFande Kong   }
6604a56b808SFande Kong   for (j = 0; j < nzi; j++) {
6614a56b808SFande Kong     ra     = aa[j];
6624a56b808SFande Kong     row    = aj[j];
663bc8e477aSFande Kong     offset = row % dof;
664bc8e477aSFande Kong     row /= dof;
6654a56b808SFande Kong     nzpi = po->i[row + 1] - po->i[row];
6664a56b808SFande Kong     pj   = po->j + po->i[row];
6674a56b808SFande Kong     pa   = po->a + po->i[row];
66848a46eb9SPierre Jolivet     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
6699566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(2.0 * nzpi));
6704a56b808SFande Kong   }
6714a56b808SFande Kong 
6724a56b808SFande Kong   /* off diagonal part: Ao[i, :]*P_oth */
6734a56b808SFande Kong   if (ao) {
6744a56b808SFande Kong     ai  = ao->i;
6754a56b808SFande Kong     pi  = p_oth->i;
6764a56b808SFande Kong     nzi = ai[i + 1] - ai[i];
6774a56b808SFande Kong     aj  = ao->j + ai[i];
6784a56b808SFande Kong     aa  = ao->a + ai[i];
6794a56b808SFande Kong     for (j = 0; j < nzi; j++) {
6804a56b808SFande Kong       row       = aj[j];
681bc8e477aSFande Kong       offset    = a->garray[row] % dof;
682bc8e477aSFande Kong       row       = map[row];
6834a56b808SFande Kong       ra        = aa[j];
6844a56b808SFande Kong       pnz       = pi[row + 1] - pi[row];
6854a56b808SFande Kong       p_othcols = p_oth->j + pi[row];
6864a56b808SFande Kong       pa        = p_oth->a + pi[row];
68748a46eb9SPierre Jolivet       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
6889566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * pnz));
6894a56b808SFande Kong     }
6904a56b808SFande Kong   } /* end if (ao) */
691bb5ddf68SFande Kong 
6923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6934a56b808SFande Kong }
6944a56b808SFande Kong 
695bc8e477aSFande Kong PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
6965c65b9ecSFande Kong 
697d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
698d71ae5a4SJacob Faibussowitsch {
6994a56b808SFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
700bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
7016718818eSStefano Zampini   Mat_APMPI      *ptap;
7024a56b808SFande Kong   PetscHMapIV     hmap;
7034a56b808SFande Kong   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
7044a56b808SFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
705bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
706bc8e477aSFande Kong   const PetscInt *mappingindices;
707bc8e477aSFande Kong   IS              map;
7084a56b808SFande Kong 
7094a56b808SFande Kong   PetscFunctionBegin;
7106718818eSStefano Zampini   MatCheckProduct(C, 4);
7116718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
71228b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
71328b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
7144a56b808SFande Kong 
7159566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
7164a56b808SFande Kong 
7174a56b808SFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
7184a56b808SFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
7194a56b808SFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
7209566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
7214a56b808SFande Kong   }
7229566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
7234a56b808SFande Kong 
7249566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
725bc8e477aSFande Kong   pon *= dof;
7269566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
7279566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
7284a56b808SFande Kong   cmaxr = 0;
729ad540459SPierre Jolivet   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
7309566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
731eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
7329566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
7334a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
7349566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
735bc8e477aSFande Kong     offset = i % dof;
736bc8e477aSFande Kong     ii     = i / dof;
737bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
7384a56b808SFande Kong     if (!nzi) continue;
7399566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
7404a56b808SFande Kong     voff = 0;
7419566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
7424a56b808SFande Kong     if (!voff) continue;
7434a56b808SFande Kong 
7444a56b808SFande Kong     /* Form C(ii, :) */
745bc8e477aSFande Kong     poj = po->j + po->i[ii];
746bc8e477aSFande Kong     poa = po->a + po->i[ii];
7474a56b808SFande Kong     for (j = 0; j < nzi; j++) {
748bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
749bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
750bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
7514a56b808SFande Kong       for (jj = 0; jj < voff; jj++) {
7524a56b808SFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
7534a56b808SFande Kong         /* If the row is empty */
754bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
7554a56b808SFande Kong           c_rmtjj[jj] = apindices[jj];
7564a56b808SFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
757bc8e477aSFande Kong           c_rmtc[pocol]++;
7584a56b808SFande Kong         } else {
7599566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
7604a56b808SFande Kong           if (loc >= 0) { /* hit */
7614a56b808SFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
7629566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
7634a56b808SFande Kong           } else { /* new element */
7644a56b808SFande Kong             loc = -(loc + 1);
7654a56b808SFande Kong             /* Move data backward */
766bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
7674a56b808SFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
7684a56b808SFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
7694a56b808SFande Kong             } /* End kk */
7704a56b808SFande Kong             c_rmtjj[loc] = apindices[jj];
7714a56b808SFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
772bc8e477aSFande Kong             c_rmtc[pocol]++;
7734a56b808SFande Kong           }
7744a56b808SFande Kong         }
7759566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(voff));
7764a56b808SFande Kong       } /* End jj */
7774a56b808SFande Kong     }   /* End j */
7784a56b808SFande Kong   }     /* End i */
7794a56b808SFande Kong 
7809566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
7819566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
7824a56b808SFande Kong 
7839566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
784bc8e477aSFande Kong   pn *= dof;
7859566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
7864a56b808SFande Kong 
7879566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
7889566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
7899566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
790bc8e477aSFande Kong   pcstart = pcstart * dof;
791bc8e477aSFande Kong   pcend   = pcend * dof;
7924a56b808SFande Kong   cd      = (Mat_SeqAIJ *)(c->A)->data;
7934a56b808SFande Kong   co      = (Mat_SeqAIJ *)(c->B)->data;
7944a56b808SFande Kong 
7954a56b808SFande Kong   cmaxr = 0;
796ad540459SPierre Jolivet   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
7979566063dSJacob Faibussowitsch   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
798eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
7994a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
8009566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
801bc8e477aSFande Kong     offset = i % dof;
802bc8e477aSFande Kong     ii     = i / dof;
803bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
8044a56b808SFande Kong     if (!nzi) continue;
8059566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
8064a56b808SFande Kong     voff = 0;
8079566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
8084a56b808SFande Kong     if (!voff) continue;
8094a56b808SFande Kong     /* Form C(ii, :) */
810bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
811bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
8124a56b808SFande Kong     for (j = 0; j < nzi; j++) {
813bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
814ad540459SPierre Jolivet       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
8159566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
8169566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
8174a56b808SFande Kong     }
8184a56b808SFande Kong   }
8199566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
8209566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
8219566063dSJacob Faibussowitsch   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
8229566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
8239566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
8249566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
8259566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
826bc8e477aSFande Kong 
827bc8e477aSFande Kong   /* Add contributions from remote */
828bc8e477aSFande Kong   for (i = 0; i < pn; i++) {
829bc8e477aSFande Kong     row = i + pcstart;
8309566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
831bc8e477aSFande Kong   }
8329566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
833bc8e477aSFande Kong 
8349566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
8359566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
836bc8e477aSFande Kong 
837bc8e477aSFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
8383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
839bc8e477aSFande Kong }
840bc8e477aSFande Kong 
841d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
842d71ae5a4SJacob Faibussowitsch {
843bc8e477aSFande Kong   PetscFunctionBegin;
84434bcad68SFande Kong 
8459566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
8463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
847bc8e477aSFande Kong }
848bc8e477aSFande Kong 
849d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
850d71ae5a4SJacob Faibussowitsch {
851bc8e477aSFande Kong   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
852bc8e477aSFande Kong   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
8536718818eSStefano Zampini   Mat_APMPI      *ptap;
854bc8e477aSFande Kong   PetscHMapIV     hmap;
855bc8e477aSFande Kong   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
856bc8e477aSFande Kong   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
857bc8e477aSFande Kong   PetscInt        offset, ii, pocol;
858bc8e477aSFande Kong   const PetscInt *mappingindices;
859bc8e477aSFande Kong   IS              map;
860bc8e477aSFande Kong 
861bc8e477aSFande Kong   PetscFunctionBegin;
8626718818eSStefano Zampini   MatCheckProduct(C, 4);
8636718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
86428b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
86528b400f6SJacob Faibussowitsch   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
866bc8e477aSFande Kong 
8679566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
868bc8e477aSFande Kong 
869bc8e477aSFande Kong   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
870bc8e477aSFande Kong   if (ptap->reuse == MAT_REUSE_MATRIX) {
871bc8e477aSFande Kong     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
8729566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
873bc8e477aSFande Kong   }
8749566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
8759566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
876bc8e477aSFande Kong   pon *= dof;
8779566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
878bc8e477aSFande Kong   pn *= dof;
879bc8e477aSFande Kong 
8809566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
8819566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
8829566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
883bc8e477aSFande Kong   pcstart *= dof;
884bc8e477aSFande Kong   pcend *= dof;
885bc8e477aSFande Kong   cmaxr = 0;
886ad540459SPierre Jolivet   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
887bc8e477aSFande Kong   cd = (Mat_SeqAIJ *)(c->A)->data;
888bc8e477aSFande Kong   co = (Mat_SeqAIJ *)(c->B)->data;
889ad540459SPierre Jolivet   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
8909566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
891eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
8929566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
893bc8e477aSFande Kong   for (i = 0; i < am && (pon || pn); i++) {
8949566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVClear(hmap));
895bc8e477aSFande Kong     offset = i % dof;
896bc8e477aSFande Kong     ii     = i / dof;
897bc8e477aSFande Kong     nzi    = po->i[ii + 1] - po->i[ii];
898bc8e477aSFande Kong     dnzi   = pd->i[ii + 1] - pd->i[ii];
899bc8e477aSFande Kong     if (!nzi && !dnzi) continue;
9009566063dSJacob Faibussowitsch     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
901bc8e477aSFande Kong     voff = 0;
9029566063dSJacob Faibussowitsch     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
903bc8e477aSFande Kong     if (!voff) continue;
904bc8e477aSFande Kong 
905bc8e477aSFande Kong     /* Form remote C(ii, :) */
906bc8e477aSFande Kong     poj = po->j + po->i[ii];
907bc8e477aSFande Kong     poa = po->a + po->i[ii];
908bc8e477aSFande Kong     for (j = 0; j < nzi; j++) {
909bc8e477aSFande Kong       pocol   = poj[j] * dof + offset;
910bc8e477aSFande Kong       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
911bc8e477aSFande Kong       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
912bc8e477aSFande Kong       for (jj = 0; jj < voff; jj++) {
913bc8e477aSFande Kong         apvaluestmp[jj] = apvalues[jj] * poa[j];
914bc8e477aSFande Kong         /* If the row is empty */
915bc8e477aSFande Kong         if (!c_rmtc[pocol]) {
916bc8e477aSFande Kong           c_rmtjj[jj] = apindices[jj];
917bc8e477aSFande Kong           c_rmtaa[jj] = apvaluestmp[jj];
918bc8e477aSFande Kong           c_rmtc[pocol]++;
919bc8e477aSFande Kong         } else {
9209566063dSJacob Faibussowitsch           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
921bc8e477aSFande Kong           if (loc >= 0) { /* hit */
922bc8e477aSFande Kong             c_rmtaa[loc] += apvaluestmp[jj];
9239566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1.0));
924bc8e477aSFande Kong           } else { /* new element */
925bc8e477aSFande Kong             loc = -(loc + 1);
926bc8e477aSFande Kong             /* Move data backward */
927bc8e477aSFande Kong             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
928bc8e477aSFande Kong               c_rmtjj[kk] = c_rmtjj[kk - 1];
929bc8e477aSFande Kong               c_rmtaa[kk] = c_rmtaa[kk - 1];
930bc8e477aSFande Kong             } /* End kk */
931bc8e477aSFande Kong             c_rmtjj[loc] = apindices[jj];
932bc8e477aSFande Kong             c_rmtaa[loc] = apvaluestmp[jj];
933bc8e477aSFande Kong             c_rmtc[pocol]++;
934bc8e477aSFande Kong           }
935bc8e477aSFande Kong         }
936bc8e477aSFande Kong       } /* End jj */
9379566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
938bc8e477aSFande Kong     } /* End j */
939bc8e477aSFande Kong 
940bc8e477aSFande Kong     /* Form local C(ii, :) */
941bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
942bc8e477aSFande Kong     pda = pd->a + pd->i[ii];
943bc8e477aSFande Kong     for (j = 0; j < dnzi; j++) {
944bc8e477aSFande Kong       row = pcstart + pdj[j] * dof + offset;
945ad540459SPierre Jolivet       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
9469566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(voff));
9479566063dSJacob Faibussowitsch       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
948bc8e477aSFande Kong     } /* End j */
949bc8e477aSFande Kong   }   /* End i */
950bc8e477aSFande Kong 
9519566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
9529566063dSJacob Faibussowitsch   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
9539566063dSJacob Faibussowitsch   PetscCall(PetscHMapIVDestroy(&hmap));
9549566063dSJacob Faibussowitsch   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
955bc8e477aSFande Kong 
9569566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9579566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9589566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
9599566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
9609566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_rmtj, c_rmta));
9614a56b808SFande Kong 
9624a56b808SFande Kong   /* Add contributions from remote */
9634a56b808SFande Kong   for (i = 0; i < pn; i++) {
9644a56b808SFande Kong     row = i + pcstart;
9659566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
9664a56b808SFande Kong   }
9679566063dSJacob Faibussowitsch   PetscCall(PetscFree2(c_othj, c_otha));
9684a56b808SFande Kong 
9699566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
9709566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
9714a56b808SFande Kong 
9724a56b808SFande Kong   ptap->reuse = MAT_REUSE_MATRIX;
9733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
9744a56b808SFande Kong }
9754a56b808SFande Kong 
976d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
977d71ae5a4SJacob Faibussowitsch {
9784a56b808SFande Kong   PetscFunctionBegin;
97934bcad68SFande Kong 
9809566063dSJacob Faibussowitsch   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
9813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
9824a56b808SFande Kong }
9834a56b808SFande Kong 
9846718818eSStefano Zampini /* TODO: move algorithm selection to MatProductSetFromOptions */
985d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
986d71ae5a4SJacob Faibussowitsch {
9874a56b808SFande Kong   Mat_APMPI      *ptap;
9886718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
9894a56b808SFande Kong   MPI_Comm        comm;
990bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
9914a56b808SFande Kong   MatType         mtype;
9924a56b808SFande Kong   PetscSF         sf;
9934a56b808SFande Kong   PetscSFNode    *iremote;
9944a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
9954a56b808SFande Kong   const PetscInt *rootdegrees;
9964a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto;
9974a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
998131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
999bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1000131c27b5Sprj-   PetscMPIInt     owner;
1001bc8e477aSFande Kong   const PetscInt *mappingindices;
10024a56b808SFande Kong   PetscBool       flg;
10034a56b808SFande Kong   const char     *algTypes[2] = {"overlapping", "merged"};
1004bc8e477aSFande Kong   IS              map;
10054a56b808SFande Kong 
10064a56b808SFande Kong   PetscFunctionBegin;
10076718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
100828b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
10099566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
101034bcad68SFande Kong 
10114a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
10129566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1013bc8e477aSFande Kong   pn *= dof;
10149566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
10159566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
10169566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
10174a56b808SFande Kong 
10189566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
10194a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
10204a56b808SFande Kong   ptap->algType = 2;
10214a56b808SFande Kong 
10224a56b808SFande Kong   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
10239566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
10249566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
10254a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
10269566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1027bc8e477aSFande Kong   pon *= dof;
10284a56b808SFande Kong   /* offsets */
10299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
10304a56b808SFande Kong   /* The number of columns we will send to remote ranks */
10319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
10329566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
103348a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
10349566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
10359566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
10364a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
10379566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
10384a56b808SFande Kong 
10399566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
10404a56b808SFande Kong   ptap->c_rmti[0] = 0;
10414a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
10424a56b808SFande Kong   for (i = 0; i < am && pon; i++) {
10434a56b808SFande Kong     /* Form one row of AP */
10449566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
1045bc8e477aSFande Kong     offset = i % dof;
1046bc8e477aSFande Kong     ii     = i / dof;
10474a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1048bc8e477aSFande Kong     nzi = po->i[ii + 1] - po->i[ii];
10494a56b808SFande Kong     if (!nzi) continue;
10504a56b808SFande Kong 
10519566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
10529566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
10534a56b808SFande Kong     /* If AP is empty, just continue */
10544a56b808SFande Kong     if (!htsize) continue;
10554a56b808SFande Kong     /* Form C(ii, :) */
1056bc8e477aSFande Kong     poj = po->j + po->i[ii];
105748a46eb9SPierre Jolivet     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
10584a56b808SFande Kong   }
10594a56b808SFande Kong 
10604a56b808SFande Kong   for (i = 0; i < pon; i++) {
10619566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
10624a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
10634a56b808SFande Kong     c_rmtc[i]           = htsize;
10644a56b808SFande Kong   }
10654a56b808SFande Kong 
10669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
10674a56b808SFande Kong 
10684a56b808SFande Kong   for (i = 0; i < pon; i++) {
10694a56b808SFande Kong     off = 0;
10709566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
10719566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
10724a56b808SFande Kong   }
10739566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
10744a56b808SFande Kong 
10759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
10764a56b808SFande Kong   for (i = 0; i < pon; i++) {
10779371c9d4SSatish Balay     owner  = 0;
10789371c9d4SSatish Balay     lidx   = 0;
1079bc8e477aSFande Kong     offset = i % dof;
1080bc8e477aSFande Kong     ii     = i / dof;
10819566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1082bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
10834a56b808SFande Kong     iremote[i].rank  = owner;
10844a56b808SFande Kong   }
10854a56b808SFande Kong 
10869566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
10879566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
10884a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
10899566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
10909566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
10919566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
10924a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
10939566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
10949566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
10954a56b808SFande Kong   rootspacesize = 0;
1096ad540459SPierre Jolivet   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
10979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
10989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
10994a56b808SFande Kong   /* Get information from leaves
11004a56b808SFande Kong    * Number of columns other people contribute to my rows
11014a56b808SFande Kong    * */
11029566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
11039566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
11049566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
11059566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
11064a56b808SFande Kong   /* The number of columns is received for each row */
11074a56b808SFande Kong   ptap->c_othi[0]     = 0;
11084a56b808SFande Kong   rootspacesize       = 0;
11094a56b808SFande Kong   rootspaceoffsets[0] = 0;
11104a56b808SFande Kong   for (i = 0; i < pn; i++) {
11114a56b808SFande Kong     rcvncols = 0;
11124a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
11134a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
11144a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
11154a56b808SFande Kong       rootspacesize++;
11164a56b808SFande Kong     }
11174a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
11184a56b808SFande Kong   }
11199566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
11204a56b808SFande Kong 
11219566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
11229566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11239566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
11249566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
11259566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
11264a56b808SFande Kong 
11279566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
11284a56b808SFande Kong   nleaves = 0;
11294a56b808SFande Kong   for (i = 0; i < pon; i++) {
1130bc8e477aSFande Kong     owner = 0;
1131bc8e477aSFande Kong     ii    = i / dof;
11329566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
11334a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
11344a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
11354a56b808SFande Kong       iremote[nleaves].rank    = owner;
11364a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
11374a56b808SFande Kong     }
11384a56b808SFande Kong   }
11399566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
11409566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
11414a56b808SFande Kong 
11429566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
11439566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
11449566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
11454a56b808SFande Kong   /* One to one map */
11469566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11474a56b808SFande Kong 
11489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
11499566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
11509566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1151bc8e477aSFande Kong   pcstart *= dof;
1152bc8e477aSFande Kong   pcend *= dof;
11539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
11544a56b808SFande Kong   for (i = 0; i < pn; i++) {
11559566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hta[i]));
11569566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
11574a56b808SFande Kong   }
11584a56b808SFande Kong   /* Work on local part */
11594a56b808SFande Kong   /* 4) Pass 1: Estimate memory for C_loc */
11604a56b808SFande Kong   for (i = 0; i < am && pn; i++) {
11619566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
11629566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1163bc8e477aSFande Kong     offset = i % dof;
1164bc8e477aSFande Kong     ii     = i / dof;
1165bc8e477aSFande Kong     nzi    = pd->i[ii + 1] - pd->i[ii];
11664a56b808SFande Kong     if (!nzi) continue;
11674a56b808SFande Kong 
11689566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
11699566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
11709566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
11714a56b808SFande Kong     if (!(htsize + htosize)) continue;
11724a56b808SFande Kong     /* Form C(ii, :) */
1173bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
11744a56b808SFande Kong     for (j = 0; j < nzi; j++) {
11759566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
11769566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
11774a56b808SFande Kong     }
11784a56b808SFande Kong   }
11794a56b808SFande Kong 
11809566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1181bc8e477aSFande Kong 
11829566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
11839566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
11844a56b808SFande Kong 
11854a56b808SFande Kong   /* Get remote data */
11869566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
11879566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
11884a56b808SFande Kong 
11894a56b808SFande Kong   for (i = 0; i < pn; i++) {
11904a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
11914a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
11924a56b808SFande Kong     for (j = 0; j < nzi; j++) {
11934a56b808SFande Kong       col = rdj[j];
11944a56b808SFande Kong       /* diag part */
11954a56b808SFande Kong       if (col >= pcstart && col < pcend) {
11969566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hta[i], col));
11974a56b808SFande Kong       } else { /* off diag */
11989566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
11994a56b808SFande Kong       }
12004a56b808SFande Kong     }
12019566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
12024a56b808SFande Kong     dnz[i] = htsize;
12039566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
12049566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
12054a56b808SFande Kong     onz[i] = htsize;
12069566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
12074a56b808SFande Kong   }
12084a56b808SFande Kong 
12099566063dSJacob Faibussowitsch   PetscCall(PetscFree2(hta, hto));
12109566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
12114a56b808SFande Kong 
12124a56b808SFande Kong   /* local sizes and preallocation */
12139566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12149566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
12159566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
12169566063dSJacob Faibussowitsch   PetscCall(MatSetUp(Cmpi));
12179566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
12184a56b808SFande Kong 
12194a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
12206718818eSStefano Zampini   Cmpi->product->data    = ptap;
12216718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
12226718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
12234a56b808SFande Kong 
12244a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
12254a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
12264a56b808SFande Kong   /* pick an algorithm */
1227d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
12284a56b808SFande Kong   alg = 0;
12299566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1230d0609cedSBarry Smith   PetscOptionsEnd();
12314a56b808SFande Kong   switch (alg) {
1232d71ae5a4SJacob Faibussowitsch   case 0:
1233d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1234d71ae5a4SJacob Faibussowitsch     break;
1235d71ae5a4SJacob Faibussowitsch   case 1:
1236d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1237d71ae5a4SJacob Faibussowitsch     break;
1238d71ae5a4SJacob Faibussowitsch   default:
1239d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
12404a56b808SFande Kong   }
12413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12424a56b808SFande Kong }
12434a56b808SFande Kong 
1244d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1245d71ae5a4SJacob Faibussowitsch {
1246bc8e477aSFande Kong   PetscFunctionBegin;
12479566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
12483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1249bc8e477aSFande Kong }
1250bc8e477aSFande Kong 
1251d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1252d71ae5a4SJacob Faibussowitsch {
12534a56b808SFande Kong   Mat_APMPI      *ptap;
12546718818eSStefano Zampini   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
12554a56b808SFande Kong   MPI_Comm        comm;
1256bc8e477aSFande Kong   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
12574a56b808SFande Kong   MatType         mtype;
12584a56b808SFande Kong   PetscSF         sf;
12594a56b808SFande Kong   PetscSFNode    *iremote;
12604a56b808SFande Kong   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
12614a56b808SFande Kong   const PetscInt *rootdegrees;
12624a56b808SFande Kong   PetscHSetI      ht, oht, *hta, *hto, *htd;
12634a56b808SFande Kong   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1264131c27b5Sprj-   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1265bc8e477aSFande Kong   PetscInt        nalg = 2, alg = 0, offset, ii;
1266131c27b5Sprj-   PetscMPIInt     owner;
12674a56b808SFande Kong   PetscBool       flg;
12684a56b808SFande Kong   const char     *algTypes[2] = {"merged", "overlapping"};
1269bc8e477aSFande Kong   const PetscInt *mappingindices;
1270bc8e477aSFande Kong   IS              map;
12714a56b808SFande Kong 
12724a56b808SFande Kong   PetscFunctionBegin;
12736718818eSStefano Zampini   MatCheckProduct(Cmpi, 5);
127428b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
12759566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
127634bcad68SFande Kong 
12774a56b808SFande Kong   /* Create symbolic parallel matrix Cmpi */
12789566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(P, NULL, &pn));
1279bc8e477aSFande Kong   pn *= dof;
12809566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
12819566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
12829566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
12834a56b808SFande Kong 
12849566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
12854a56b808SFande Kong   ptap->reuse   = MAT_INITIAL_MATRIX;
12864a56b808SFande Kong   ptap->algType = 3;
12874a56b808SFande Kong 
12884a56b808SFande Kong   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
12899566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
12909566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
12914a56b808SFande Kong 
12924a56b808SFande Kong   /* This equals to the number of offdiag columns in P */
12939566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1294bc8e477aSFande Kong   pon *= dof;
12954a56b808SFande Kong   /* offsets */
12969566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
12974a56b808SFande Kong   /* The number of columns we will send to remote ranks */
12989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtc));
12999566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &hta));
130048a46eb9SPierre Jolivet   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
13019566063dSJacob Faibussowitsch   PetscCall(MatGetLocalSize(A, &am, NULL));
13029566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
13034a56b808SFande Kong   /* Create hash table to merge all columns for C(i, :) */
13049566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&ht));
13059566063dSJacob Faibussowitsch   PetscCall(PetscHSetICreate(&oht));
13069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
13074a56b808SFande Kong   for (i = 0; i < pn; i++) {
13089566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&htd[i]));
13099566063dSJacob Faibussowitsch     PetscCall(PetscHSetICreate(&hto[i]));
13104a56b808SFande Kong   }
1311bc8e477aSFande Kong 
13129566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(map, &mappingindices));
13134a56b808SFande Kong   ptap->c_rmti[0] = 0;
13144a56b808SFande Kong   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
13154a56b808SFande Kong   for (i = 0; i < am && (pon || pn); i++) {
13164a56b808SFande Kong     /* Form one row of AP */
13179566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(ht));
13189566063dSJacob Faibussowitsch     PetscCall(PetscHSetIClear(oht));
1319bc8e477aSFande Kong     offset = i % dof;
1320bc8e477aSFande Kong     ii     = i / dof;
13214a56b808SFande Kong     /* If the off diag is empty, we should not do any calculation */
1322bc8e477aSFande Kong     nzi  = po->i[ii + 1] - po->i[ii];
1323bc8e477aSFande Kong     dnzi = pd->i[ii + 1] - pd->i[ii];
13244a56b808SFande Kong     if (!nzi && !dnzi) continue;
13254a56b808SFande Kong 
13269566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
13279566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(ht, &htsize));
13289566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(oht, &htosize));
13294a56b808SFande Kong     /* If AP is empty, just continue */
13304a56b808SFande Kong     if (!(htsize + htosize)) continue;
13314a56b808SFande Kong 
13324a56b808SFande Kong     /* Form remote C(ii, :) */
1333bc8e477aSFande Kong     poj = po->j + po->i[ii];
13344a56b808SFande Kong     for (j = 0; j < nzi; j++) {
13359566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
13369566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
13374a56b808SFande Kong     }
13384a56b808SFande Kong 
13394a56b808SFande Kong     /* Form local C(ii, :) */
1340bc8e477aSFande Kong     pdj = pd->j + pd->i[ii];
13414a56b808SFande Kong     for (j = 0; j < dnzi; j++) {
13429566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
13439566063dSJacob Faibussowitsch       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
13444a56b808SFande Kong     }
13454a56b808SFande Kong   }
13464a56b808SFande Kong 
13479566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(map, &mappingindices));
1348bc8e477aSFande Kong 
13499566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&ht));
13509566063dSJacob Faibussowitsch   PetscCall(PetscHSetIDestroy(&oht));
13514a56b808SFande Kong 
13524a56b808SFande Kong   for (i = 0; i < pon; i++) {
13539566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
13544a56b808SFande Kong     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
13554a56b808SFande Kong     c_rmtc[i]           = htsize;
13564a56b808SFande Kong   }
13574a56b808SFande Kong 
13589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
13594a56b808SFande Kong 
13604a56b808SFande Kong   for (i = 0; i < pon; i++) {
13614a56b808SFande Kong     off = 0;
13629566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
13639566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hta[i]));
13644a56b808SFande Kong   }
13659566063dSJacob Faibussowitsch   PetscCall(PetscFree(hta));
13664a56b808SFande Kong 
13679566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &iremote));
13684a56b808SFande Kong   for (i = 0; i < pon; i++) {
13699371c9d4SSatish Balay     owner  = 0;
13709371c9d4SSatish Balay     lidx   = 0;
1371bc8e477aSFande Kong     offset = i % dof;
1372bc8e477aSFande Kong     ii     = i / dof;
13739566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1374bc8e477aSFande Kong     iremote[i].index = lidx * dof + offset;
13754a56b808SFande Kong     iremote[i].rank  = owner;
13764a56b808SFande Kong   }
13774a56b808SFande Kong 
13789566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &sf));
13799566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
13804a56b808SFande Kong   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
13819566063dSJacob Faibussowitsch   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
13829566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(sf));
13839566063dSJacob Faibussowitsch   PetscCall(PetscSFSetUp(sf));
13844a56b808SFande Kong   /* How many neighbors have contributions to my rows? */
13859566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
13869566063dSJacob Faibussowitsch   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
13874a56b808SFande Kong   rootspacesize = 0;
1388ad540459SPierre Jolivet   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
13899566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
13909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
13914a56b808SFande Kong   /* Get information from leaves
13924a56b808SFande Kong    * Number of columns other people contribute to my rows
13934a56b808SFande Kong    * */
13949566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
13959566063dSJacob Faibussowitsch   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
13969566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtc));
13979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
13984a56b808SFande Kong   /* The number of columns is received for each row */
13994a56b808SFande Kong   ptap->c_othi[0]     = 0;
14004a56b808SFande Kong   rootspacesize       = 0;
14014a56b808SFande Kong   rootspaceoffsets[0] = 0;
14024a56b808SFande Kong   for (i = 0; i < pn; i++) {
14034a56b808SFande Kong     rcvncols = 0;
14044a56b808SFande Kong     for (j = 0; j < rootdegrees[i]; j++) {
14054a56b808SFande Kong       rcvncols += rootspace[rootspacesize];
14064a56b808SFande Kong       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
14074a56b808SFande Kong       rootspacesize++;
14084a56b808SFande Kong     }
14094a56b808SFande Kong     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
14104a56b808SFande Kong   }
14119566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspace));
14124a56b808SFande Kong 
14139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
14149566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14159566063dSJacob Faibussowitsch   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
14169566063dSJacob Faibussowitsch   PetscCall(PetscSFDestroy(&sf));
14179566063dSJacob Faibussowitsch   PetscCall(PetscFree(rootspaceoffsets));
14184a56b808SFande Kong 
14199566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
14204a56b808SFande Kong   nleaves = 0;
14214a56b808SFande Kong   for (i = 0; i < pon; i++) {
1422071fcb05SBarry Smith     owner = 0;
1423bc8e477aSFande Kong     ii    = i / dof;
14249566063dSJacob Faibussowitsch     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
14254a56b808SFande Kong     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
14264a56b808SFande Kong     for (j = 0; j < sendncols; j++) {
14274a56b808SFande Kong       iremote[nleaves].rank    = owner;
14284a56b808SFande Kong       iremote[nleaves++].index = c_rmtoffsets[i] + j;
14294a56b808SFande Kong     }
14304a56b808SFande Kong   }
14319566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtoffsets));
14329566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
14334a56b808SFande Kong 
14349566063dSJacob Faibussowitsch   PetscCall(PetscSFCreate(comm, &ptap->sf));
14359566063dSJacob Faibussowitsch   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
14369566063dSJacob Faibussowitsch   PetscCall(PetscSFSetFromOptions(ptap->sf));
14374a56b808SFande Kong   /* One to one map */
14389566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14394a56b808SFande Kong   /* Get remote data */
14409566063dSJacob Faibussowitsch   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
14419566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_rmtj));
14429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
14439566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1444bc8e477aSFande Kong   pcstart *= dof;
1445bc8e477aSFande Kong   pcend *= dof;
14464a56b808SFande Kong   for (i = 0; i < pn; i++) {
14474a56b808SFande Kong     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
14484a56b808SFande Kong     rdj = c_othj + ptap->c_othi[i];
14494a56b808SFande Kong     for (j = 0; j < nzi; j++) {
14504a56b808SFande Kong       col = rdj[j];
14514a56b808SFande Kong       /* diag part */
14524a56b808SFande Kong       if (col >= pcstart && col < pcend) {
14539566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(htd[i], col));
14544a56b808SFande Kong       } else { /* off diag */
14559566063dSJacob Faibussowitsch         PetscCall(PetscHSetIAdd(hto[i], col));
14564a56b808SFande Kong       }
14574a56b808SFande Kong     }
14589566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
14594a56b808SFande Kong     dnz[i] = htsize;
14609566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&htd[i]));
14619566063dSJacob Faibussowitsch     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
14624a56b808SFande Kong     onz[i] = htsize;
14639566063dSJacob Faibussowitsch     PetscCall(PetscHSetIDestroy(&hto[i]));
14644a56b808SFande Kong   }
14654a56b808SFande Kong 
14669566063dSJacob Faibussowitsch   PetscCall(PetscFree2(htd, hto));
14679566063dSJacob Faibussowitsch   PetscCall(PetscFree(c_othj));
14684a56b808SFande Kong 
14694a56b808SFande Kong   /* local sizes and preallocation */
14709566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
14719566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
14729566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
14739566063dSJacob Faibussowitsch   PetscCall(PetscFree2(dnz, onz));
14744a56b808SFande Kong 
14754a56b808SFande Kong   /* attach the supporting struct to Cmpi for reuse */
14766718818eSStefano Zampini   Cmpi->product->data    = ptap;
14776718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
14786718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
14794a56b808SFande Kong 
14804a56b808SFande Kong   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
14814a56b808SFande Kong   Cmpi->assembled = PETSC_FALSE;
14824a56b808SFande Kong   /* pick an algorithm */
1483d0609cedSBarry Smith   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
14844a56b808SFande Kong   alg = 0;
14859566063dSJacob Faibussowitsch   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1486d0609cedSBarry Smith   PetscOptionsEnd();
14874a56b808SFande Kong   switch (alg) {
1488d71ae5a4SJacob Faibussowitsch   case 0:
1489d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1490d71ae5a4SJacob Faibussowitsch     break;
1491d71ae5a4SJacob Faibussowitsch   case 1:
1492d71ae5a4SJacob Faibussowitsch     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1493d71ae5a4SJacob Faibussowitsch     break;
1494d71ae5a4SJacob Faibussowitsch   default:
1495d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
14964a56b808SFande Kong   }
14973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14984a56b808SFande Kong }
14994a56b808SFande Kong 
1500d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1501d71ae5a4SJacob Faibussowitsch {
1502bc8e477aSFande Kong   PetscFunctionBegin;
15039566063dSJacob Faibussowitsch   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
15043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1505bc8e477aSFande Kong }
1506bc8e477aSFande Kong 
1507d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1508d71ae5a4SJacob Faibussowitsch {
15093cdca5ebSHong Zhang   Mat_APMPI               *ptap;
15106718818eSStefano Zampini   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1511e953e47cSHong Zhang   MPI_Comm                 comm;
1512e953e47cSHong Zhang   PetscMPIInt              size, rank;
1513e953e47cSHong Zhang   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1514e953e47cSHong Zhang   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1515e953e47cSHong Zhang   PetscInt                *lnk, i, k, pnz, row, nsend;
1516e953e47cSHong Zhang   PetscBT                  lnkbt;
1517ec4bef21SJose E. Roman   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1518ec4bef21SJose E. Roman   PETSC_UNUSED PetscMPIInt icompleted = 0;
1519e953e47cSHong Zhang   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1520e953e47cSHong Zhang   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1521e953e47cSHong Zhang   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1522e953e47cSHong Zhang   MPI_Request             *swaits, *rwaits;
1523e953e47cSHong Zhang   MPI_Status              *sstatus, rstatus;
1524e953e47cSHong Zhang   PetscLayout              rowmap;
1525e953e47cSHong Zhang   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1526e953e47cSHong Zhang   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1527e953e47cSHong Zhang   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1528ec07b8f8SHong Zhang   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1529e953e47cSHong Zhang   PetscScalar             *apv;
1530eec179cfSJacob Faibussowitsch   PetscHMapI               ta;
1531b92f168fSBarry Smith   MatType                  mtype;
1532e83e5f86SFande Kong   const char              *prefix;
1533e953e47cSHong Zhang #if defined(PETSC_USE_INFO)
1534e953e47cSHong Zhang   PetscReal apfill;
1535e953e47cSHong Zhang #endif
1536e953e47cSHong Zhang 
1537e953e47cSHong Zhang   PetscFunctionBegin;
15386718818eSStefano Zampini   MatCheckProduct(Cmpi, 4);
153928b400f6SJacob Faibussowitsch   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
15409566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
15419566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
15429566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1543e953e47cSHong Zhang 
154452f7967eSHong Zhang   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1545ec07b8f8SHong Zhang 
1546e953e47cSHong Zhang   /* create symbolic parallel matrix Cmpi */
15479566063dSJacob Faibussowitsch   PetscCall(MatGetType(A, &mtype));
15489566063dSJacob Faibussowitsch   PetscCall(MatSetType(Cmpi, mtype));
1549e953e47cSHong Zhang 
1550e953e47cSHong Zhang   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1551e953e47cSHong Zhang   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1552e953e47cSHong Zhang 
15533cdca5ebSHong Zhang   /* create struct Mat_APMPI and attached it to C later */
15549566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ptap));
155515a3b8e2SHong Zhang   ptap->reuse   = MAT_INITIAL_MATRIX;
1556a4ffb656SHong Zhang   ptap->algType = 1;
155715a3b8e2SHong Zhang 
155815a3b8e2SHong Zhang   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
15599566063dSJacob Faibussowitsch   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
156015a3b8e2SHong Zhang   /* get P_loc by taking all local rows of P */
15619566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
156215a3b8e2SHong Zhang 
156367a12041SHong Zhang   /* (0) compute Rd = Pd^T, Ro = Po^T  */
15649566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
15659566063dSJacob Faibussowitsch   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
156615a3b8e2SHong Zhang 
156767a12041SHong Zhang   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
156867a12041SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
156952f7967eSHong Zhang   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1570445158ffSHong Zhang 
15719a6dcab7SHong Zhang   /* create and initialize a linked list */
1572eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
15734b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
15744b38b95cSHong Zhang   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1575eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1576d0e9a2f7SHong Zhang   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
157778d30b94SHong Zhang 
15789566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1579445158ffSHong Zhang 
15808cb82516SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1581ec07b8f8SHong Zhang   if (ao) {
15829566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1583ec07b8f8SHong Zhang   } else {
15849566063dSJacob Faibussowitsch     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1585ec07b8f8SHong Zhang   }
1586445158ffSHong Zhang   current_space = free_space;
158767a12041SHong Zhang   nspacedouble  = 0;
158867a12041SHong Zhang 
15899566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &api));
159067a12041SHong Zhang   api[0] = 0;
1591445158ffSHong Zhang   for (i = 0; i < am; i++) {
159267a12041SHong Zhang     /* diagonal portion: Ad[i,:]*P */
15939371c9d4SSatish Balay     ai  = ad->i;
15949371c9d4SSatish Balay     pi  = p_loc->i;
159567a12041SHong Zhang     nzi = ai[i + 1] - ai[i];
159667a12041SHong Zhang     aj  = ad->j + ai[i];
1597445158ffSHong Zhang     for (j = 0; j < nzi; j++) {
1598445158ffSHong Zhang       row  = aj[j];
159967a12041SHong Zhang       pnz  = pi[row + 1] - pi[row];
160067a12041SHong Zhang       Jptr = p_loc->j + pi[row];
1601445158ffSHong Zhang       /* add non-zero cols of P into the sorted linked list lnk */
16029566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1603445158ffSHong Zhang     }
160467a12041SHong Zhang     /* off-diagonal portion: Ao[i,:]*P */
1605ec07b8f8SHong Zhang     if (ao) {
16069371c9d4SSatish Balay       ai  = ao->i;
16079371c9d4SSatish Balay       pi  = p_oth->i;
160867a12041SHong Zhang       nzi = ai[i + 1] - ai[i];
160967a12041SHong Zhang       aj  = ao->j + ai[i];
1610445158ffSHong Zhang       for (j = 0; j < nzi; j++) {
1611445158ffSHong Zhang         row  = aj[j];
161267a12041SHong Zhang         pnz  = pi[row + 1] - pi[row];
161367a12041SHong Zhang         Jptr = p_oth->j + pi[row];
16149566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1615445158ffSHong Zhang       }
1616ec07b8f8SHong Zhang     }
1617445158ffSHong Zhang     apnz       = lnk[0];
1618445158ffSHong Zhang     api[i + 1] = api[i] + apnz;
1619445158ffSHong Zhang     if (ap_rmax < apnz) ap_rmax = apnz;
1620445158ffSHong Zhang 
1621445158ffSHong Zhang     /* if free space is not available, double the total space in the list */
1622445158ffSHong Zhang     if (current_space->local_remaining < apnz) {
16239566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1624445158ffSHong Zhang       nspacedouble++;
1625445158ffSHong Zhang     }
1626445158ffSHong Zhang 
1627445158ffSHong Zhang     /* Copy data into free space, then initialize lnk */
16289566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1629445158ffSHong Zhang 
1630445158ffSHong Zhang     current_space->array += apnz;
1631445158ffSHong Zhang     current_space->local_used += apnz;
1632445158ffSHong Zhang     current_space->local_remaining -= apnz;
1633445158ffSHong Zhang   }
1634681d504bSHong Zhang   /* Allocate space for apj and apv, initialize apj, and */
1635445158ffSHong Zhang   /* destroy list of free space and other temporary array(s) */
16369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
16379566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
16389566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
16399a6dcab7SHong Zhang 
1640aa690a28SHong Zhang   /* Create AP_loc for reuse */
16419566063dSJacob Faibussowitsch   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
16429566063dSJacob Faibussowitsch   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1643aa690a28SHong Zhang #if defined(PETSC_USE_INFO)
1644ec07b8f8SHong Zhang   if (ao) {
1645aa690a28SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1646ec07b8f8SHong Zhang   } else {
1647ec07b8f8SHong Zhang     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1648ec07b8f8SHong Zhang   }
1649aa690a28SHong Zhang   ptap->AP_loc->info.mallocs           = nspacedouble;
1650aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_given  = fill;
1651aa690a28SHong Zhang   ptap->AP_loc->info.fill_ratio_needed = apfill;
1652aa690a28SHong Zhang 
1653aa690a28SHong Zhang   if (api[am]) {
16549566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
16559566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1656aa690a28SHong Zhang   } else {
16579566063dSJacob Faibussowitsch     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1658aa690a28SHong Zhang   }
1659aa690a28SHong Zhang #endif
1660aa690a28SHong Zhang 
1661681d504bSHong Zhang   /* (2-1) compute symbolic Co = Ro*AP_loc  */
16629566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(A, &prefix));
16639566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
16649566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
16659566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
16669566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
16679566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
16689566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
16699566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
16709566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
16719566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_oth, fill));
16729566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_oth));
16739566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_oth));
167415a3b8e2SHong Zhang 
167567a12041SHong Zhang   /* (3) send coj of C_oth to other processors  */
167615a3b8e2SHong Zhang   /* determine row ownership */
16779566063dSJacob Faibussowitsch   PetscCall(PetscLayoutCreate(comm, &rowmap));
1678445158ffSHong Zhang   rowmap->n  = pn;
1679445158ffSHong Zhang   rowmap->bs = 1;
16809566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(rowmap));
1681445158ffSHong Zhang   owners = rowmap->range;
168215a3b8e2SHong Zhang 
168315a3b8e2SHong Zhang   /* determine the number of messages to send, their lengths */
16849566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
16859566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_s, size));
16869566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(len_si, size));
168715a3b8e2SHong Zhang 
168867a12041SHong Zhang   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
16899371c9d4SSatish Balay   coi   = c_oth->i;
16909371c9d4SSatish Balay   coj   = c_oth->j;
169167a12041SHong Zhang   con   = ptap->C_oth->rmap->n;
169215a3b8e2SHong Zhang   proc  = 0;
169367a12041SHong Zhang   for (i = 0; i < con; i++) {
169415a3b8e2SHong Zhang     while (prmap[i] >= owners[proc + 1]) proc++;
169515a3b8e2SHong Zhang     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
169615a3b8e2SHong Zhang     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
169715a3b8e2SHong Zhang   }
169815a3b8e2SHong Zhang 
169915a3b8e2SHong Zhang   len          = 0; /* max length of buf_si[], see (4) */
170015a3b8e2SHong Zhang   owners_co[0] = 0;
170167a12041SHong Zhang   nsend        = 0;
170215a3b8e2SHong Zhang   for (proc = 0; proc < size; proc++) {
170315a3b8e2SHong Zhang     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
170415a3b8e2SHong Zhang     if (len_s[proc]) {
1705445158ffSHong Zhang       nsend++;
170615a3b8e2SHong Zhang       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
170715a3b8e2SHong Zhang       len += len_si[proc];
170815a3b8e2SHong Zhang     }
170915a3b8e2SHong Zhang   }
171015a3b8e2SHong Zhang 
171115a3b8e2SHong Zhang   /* determine the number and length of messages to receive for coi and coj  */
17129566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
17139566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
171415a3b8e2SHong Zhang 
171515a3b8e2SHong Zhang   /* post the Irecv and Isend of coj */
17169566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagj));
17179566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
17189566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nsend + 1, &swaits));
171915a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
172015a3b8e2SHong Zhang     if (!len_s[proc]) continue;
172115a3b8e2SHong Zhang     i = owners_co[proc];
17229566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
172315a3b8e2SHong Zhang     k++;
172415a3b8e2SHong Zhang   }
172515a3b8e2SHong Zhang 
1726681d504bSHong Zhang   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
17279566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
17289566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
17299566063dSJacob Faibussowitsch   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
17309566063dSJacob Faibussowitsch   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
17319566063dSJacob Faibussowitsch   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
17329566063dSJacob Faibussowitsch   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
17339566063dSJacob Faibussowitsch   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
17349566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
17359566063dSJacob Faibussowitsch   PetscCall(MatProductSetFill(ptap->C_loc, fill));
17369566063dSJacob Faibussowitsch   PetscCall(MatProductSetFromOptions(ptap->C_loc));
17379566063dSJacob Faibussowitsch   PetscCall(MatProductSymbolic(ptap->C_loc));
17384222ddf1SHong Zhang 
1739681d504bSHong Zhang   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1740681d504bSHong Zhang 
1741681d504bSHong Zhang   /* receives coj are complete */
174248a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
17439566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17449566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
174515a3b8e2SHong Zhang 
174678d30b94SHong Zhang   /* add received column indices into ta to update Crmax */
174778d30b94SHong Zhang   for (k = 0; k < nrecv; k++) { /* k-th received message */
174878d30b94SHong Zhang     Jptr = buf_rj[k];
1749c76ffc5fSJacob Faibussowitsch     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
175078d30b94SHong Zhang   }
1751eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1752eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
17539a6dcab7SHong Zhang 
175415a3b8e2SHong Zhang   /* (4) send and recv coi */
17559566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(comm, &tagi));
17569566063dSJacob Faibussowitsch   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
17579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &buf_s));
175815a3b8e2SHong Zhang   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
175915a3b8e2SHong Zhang   for (proc = 0, k = 0; proc < size; proc++) {
176015a3b8e2SHong Zhang     if (!len_s[proc]) continue;
176115a3b8e2SHong Zhang     /* form outgoing message for i-structure:
176215a3b8e2SHong Zhang          buf_si[0]:                 nrows to be sent
176315a3b8e2SHong Zhang                [1:nrows]:           row index (global)
176415a3b8e2SHong Zhang                [nrows+1:2*nrows+1]: i-structure index
176515a3b8e2SHong Zhang     */
176615a3b8e2SHong Zhang     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
176715a3b8e2SHong Zhang     buf_si_i    = buf_si + nrows + 1;
176815a3b8e2SHong Zhang     buf_si[0]   = nrows;
176915a3b8e2SHong Zhang     buf_si_i[0] = 0;
177015a3b8e2SHong Zhang     nrows       = 0;
177115a3b8e2SHong Zhang     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
177215a3b8e2SHong Zhang       nzi                 = coi[i + 1] - coi[i];
177315a3b8e2SHong Zhang       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
177415a3b8e2SHong Zhang       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
177515a3b8e2SHong Zhang       nrows++;
177615a3b8e2SHong Zhang     }
17779566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
177815a3b8e2SHong Zhang     k++;
177915a3b8e2SHong Zhang     buf_si += len_si[proc];
178015a3b8e2SHong Zhang   }
178148a46eb9SPierre Jolivet   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
17829566063dSJacob Faibussowitsch   PetscCall(PetscFree(rwaits));
17839566063dSJacob Faibussowitsch   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
178415a3b8e2SHong Zhang 
17859566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
17869566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_ri));
17879566063dSJacob Faibussowitsch   PetscCall(PetscFree(swaits));
17889566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_s));
1789a4ffb656SHong Zhang 
179067a12041SHong Zhang   /* (5) compute the local portion of Cmpi      */
1791*bbea24aaSStefano Zampini   /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */
17929566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
179315a3b8e2SHong Zhang   current_space = free_space;
179415a3b8e2SHong Zhang 
17959566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1796445158ffSHong Zhang   for (k = 0; k < nrecv; k++) {
179715a3b8e2SHong Zhang     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
179815a3b8e2SHong Zhang     nrows       = *buf_ri_k[k];
179915a3b8e2SHong Zhang     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1800a5b23f4aSJose E. Roman     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
180115a3b8e2SHong Zhang   }
1802445158ffSHong Zhang 
1803d0609cedSBarry Smith   MatPreallocateBegin(comm, pn, pn, dnz, onz);
18049566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
180515a3b8e2SHong Zhang   for (i = 0; i < pn; i++) {
180667a12041SHong Zhang     /* add C_loc into Cmpi */
180767a12041SHong Zhang     nzi  = c_loc->i[i + 1] - c_loc->i[i];
180867a12041SHong Zhang     Jptr = c_loc->j + c_loc->i[i];
18099566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
181015a3b8e2SHong Zhang 
181115a3b8e2SHong Zhang     /* add received col data into lnk */
1812445158ffSHong Zhang     for (k = 0; k < nrecv; k++) { /* k-th received message */
181315a3b8e2SHong Zhang       if (i == *nextrow[k]) {     /* i-th row */
181415a3b8e2SHong Zhang         nzi  = *(nextci[k] + 1) - *nextci[k];
181515a3b8e2SHong Zhang         Jptr = buf_rj[k] + *nextci[k];
18169566063dSJacob Faibussowitsch         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
18179371c9d4SSatish Balay         nextrow[k]++;
18189371c9d4SSatish Balay         nextci[k]++;
181915a3b8e2SHong Zhang       }
182015a3b8e2SHong Zhang     }
1821d0e9a2f7SHong Zhang     nzi = lnk[0];
18228cb82516SHong Zhang 
182315a3b8e2SHong Zhang     /* copy data into free space, then initialize lnk */
18249566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
18259566063dSJacob Faibussowitsch     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
182615a3b8e2SHong Zhang   }
18279566063dSJacob Faibussowitsch   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
18289566063dSJacob Faibussowitsch   PetscCall(PetscLLDestroy(lnk, lnkbt));
18299566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceDestroy(free_space));
183080bb4639SHong Zhang 
1831ae5f0867Sstefano_zampini   /* local sizes and preallocation */
18329566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1833ac94c67aSTristan Konolige   if (P->cmap->bs > 0) {
18349566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
18359566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1836ac94c67aSTristan Konolige   }
18379566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1838d0609cedSBarry Smith   MatPreallocateEnd(dnz, onz);
183915a3b8e2SHong Zhang 
1840445158ffSHong Zhang   /* members in merge */
18419566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r));
18429566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r));
18439566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri[0]));
18449566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_ri));
18459566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj[0]));
18469566063dSJacob Faibussowitsch   PetscCall(PetscFree(buf_rj));
18479566063dSJacob Faibussowitsch   PetscCall(PetscLayoutDestroy(&rowmap));
184815a3b8e2SHong Zhang 
18499566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(pN, &ptap->apa));
18502259aa2eSHong Zhang 
18516718818eSStefano Zampini   /* attach the supporting struct to Cmpi for reuse */
18526718818eSStefano Zampini   Cmpi->product->data    = ptap;
18536718818eSStefano Zampini   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
18546718818eSStefano Zampini   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
18556718818eSStefano Zampini 
18561a47ec54SHong Zhang   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
18571a47ec54SHong Zhang   Cmpi->assembled = PETSC_FALSE;
18583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18590d3441aeSHong Zhang }
18600d3441aeSHong Zhang 
1861d71ae5a4SJacob Faibussowitsch PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1862d71ae5a4SJacob Faibussowitsch {
18636718818eSStefano Zampini   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
18640d3441aeSHong Zhang   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
18652dd9e643SHong Zhang   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
18666718818eSStefano Zampini   Mat_APMPI         *ptap;
18679ce11a7cSHong Zhang   Mat                AP_loc, C_loc, C_oth;
18680d3441aeSHong Zhang   PetscInt           i, rstart, rend, cm, ncols, row;
18698cb82516SHong Zhang   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
18708cb82516SHong Zhang   PetscScalar       *apa;
18710d3441aeSHong Zhang   const PetscInt    *cols;
18720d3441aeSHong Zhang   const PetscScalar *vals;
18730d3441aeSHong Zhang 
18740d3441aeSHong Zhang   PetscFunctionBegin;
18756718818eSStefano Zampini   MatCheckProduct(C, 3);
18766718818eSStefano Zampini   ptap = (Mat_APMPI *)C->product->data;
187728b400f6SJacob Faibussowitsch   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
187828b400f6SJacob Faibussowitsch   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1879a9899c97SHong Zhang 
18809566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(C));
1881e497d3c8SHong Zhang   /* 1) get R = Pd^T,Ro = Po^T */
1882748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
18839566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
18849566063dSJacob Faibussowitsch     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1885748c7196SHong Zhang   }
18860d3441aeSHong Zhang 
1887e497d3c8SHong Zhang   /* 2) get AP_loc */
18880d3441aeSHong Zhang   AP_loc = ptap->AP_loc;
18898cb82516SHong Zhang   ap     = (Mat_SeqAIJ *)AP_loc->data;
18900d3441aeSHong Zhang 
1891e497d3c8SHong Zhang   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1892748c7196SHong Zhang   if (ptap->reuse == MAT_REUSE_MATRIX) {
1893748c7196SHong Zhang     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
18949566063dSJacob Faibussowitsch     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
18959566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1896e497d3c8SHong Zhang   }
18970d3441aeSHong Zhang 
18988cb82516SHong Zhang   /* 2-2) compute numeric A_loc*P - dominating part */
18990d3441aeSHong Zhang   /* get data from symbolic products */
19008cb82516SHong Zhang   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1901ad540459SPierre Jolivet   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
19028cb82516SHong Zhang   apa = ptap->apa;
1903681d504bSHong Zhang   api = ap->i;
1904681d504bSHong Zhang   apj = ap->j;
1905e497d3c8SHong Zhang   for (i = 0; i < am; i++) {
1906445158ffSHong Zhang     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1907e497d3c8SHong Zhang     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1908e497d3c8SHong Zhang     apnz = api[i + 1] - api[i];
1909e497d3c8SHong Zhang     for (j = 0; j < apnz; j++) {
1910e497d3c8SHong Zhang       col                 = apj[j + api[i]];
1911e497d3c8SHong Zhang       ap->a[j + ap->i[i]] = apa[col];
1912e497d3c8SHong Zhang       apa[col]            = 0.0;
19130d3441aeSHong Zhang     }
19140d3441aeSHong Zhang   }
1915976452c9SRichard Tran Mills   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
19169566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
19170d3441aeSHong Zhang 
19188cb82516SHong Zhang   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
19199566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_loc));
19209566063dSJacob Faibussowitsch   PetscCall(MatProductNumeric(ptap->C_oth));
19219ce11a7cSHong Zhang   C_loc = ptap->C_loc;
19229ce11a7cSHong Zhang   C_oth = ptap->C_oth;
19230d3441aeSHong Zhang 
19240d3441aeSHong Zhang   /* add C_loc and Co to to C */
19259566063dSJacob Faibussowitsch   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
19260d3441aeSHong Zhang 
19270d3441aeSHong Zhang   /* C_loc -> C */
19280d3441aeSHong Zhang   cm    = C_loc->rmap->N;
19299ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_loc->data;
19308cb82516SHong Zhang   cols  = c_seq->j;
19318cb82516SHong Zhang   vals  = c_seq->a;
1932904d1e70Sandi selinger 
1933e9ede7d0Sandi selinger   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1934904d1e70Sandi selinger   /* when there are no off-processor parts.  */
19351de21080Sandi selinger   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
19361de21080Sandi selinger   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
19371de21080Sandi selinger   /* a table, and other, more complex stuff has to be done. */
1938904d1e70Sandi selinger   if (C->assembled) {
1939904d1e70Sandi selinger     C->was_assembled = PETSC_TRUE;
1940904d1e70Sandi selinger     C->assembled     = PETSC_FALSE;
1941904d1e70Sandi selinger   }
1942904d1e70Sandi selinger   if (C->was_assembled) {
19430d3441aeSHong Zhang     for (i = 0; i < cm; i++) {
19449ce11a7cSHong Zhang       ncols = c_seq->i[i + 1] - c_seq->i[i];
19450d3441aeSHong Zhang       row   = rstart + i;
19469566063dSJacob Faibussowitsch       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19479371c9d4SSatish Balay       cols += ncols;
19489371c9d4SSatish Balay       vals += ncols;
19490d3441aeSHong Zhang     }
1950904d1e70Sandi selinger   } else {
19519566063dSJacob Faibussowitsch     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1952904d1e70Sandi selinger   }
19530d3441aeSHong Zhang 
19540d3441aeSHong Zhang   /* Co -> C, off-processor part */
19559ce11a7cSHong Zhang   cm    = C_oth->rmap->N;
19569ce11a7cSHong Zhang   c_seq = (Mat_SeqAIJ *)C_oth->data;
19578cb82516SHong Zhang   cols  = c_seq->j;
19588cb82516SHong Zhang   vals  = c_seq->a;
19599ce11a7cSHong Zhang   for (i = 0; i < cm; i++) {
19609ce11a7cSHong Zhang     ncols = c_seq->i[i + 1] - c_seq->i[i];
19610d3441aeSHong Zhang     row   = p->garray[i];
19629566063dSJacob Faibussowitsch     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
19639371c9d4SSatish Balay     cols += ncols;
19649371c9d4SSatish Balay     vals += ncols;
19650d3441aeSHong Zhang   }
1966904d1e70Sandi selinger 
19679566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
19689566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
19690d3441aeSHong Zhang 
1970748c7196SHong Zhang   ptap->reuse = MAT_REUSE_MATRIX;
19713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19720d3441aeSHong Zhang }
19734222ddf1SHong Zhang 
1974d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1975d71ae5a4SJacob Faibussowitsch {
19764222ddf1SHong Zhang   Mat_Product        *product = C->product;
19774222ddf1SHong Zhang   Mat                 A = product->A, P = product->B;
19784222ddf1SHong Zhang   MatProductAlgorithm alg  = product->alg;
19794222ddf1SHong Zhang   PetscReal           fill = product->fill;
19804222ddf1SHong Zhang   PetscBool           flg;
19814222ddf1SHong Zhang 
19824222ddf1SHong Zhang   PetscFunctionBegin;
19834222ddf1SHong Zhang   /* scalable: do R=P^T locally, then C=R*A*P */
19849566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
19854222ddf1SHong Zhang   if (flg) {
19869566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
19874222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
19884222ddf1SHong Zhang     goto next;
19894222ddf1SHong Zhang   }
19904222ddf1SHong Zhang 
19914222ddf1SHong Zhang   /* nonscalable: do R=P^T locally, then C=R*A*P */
19929566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
19934222ddf1SHong Zhang   if (flg) {
19949566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
19954222ddf1SHong Zhang     goto next;
19964222ddf1SHong Zhang   }
19974222ddf1SHong Zhang 
19984222ddf1SHong Zhang   /* allatonce */
19999566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
20004222ddf1SHong Zhang   if (flg) {
20019566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
20024222ddf1SHong Zhang     goto next;
20034222ddf1SHong Zhang   }
20044222ddf1SHong Zhang 
20054222ddf1SHong Zhang   /* allatonce_merged */
20069566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
20074222ddf1SHong Zhang   if (flg) {
20089566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
20094222ddf1SHong Zhang     goto next;
20104222ddf1SHong Zhang   }
20114222ddf1SHong Zhang 
20124e84afc0SStefano Zampini   /* backend general code */
20139566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "backend", &flg));
20144e84afc0SStefano Zampini   if (flg) {
20159566063dSJacob Faibussowitsch     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
20163ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
20174e84afc0SStefano Zampini   }
20184e84afc0SStefano Zampini 
20194222ddf1SHong Zhang   /* hypre */
20204222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
20219566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
20224222ddf1SHong Zhang   if (flg) {
20239566063dSJacob Faibussowitsch     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
20244222ddf1SHong Zhang     C->ops->productnumeric = MatProductNumeric_PtAP;
20253ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
20264222ddf1SHong Zhang   }
20274222ddf1SHong Zhang #endif
20284222ddf1SHong Zhang   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
20294222ddf1SHong Zhang 
20304222ddf1SHong Zhang next:
20314222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_PtAP;
20323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20334222ddf1SHong Zhang }
2034