xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 8a9f67ba4a2e1fdde06746005325cc1778268ffa)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmap.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PETSC_HASH_MAP(HMapIV, PetscInt, PetscScalar, PetscHashInt, PetscHashEqual, -1)
17 
18 PETSC_STATIC_INLINE
19 PetscErrorCode PetscHMapIVAdd(PetscHMapIV ht,PetscInt key,PetscScalar val)
20 {
21   int      ret;
22   khiter_t iter;
23   PetscFunctionBeginHot;
24   PetscValidPointer(ht,1);
25   iter = kh_put(HMapIV,ht,key,&ret);
26   PetscHashAssert(ret>=0);
27   if (ret) kh_val(ht,iter) = val;
28   else  kh_val(ht,iter) += val;
29   PetscFunctionReturn(0);
30 }
31 
32 
33 PETSC_STATIC_INLINE PETSC_UNUSED
34 PetscErrorCode PetscHMapIVAddAndScale(PetscHMapIV ht,PetscHMapIV hd,PetscScalar scale)
35 {
36   int         ret;
37   PetscInt    key;
38   PetscScalar val;
39   PetscFunctionBegin;
40   PetscValidPointer(ht,1);
41   PetscValidPointer(hd,2);
42   kh_foreach(hd,key,val,{ khiter_t i;
43       i = kh_put(HMapIV,ht,key,&ret);
44       PetscHashAssert(ret>=0);
45       if (ret)  kh_val(ht,i) = val*scale;
46       else kh_val(ht,i) += val*scale;})
47   PetscFunctionReturn(0);
48 }
49 
50 /* #define PTAP_PROFILE */
51 
52 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
53 {
54   PetscErrorCode    ierr;
55   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
56   Mat_APMPI         *ptap=a->ap;
57   PetscBool         iascii;
58   PetscViewerFormat format;
59 
60   PetscFunctionBegin;
61   if (!ptap) {
62     /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
63     A->ops->view = MatView_MPIAIJ;
64     ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr);
65     PetscFunctionReturn(0);
66   }
67 
68   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
69   if (iascii) {
70     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
71     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
72       if (ptap->algType == 0) {
73         ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr);
74       } else if (ptap->algType == 1) {
75         ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr);
76       } else if (ptap->algType == 2) {
77         ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
78       } else if (ptap->algType == 3) {
79         ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
80       }
81     }
82   }
83   ierr = (ptap->view)(A,viewer);CHKERRQ(ierr);
84   PetscFunctionReturn(0);
85 }
86 
87 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
88 {
89   PetscErrorCode      ierr;
90   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data;
91   Mat_APMPI           *ptap=a->ap;
92   Mat_Merge_SeqsToMPI *merge;
93 
94   PetscFunctionBegin;
95   if (!ptap) PetscFunctionReturn(0);
96 
97   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
98   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
99   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
100   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
101   ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
102   ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
103   ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
104   if (ptap->AP_loc) { /* used by alg_rap */
105     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
106     ierr = PetscFree(ap->i);CHKERRQ(ierr);
107     ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr);
108     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
109   } else { /* used by alg_ptap */
110     ierr = PetscFree(ptap->api);CHKERRQ(ierr);
111     ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
112   }
113   ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
114   ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
115   if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
116 
117   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
118 
119   merge=ptap->merge;
120   if (merge) { /* used by alg_ptap */
121     ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
122     ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
123     ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
124     ierr = PetscFree(merge->bi);CHKERRQ(ierr);
125     ierr = PetscFree(merge->bj);CHKERRQ(ierr);
126     ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
127     ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
128     ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
129     ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
130     ierr = PetscFree(merge->coi);CHKERRQ(ierr);
131     ierr = PetscFree(merge->coj);CHKERRQ(ierr);
132     ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
133     ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
134     ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
135   }
136   ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr);
137 
138   ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr);
139   ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr);
140   ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr);
141   PetscFunctionReturn(0);
142 }
143 
144 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
145 {
146   PetscErrorCode ierr;
147   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
148   Mat_APMPI      *ptap=a->ap;
149 
150   PetscFunctionBegin;
151   ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr);
152   ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */
153   ierr = PetscFree(ptap);CHKERRQ(ierr);
154   PetscFunctionReturn(0);
155 }
156 
157 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
158 {
159   PetscErrorCode ierr;
160   PetscBool      flg;
161   MPI_Comm       comm;
162 #if !defined(PETSC_HAVE_HYPRE)
163   const char          *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"};
164   PetscInt            nalg=4;
165 #else
166   const char          *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"};
167   PetscInt            nalg=5;
168 #endif
169   PetscInt            pN=P->cmap->N,alg=1; /* set default algorithm */
170 
171   PetscFunctionBegin;
172   /* check if matrix local sizes are compatible */
173   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
174   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
175   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
176 
177   if (scall == MAT_INITIAL_MATRIX) {
178     /* pick an algorithm */
179     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
180     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
181     ierr = PetscOptionsEnd();CHKERRQ(ierr);
182 
183     if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
184       MatInfo     Ainfo,Pinfo;
185       PetscInt    nz_local;
186       PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;
187 
188       ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr);
189       ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr);
190       nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
191 
192       if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
193       ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr);
194 
195       if (alg_scalable) {
196         alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
197       }
198     }
199 
200     switch (alg) {
201     case 1:
202       /* do R=P^T locally, then C=R*A*P -- nonscalable */
203       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
204       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
205       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
206       break;
207     case 2:
208       /* compute C=P^T*A*P allatonce */
209       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
210       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr);
211       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
212       break;
213     case 3:
214       /* compute C=P^T*A*P allatonce */
215       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
216       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr);
217       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
218       break;
219 #if defined(PETSC_HAVE_HYPRE)
220     case 4:
221       /* Use boomerAMGBuildCoarseOperator */
222       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
223       ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr);
224       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
225       break;
226 #endif
227     default:
228       /* do R=P^T locally, then C=R*A*P */
229       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
230       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr);
231       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
232       break;
233     }
234 
235     if (alg == 0 || alg == 1 || alg == 2 || alg == 3) {
236       Mat_MPIAIJ *c  = (Mat_MPIAIJ*)(*C)->data;
237       Mat_APMPI  *ap = c->ap;
238       ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr);
239       ap->freestruct = PETSC_FALSE;
240       ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr);
241       ierr = PetscOptionsEnd();CHKERRQ(ierr);
242     }
243   }
244 
245   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
246   ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr);
247   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
248   PetscFunctionReturn(0);
249 }
250 
251 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
252 {
253   PetscErrorCode    ierr;
254   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
255   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
256   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
257   Mat_APMPI         *ptap = c->ap;
258   Mat               AP_loc,C_loc,C_oth;
259   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
260   PetscScalar       *apa;
261   const PetscInt    *cols;
262   const PetscScalar *vals;
263 
264   PetscFunctionBegin;
265   if (!ptap) {
266     MPI_Comm comm;
267     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
268     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
269   }
270 
271   ierr = MatZeroEntries(C);CHKERRQ(ierr);
272 
273   /* 1) get R = Pd^T,Ro = Po^T */
274   if (ptap->reuse == MAT_REUSE_MATRIX) {
275     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
276     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
277   }
278 
279   /* 2) get AP_loc */
280   AP_loc = ptap->AP_loc;
281   ap = (Mat_SeqAIJ*)AP_loc->data;
282 
283   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
284   /*-----------------------------------------------------*/
285   if (ptap->reuse == MAT_REUSE_MATRIX) {
286     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
287     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
288     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
289   }
290 
291   /* 2-2) compute numeric A_loc*P - dominating part */
292   /* ---------------------------------------------- */
293   /* get data from symbolic products */
294   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
295   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
296   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
297 
298   api   = ap->i;
299   apj   = ap->j;
300   ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr);
301   for (i=0; i<am; i++) {
302     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
303     apnz = api[i+1] - api[i];
304     apa = ap->a + api[i];
305     ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr);
306     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
307     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
308   }
309   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr);
310   if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
311 
312   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
313   /* Always use scalable version since we are in the MPI scalable version */
314   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
315   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
316 
317   C_loc = ptap->C_loc;
318   C_oth = ptap->C_oth;
319 
320   /* add C_loc and Co to to C */
321   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
322 
323   /* C_loc -> C */
324   cm    = C_loc->rmap->N;
325   c_seq = (Mat_SeqAIJ*)C_loc->data;
326   cols = c_seq->j;
327   vals = c_seq->a;
328   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
329 
330   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
331   /* when there are no off-processor parts.  */
332   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
333   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
334   /* a table, and other, more complex stuff has to be done. */
335   if (C->assembled) {
336     C->was_assembled = PETSC_TRUE;
337     C->assembled     = PETSC_FALSE;
338   }
339   if (C->was_assembled) {
340     for (i=0; i<cm; i++) {
341       ncols = c_seq->i[i+1] - c_seq->i[i];
342       row = rstart + i;
343       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
344       cols += ncols; vals += ncols;
345     }
346   } else {
347     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
348   }
349   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
350   if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
351 
352   /* Co -> C, off-processor part */
353   cm = C_oth->rmap->N;
354   c_seq = (Mat_SeqAIJ*)C_oth->data;
355   cols = c_seq->j;
356   vals = c_seq->a;
357   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
358   for (i=0; i<cm; i++) {
359     ncols = c_seq->i[i+1] - c_seq->i[i];
360     row = p->garray[i];
361     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
362     cols += ncols; vals += ncols;
363   }
364   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
365   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
366 
367   ptap->reuse = MAT_REUSE_MATRIX;
368 
369   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
370   if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
371 
372   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
373   if (ptap->freestruct) {
374     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
375   }
376   PetscFunctionReturn(0);
377 }
378 
379 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
380 {
381   PetscErrorCode      ierr;
382   Mat_APMPI           *ptap;
383   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
384   MPI_Comm            comm;
385   PetscMPIInt         size,rank;
386   Mat                 Cmpi,P_loc,P_oth;
387   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
388   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
389   PetscInt            *lnk,i,k,pnz,row,nsend;
390   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
391   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
392   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
393   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
394   MPI_Request         *swaits,*rwaits;
395   MPI_Status          *sstatus,rstatus;
396   PetscLayout         rowmap;
397   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
398   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
399   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
400   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
401   PetscScalar         *apv;
402   PetscTable          ta;
403   MatType             mtype;
404   const char          *prefix;
405 #if defined(PETSC_USE_INFO)
406   PetscReal           apfill;
407 #endif
408 
409   PetscFunctionBegin;
410   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
411   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
412   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
413 
414   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
415 
416   /* create symbolic parallel matrix Cmpi */
417   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
418   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
419   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
420 
421   /* create struct Mat_APMPI and attached it to C later */
422   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
423   ptap->reuse = MAT_INITIAL_MATRIX;
424   ptap->algType = 0;
425 
426   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
427   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr);
428   /* get P_loc by taking all local rows of P */
429   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr);
430 
431   ptap->P_loc = P_loc;
432   ptap->P_oth = P_oth;
433 
434   /* (0) compute Rd = Pd^T, Ro = Po^T  */
435   /* --------------------------------- */
436   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
437   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
438 
439   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
440   /* ----------------------------------------------------------------- */
441   p_loc  = (Mat_SeqAIJ*)P_loc->data;
442   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
443 
444   /* create and initialize a linked list */
445   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
446   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
447   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
448   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
449 
450   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
451 
452   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
453   if (ao) {
454     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
455   } else {
456     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
457   }
458   current_space = free_space;
459   nspacedouble  = 0;
460 
461   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
462   api[0] = 0;
463   for (i=0; i<am; i++) {
464     /* diagonal portion: Ad[i,:]*P */
465     ai = ad->i; pi = p_loc->i;
466     nzi = ai[i+1] - ai[i];
467     aj  = ad->j + ai[i];
468     for (j=0; j<nzi; j++) {
469       row  = aj[j];
470       pnz  = pi[row+1] - pi[row];
471       Jptr = p_loc->j + pi[row];
472       /* add non-zero cols of P into the sorted linked list lnk */
473       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
474     }
475     /* off-diagonal portion: Ao[i,:]*P */
476     if (ao) {
477       ai = ao->i; pi = p_oth->i;
478       nzi = ai[i+1] - ai[i];
479       aj  = ao->j + ai[i];
480       for (j=0; j<nzi; j++) {
481         row  = aj[j];
482         pnz  = pi[row+1] - pi[row];
483         Jptr = p_oth->j + pi[row];
484         ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
485       }
486     }
487     apnz     = lnk[0];
488     api[i+1] = api[i] + apnz;
489 
490     /* if free space is not available, double the total space in the list */
491     if (current_space->local_remaining<apnz) {
492       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
493       nspacedouble++;
494     }
495 
496     /* Copy data into free space, then initialize lnk */
497     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
498 
499     current_space->array           += apnz;
500     current_space->local_used      += apnz;
501     current_space->local_remaining -= apnz;
502   }
503   /* Allocate space for apj and apv, initialize apj, and */
504   /* destroy list of free space and other temporary array(s) */
505   ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
506   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
507   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
508 
509   /* Create AP_loc for reuse */
510   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
511   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr);
512 
513 #if defined(PETSC_USE_INFO)
514   if (ao) {
515     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
516   } else {
517     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
518   }
519   ptap->AP_loc->info.mallocs           = nspacedouble;
520   ptap->AP_loc->info.fill_ratio_given  = fill;
521   ptap->AP_loc->info.fill_ratio_needed = apfill;
522 
523   if (api[am]) {
524     ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
525     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
526   } else {
527     ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
528   }
529 #endif
530 
531   /* (2-1) compute symbolic Co = Ro*AP_loc  */
532   /* ------------------------------------ */
533   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
534   ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr);
535   ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr);
536   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
537 
538   /* (3) send coj of C_oth to other processors  */
539   /* ------------------------------------------ */
540   /* determine row ownership */
541   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
542   rowmap->n  = pn;
543   rowmap->bs = 1;
544   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
545   owners = rowmap->range;
546 
547   /* determine the number of messages to send, their lengths */
548   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
549   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
550   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
551 
552   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
553   coi   = c_oth->i; coj = c_oth->j;
554   con   = ptap->C_oth->rmap->n;
555   proc  = 0;
556   ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr);
557   for (i=0; i<con; i++) {
558     while (prmap[i] >= owners[proc+1]) proc++;
559     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
560     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
561   }
562 
563   len          = 0; /* max length of buf_si[], see (4) */
564   owners_co[0] = 0;
565   nsend        = 0;
566   for (proc=0; proc<size; proc++) {
567     owners_co[proc+1] = owners_co[proc] + len_si[proc];
568     if (len_s[proc]) {
569       nsend++;
570       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
571       len         += len_si[proc];
572     }
573   }
574 
575   /* determine the number and length of messages to receive for coi and coj  */
576   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
577   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
578 
579   /* post the Irecv and Isend of coj */
580   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
581   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
582   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
583   for (proc=0, k=0; proc<size; proc++) {
584     if (!len_s[proc]) continue;
585     i    = owners_co[proc];
586     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
587     k++;
588   }
589 
590   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
591   /* ---------------------------------------- */
592   ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr);
593   ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr);
594   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
595   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
596   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr);
597 
598   /* receives coj are complete */
599   for (i=0; i<nrecv; i++) {
600     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
601   }
602   ierr = PetscFree(rwaits);CHKERRQ(ierr);
603   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
604 
605   /* add received column indices into ta to update Crmax */
606   for (k=0; k<nrecv; k++) {/* k-th received message */
607     Jptr = buf_rj[k];
608     for (j=0; j<len_r[k]; j++) {
609       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
610     }
611   }
612   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
613   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
614 
615   /* (4) send and recv coi */
616   /*-----------------------*/
617   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
618   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
619   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
620   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
621   for (proc=0,k=0; proc<size; proc++) {
622     if (!len_s[proc]) continue;
623     /* form outgoing message for i-structure:
624          buf_si[0]:                 nrows to be sent
625                [1:nrows]:           row index (global)
626                [nrows+1:2*nrows+1]: i-structure index
627     */
628     /*-------------------------------------------*/
629     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
630     buf_si_i    = buf_si + nrows+1;
631     buf_si[0]   = nrows;
632     buf_si_i[0] = 0;
633     nrows       = 0;
634     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
635       nzi = coi[i+1] - coi[i];
636       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
637       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
638       nrows++;
639     }
640     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
641     k++;
642     buf_si += len_si[proc];
643   }
644   for (i=0; i<nrecv; i++) {
645     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
646   }
647   ierr = PetscFree(rwaits);CHKERRQ(ierr);
648   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
649 
650   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
651   ierr = PetscFree(len_ri);CHKERRQ(ierr);
652   ierr = PetscFree(swaits);CHKERRQ(ierr);
653   ierr = PetscFree(buf_s);CHKERRQ(ierr);
654 
655   /* (5) compute the local portion of Cmpi      */
656   /* ------------------------------------------ */
657   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
658   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
659   current_space = free_space;
660 
661   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
662   for (k=0; k<nrecv; k++) {
663     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
664     nrows       = *buf_ri_k[k];
665     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
666     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
667   }
668 
669   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
670   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
671   for (i=0; i<pn; i++) {
672     /* add C_loc into Cmpi */
673     nzi  = c_loc->i[i+1] - c_loc->i[i];
674     Jptr = c_loc->j + c_loc->i[i];
675     ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
676 
677     /* add received col data into lnk */
678     for (k=0; k<nrecv; k++) { /* k-th received message */
679       if (i == *nextrow[k]) { /* i-th row */
680         nzi  = *(nextci[k]+1) - *nextci[k];
681         Jptr = buf_rj[k] + *nextci[k];
682         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
683         nextrow[k]++; nextci[k]++;
684       }
685     }
686     nzi = lnk[0];
687 
688     /* copy data into free space, then initialize lnk */
689     ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr);
690     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
691   }
692   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
693   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
694   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
695 
696   /* local sizes and preallocation */
697   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
698   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
699   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
700   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
701 
702   /* members in merge */
703   ierr = PetscFree(id_r);CHKERRQ(ierr);
704   ierr = PetscFree(len_r);CHKERRQ(ierr);
705   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
706   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
707   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
708   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
709   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
710 
711   /* attach the supporting struct to Cmpi for reuse */
712   c = (Mat_MPIAIJ*)Cmpi->data;
713   c->ap           = ptap;
714   ptap->duplicate = Cmpi->ops->duplicate;
715   ptap->destroy   = Cmpi->ops->destroy;
716   ptap->view      = Cmpi->ops->view;
717 
718   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
719   Cmpi->assembled        = PETSC_FALSE;
720   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
721   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
722   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
723   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
724   *C                     = Cmpi;
725 
726    nout = 0;
727    ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr);
728    if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
729    ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr);
730    if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
731 
732   PetscFunctionReturn(0);
733 }
734 
735 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHSetI dht,PetscHSetI oht)
736 {
737   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
738   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
739   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
740   PetscInt             pcstart,pcend,column;
741   PetscErrorCode       ierr;
742 
743   PetscFunctionBegin;
744 
745   pcstart = P->cmap->rstart;
746   pcend   = P->cmap->rend;
747   /* diagonal portion: Ad[i,:]*P */
748   ai = ad->i;
749   nzi = ai[i+1] - ai[i];
750   aj  = ad->j + ai[i];
751   for (j=0; j<nzi; j++) {
752     row  = aj[j];
753     nzpi = pd->i[row+1] - pd->i[row];
754     pj  = pd->j + pd->i[row];
755     for (k=0; k<nzpi; k++) {
756       ierr = PetscHSetIAdd(dht,pj[k]+pcstart);CHKERRQ(ierr);
757     }
758   }
759   for (j=0; j<nzi; j++) {
760     row  = aj[j];
761     nzpi = po->i[row+1] - po->i[row];
762     pj  = po->j + po->i[row];
763     for (k=0; k<nzpi; k++) {
764       ierr = PetscHSetIAdd(oht,p->garray[pj[k]]);CHKERRQ(ierr);
765     }
766   }
767 
768   /* off diagonal part: Ao[i, :]*P_oth */
769   if (ao) {
770     ai = ao->i;
771     pi = p_oth->i;
772     nzi = ai[i+1] - ai[i];
773     aj  = ao->j + ai[i];
774     for (j=0; j<nzi; j++) {
775       row  = aj[j];
776       pnz  = pi[row+1] - pi[row];
777       p_othcols = p_oth->j + pi[row];
778       for (col=0; col<pnz; col++) {
779         column = p_othcols[col];
780         if (column>=pcstart && column<pcend) {
781           ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr);
782         } else {
783           ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr);
784         }
785       }
786     }
787   } /* end if (ao) */
788   PetscFunctionReturn(0);
789 }
790 
791 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHMapIV hmap)
792 {
793   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
794   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
795   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi;
796   PetscScalar          ra,*aa,*pa;
797   PetscErrorCode       ierr;
798 
799   PetscFunctionBegin;
800   pcstart = P->cmap->rstart;
801   /* diagonal portion: Ad[i,:]*P */
802   ai  = ad->i;
803   nzi = ai[i+1] - ai[i];
804   aj  = ad->j + ai[i];
805   aa  = ad->a + ai[i];
806 
807   for (j=0; j<nzi; j++) {
808     ra   = aa[j];
809     row  = aj[j];
810     nzpi = pd->i[row+1] - pd->i[row];
811     pj = pd->j + pd->i[row];
812     pa = pd->a + pd->i[row];
813     for (k=0; k<nzpi; k++) {
814       ierr = PetscHMapIVAdd(hmap,pj[k]+pcstart,ra*pa[k]);CHKERRQ(ierr);
815     }
816   }
817   for (j=0; j<nzi; j++) {
818     ra   = aa[j];
819     row  = aj[j];
820     nzpi = po->i[row+1] - po->i[row];
821     pj = po->j + po->i[row];
822     pa = po->a + po->i[row];
823     for (k=0; k<nzpi; k++) {
824       ierr = PetscHMapIVAdd(hmap,p->garray[pj[k]],ra*pa[k]);CHKERRQ(ierr);
825     }
826   }
827 
828 
829   /* off diagonal part: Ao[i, :]*P_oth */
830   if (ao) {
831     ai = ao->i;
832     pi = p_oth->i;
833     nzi = ai[i+1] - ai[i];
834     aj  = ao->j + ai[i];
835     aa  = ao->a + ai[i];
836     for (j=0; j<nzi; j++) {
837       row  = aj[j];
838       ra   = aa[j];
839       pnz  = pi[row+1] - pi[row];
840       p_othcols = p_oth->j + pi[row];
841       pa   = p_oth->a + pi[row];
842       for (col=0; col<pnz; col++) {
843         ierr = PetscHMapIVAdd(hmap,p_othcols[col],ra*pa[col]);CHKERRQ(ierr);
844       }
845     }
846   } /* end if (ao) */
847   PetscFunctionReturn(0);
848 }
849 
850 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
851 {
852   PetscErrorCode    ierr;
853   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
854   Mat_SeqAIJ        *cd,*co,*po,*pd;
855   Mat_APMPI         *ptap = c->ap;
856   PetscHMapIV       hmap;
857   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
858   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
859   MPI_Comm          comm;
860 
861   PetscFunctionBegin;
862   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
863   if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
864 
865   ierr = MatZeroEntries(C);CHKERRQ(ierr);
866 
867   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
868   /*-----------------------------------------------------*/
869   if (ptap->reuse == MAT_REUSE_MATRIX) {
870     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
871     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
872   }
873 
874   po = (Mat_SeqAIJ*) p->B->data;
875   pd = (Mat_SeqAIJ*) p->A->data;
876 
877   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
878 
879   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
880   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
881   cmaxr = 0;
882   for (i=0; i<pon; i++) {
883     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
884   }
885   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
886   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
887   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
888   for (i=0; i<am && pon; i++) {
889     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
890     nzi = po->i[i+1] - po->i[i];
891     if (!nzi) continue;
892     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr);
893     voff = 0;
894     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
895     if (!voff) continue;
896     /*ierr = PetscMemzero(c_rmtc,sizeof(PetscInt)*pon);CHKERRQ(ierr);*/
897 
898     /* Form C(ii, :) */
899     poj = po->j + po->i[i];
900     poa = po->a + po->i[i];
901     for (j=0; j<nzi; j++) {
902       c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]];
903       c_rmtaa = c_rmta + ptap->c_rmti[poj[j]];
904       for (jj=0; jj<voff; jj++) {
905         apvaluestmp[jj] = apvalues[jj]*poa[j];
906         /*If the row is empty */
907         if (!c_rmtc[poj[j]]) {
908           c_rmtjj[jj] = apindices[jj];
909           c_rmtaa[jj] = apvaluestmp[jj];
910           c_rmtc[poj[j]]++;
911         } else {
912           ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr);
913           if (loc>=0){ /* hit */
914             c_rmtaa[loc] += apvaluestmp[jj];
915           } else { /* new element */
916             loc = -(loc+1);
917             /* Move data backward */
918             for (kk=c_rmtc[poj[j]]; kk>loc; kk--) {
919               c_rmtjj[kk] = c_rmtjj[kk-1];
920               c_rmtaa[kk] = c_rmtaa[kk-1];
921             }/* End kk */
922             c_rmtjj[loc] = apindices[jj];
923             c_rmtaa[loc] = apvaluestmp[jj];
924             c_rmtc[poj[j]]++;
925           }
926         }
927       } /* End jj */
928     } /* End j */
929   } /* End i */
930 
931   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
932 
933   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
934   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
935 
936   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
937   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
938   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
939   cd = (Mat_SeqAIJ*)(c->A)->data;
940   co = (Mat_SeqAIJ*)(c->B)->data;
941 
942   cmaxr = 0;
943   for (i=0; i<pn; i++) {
944     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
945   }
946   ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr);
947   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
948   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
949   for (i=0; i<am && pn; i++) {
950     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
951     nzi = pd->i[i+1] - pd->i[i];
952     if (!nzi) continue;
953     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr);
954     voff = 0;
955     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
956     if (!voff) continue;
957     /* Form C(ii, :) */
958     pdj = pd->j + pd->i[i];
959     pda = pd->a + pd->i[i];
960     for (j=0; j<nzi; j++) {
961       row = pcstart + pdj[j];
962       for (jj=0; jj<voff; jj++) {
963         apvaluestmp[jj] = apvalues[jj]*pda[j];
964       }
965       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
966     }
967   }
968 
969   ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr);
970   ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr);
971   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
972   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr);
973   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
974   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
975 
976   /* Add contributions from remote */
977   for (i = 0; i < pn; i++) {
978     row = i + pcstart;
979     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
980   }
981   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
982 
983   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
984   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
985 
986   ptap->reuse = MAT_REUSE_MATRIX;
987 
988   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
989   if (ptap->freestruct) {
990     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
991   }
992   PetscFunctionReturn(0);
993 }
994 
995 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
996 {
997   PetscErrorCode    ierr;
998   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
999   Mat_SeqAIJ        *cd,*co,*po,*pd;
1000   Mat_APMPI         *ptap = c->ap;
1001   PetscHMapIV       hmap;
1002   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
1003   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
1004   MPI_Comm          comm;
1005 
1006   PetscFunctionBegin;
1007   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1008   if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1009 
1010   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1011 
1012   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1013   /*-----------------------------------------------------*/
1014   if (ptap->reuse == MAT_REUSE_MATRIX) {
1015     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1016     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1017   }
1018 
1019   po = (Mat_SeqAIJ*) p->B->data;
1020   pd = (Mat_SeqAIJ*) p->A->data;
1021 
1022   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1023   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1024 
1025   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
1026   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1027   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1028   cmaxr = 0;
1029   for (i=0; i<pon; i++) {
1030     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
1031   }
1032   cd = (Mat_SeqAIJ*)(c->A)->data;
1033   co = (Mat_SeqAIJ*)(c->B)->data;
1034   for (i=0; i<pn; i++) {
1035     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
1036   }
1037   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
1038   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
1039   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
1040   for (i=0; i<am && (pon || pn); i++) {
1041     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
1042     nzi  = po->i[i+1] - po->i[i];
1043     dnzi = pd->i[i+1] - pd->i[i];
1044     if (!nzi && !dnzi) continue;
1045     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr);
1046     voff = 0;
1047     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
1048     if (!voff) continue;
1049 
1050     /* Form remote C(ii, :) */
1051     poj = po->j + po->i[i];
1052     poa = po->a + po->i[i];
1053     for (j=0; j<nzi; j++) {
1054       c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]];
1055       c_rmtaa = c_rmta + ptap->c_rmti[poj[j]];
1056       for (jj=0; jj<voff; jj++) {
1057         apvaluestmp[jj] = apvalues[jj]*poa[j];
1058         /*If the row is empty */
1059         if (!c_rmtc[poj[j]]) {
1060           c_rmtjj[jj] = apindices[jj];
1061           c_rmtaa[jj] = apvaluestmp[jj];
1062           c_rmtc[poj[j]]++;
1063         } else {
1064           ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr);
1065           if (loc>=0){ /* hit */
1066             c_rmtaa[loc] += apvaluestmp[jj];
1067           } else { /* new element */
1068             loc = -(loc+1);
1069             /* Move data backward */
1070             for (kk=c_rmtc[poj[j]]; kk>loc; kk--) {
1071               c_rmtjj[kk] = c_rmtjj[kk-1];
1072               c_rmtaa[kk] = c_rmtaa[kk-1];
1073             }/* End kk */
1074             c_rmtjj[loc] = apindices[jj];
1075             c_rmtaa[loc] = apvaluestmp[jj];
1076             c_rmtc[poj[j]]++;
1077           }
1078         }
1079       } /* End jj */
1080     } /* End j */
1081 
1082     /* Form local C(ii, :) */
1083     pdj = pd->j + pd->i[i];
1084     pda = pd->a + pd->i[i];
1085     for (j=0; j<dnzi; j++) {
1086       row = pcstart + pdj[j];
1087       for (jj=0; jj<voff; jj++) {
1088         apvaluestmp[jj] = apvalues[jj]*pda[j];
1089       }/* End kk */
1090       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
1091     }/* End j */
1092   } /* End i */
1093 
1094   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
1095   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
1096   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
1097 
1098   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1099   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
1100   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr);
1101   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
1102   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
1103 
1104   /* Add contributions from remote */
1105   for (i = 0; i < pn; i++) {
1106     row = i + pcstart;
1107     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
1108   }
1109   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
1110 
1111   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1112   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1113 
1114   ptap->reuse = MAT_REUSE_MATRIX;
1115 
1116   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1117   if (ptap->freestruct) {
1118     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
1119   }
1120   PetscFunctionReturn(0);
1121 }
1122 
1123 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C)
1124 {
1125   Mat_APMPI           *ptap;
1126   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
1127   MPI_Comm            comm;
1128   Mat                 Cmpi;
1129   Mat_SeqAIJ          *pd,*po;
1130   MatType             mtype;
1131   PetscSF             sf;
1132   PetscSFNode         *iremote;
1133   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1134   const PetscInt      *rootdegrees;
1135   PetscHSetI          ht,oht,*hta,*hto;
1136   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1137   PetscInt            owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1138   PetscInt            nalg=2,alg=0;
1139   PetscBool           flg;
1140   const char          *algTypes[2] = {"overlapping","merged"};
1141   PetscErrorCode      ierr;
1142 
1143   PetscFunctionBegin;
1144   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1145 
1146   /* Create symbolic parallel matrix Cmpi */
1147   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1148   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1149   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1150   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1151   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1152 
1153   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1154   ptap->reuse = MAT_INITIAL_MATRIX;
1155   ptap->algType = 2;
1156 
1157   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1158   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1159 
1160   po = (Mat_SeqAIJ*)p->B->data;
1161   pd = (Mat_SeqAIJ*)p->A->data;
1162 
1163   /* This equals to the number of offdiag columns in P */
1164   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1165   /* offsets */
1166   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
1167   /* The number of columns we will send to remote ranks */
1168   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
1169   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
1170   for (i=0; i<pon; i++) {
1171     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1172   }
1173   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1174   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
1175   /* Create hash table to merge all columns for C(i, :) */
1176   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
1177 
1178   ptap->c_rmti[0] = 0;
1179   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1180   for (i=0; i<am && pon; i++) {
1181     /* Form one row of AP */
1182     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1183     /* If the off diag is empty, we should not do any calculation */
1184     nzi = po->i[i+1] - po->i[i];
1185     if (!nzi) continue;
1186 
1187     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,ht);CHKERRQ(ierr);
1188     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1189     /* If AP is empty, just continue */
1190     if (!htsize) continue;
1191     /* Form C(ii, :) */
1192     poj = po->j + po->i[i];
1193     for (j=0; j<nzi; j++) {
1194       ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr);
1195     }
1196   }
1197 
1198   for (i=0; i<pon; i++) {
1199     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1200     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1201     c_rmtc[i] = htsize;
1202   }
1203 
1204   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
1205 
1206   for (i=0; i<pon; i++) {
1207     off = 0;
1208     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
1209     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1210   }
1211   ierr = PetscFree(hta);CHKERRQ(ierr);
1212 
1213   ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr);
1214   for (i=0; i<pon; i++) {
1215     owner = 0; lidx = 0;
1216     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1217     iremote[i].index = lidx;
1218     iremote[i].rank  = owner;
1219   }
1220 
1221   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1222   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1223   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1224   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
1225   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
1226   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1227   /* How many neighbors have contributions to my rows? */
1228   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
1229   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
1230   rootspacesize = 0;
1231   for (i = 0; i < pn; i++) {
1232     rootspacesize += rootdegrees[i];
1233   }
1234   ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1235   ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
1236   /* Get information from leaves
1237    * Number of columns other people contribute to my rows
1238    * */
1239   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1240   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1241   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1242   ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
1243   /* The number of columns is received for each row */
1244   ptap->c_othi[0] = 0;
1245   rootspacesize = 0;
1246   rootspaceoffsets[0] = 0;
1247   for (i = 0; i < pn; i++) {
1248     rcvncols = 0;
1249     for (j = 0; j<rootdegrees[i]; j++) {
1250       rcvncols += rootspace[rootspacesize];
1251       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1252       rootspacesize++;
1253     }
1254     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1255   }
1256   ierr = PetscFree(rootspace);CHKERRQ(ierr);
1257 
1258   ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
1259   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1260   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1261   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1262   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
1263 
1264   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
1265   nleaves = 0;
1266   for (i = 0; i<pon; i++) {
1267     owner = 0; lidx = 0;
1268     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1269     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1270     for (j=0; j<sendncols; j++) {
1271       iremote[nleaves].rank = owner;
1272       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1273     }
1274   }
1275   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
1276   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
1277 
1278   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
1279   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1280   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
1281   /* One to one map */
1282   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1283 
1284   ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
1285   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
1286   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1287   ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr);
1288   for (i=0; i<pn; i++) {
1289     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1290     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
1291   }
1292   /* Work on local part */
1293   /* 4) Pass 1: Estimate memory for C_loc */
1294   for (i=0; i<am && pn; i++) {
1295     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1296     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1297     nzi = pd->i[i+1] - pd->i[i];
1298     if (!nzi) continue;
1299 
1300     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr);
1301     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1302     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
1303     if (!(htsize+htosize)) continue;
1304     /* Form C(ii, :) */
1305     pdj = pd->j + pd->i[i];
1306     for (j=0; j<nzi; j++) {
1307       ierr = PetscHSetIAppend(hta[pdj[j]],ht);CHKERRQ(ierr);
1308       ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr);
1309     }
1310   }
1311 
1312   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
1313   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
1314 
1315   /* Get remote data */
1316   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1317   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1318 
1319   for (i = 0; i < pn; i++) {
1320     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1321     rdj = c_othj + ptap->c_othi[i];
1322     for (j = 0; j < nzi; j++) {
1323       col =  rdj[j];
1324       /* diag part */
1325       if (col>=pcstart && col<pcend) {
1326         ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr);
1327       } else { /* off diag */
1328         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
1329       }
1330     }
1331     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1332     dnz[i] = htsize;
1333     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1334     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
1335     onz[i] = htsize;
1336     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
1337   }
1338 
1339   ierr = PetscFree2(hta,hto);CHKERRQ(ierr);
1340   ierr = PetscFree(c_othj);CHKERRQ(ierr);
1341 
1342   /* local sizes and preallocation */
1343   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1344   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1345   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1346   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
1347 
1348   /* attach the supporting struct to Cmpi for reuse */
1349   c = (Mat_MPIAIJ*)Cmpi->data;
1350   c->ap           = ptap;
1351   ptap->duplicate = Cmpi->ops->duplicate;
1352   ptap->destroy   = Cmpi->ops->destroy;
1353   ptap->view      = Cmpi->ops->view;
1354 
1355   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1356   Cmpi->assembled        = PETSC_FALSE;
1357   /* pick an algorithm */
1358   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1359   alg = 0;
1360   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1361   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1362   switch (alg) {
1363     case 0:
1364       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1365       break;
1366     case 1:
1367       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1368       break;
1369     default:
1370       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1371   }
1372   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1373   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1374   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1375   *C                     = Cmpi;
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C)
1380 {
1381   Mat_APMPI           *ptap;
1382   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
1383   MPI_Comm            comm;
1384   Mat                 Cmpi;
1385   Mat_SeqAIJ          *pd,*po;
1386   MatType             mtype;
1387   PetscSF             sf;
1388   PetscSFNode         *iremote;
1389   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1390   const PetscInt      *rootdegrees;
1391   PetscHSetI          ht,oht,*hta,*hto,*htd;
1392   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1393   PetscInt            owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1394   PetscInt            nalg=2,alg=0;
1395   PetscBool           flg;
1396   const char          *algTypes[2] = {"merged","overlapping"};
1397   PetscErrorCode      ierr;
1398 
1399   PetscFunctionBegin;
1400   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1401 
1402   /* Create symbolic parallel matrix Cmpi */
1403   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1404   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1405   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1406   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1407   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1408 
1409   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1410   ptap->reuse = MAT_INITIAL_MATRIX;
1411   ptap->algType = 3;
1412 
1413   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1414   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1415 
1416   po = (Mat_SeqAIJ*)p->B->data;
1417   pd = (Mat_SeqAIJ*)p->A->data;
1418 
1419   /* This equals to the number of offdiag columns in P */
1420   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1421   /* offsets */
1422   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
1423   /* The number of columns we will send to remote ranks */
1424   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
1425   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
1426   for (i=0; i<pon; i++) {
1427     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1428   }
1429   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1430   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
1431   /* Create hash table to merge all columns for C(i, :) */
1432   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
1433   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
1434   ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr);
1435   for (i=0; i<pn; i++) {
1436     ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr);
1437     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
1438   }
1439   ptap->c_rmti[0] = 0;
1440   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1441   for (i=0; i<am && (pon || pn); i++) {
1442     /* Form one row of AP */
1443     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1444     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1445     /* If the off diag is empty, we should not do any calculation */
1446     nzi = po->i[i+1] - po->i[i];
1447     dnzi = pd->i[i+1] - pd->i[i];
1448     if (!nzi && !dnzi) continue;
1449 
1450     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr);
1451     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1452     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
1453     /* If AP is empty, just continue */
1454     if (!(htsize+htosize)) continue;
1455 
1456     /* Form remote C(ii, :) */
1457     poj = po->j + po->i[i];
1458     for (j=0; j<nzi; j++) {
1459       ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr);
1460       ierr = PetscHSetIAppend(hta[poj[j]],oht);CHKERRQ(ierr);
1461     }
1462 
1463     /* Form local C(ii, :) */
1464     pdj = pd->j + pd->i[i];
1465     for (j=0; j<dnzi; j++) {
1466       ierr = PetscHSetIAppend(htd[pdj[j]],ht);CHKERRQ(ierr);
1467       ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr);
1468     }
1469   }
1470 
1471   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
1472   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
1473 
1474   for (i=0; i<pon; i++) {
1475     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1476     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1477     c_rmtc[i] = htsize;
1478   }
1479 
1480   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
1481 
1482   for (i=0; i<pon; i++) {
1483     off = 0;
1484     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
1485     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1486   }
1487   ierr = PetscFree(hta);CHKERRQ(ierr);
1488 
1489   ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr);
1490   for (i=0; i<pon; i++) {
1491     owner = 0; lidx = 0;
1492     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1493     iremote[i].index = lidx;
1494     iremote[i].rank  = owner;
1495   }
1496 
1497   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1498   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1499   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1500   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
1501   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
1502   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1503   /* How many neighbors have contributions to my rows? */
1504   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
1505   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
1506   rootspacesize = 0;
1507   for (i = 0; i < pn; i++) {
1508     rootspacesize += rootdegrees[i];
1509   }
1510   ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1511   ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
1512   /* Get information from leaves
1513    * Number of columns other people contribute to my rows
1514    * */
1515   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1516   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1517   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1518   ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
1519   /* The number of columns is received for each row */
1520   ptap->c_othi[0] = 0;
1521   rootspacesize = 0;
1522   rootspaceoffsets[0] = 0;
1523   for (i = 0; i < pn; i++) {
1524     rcvncols = 0;
1525     for (j = 0; j<rootdegrees[i]; j++) {
1526       rcvncols += rootspace[rootspacesize];
1527       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1528       rootspacesize++;
1529     }
1530     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1531   }
1532   ierr = PetscFree(rootspace);CHKERRQ(ierr);
1533 
1534   ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
1535   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1536   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1537   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1538   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
1539 
1540   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
1541   nleaves = 0;
1542   for (i = 0; i<pon; i++) {
1543     owner = 0; lidx = 0;
1544     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1545     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1546     for (j=0; j<sendncols; j++) {
1547       iremote[nleaves].rank = owner;
1548       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1549     }
1550   }
1551   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
1552   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
1553 
1554   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
1555   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1556   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
1557   /* One to one map */
1558   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1559   /* Get remote data */
1560   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1561   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1562   ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
1563   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1564 
1565   for (i = 0; i < pn; i++) {
1566     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1567     rdj = c_othj + ptap->c_othi[i];
1568     for (j = 0; j < nzi; j++) {
1569       col =  rdj[j];
1570       /* diag part */
1571       if (col>=pcstart && col<pcend) {
1572         ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr);
1573       } else { /* off diag */
1574         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
1575       }
1576     }
1577     ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr);
1578     dnz[i] = htsize;
1579     ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr);
1580     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
1581     onz[i] = htsize;
1582     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
1583   }
1584 
1585   ierr = PetscFree2(htd,hto);CHKERRQ(ierr);
1586   ierr = PetscFree(c_othj);CHKERRQ(ierr);
1587 
1588   /* local sizes and preallocation */
1589   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1590   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1591   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1592   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
1593 
1594   /* attach the supporting struct to Cmpi for reuse */
1595   c = (Mat_MPIAIJ*)Cmpi->data;
1596   c->ap           = ptap;
1597   ptap->duplicate = Cmpi->ops->duplicate;
1598   ptap->destroy   = Cmpi->ops->destroy;
1599   ptap->view      = Cmpi->ops->view;
1600 
1601   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1602   Cmpi->assembled        = PETSC_FALSE;
1603   /* pick an algorithm */
1604   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1605   alg = 0;
1606   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1607   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1608   switch (alg) {
1609     case 0:
1610       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1611       break;
1612     case 1:
1613       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1614       break;
1615     default:
1616       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1617   }
1618   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1619   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1620   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1621   *C                     = Cmpi;
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1626 {
1627   PetscErrorCode      ierr;
1628   Mat_APMPI           *ptap;
1629   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1630   MPI_Comm            comm;
1631   PetscMPIInt         size,rank;
1632   Mat                 Cmpi;
1633   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1634   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1635   PetscInt            *lnk,i,k,pnz,row,nsend;
1636   PetscBT             lnkbt;
1637   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1638   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1639   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1640   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1641   MPI_Request         *swaits,*rwaits;
1642   MPI_Status          *sstatus,rstatus;
1643   PetscLayout         rowmap;
1644   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1645   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1646   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1647   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1648   PetscScalar         *apv;
1649   PetscTable          ta;
1650   MatType             mtype;
1651   const char          *prefix;
1652 #if defined(PETSC_USE_INFO)
1653   PetscReal           apfill;
1654 #endif
1655 
1656   PetscFunctionBegin;
1657   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1658   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1659   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1660 
1661   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1662 
1663   /* create symbolic parallel matrix Cmpi */
1664   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1665   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1666   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1667 
1668   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1669   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1670 
1671   /* create struct Mat_APMPI and attached it to C later */
1672   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1673   ptap->reuse = MAT_INITIAL_MATRIX;
1674   ptap->algType = 1;
1675 
1676   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1677   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1678   /* get P_loc by taking all local rows of P */
1679   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1680 
1681   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1682   /* --------------------------------- */
1683   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1684   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
1685 
1686   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1687   /* ----------------------------------------------------------------- */
1688   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1689   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1690 
1691   /* create and initialize a linked list */
1692   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
1693   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1694   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1695   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
1696   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1697 
1698   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1699 
1700   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1701   if (ao) {
1702     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
1703   } else {
1704     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
1705   }
1706   current_space = free_space;
1707   nspacedouble  = 0;
1708 
1709   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
1710   api[0] = 0;
1711   for (i=0; i<am; i++) {
1712     /* diagonal portion: Ad[i,:]*P */
1713     ai = ad->i; pi = p_loc->i;
1714     nzi = ai[i+1] - ai[i];
1715     aj  = ad->j + ai[i];
1716     for (j=0; j<nzi; j++) {
1717       row  = aj[j];
1718       pnz  = pi[row+1] - pi[row];
1719       Jptr = p_loc->j + pi[row];
1720       /* add non-zero cols of P into the sorted linked list lnk */
1721       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1722     }
1723     /* off-diagonal portion: Ao[i,:]*P */
1724     if (ao) {
1725       ai = ao->i; pi = p_oth->i;
1726       nzi = ai[i+1] - ai[i];
1727       aj  = ao->j + ai[i];
1728       for (j=0; j<nzi; j++) {
1729         row  = aj[j];
1730         pnz  = pi[row+1] - pi[row];
1731         Jptr = p_oth->j + pi[row];
1732         ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1733       }
1734     }
1735     apnz     = lnk[0];
1736     api[i+1] = api[i] + apnz;
1737     if (ap_rmax < apnz) ap_rmax = apnz;
1738 
1739     /* if free space is not available, double the total space in the list */
1740     if (current_space->local_remaining<apnz) {
1741       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1742       nspacedouble++;
1743     }
1744 
1745     /* Copy data into free space, then initialize lnk */
1746     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1747 
1748     current_space->array           += apnz;
1749     current_space->local_used      += apnz;
1750     current_space->local_remaining -= apnz;
1751   }
1752   /* Allocate space for apj and apv, initialize apj, and */
1753   /* destroy list of free space and other temporary array(s) */
1754   ierr   = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
1755   ierr   = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
1756   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1757 
1758   /* Create AP_loc for reuse */
1759   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
1760 
1761 #if defined(PETSC_USE_INFO)
1762   if (ao) {
1763     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1764   } else {
1765     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1766   }
1767   ptap->AP_loc->info.mallocs           = nspacedouble;
1768   ptap->AP_loc->info.fill_ratio_given  = fill;
1769   ptap->AP_loc->info.fill_ratio_needed = apfill;
1770 
1771   if (api[am]) {
1772     ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
1773     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
1774   } else {
1775     ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
1776   }
1777 #endif
1778 
1779   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1780   /* ------------------------------------ */
1781   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
1782   ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr);
1783   ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr);
1784   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
1785 
1786   /* (3) send coj of C_oth to other processors  */
1787   /* ------------------------------------------ */
1788   /* determine row ownership */
1789   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
1790   rowmap->n  = pn;
1791   rowmap->bs = 1;
1792   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
1793   owners = rowmap->range;
1794 
1795   /* determine the number of messages to send, their lengths */
1796   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
1797   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1798   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1799 
1800   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1801   coi   = c_oth->i; coj = c_oth->j;
1802   con   = ptap->C_oth->rmap->n;
1803   proc  = 0;
1804   for (i=0; i<con; i++) {
1805     while (prmap[i] >= owners[proc+1]) proc++;
1806     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1807     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1808   }
1809 
1810   len          = 0; /* max length of buf_si[], see (4) */
1811   owners_co[0] = 0;
1812   nsend        = 0;
1813   for (proc=0; proc<size; proc++) {
1814     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1815     if (len_s[proc]) {
1816       nsend++;
1817       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1818       len         += len_si[proc];
1819     }
1820   }
1821 
1822   /* determine the number and length of messages to receive for coi and coj  */
1823   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
1824   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
1825 
1826   /* post the Irecv and Isend of coj */
1827   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1828   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1829   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
1830   for (proc=0, k=0; proc<size; proc++) {
1831     if (!len_s[proc]) continue;
1832     i    = owners_co[proc];
1833     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1834     k++;
1835   }
1836 
1837   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1838   /* ---------------------------------------- */
1839   ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr);
1840   ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr);
1841   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
1842   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1843 
1844   /* receives coj are complete */
1845   for (i=0; i<nrecv; i++) {
1846     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1847   }
1848   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1849   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
1850 
1851   /* add received column indices into ta to update Crmax */
1852   for (k=0; k<nrecv; k++) {/* k-th received message */
1853     Jptr = buf_rj[k];
1854     for (j=0; j<len_r[k]; j++) {
1855       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1856     }
1857   }
1858   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
1859   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1860 
1861   /* (4) send and recv coi */
1862   /*-----------------------*/
1863   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1864   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1865   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1866   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1867   for (proc=0,k=0; proc<size; proc++) {
1868     if (!len_s[proc]) continue;
1869     /* form outgoing message for i-structure:
1870          buf_si[0]:                 nrows to be sent
1871                [1:nrows]:           row index (global)
1872                [nrows+1:2*nrows+1]: i-structure index
1873     */
1874     /*-------------------------------------------*/
1875     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1876     buf_si_i    = buf_si + nrows+1;
1877     buf_si[0]   = nrows;
1878     buf_si_i[0] = 0;
1879     nrows       = 0;
1880     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1881       nzi = coi[i+1] - coi[i];
1882       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1883       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1884       nrows++;
1885     }
1886     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1887     k++;
1888     buf_si += len_si[proc];
1889   }
1890   for (i=0; i<nrecv; i++) {
1891     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1892   }
1893   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1894   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
1895 
1896   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
1897   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1898   ierr = PetscFree(swaits);CHKERRQ(ierr);
1899   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1900 
1901   /* (5) compute the local portion of Cmpi      */
1902   /* ------------------------------------------ */
1903   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1904   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
1905   current_space = free_space;
1906 
1907   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
1908   for (k=0; k<nrecv; k++) {
1909     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1910     nrows       = *buf_ri_k[k];
1911     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1912     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1913   }
1914 
1915   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1916   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1917   for (i=0; i<pn; i++) {
1918     /* add C_loc into Cmpi */
1919     nzi  = c_loc->i[i+1] - c_loc->i[i];
1920     Jptr = c_loc->j + c_loc->i[i];
1921     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1922 
1923     /* add received col data into lnk */
1924     for (k=0; k<nrecv; k++) { /* k-th received message */
1925       if (i == *nextrow[k]) { /* i-th row */
1926         nzi  = *(nextci[k]+1) - *nextci[k];
1927         Jptr = buf_rj[k] + *nextci[k];
1928         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1929         nextrow[k]++; nextci[k]++;
1930       }
1931     }
1932     nzi = lnk[0];
1933 
1934     /* copy data into free space, then initialize lnk */
1935     ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1936     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1937   }
1938   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1939   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1940   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
1941 
1942   /* local sizes and preallocation */
1943   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1944   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1945   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1946   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1947 
1948   /* members in merge */
1949   ierr = PetscFree(id_r);CHKERRQ(ierr);
1950   ierr = PetscFree(len_r);CHKERRQ(ierr);
1951   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
1952   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
1953   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
1954   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
1955   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
1956 
1957   /* attach the supporting struct to Cmpi for reuse */
1958   c = (Mat_MPIAIJ*)Cmpi->data;
1959   c->ap           = ptap;
1960   ptap->duplicate = Cmpi->ops->duplicate;
1961   ptap->destroy   = Cmpi->ops->destroy;
1962   ptap->view      = Cmpi->ops->view;
1963   ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
1964 
1965   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1966   Cmpi->assembled        = PETSC_FALSE;
1967   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1968   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1969   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1970   *C                     = Cmpi;
1971   PetscFunctionReturn(0);
1972 }
1973 
1974 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1975 {
1976   PetscErrorCode    ierr;
1977   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1978   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1979   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1980   Mat_APMPI         *ptap = c->ap;
1981   Mat               AP_loc,C_loc,C_oth;
1982   PetscInt          i,rstart,rend,cm,ncols,row;
1983   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1984   PetscScalar       *apa;
1985   const PetscInt    *cols;
1986   const PetscScalar *vals;
1987 
1988   PetscFunctionBegin;
1989   if (!ptap) {
1990     MPI_Comm comm;
1991     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1992     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1993   }
1994 
1995   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1996   /* 1) get R = Pd^T,Ro = Po^T */
1997   if (ptap->reuse == MAT_REUSE_MATRIX) {
1998     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1999     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
2000   }
2001 
2002   /* 2) get AP_loc */
2003   AP_loc = ptap->AP_loc;
2004   ap = (Mat_SeqAIJ*)AP_loc->data;
2005 
2006   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
2007   /*-----------------------------------------------------*/
2008   if (ptap->reuse == MAT_REUSE_MATRIX) {
2009     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2010     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
2011     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
2012   }
2013 
2014   /* 2-2) compute numeric A_loc*P - dominating part */
2015   /* ---------------------------------------------- */
2016   /* get data from symbolic products */
2017   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2018   if (ptap->P_oth) {
2019     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2020   }
2021   apa   = ptap->apa;
2022   api   = ap->i;
2023   apj   = ap->j;
2024   for (i=0; i<am; i++) {
2025     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2026     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2027     apnz = api[i+1] - api[i];
2028     for (j=0; j<apnz; j++) {
2029       col = apj[j+api[i]];
2030       ap->a[j+ap->i[i]] = apa[col];
2031       apa[col] = 0.0;
2032     }
2033     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
2034   }
2035 
2036   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2037   ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
2038   ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
2039   C_loc = ptap->C_loc;
2040   C_oth = ptap->C_oth;
2041 
2042   /* add C_loc and Co to to C */
2043   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
2044 
2045   /* C_loc -> C */
2046   cm    = C_loc->rmap->N;
2047   c_seq = (Mat_SeqAIJ*)C_loc->data;
2048   cols = c_seq->j;
2049   vals = c_seq->a;
2050 
2051 
2052   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2053   /* when there are no off-processor parts.  */
2054   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2055   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2056   /* a table, and other, more complex stuff has to be done. */
2057   if (C->assembled) {
2058     C->was_assembled = PETSC_TRUE;
2059     C->assembled     = PETSC_FALSE;
2060   }
2061   if (C->was_assembled) {
2062     for (i=0; i<cm; i++) {
2063       ncols = c_seq->i[i+1] - c_seq->i[i];
2064       row = rstart + i;
2065       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
2066       cols += ncols; vals += ncols;
2067     }
2068   } else {
2069     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
2070   }
2071 
2072   /* Co -> C, off-processor part */
2073   cm = C_oth->rmap->N;
2074   c_seq = (Mat_SeqAIJ*)C_oth->data;
2075   cols = c_seq->j;
2076   vals = c_seq->a;
2077   for (i=0; i<cm; i++) {
2078     ncols = c_seq->i[i+1] - c_seq->i[i];
2079     row = p->garray[i];
2080     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
2081     cols += ncols; vals += ncols;
2082   }
2083 
2084   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2085   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2086 
2087   ptap->reuse = MAT_REUSE_MATRIX;
2088 
2089   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2090   if (ptap->freestruct) {
2091     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
2092   }
2093   PetscFunctionReturn(0);
2094 }
2095