xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 3d6a3516742e32e485e84f6dcdca1b62f17153b7)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmap.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PETSC_HASH_MAP(HMapIV, PetscInt, PetscScalar, PetscHashInt, PetscHashEqual, -1)
17 
18 PETSC_STATIC_INLINE
19 PetscErrorCode PetscHMapIVAdd(PetscHMapIV ht,PetscInt key,PetscScalar val)
20 {
21   int      ret;
22   khiter_t iter;
23   PetscFunctionBeginHot;
24   PetscValidPointer(ht,1);
25   iter = kh_put(HMapIV,ht,key,&ret);
26   PetscHashAssert(ret>=0);
27   if (ret) kh_val(ht,iter) = val;
28   else  kh_val(ht,iter) += val;
29   PetscFunctionReturn(0);
30 }
31 
32 
33 PETSC_STATIC_INLINE PETSC_UNUSED
34 PetscErrorCode PetscHMapIVAddAndScale(PetscHMapIV ht,PetscHMapIV hd,PetscScalar scale)
35 {
36   int         ret;
37   PetscInt    key;
38   PetscScalar val;
39   PetscFunctionBegin;
40   PetscValidPointer(ht,1);
41   PetscValidPointer(hd,2);
42   kh_foreach(hd,key,val,{ khiter_t i;
43       i = kh_put(HMapIV,ht,key,&ret);
44       PetscHashAssert(ret>=0);
45       if (ret)  kh_val(ht,i) = val*scale;
46       else kh_val(ht,i) += val*scale;})
47   PetscFunctionReturn(0);
48 }
49 
50 /* #define PTAP_PROFILE */
51 
52 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
53 {
54   PetscErrorCode    ierr;
55   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
56   Mat_APMPI         *ptap=a->ap;
57   PetscBool         iascii;
58   PetscViewerFormat format;
59 
60   PetscFunctionBegin;
61   if (!ptap) {
62     /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
63     A->ops->view = MatView_MPIAIJ;
64     ierr = (A->ops->view)(A,viewer);CHKERRQ(ierr);
65     PetscFunctionReturn(0);
66   }
67 
68   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
69   if (iascii) {
70     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
71     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
72       if (ptap->algType == 0) {
73         ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr);
74       } else if (ptap->algType == 1) {
75         ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr);
76       } else if (ptap->algType == 2) {
77         ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
78       } else if (ptap->algType == 3) {
79         ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
80       }
81     }
82   }
83   ierr = (ptap->view)(A,viewer);CHKERRQ(ierr);
84   PetscFunctionReturn(0);
85 }
86 
87 PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
88 {
89   PetscErrorCode      ierr;
90   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data;
91   Mat_APMPI           *ptap=a->ap;
92   Mat_Merge_SeqsToMPI *merge;
93 
94   PetscFunctionBegin;
95   if (!ptap) PetscFunctionReturn(0);
96 
97   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
98   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
99   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
100   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
101   ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
102   ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
103   ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
104   if (ptap->AP_loc) { /* used by alg_rap */
105     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
106     ierr = PetscFree(ap->i);CHKERRQ(ierr);
107     ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr);
108     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
109   } else { /* used by alg_ptap */
110     ierr = PetscFree(ptap->api);CHKERRQ(ierr);
111     ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
112   }
113   ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
114   ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
115   if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
116 
117   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
118 
119   merge=ptap->merge;
120   if (merge) { /* used by alg_ptap */
121     ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
122     ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
123     ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
124     ierr = PetscFree(merge->bi);CHKERRQ(ierr);
125     ierr = PetscFree(merge->bj);CHKERRQ(ierr);
126     ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
127     ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
128     ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
129     ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
130     ierr = PetscFree(merge->coi);CHKERRQ(ierr);
131     ierr = PetscFree(merge->coj);CHKERRQ(ierr);
132     ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
133     ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
134     ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
135   }
136   ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr);
137 
138   ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr);
139   ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr);
140   ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr);
141   PetscFunctionReturn(0);
142 }
143 
144 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
145 {
146   PetscErrorCode ierr;
147   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
148   Mat_APMPI      *ptap=a->ap;
149 
150   PetscFunctionBegin;
151   ierr = (*A->ops->freeintermediatedatastructures)(A);CHKERRQ(ierr);
152   ierr = (*ptap->destroy)(A);CHKERRQ(ierr); /* MatDestroy_MPIAIJ(A) */
153   ierr = PetscFree(ptap);CHKERRQ(ierr);
154   PetscFunctionReturn(0);
155 }
156 
157 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
158 {
159   PetscErrorCode ierr;
160   PetscBool      flg;
161   MPI_Comm       comm;
162 #if !defined(PETSC_HAVE_HYPRE)
163   const char          *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"};
164   PetscInt            nalg=4;
165 #else
166   const char          *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"};
167   PetscInt            nalg=5;
168 #endif
169   PetscInt            pN=P->cmap->N,alg=1; /* set default algorithm */
170 
171   PetscFunctionBegin;
172   /* check if matrix local sizes are compatible */
173   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
174   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
175   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
176 
177   if (scall == MAT_INITIAL_MATRIX) {
178     /* pick an algorithm */
179     ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
180     ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
181     ierr = PetscOptionsEnd();CHKERRQ(ierr);
182 
183     if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
184       MatInfo     Ainfo,Pinfo;
185       PetscInt    nz_local;
186       PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;
187 
188       ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr);
189       ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr);
190       nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
191 
192       if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
193       ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr);
194 
195       if (alg_scalable) {
196         alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
197       }
198     }
199 
200     switch (alg) {
201     case 1:
202       /* do R=P^T locally, then C=R*A*P -- nonscalable */
203       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
204       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
205       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
206       break;
207     case 2:
208       /* compute C=P^T*A*P allatonce */
209       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
210       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr);
211       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
212       break;
213     case 3:
214       /* compute C=P^T*A*P allatonce */
215       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
216       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr);
217       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
218       break;
219 #if defined(PETSC_HAVE_HYPRE)
220     case 4:
221       /* Use boomerAMGBuildCoarseOperator */
222       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
223       ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr);
224       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
225       break;
226 #endif
227     default:
228       /* do R=P^T locally, then C=R*A*P */
229       ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
230       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr);
231       ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
232       break;
233     }
234 
235     if (alg == 0 || alg == 1 || alg == 2 || alg == 3) {
236       Mat_MPIAIJ *c  = (Mat_MPIAIJ*)(*C)->data;
237       Mat_APMPI  *ap = c->ap;
238       ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");CHKERRQ(ierr);
239       ap->freestruct = PETSC_FALSE;
240       ierr = PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);CHKERRQ(ierr);
241       ierr = PetscOptionsEnd();CHKERRQ(ierr);
242     }
243   }
244 
245   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
246   ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr);
247   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
248   PetscFunctionReturn(0);
249 }
250 
251 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
252 {
253   PetscErrorCode    ierr;
254   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
255   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
256   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
257   Mat_APMPI         *ptap = c->ap;
258   Mat               AP_loc,C_loc,C_oth;
259   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
260   PetscScalar       *apa;
261   const PetscInt    *cols;
262   const PetscScalar *vals;
263 
264   PetscFunctionBegin;
265   if (!ptap) {
266     MPI_Comm comm;
267     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
268     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
269   }
270 
271   ierr = MatZeroEntries(C);CHKERRQ(ierr);
272 
273   /* 1) get R = Pd^T,Ro = Po^T */
274   if (ptap->reuse == MAT_REUSE_MATRIX) {
275     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
276     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
277   }
278 
279   /* 2) get AP_loc */
280   AP_loc = ptap->AP_loc;
281   ap = (Mat_SeqAIJ*)AP_loc->data;
282 
283   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
284   /*-----------------------------------------------------*/
285   if (ptap->reuse == MAT_REUSE_MATRIX) {
286     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
287     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
288     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
289   }
290 
291   /* 2-2) compute numeric A_loc*P - dominating part */
292   /* ---------------------------------------------- */
293   /* get data from symbolic products */
294   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
295   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
296   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
297 
298   api   = ap->i;
299   apj   = ap->j;
300   ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr);
301   for (i=0; i<am; i++) {
302     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
303     apnz = api[i+1] - api[i];
304     apa = ap->a + api[i];
305     ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr);
306     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
307     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
308   }
309   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr);
310   if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
311 
312   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
313   /* Always use scalable version since we are in the MPI scalable version */
314   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
315   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
316 
317   C_loc = ptap->C_loc;
318   C_oth = ptap->C_oth;
319 
320   /* add C_loc and Co to to C */
321   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
322 
323   /* C_loc -> C */
324   cm    = C_loc->rmap->N;
325   c_seq = (Mat_SeqAIJ*)C_loc->data;
326   cols = c_seq->j;
327   vals = c_seq->a;
328   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
329 
330   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
331   /* when there are no off-processor parts.  */
332   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
333   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
334   /* a table, and other, more complex stuff has to be done. */
335   if (C->assembled) {
336     C->was_assembled = PETSC_TRUE;
337     C->assembled     = PETSC_FALSE;
338   }
339   if (C->was_assembled) {
340     for (i=0; i<cm; i++) {
341       ncols = c_seq->i[i+1] - c_seq->i[i];
342       row = rstart + i;
343       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
344       cols += ncols; vals += ncols;
345     }
346   } else {
347     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
348   }
349   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
350   if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
351 
352   /* Co -> C, off-processor part */
353   cm = C_oth->rmap->N;
354   c_seq = (Mat_SeqAIJ*)C_oth->data;
355   cols = c_seq->j;
356   vals = c_seq->a;
357   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
358   for (i=0; i<cm; i++) {
359     ncols = c_seq->i[i+1] - c_seq->i[i];
360     row = p->garray[i];
361     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
362     cols += ncols; vals += ncols;
363   }
364   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
365   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
366 
367   ptap->reuse = MAT_REUSE_MATRIX;
368 
369   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
370   if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
371 
372   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
373   if (ptap->freestruct) {
374     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
375   }
376   PetscFunctionReturn(0);
377 }
378 
379 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
380 {
381   PetscErrorCode      ierr;
382   Mat_APMPI           *ptap;
383   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
384   MPI_Comm            comm;
385   PetscMPIInt         size,rank;
386   Mat                 Cmpi,P_loc,P_oth;
387   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
388   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
389   PetscInt            *lnk,i,k,pnz,row,nsend;
390   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
391   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
392   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
393   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
394   MPI_Request         *swaits,*rwaits;
395   MPI_Status          *sstatus,rstatus;
396   PetscLayout         rowmap;
397   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
398   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
399   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
400   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
401   PetscScalar         *apv;
402   PetscTable          ta;
403   MatType             mtype;
404   const char          *prefix;
405 #if defined(PETSC_USE_INFO)
406   PetscReal           apfill;
407 #endif
408 
409   PetscFunctionBegin;
410   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
411   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
412   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
413 
414   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
415 
416   /* create symbolic parallel matrix Cmpi */
417   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
418   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
419   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
420 
421   /* create struct Mat_APMPI and attached it to C later */
422   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
423   ptap->reuse = MAT_INITIAL_MATRIX;
424   ptap->algType = 0;
425 
426   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
427   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr);
428   /* get P_loc by taking all local rows of P */
429   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr);
430 
431   ptap->P_loc = P_loc;
432   ptap->P_oth = P_oth;
433 
434   /* (0) compute Rd = Pd^T, Ro = Po^T  */
435   /* --------------------------------- */
436   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
437   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
438 
439   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
440   /* ----------------------------------------------------------------- */
441   p_loc  = (Mat_SeqAIJ*)P_loc->data;
442   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
443 
444   /* create and initialize a linked list */
445   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
446   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
447   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
448   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
449 
450   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
451 
452   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
453   if (ao) {
454     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
455   } else {
456     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
457   }
458   current_space = free_space;
459   nspacedouble  = 0;
460 
461   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
462   api[0] = 0;
463   for (i=0; i<am; i++) {
464     /* diagonal portion: Ad[i,:]*P */
465     ai = ad->i; pi = p_loc->i;
466     nzi = ai[i+1] - ai[i];
467     aj  = ad->j + ai[i];
468     for (j=0; j<nzi; j++) {
469       row  = aj[j];
470       pnz  = pi[row+1] - pi[row];
471       Jptr = p_loc->j + pi[row];
472       /* add non-zero cols of P into the sorted linked list lnk */
473       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
474     }
475     /* off-diagonal portion: Ao[i,:]*P */
476     if (ao) {
477       ai = ao->i; pi = p_oth->i;
478       nzi = ai[i+1] - ai[i];
479       aj  = ao->j + ai[i];
480       for (j=0; j<nzi; j++) {
481         row  = aj[j];
482         pnz  = pi[row+1] - pi[row];
483         Jptr = p_oth->j + pi[row];
484         ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
485       }
486     }
487     apnz     = lnk[0];
488     api[i+1] = api[i] + apnz;
489 
490     /* if free space is not available, double the total space in the list */
491     if (current_space->local_remaining<apnz) {
492       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
493       nspacedouble++;
494     }
495 
496     /* Copy data into free space, then initialize lnk */
497     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
498 
499     current_space->array           += apnz;
500     current_space->local_used      += apnz;
501     current_space->local_remaining -= apnz;
502   }
503   /* Allocate space for apj and apv, initialize apj, and */
504   /* destroy list of free space and other temporary array(s) */
505   ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
506   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
507   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
508 
509   /* Create AP_loc for reuse */
510   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
511   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr);
512 
513 #if defined(PETSC_USE_INFO)
514   if (ao) {
515     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
516   } else {
517     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
518   }
519   ptap->AP_loc->info.mallocs           = nspacedouble;
520   ptap->AP_loc->info.fill_ratio_given  = fill;
521   ptap->AP_loc->info.fill_ratio_needed = apfill;
522 
523   if (api[am]) {
524     ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
525     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
526   } else {
527     ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
528   }
529 #endif
530 
531   /* (2-1) compute symbolic Co = Ro*AP_loc  */
532   /* ------------------------------------ */
533   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
534   ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr);
535   ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr);
536   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
537 
538   /* (3) send coj of C_oth to other processors  */
539   /* ------------------------------------------ */
540   /* determine row ownership */
541   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
542   rowmap->n  = pn;
543   rowmap->bs = 1;
544   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
545   owners = rowmap->range;
546 
547   /* determine the number of messages to send, their lengths */
548   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
549   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
550   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
551 
552   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
553   coi   = c_oth->i; coj = c_oth->j;
554   con   = ptap->C_oth->rmap->n;
555   proc  = 0;
556   ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr);
557   for (i=0; i<con; i++) {
558     while (prmap[i] >= owners[proc+1]) proc++;
559     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
560     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
561   }
562 
563   len          = 0; /* max length of buf_si[], see (4) */
564   owners_co[0] = 0;
565   nsend        = 0;
566   for (proc=0; proc<size; proc++) {
567     owners_co[proc+1] = owners_co[proc] + len_si[proc];
568     if (len_s[proc]) {
569       nsend++;
570       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
571       len         += len_si[proc];
572     }
573   }
574 
575   /* determine the number and length of messages to receive for coi and coj  */
576   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
577   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
578 
579   /* post the Irecv and Isend of coj */
580   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
581   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
582   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
583   for (proc=0, k=0; proc<size; proc++) {
584     if (!len_s[proc]) continue;
585     i    = owners_co[proc];
586     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
587     k++;
588   }
589 
590   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
591   /* ---------------------------------------- */
592   ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr);
593   ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr);
594   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
595   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
596   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr);
597 
598   /* receives coj are complete */
599   for (i=0; i<nrecv; i++) {
600     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
601   }
602   ierr = PetscFree(rwaits);CHKERRQ(ierr);
603   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
604 
605   /* add received column indices into ta to update Crmax */
606   for (k=0; k<nrecv; k++) {/* k-th received message */
607     Jptr = buf_rj[k];
608     for (j=0; j<len_r[k]; j++) {
609       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
610     }
611   }
612   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
613   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
614 
615   /* (4) send and recv coi */
616   /*-----------------------*/
617   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
618   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
619   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
620   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
621   for (proc=0,k=0; proc<size; proc++) {
622     if (!len_s[proc]) continue;
623     /* form outgoing message for i-structure:
624          buf_si[0]:                 nrows to be sent
625                [1:nrows]:           row index (global)
626                [nrows+1:2*nrows+1]: i-structure index
627     */
628     /*-------------------------------------------*/
629     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
630     buf_si_i    = buf_si + nrows+1;
631     buf_si[0]   = nrows;
632     buf_si_i[0] = 0;
633     nrows       = 0;
634     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
635       nzi = coi[i+1] - coi[i];
636       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
637       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
638       nrows++;
639     }
640     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
641     k++;
642     buf_si += len_si[proc];
643   }
644   for (i=0; i<nrecv; i++) {
645     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
646   }
647   ierr = PetscFree(rwaits);CHKERRQ(ierr);
648   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
649 
650   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
651   ierr = PetscFree(len_ri);CHKERRQ(ierr);
652   ierr = PetscFree(swaits);CHKERRQ(ierr);
653   ierr = PetscFree(buf_s);CHKERRQ(ierr);
654 
655   /* (5) compute the local portion of Cmpi      */
656   /* ------------------------------------------ */
657   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
658   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
659   current_space = free_space;
660 
661   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
662   for (k=0; k<nrecv; k++) {
663     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
664     nrows       = *buf_ri_k[k];
665     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
666     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
667   }
668 
669   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
670   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
671   for (i=0; i<pn; i++) {
672     /* add C_loc into Cmpi */
673     nzi  = c_loc->i[i+1] - c_loc->i[i];
674     Jptr = c_loc->j + c_loc->i[i];
675     ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
676 
677     /* add received col data into lnk */
678     for (k=0; k<nrecv; k++) { /* k-th received message */
679       if (i == *nextrow[k]) { /* i-th row */
680         nzi  = *(nextci[k]+1) - *nextci[k];
681         Jptr = buf_rj[k] + *nextci[k];
682         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
683         nextrow[k]++; nextci[k]++;
684       }
685     }
686     nzi = lnk[0];
687 
688     /* copy data into free space, then initialize lnk */
689     ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr);
690     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
691   }
692   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
693   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
694   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
695 
696   /* local sizes and preallocation */
697   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
698   if (P->cmap->bs > 0) {
699     ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr);
700     ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr);
701   }
702   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
703   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
704 
705   /* members in merge */
706   ierr = PetscFree(id_r);CHKERRQ(ierr);
707   ierr = PetscFree(len_r);CHKERRQ(ierr);
708   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
709   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
710   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
711   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
712   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
713 
714   /* attach the supporting struct to Cmpi for reuse */
715   c = (Mat_MPIAIJ*)Cmpi->data;
716   c->ap           = ptap;
717   ptap->duplicate = Cmpi->ops->duplicate;
718   ptap->destroy   = Cmpi->ops->destroy;
719   ptap->view      = Cmpi->ops->view;
720 
721   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
722   Cmpi->assembled        = PETSC_FALSE;
723   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
724   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
725   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
726   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
727   *C                     = Cmpi;
728 
729    nout = 0;
730    ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr);
731    if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
732    ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr);
733    if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
734 
735   PetscFunctionReturn(0);
736 }
737 
738 PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHSetI dht,PetscHSetI oht)
739 {
740   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
741   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
742   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
743   PetscInt             pcstart,pcend,column;
744   PetscErrorCode       ierr;
745 
746   PetscFunctionBegin;
747 
748   pcstart = P->cmap->rstart;
749   pcend   = P->cmap->rend;
750   /* diagonal portion: Ad[i,:]*P */
751   ai = ad->i;
752   nzi = ai[i+1] - ai[i];
753   aj  = ad->j + ai[i];
754   for (j=0; j<nzi; j++) {
755     row  = aj[j];
756     nzpi = pd->i[row+1] - pd->i[row];
757     pj  = pd->j + pd->i[row];
758     for (k=0; k<nzpi; k++) {
759       ierr = PetscHSetIAdd(dht,pj[k]+pcstart);CHKERRQ(ierr);
760     }
761   }
762   for (j=0; j<nzi; j++) {
763     row  = aj[j];
764     nzpi = po->i[row+1] - po->i[row];
765     pj  = po->j + po->i[row];
766     for (k=0; k<nzpi; k++) {
767       ierr = PetscHSetIAdd(oht,p->garray[pj[k]]);CHKERRQ(ierr);
768     }
769   }
770 
771   /* off diagonal part: Ao[i, :]*P_oth */
772   if (ao) {
773     ai = ao->i;
774     pi = p_oth->i;
775     nzi = ai[i+1] - ai[i];
776     aj  = ao->j + ai[i];
777     for (j=0; j<nzi; j++) {
778       row  = aj[j];
779       pnz  = pi[row+1] - pi[row];
780       p_othcols = p_oth->j + pi[row];
781       for (col=0; col<pnz; col++) {
782         column = p_othcols[col];
783         if (column>=pcstart && column<pcend) {
784           ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr);
785         } else {
786           ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr);
787         }
788       }
789     }
790   } /* end if (ao) */
791   PetscFunctionReturn(0);
792 }
793 
794 PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,PetscInt i,PetscHMapIV hmap)
795 {
796   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
797   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
798   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi;
799   PetscScalar          ra,*aa,*pa;
800   PetscErrorCode       ierr;
801 
802   PetscFunctionBegin;
803   pcstart = P->cmap->rstart;
804   /* diagonal portion: Ad[i,:]*P */
805   ai  = ad->i;
806   nzi = ai[i+1] - ai[i];
807   aj  = ad->j + ai[i];
808   aa  = ad->a + ai[i];
809 
810   for (j=0; j<nzi; j++) {
811     ra   = aa[j];
812     row  = aj[j];
813     nzpi = pd->i[row+1] - pd->i[row];
814     pj = pd->j + pd->i[row];
815     pa = pd->a + pd->i[row];
816     for (k=0; k<nzpi; k++) {
817       ierr = PetscHMapIVAdd(hmap,pj[k]+pcstart,ra*pa[k]);CHKERRQ(ierr);
818     }
819   }
820   for (j=0; j<nzi; j++) {
821     ra   = aa[j];
822     row  = aj[j];
823     nzpi = po->i[row+1] - po->i[row];
824     pj = po->j + po->i[row];
825     pa = po->a + po->i[row];
826     for (k=0; k<nzpi; k++) {
827       ierr = PetscHMapIVAdd(hmap,p->garray[pj[k]],ra*pa[k]);CHKERRQ(ierr);
828     }
829   }
830 
831 
832   /* off diagonal part: Ao[i, :]*P_oth */
833   if (ao) {
834     ai = ao->i;
835     pi = p_oth->i;
836     nzi = ai[i+1] - ai[i];
837     aj  = ao->j + ai[i];
838     aa  = ao->a + ai[i];
839     for (j=0; j<nzi; j++) {
840       row  = aj[j];
841       ra   = aa[j];
842       pnz  = pi[row+1] - pi[row];
843       p_othcols = p_oth->j + pi[row];
844       pa   = p_oth->a + pi[row];
845       for (col=0; col<pnz; col++) {
846         ierr = PetscHMapIVAdd(hmap,p_othcols[col],ra*pa[col]);CHKERRQ(ierr);
847       }
848     }
849   } /* end if (ao) */
850   PetscFunctionReturn(0);
851 }
852 
853 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
854 {
855   PetscErrorCode    ierr;
856   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
857   Mat_SeqAIJ        *cd,*co,*po,*pd;
858   Mat_APMPI         *ptap = c->ap;
859   PetscHMapIV       hmap;
860   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
861   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
862   MPI_Comm          comm;
863 
864   PetscFunctionBegin;
865   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
866   if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
867 
868   ierr = MatZeroEntries(C);CHKERRQ(ierr);
869 
870   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
871   /*-----------------------------------------------------*/
872   if (ptap->reuse == MAT_REUSE_MATRIX) {
873     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
874     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
875   }
876 
877   po = (Mat_SeqAIJ*) p->B->data;
878   pd = (Mat_SeqAIJ*) p->A->data;
879 
880   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
881 
882   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
883   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
884   cmaxr = 0;
885   for (i=0; i<pon; i++) {
886     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
887   }
888   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
889   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
890   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
891   for (i=0; i<am && pon; i++) {
892     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
893     nzi = po->i[i+1] - po->i[i];
894     if (!nzi) continue;
895     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr);
896     voff = 0;
897     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
898     if (!voff) continue;
899     /*ierr = PetscMemzero(c_rmtc,sizeof(PetscInt)*pon);CHKERRQ(ierr);*/
900 
901     /* Form C(ii, :) */
902     poj = po->j + po->i[i];
903     poa = po->a + po->i[i];
904     for (j=0; j<nzi; j++) {
905       c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]];
906       c_rmtaa = c_rmta + ptap->c_rmti[poj[j]];
907       for (jj=0; jj<voff; jj++) {
908         apvaluestmp[jj] = apvalues[jj]*poa[j];
909         /*If the row is empty */
910         if (!c_rmtc[poj[j]]) {
911           c_rmtjj[jj] = apindices[jj];
912           c_rmtaa[jj] = apvaluestmp[jj];
913           c_rmtc[poj[j]]++;
914         } else {
915           ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr);
916           if (loc>=0){ /* hit */
917             c_rmtaa[loc] += apvaluestmp[jj];
918           } else { /* new element */
919             loc = -(loc+1);
920             /* Move data backward */
921             for (kk=c_rmtc[poj[j]]; kk>loc; kk--) {
922               c_rmtjj[kk] = c_rmtjj[kk-1];
923               c_rmtaa[kk] = c_rmtaa[kk-1];
924             }/* End kk */
925             c_rmtjj[loc] = apindices[jj];
926             c_rmtaa[loc] = apvaluestmp[jj];
927             c_rmtc[poj[j]]++;
928           }
929         }
930       } /* End jj */
931     } /* End j */
932   } /* End i */
933 
934   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
935 
936   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
937   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
938 
939   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
940   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
941   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
942   cd = (Mat_SeqAIJ*)(c->A)->data;
943   co = (Mat_SeqAIJ*)(c->B)->data;
944 
945   cmaxr = 0;
946   for (i=0; i<pn; i++) {
947     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
948   }
949   ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr);
950   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
951   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
952   for (i=0; i<am && pn; i++) {
953     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
954     nzi = pd->i[i+1] - pd->i[i];
955     if (!nzi) continue;
956     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr);
957     voff = 0;
958     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
959     if (!voff) continue;
960     /* Form C(ii, :) */
961     pdj = pd->j + pd->i[i];
962     pda = pd->a + pd->i[i];
963     for (j=0; j<nzi; j++) {
964       row = pcstart + pdj[j];
965       for (jj=0; jj<voff; jj++) {
966         apvaluestmp[jj] = apvalues[jj]*pda[j];
967       }
968       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
969     }
970   }
971 
972   ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr);
973   ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr);
974   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
975   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr);
976   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
977   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
978 
979   /* Add contributions from remote */
980   for (i = 0; i < pn; i++) {
981     row = i + pcstart;
982     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
983   }
984   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
985 
986   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
987   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
988 
989   ptap->reuse = MAT_REUSE_MATRIX;
990 
991   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
992   if (ptap->freestruct) {
993     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
994   }
995   PetscFunctionReturn(0);
996 }
997 
998 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
999 {
1000   PetscErrorCode    ierr;
1001   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1002   Mat_SeqAIJ        *cd,*co,*po,*pd;
1003   Mat_APMPI         *ptap = c->ap;
1004   PetscHMapIV       hmap;
1005   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
1006   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
1007   MPI_Comm          comm;
1008 
1009   PetscFunctionBegin;
1010   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1011   if (!ptap) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1012 
1013   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1014 
1015   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1016   /*-----------------------------------------------------*/
1017   if (ptap->reuse == MAT_REUSE_MATRIX) {
1018     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1019     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1020   }
1021 
1022   po = (Mat_SeqAIJ*) p->B->data;
1023   pd = (Mat_SeqAIJ*) p->A->data;
1024 
1025   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1026   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1027 
1028   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
1029   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1030   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1031   cmaxr = 0;
1032   for (i=0; i<pon; i++) {
1033     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
1034   }
1035   cd = (Mat_SeqAIJ*)(c->A)->data;
1036   co = (Mat_SeqAIJ*)(c->B)->data;
1037   for (i=0; i<pn; i++) {
1038     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
1039   }
1040   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
1041   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
1042   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
1043   for (i=0; i<am && (pon || pn); i++) {
1044     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
1045     nzi  = po->i[i+1] - po->i[i];
1046     dnzi = pd->i[i+1] - pd->i[i];
1047     if (!nzi && !dnzi) continue;
1048     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,i,hmap);CHKERRQ(ierr);
1049     voff = 0;
1050     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
1051     if (!voff) continue;
1052 
1053     /* Form remote C(ii, :) */
1054     poj = po->j + po->i[i];
1055     poa = po->a + po->i[i];
1056     for (j=0; j<nzi; j++) {
1057       c_rmtjj = c_rmtj + ptap->c_rmti[poj[j]];
1058       c_rmtaa = c_rmta + ptap->c_rmti[poj[j]];
1059       for (jj=0; jj<voff; jj++) {
1060         apvaluestmp[jj] = apvalues[jj]*poa[j];
1061         /*If the row is empty */
1062         if (!c_rmtc[poj[j]]) {
1063           c_rmtjj[jj] = apindices[jj];
1064           c_rmtaa[jj] = apvaluestmp[jj];
1065           c_rmtc[poj[j]]++;
1066         } else {
1067           ierr = PetscFindInt(apindices[jj],c_rmtc[poj[j]],c_rmtjj,&loc);CHKERRQ(ierr);
1068           if (loc>=0){ /* hit */
1069             c_rmtaa[loc] += apvaluestmp[jj];
1070           } else { /* new element */
1071             loc = -(loc+1);
1072             /* Move data backward */
1073             for (kk=c_rmtc[poj[j]]; kk>loc; kk--) {
1074               c_rmtjj[kk] = c_rmtjj[kk-1];
1075               c_rmtaa[kk] = c_rmtaa[kk-1];
1076             }/* End kk */
1077             c_rmtjj[loc] = apindices[jj];
1078             c_rmtaa[loc] = apvaluestmp[jj];
1079             c_rmtc[poj[j]]++;
1080           }
1081         }
1082       } /* End jj */
1083     } /* End j */
1084 
1085     /* Form local C(ii, :) */
1086     pdj = pd->j + pd->i[i];
1087     pda = pd->a + pd->i[i];
1088     for (j=0; j<dnzi; j++) {
1089       row = pcstart + pdj[j];
1090       for (jj=0; jj<voff; jj++) {
1091         apvaluestmp[jj] = apvalues[jj]*pda[j];
1092       }/* End kk */
1093       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
1094     }/* End j */
1095   } /* End i */
1096 
1097   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
1098   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
1099   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
1100 
1101   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1102   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
1103   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_SUM);CHKERRQ(ierr);
1104   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);CHKERRQ(ierr);
1105   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
1106 
1107   /* Add contributions from remote */
1108   for (i = 0; i < pn; i++) {
1109     row = i + pcstart;
1110     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
1111   }
1112   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
1113 
1114   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1115   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1116 
1117   ptap->reuse = MAT_REUSE_MATRIX;
1118 
1119   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1120   if (ptap->freestruct) {
1121     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
1122   }
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C)
1127 {
1128   Mat_APMPI           *ptap;
1129   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
1130   MPI_Comm            comm;
1131   Mat                 Cmpi;
1132   Mat_SeqAIJ          *pd,*po;
1133   MatType             mtype;
1134   PetscSF             sf;
1135   PetscSFNode         *iremote;
1136   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1137   const PetscInt      *rootdegrees;
1138   PetscHSetI          ht,oht,*hta,*hto;
1139   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1140   PetscInt            owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1141   PetscInt            nalg=2,alg=0;
1142   PetscBool           flg;
1143   const char          *algTypes[2] = {"overlapping","merged"};
1144   PetscErrorCode      ierr;
1145 
1146   PetscFunctionBegin;
1147   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1148 
1149   /* Create symbolic parallel matrix Cmpi */
1150   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1151   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1152   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1153   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1154   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1155 
1156   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1157   ptap->reuse = MAT_INITIAL_MATRIX;
1158   ptap->algType = 2;
1159 
1160   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1161   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1162 
1163   po = (Mat_SeqAIJ*)p->B->data;
1164   pd = (Mat_SeqAIJ*)p->A->data;
1165 
1166   /* This equals to the number of offdiag columns in P */
1167   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1168   /* offsets */
1169   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
1170   /* The number of columns we will send to remote ranks */
1171   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
1172   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
1173   for (i=0; i<pon; i++) {
1174     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1175   }
1176   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1177   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
1178   /* Create hash table to merge all columns for C(i, :) */
1179   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
1180 
1181   ptap->c_rmti[0] = 0;
1182   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1183   for (i=0; i<am && pon; i++) {
1184     /* Form one row of AP */
1185     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1186     /* If the off diag is empty, we should not do any calculation */
1187     nzi = po->i[i+1] - po->i[i];
1188     if (!nzi) continue;
1189 
1190     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,ht);CHKERRQ(ierr);
1191     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1192     /* If AP is empty, just continue */
1193     if (!htsize) continue;
1194     /* Form C(ii, :) */
1195     poj = po->j + po->i[i];
1196     for (j=0; j<nzi; j++) {
1197       ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr);
1198     }
1199   }
1200 
1201   for (i=0; i<pon; i++) {
1202     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1203     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1204     c_rmtc[i] = htsize;
1205   }
1206 
1207   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
1208 
1209   for (i=0; i<pon; i++) {
1210     off = 0;
1211     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
1212     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1213   }
1214   ierr = PetscFree(hta);CHKERRQ(ierr);
1215 
1216   ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr);
1217   for (i=0; i<pon; i++) {
1218     owner = 0; lidx = 0;
1219     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1220     iremote[i].index = lidx;
1221     iremote[i].rank  = owner;
1222   }
1223 
1224   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1225   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1226   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1227   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
1228   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
1229   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1230   /* How many neighbors have contributions to my rows? */
1231   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
1232   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
1233   rootspacesize = 0;
1234   for (i = 0; i < pn; i++) {
1235     rootspacesize += rootdegrees[i];
1236   }
1237   ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1238   ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
1239   /* Get information from leaves
1240    * Number of columns other people contribute to my rows
1241    * */
1242   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1243   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1244   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1245   ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
1246   /* The number of columns is received for each row */
1247   ptap->c_othi[0] = 0;
1248   rootspacesize = 0;
1249   rootspaceoffsets[0] = 0;
1250   for (i = 0; i < pn; i++) {
1251     rcvncols = 0;
1252     for (j = 0; j<rootdegrees[i]; j++) {
1253       rcvncols += rootspace[rootspacesize];
1254       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1255       rootspacesize++;
1256     }
1257     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1258   }
1259   ierr = PetscFree(rootspace);CHKERRQ(ierr);
1260 
1261   ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
1262   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1263   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1264   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1265   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
1266 
1267   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
1268   nleaves = 0;
1269   for (i = 0; i<pon; i++) {
1270     owner = 0; lidx = 0;
1271     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1272     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1273     for (j=0; j<sendncols; j++) {
1274       iremote[nleaves].rank = owner;
1275       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1276     }
1277   }
1278   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
1279   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
1280 
1281   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
1282   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1283   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
1284   /* One to one map */
1285   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1286 
1287   ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
1288   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
1289   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1290   ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr);
1291   for (i=0; i<pn; i++) {
1292     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1293     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
1294   }
1295   /* Work on local part */
1296   /* 4) Pass 1: Estimate memory for C_loc */
1297   for (i=0; i<am && pn; i++) {
1298     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1299     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1300     nzi = pd->i[i+1] - pd->i[i];
1301     if (!nzi) continue;
1302 
1303     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr);
1304     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1305     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
1306     if (!(htsize+htosize)) continue;
1307     /* Form C(ii, :) */
1308     pdj = pd->j + pd->i[i];
1309     for (j=0; j<nzi; j++) {
1310       ierr = PetscHSetIAppend(hta[pdj[j]],ht);CHKERRQ(ierr);
1311       ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr);
1312     }
1313   }
1314 
1315   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
1316   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
1317 
1318   /* Get remote data */
1319   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1320   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1321 
1322   for (i = 0; i < pn; i++) {
1323     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1324     rdj = c_othj + ptap->c_othi[i];
1325     for (j = 0; j < nzi; j++) {
1326       col =  rdj[j];
1327       /* diag part */
1328       if (col>=pcstart && col<pcend) {
1329         ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr);
1330       } else { /* off diag */
1331         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
1332       }
1333     }
1334     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1335     dnz[i] = htsize;
1336     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1337     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
1338     onz[i] = htsize;
1339     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
1340   }
1341 
1342   ierr = PetscFree2(hta,hto);CHKERRQ(ierr);
1343   ierr = PetscFree(c_othj);CHKERRQ(ierr);
1344 
1345   /* local sizes and preallocation */
1346   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1347   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1348   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1349   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
1350 
1351   /* attach the supporting struct to Cmpi for reuse */
1352   c = (Mat_MPIAIJ*)Cmpi->data;
1353   c->ap           = ptap;
1354   ptap->duplicate = Cmpi->ops->duplicate;
1355   ptap->destroy   = Cmpi->ops->destroy;
1356   ptap->view      = Cmpi->ops->view;
1357 
1358   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1359   Cmpi->assembled        = PETSC_FALSE;
1360   /* pick an algorithm */
1361   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1362   alg = 0;
1363   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1364   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1365   switch (alg) {
1366     case 0:
1367       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1368       break;
1369     case 1:
1370       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1371       break;
1372     default:
1373       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1374   }
1375   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1376   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1377   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1378   *C                     = Cmpi;
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C)
1383 {
1384   Mat_APMPI           *ptap;
1385   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
1386   MPI_Comm            comm;
1387   Mat                 Cmpi;
1388   Mat_SeqAIJ          *pd,*po;
1389   MatType             mtype;
1390   PetscSF             sf;
1391   PetscSFNode         *iremote;
1392   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1393   const PetscInt      *rootdegrees;
1394   PetscHSetI          ht,oht,*hta,*hto,*htd;
1395   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1396   PetscInt            owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1397   PetscInt            nalg=2,alg=0;
1398   PetscBool           flg;
1399   const char          *algTypes[2] = {"merged","overlapping"};
1400   PetscErrorCode      ierr;
1401 
1402   PetscFunctionBegin;
1403   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1404 
1405   /* Create symbolic parallel matrix Cmpi */
1406   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1407   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1408   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1409   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1410   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1411 
1412   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1413   ptap->reuse = MAT_INITIAL_MATRIX;
1414   ptap->algType = 3;
1415 
1416   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1417   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1418 
1419   po = (Mat_SeqAIJ*)p->B->data;
1420   pd = (Mat_SeqAIJ*)p->A->data;
1421 
1422   /* This equals to the number of offdiag columns in P */
1423   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1424   /* offsets */
1425   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
1426   /* The number of columns we will send to remote ranks */
1427   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
1428   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
1429   for (i=0; i<pon; i++) {
1430     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1431   }
1432   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1433   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
1434   /* Create hash table to merge all columns for C(i, :) */
1435   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
1436   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
1437   ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr);
1438   for (i=0; i<pn; i++) {
1439     ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr);
1440     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
1441   }
1442   ptap->c_rmti[0] = 0;
1443   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1444   for (i=0; i<am && (pon || pn); i++) {
1445     /* Form one row of AP */
1446     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1447     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1448     /* If the off diag is empty, we should not do any calculation */
1449     nzi = po->i[i+1] - po->i[i];
1450     dnzi = pd->i[i+1] - pd->i[i];
1451     if (!nzi && !dnzi) continue;
1452 
1453     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,i,ht,oht);CHKERRQ(ierr);
1454     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1455     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
1456     /* If AP is empty, just continue */
1457     if (!(htsize+htosize)) continue;
1458 
1459     /* Form remote C(ii, :) */
1460     poj = po->j + po->i[i];
1461     for (j=0; j<nzi; j++) {
1462       ierr = PetscHSetIAppend(hta[poj[j]],ht);CHKERRQ(ierr);
1463       ierr = PetscHSetIAppend(hta[poj[j]],oht);CHKERRQ(ierr);
1464     }
1465 
1466     /* Form local C(ii, :) */
1467     pdj = pd->j + pd->i[i];
1468     for (j=0; j<dnzi; j++) {
1469       ierr = PetscHSetIAppend(htd[pdj[j]],ht);CHKERRQ(ierr);
1470       ierr = PetscHSetIAppend(hto[pdj[j]],oht);CHKERRQ(ierr);
1471     }
1472   }
1473 
1474   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
1475   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
1476 
1477   for (i=0; i<pon; i++) {
1478     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1479     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1480     c_rmtc[i] = htsize;
1481   }
1482 
1483   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
1484 
1485   for (i=0; i<pon; i++) {
1486     off = 0;
1487     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
1488     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1489   }
1490   ierr = PetscFree(hta);CHKERRQ(ierr);
1491 
1492   ierr = PetscCalloc1(pon,&iremote);CHKERRQ(ierr);
1493   for (i=0; i<pon; i++) {
1494     owner = 0; lidx = 0;
1495     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1496     iremote[i].index = lidx;
1497     iremote[i].rank  = owner;
1498   }
1499 
1500   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1501   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1502   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1503   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
1504   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
1505   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1506   /* How many neighbors have contributions to my rows? */
1507   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
1508   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
1509   rootspacesize = 0;
1510   for (i = 0; i < pn; i++) {
1511     rootspacesize += rootdegrees[i];
1512   }
1513   ierr = PetscCalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1514   ierr = PetscCalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
1515   /* Get information from leaves
1516    * Number of columns other people contribute to my rows
1517    * */
1518   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1519   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1520   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1521   ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
1522   /* The number of columns is received for each row */
1523   ptap->c_othi[0] = 0;
1524   rootspacesize = 0;
1525   rootspaceoffsets[0] = 0;
1526   for (i = 0; i < pn; i++) {
1527     rcvncols = 0;
1528     for (j = 0; j<rootdegrees[i]; j++) {
1529       rcvncols += rootspace[rootspacesize];
1530       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1531       rootspacesize++;
1532     }
1533     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1534   }
1535   ierr = PetscFree(rootspace);CHKERRQ(ierr);
1536 
1537   ierr = PetscCalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
1538   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1539   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1540   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1541   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
1542 
1543   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
1544   nleaves = 0;
1545   for (i = 0; i<pon; i++) {
1546     owner = 0; lidx = 0;
1547     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[i],&owner,&lidx);CHKERRQ(ierr);
1548     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1549     for (j=0; j<sendncols; j++) {
1550       iremote[nleaves].rank = owner;
1551       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1552     }
1553   }
1554   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
1555   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
1556 
1557   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
1558   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1559   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
1560   /* One to one map */
1561   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1562   /* Get remote data */
1563   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);CHKERRQ(ierr);
1564   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1565   ierr = PetscCalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
1566   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1567 
1568   for (i = 0; i < pn; i++) {
1569     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1570     rdj = c_othj + ptap->c_othi[i];
1571     for (j = 0; j < nzi; j++) {
1572       col =  rdj[j];
1573       /* diag part */
1574       if (col>=pcstart && col<pcend) {
1575         ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr);
1576       } else { /* off diag */
1577         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
1578       }
1579     }
1580     ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr);
1581     dnz[i] = htsize;
1582     ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr);
1583     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
1584     onz[i] = htsize;
1585     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
1586   }
1587 
1588   ierr = PetscFree2(htd,hto);CHKERRQ(ierr);
1589   ierr = PetscFree(c_othj);CHKERRQ(ierr);
1590 
1591   /* local sizes and preallocation */
1592   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1593   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1594   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1595   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
1596 
1597   /* attach the supporting struct to Cmpi for reuse */
1598   c = (Mat_MPIAIJ*)Cmpi->data;
1599   c->ap           = ptap;
1600   ptap->duplicate = Cmpi->ops->duplicate;
1601   ptap->destroy   = Cmpi->ops->destroy;
1602   ptap->view      = Cmpi->ops->view;
1603 
1604   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1605   Cmpi->assembled        = PETSC_FALSE;
1606   /* pick an algorithm */
1607   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1608   alg = 0;
1609   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1610   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1611   switch (alg) {
1612     case 0:
1613       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1614       break;
1615     case 1:
1616       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1617       break;
1618     default:
1619       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1620   }
1621   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1622   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1623   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1624   *C                     = Cmpi;
1625   PetscFunctionReturn(0);
1626 }
1627 
1628 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1629 {
1630   PetscErrorCode      ierr;
1631   Mat_APMPI           *ptap;
1632   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1633   MPI_Comm            comm;
1634   PetscMPIInt         size,rank;
1635   Mat                 Cmpi;
1636   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1637   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1638   PetscInt            *lnk,i,k,pnz,row,nsend;
1639   PetscBT             lnkbt;
1640   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1641   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1642   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1643   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1644   MPI_Request         *swaits,*rwaits;
1645   MPI_Status          *sstatus,rstatus;
1646   PetscLayout         rowmap;
1647   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1648   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1649   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1650   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1651   PetscScalar         *apv;
1652   PetscTable          ta;
1653   MatType             mtype;
1654   const char          *prefix;
1655 #if defined(PETSC_USE_INFO)
1656   PetscReal           apfill;
1657 #endif
1658 
1659   PetscFunctionBegin;
1660   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1661   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1662   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1663 
1664   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1665 
1666   /* create symbolic parallel matrix Cmpi */
1667   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1668   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1669   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1670 
1671   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1672   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1673 
1674   /* create struct Mat_APMPI and attached it to C later */
1675   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1676   ptap->reuse = MAT_INITIAL_MATRIX;
1677   ptap->algType = 1;
1678 
1679   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1680   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1681   /* get P_loc by taking all local rows of P */
1682   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1683 
1684   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1685   /* --------------------------------- */
1686   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1687   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
1688 
1689   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1690   /* ----------------------------------------------------------------- */
1691   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1692   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1693 
1694   /* create and initialize a linked list */
1695   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
1696   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1697   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1698   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
1699   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1700 
1701   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1702 
1703   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1704   if (ao) {
1705     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
1706   } else {
1707     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
1708   }
1709   current_space = free_space;
1710   nspacedouble  = 0;
1711 
1712   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
1713   api[0] = 0;
1714   for (i=0; i<am; i++) {
1715     /* diagonal portion: Ad[i,:]*P */
1716     ai = ad->i; pi = p_loc->i;
1717     nzi = ai[i+1] - ai[i];
1718     aj  = ad->j + ai[i];
1719     for (j=0; j<nzi; j++) {
1720       row  = aj[j];
1721       pnz  = pi[row+1] - pi[row];
1722       Jptr = p_loc->j + pi[row];
1723       /* add non-zero cols of P into the sorted linked list lnk */
1724       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1725     }
1726     /* off-diagonal portion: Ao[i,:]*P */
1727     if (ao) {
1728       ai = ao->i; pi = p_oth->i;
1729       nzi = ai[i+1] - ai[i];
1730       aj  = ao->j + ai[i];
1731       for (j=0; j<nzi; j++) {
1732         row  = aj[j];
1733         pnz  = pi[row+1] - pi[row];
1734         Jptr = p_oth->j + pi[row];
1735         ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1736       }
1737     }
1738     apnz     = lnk[0];
1739     api[i+1] = api[i] + apnz;
1740     if (ap_rmax < apnz) ap_rmax = apnz;
1741 
1742     /* if free space is not available, double the total space in the list */
1743     if (current_space->local_remaining<apnz) {
1744       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1745       nspacedouble++;
1746     }
1747 
1748     /* Copy data into free space, then initialize lnk */
1749     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1750 
1751     current_space->array           += apnz;
1752     current_space->local_used      += apnz;
1753     current_space->local_remaining -= apnz;
1754   }
1755   /* Allocate space for apj and apv, initialize apj, and */
1756   /* destroy list of free space and other temporary array(s) */
1757   ierr   = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
1758   ierr   = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
1759   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1760 
1761   /* Create AP_loc for reuse */
1762   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
1763 
1764 #if defined(PETSC_USE_INFO)
1765   if (ao) {
1766     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1767   } else {
1768     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1769   }
1770   ptap->AP_loc->info.mallocs           = nspacedouble;
1771   ptap->AP_loc->info.fill_ratio_given  = fill;
1772   ptap->AP_loc->info.fill_ratio_needed = apfill;
1773 
1774   if (api[am]) {
1775     ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
1776     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
1777   } else {
1778     ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
1779   }
1780 #endif
1781 
1782   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1783   /* ------------------------------------ */
1784   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
1785   ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr);
1786   ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr);
1787   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
1788 
1789   /* (3) send coj of C_oth to other processors  */
1790   /* ------------------------------------------ */
1791   /* determine row ownership */
1792   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
1793   rowmap->n  = pn;
1794   rowmap->bs = 1;
1795   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
1796   owners = rowmap->range;
1797 
1798   /* determine the number of messages to send, their lengths */
1799   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
1800   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1801   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1802 
1803   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1804   coi   = c_oth->i; coj = c_oth->j;
1805   con   = ptap->C_oth->rmap->n;
1806   proc  = 0;
1807   for (i=0; i<con; i++) {
1808     while (prmap[i] >= owners[proc+1]) proc++;
1809     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1810     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1811   }
1812 
1813   len          = 0; /* max length of buf_si[], see (4) */
1814   owners_co[0] = 0;
1815   nsend        = 0;
1816   for (proc=0; proc<size; proc++) {
1817     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1818     if (len_s[proc]) {
1819       nsend++;
1820       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1821       len         += len_si[proc];
1822     }
1823   }
1824 
1825   /* determine the number and length of messages to receive for coi and coj  */
1826   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
1827   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
1828 
1829   /* post the Irecv and Isend of coj */
1830   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1831   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1832   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
1833   for (proc=0, k=0; proc<size; proc++) {
1834     if (!len_s[proc]) continue;
1835     i    = owners_co[proc];
1836     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1837     k++;
1838   }
1839 
1840   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1841   /* ---------------------------------------- */
1842   ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr);
1843   ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr);
1844   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
1845   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1846 
1847   /* receives coj are complete */
1848   for (i=0; i<nrecv; i++) {
1849     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1850   }
1851   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1852   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
1853 
1854   /* add received column indices into ta to update Crmax */
1855   for (k=0; k<nrecv; k++) {/* k-th received message */
1856     Jptr = buf_rj[k];
1857     for (j=0; j<len_r[k]; j++) {
1858       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1859     }
1860   }
1861   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
1862   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1863 
1864   /* (4) send and recv coi */
1865   /*-----------------------*/
1866   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1867   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1868   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1869   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1870   for (proc=0,k=0; proc<size; proc++) {
1871     if (!len_s[proc]) continue;
1872     /* form outgoing message for i-structure:
1873          buf_si[0]:                 nrows to be sent
1874                [1:nrows]:           row index (global)
1875                [nrows+1:2*nrows+1]: i-structure index
1876     */
1877     /*-------------------------------------------*/
1878     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1879     buf_si_i    = buf_si + nrows+1;
1880     buf_si[0]   = nrows;
1881     buf_si_i[0] = 0;
1882     nrows       = 0;
1883     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1884       nzi = coi[i+1] - coi[i];
1885       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1886       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1887       nrows++;
1888     }
1889     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1890     k++;
1891     buf_si += len_si[proc];
1892   }
1893   for (i=0; i<nrecv; i++) {
1894     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1895   }
1896   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1897   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
1898 
1899   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
1900   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1901   ierr = PetscFree(swaits);CHKERRQ(ierr);
1902   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1903 
1904   /* (5) compute the local portion of Cmpi      */
1905   /* ------------------------------------------ */
1906   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1907   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
1908   current_space = free_space;
1909 
1910   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
1911   for (k=0; k<nrecv; k++) {
1912     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1913     nrows       = *buf_ri_k[k];
1914     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1915     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1916   }
1917 
1918   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1919   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1920   for (i=0; i<pn; i++) {
1921     /* add C_loc into Cmpi */
1922     nzi  = c_loc->i[i+1] - c_loc->i[i];
1923     Jptr = c_loc->j + c_loc->i[i];
1924     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1925 
1926     /* add received col data into lnk */
1927     for (k=0; k<nrecv; k++) { /* k-th received message */
1928       if (i == *nextrow[k]) { /* i-th row */
1929         nzi  = *(nextci[k]+1) - *nextci[k];
1930         Jptr = buf_rj[k] + *nextci[k];
1931         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1932         nextrow[k]++; nextci[k]++;
1933       }
1934     }
1935     nzi = lnk[0];
1936 
1937     /* copy data into free space, then initialize lnk */
1938     ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1939     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1940   }
1941   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1942   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1943   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
1944 
1945   /* local sizes and preallocation */
1946   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1947   if (P->cmap->bs > 0) {
1948     ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr);
1949     ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr);
1950   }
1951   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1952   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1953 
1954   /* members in merge */
1955   ierr = PetscFree(id_r);CHKERRQ(ierr);
1956   ierr = PetscFree(len_r);CHKERRQ(ierr);
1957   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
1958   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
1959   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
1960   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
1961   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
1962 
1963   /* attach the supporting struct to Cmpi for reuse */
1964   c = (Mat_MPIAIJ*)Cmpi->data;
1965   c->ap           = ptap;
1966   ptap->duplicate = Cmpi->ops->duplicate;
1967   ptap->destroy   = Cmpi->ops->destroy;
1968   ptap->view      = Cmpi->ops->view;
1969   ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
1970 
1971   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1972   Cmpi->assembled        = PETSC_FALSE;
1973   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1974   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1975   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1976   *C                     = Cmpi;
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1981 {
1982   PetscErrorCode    ierr;
1983   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1984   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1985   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1986   Mat_APMPI         *ptap = c->ap;
1987   Mat               AP_loc,C_loc,C_oth;
1988   PetscInt          i,rstart,rend,cm,ncols,row;
1989   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1990   PetscScalar       *apa;
1991   const PetscInt    *cols;
1992   const PetscScalar *vals;
1993 
1994   PetscFunctionBegin;
1995   if (!ptap) {
1996     MPI_Comm comm;
1997     ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1998     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1999   }
2000 
2001   ierr = MatZeroEntries(C);CHKERRQ(ierr);
2002   /* 1) get R = Pd^T,Ro = Po^T */
2003   if (ptap->reuse == MAT_REUSE_MATRIX) {
2004     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
2005     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
2006   }
2007 
2008   /* 2) get AP_loc */
2009   AP_loc = ptap->AP_loc;
2010   ap = (Mat_SeqAIJ*)AP_loc->data;
2011 
2012   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
2013   /*-----------------------------------------------------*/
2014   if (ptap->reuse == MAT_REUSE_MATRIX) {
2015     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2016     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
2017     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
2018   }
2019 
2020   /* 2-2) compute numeric A_loc*P - dominating part */
2021   /* ---------------------------------------------- */
2022   /* get data from symbolic products */
2023   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2024   if (ptap->P_oth) {
2025     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2026   }
2027   apa   = ptap->apa;
2028   api   = ap->i;
2029   apj   = ap->j;
2030   for (i=0; i<am; i++) {
2031     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2032     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2033     apnz = api[i+1] - api[i];
2034     for (j=0; j<apnz; j++) {
2035       col = apj[j+api[i]];
2036       ap->a[j+ap->i[i]] = apa[col];
2037       apa[col] = 0.0;
2038     }
2039     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
2040   }
2041 
2042   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2043   ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
2044   ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
2045   C_loc = ptap->C_loc;
2046   C_oth = ptap->C_oth;
2047 
2048   /* add C_loc and Co to to C */
2049   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
2050 
2051   /* C_loc -> C */
2052   cm    = C_loc->rmap->N;
2053   c_seq = (Mat_SeqAIJ*)C_loc->data;
2054   cols = c_seq->j;
2055   vals = c_seq->a;
2056 
2057 
2058   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2059   /* when there are no off-processor parts.  */
2060   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2061   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2062   /* a table, and other, more complex stuff has to be done. */
2063   if (C->assembled) {
2064     C->was_assembled = PETSC_TRUE;
2065     C->assembled     = PETSC_FALSE;
2066   }
2067   if (C->was_assembled) {
2068     for (i=0; i<cm; i++) {
2069       ncols = c_seq->i[i+1] - c_seq->i[i];
2070       row = rstart + i;
2071       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
2072       cols += ncols; vals += ncols;
2073     }
2074   } else {
2075     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
2076   }
2077 
2078   /* Co -> C, off-processor part */
2079   cm = C_oth->rmap->N;
2080   c_seq = (Mat_SeqAIJ*)C_oth->data;
2081   cols = c_seq->j;
2082   vals = c_seq->a;
2083   for (i=0; i<cm; i++) {
2084     ncols = c_seq->i[i+1] - c_seq->i[i];
2085     row = p->garray[i];
2086     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
2087     cols += ncols; vals += ncols;
2088   }
2089 
2090   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2091   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2092 
2093   ptap->reuse = MAT_REUSE_MATRIX;
2094 
2095   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2096   if (ptap->freestruct) {
2097     ierr = MatFreeIntermediateDataStructures(C);CHKERRQ(ierr);
2098   }
2099   PetscFunctionReturn(0);
2100 }
2101