xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision cf742fcc470053b8746f003f5618f6b40d3ea504)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 
13 /* #define PTAP_PROFILE */
14 
15 extern PetscErrorCode MatDestroy_MPIAIJ(Mat);
16 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
17 {
18   PetscErrorCode ierr;
19   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
20   Mat_PtAPMPI    *ptap=a->ptap;
21 
22   PetscFunctionBegin;
23   if (ptap) {
24     Mat_Merge_SeqsToMPI *merge=ptap->merge;
25     ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
26     ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
27     ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
28     ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
29     ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
30     ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
31     ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
32     if (ptap->AP_loc) { /* used by alg_rap */
33       Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
34       ierr = PetscFree(ap->i);CHKERRQ(ierr);
35       ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr);
36       ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
37     } else { /* used by alg_ptap */
38       ierr = PetscFree(ptap->api);CHKERRQ(ierr);
39       ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
40     }
41     ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
42     ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
43     if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
44 
45     if (merge) { /* used by alg_ptap */
46       ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
47       ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
48       ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
49       ierr = PetscFree(merge->bi);CHKERRQ(ierr);
50       ierr = PetscFree(merge->bj);CHKERRQ(ierr);
51       ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
52       ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
53       ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
54       ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
55       ierr = PetscFree(merge->coi);CHKERRQ(ierr);
56       ierr = PetscFree(merge->coj);CHKERRQ(ierr);
57       ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
58       ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
59       ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
60     }
61 
62     ierr = ptap->destroy(A);CHKERRQ(ierr);
63     ierr = PetscFree(ptap);CHKERRQ(ierr);
64   }
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
69 {
70   PetscErrorCode ierr;
71   Mat_MPIAIJ     *a     = (Mat_MPIAIJ*)A->data;
72   Mat_PtAPMPI    *ptap  = a->ptap;
73 
74   PetscFunctionBegin;
75   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
76   (*M)->ops->destroy   = ptap->destroy;
77   (*M)->ops->duplicate = ptap->duplicate;
78   PetscFunctionReturn(0);
79 }
80 
81 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
82 {
83   PetscErrorCode ierr;
84   PetscBool      rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */
85   MPI_Comm       comm;
86 
87   PetscFunctionBegin;
88   /* check if matrix local sizes are compatible */
89   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
90   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
91   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
92 
93   ierr = PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);CHKERRQ(ierr);
94   if (scall == MAT_INITIAL_MATRIX) {
95     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
96     if (rap) { /* do R=P^T locally, then C=R*A*P */
97       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
98     } else {       /* do P^T*A*P */
99       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);CHKERRQ(ierr);
100     }
101     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
102   }
103   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
104   ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr);
105   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 #if defined(PETSC_HAVE_HYPRE)
110 PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
111 #endif
112 
113 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
114 {
115   PetscErrorCode    ierr;
116   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
117   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
118   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
119   Mat_PtAPMPI       *ptap = c->ptap;
120   Mat               AP_loc,C_loc,C_oth;
121   PetscInt          i,rstart,rend,cm,ncols,row;
122   PetscInt          *api,*apj,am = A->rmap->n,j,apnz;
123   PetscScalar       *apa;
124   const PetscInt    *cols;
125   const PetscScalar *vals;
126 
127   PetscFunctionBegin;
128   ierr = MatZeroEntries(C);CHKERRQ(ierr);
129 
130   /* 1) get R = Pd^T,Ro = Po^T */
131   if (ptap->reuse == MAT_REUSE_MATRIX) {
132     ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
133     ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
134   }
135 
136   /* 2) get AP_loc */
137   AP_loc = ptap->AP_loc;
138   ap = (Mat_SeqAIJ*)AP_loc->data;
139 
140   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
141   /*-----------------------------------------------------*/
142   if (ptap->reuse == MAT_REUSE_MATRIX) {
143     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
144     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
145     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
146   }
147 
148   /* 2-2) compute numeric A_loc*P - dominating part */
149   /* ---------------------------------------------- */
150   /* get data from symbolic products */
151   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
152   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
153   apa   = ptap->apa;
154   api   = ap->i;
155   apj   = ap->j;
156   for (i=0; i<am; i++) {
157     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
158     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
159 
160     apnz = api[i+1] - api[i];
161     for (j=0; j<apnz; j++) {
162       ap->a[j+ap->i[i]] = apa[j];
163       apa[j]            = 0.0;
164     }
165     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
166   }
167 
168   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
169   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
170   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
171 
172   C_loc = ptap->C_loc;
173   C_oth = ptap->C_oth;
174 
175   /* add C_loc and Co to to C */
176   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
177 
178   /* C_loc -> C */
179   cm    = C_loc->rmap->N;
180   c_seq = (Mat_SeqAIJ*)C_loc->data;
181   cols = c_seq->j;
182   vals = c_seq->a;
183   for (i=0; i<cm; i++) {
184     ncols = c_seq->i[i+1] - c_seq->i[i];
185     row = rstart + i;
186     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
187     cols += ncols; vals += ncols;
188   }
189 
190   /* Co -> C, off-processor part */
191   cm = C_oth->rmap->N;
192   c_seq = (Mat_SeqAIJ*)C_oth->data;
193   cols = c_seq->j;
194   vals = c_seq->a;
195   for (i=0; i<cm; i++) {
196     ncols = c_seq->i[i+1] - c_seq->i[i];
197     row = p->garray[i];
198     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
199     cols += ncols; vals += ncols;
200   }
201   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203 
204   ptap->reuse = MAT_REUSE_MATRIX;
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
209 {
210   PetscErrorCode      ierr;
211   Mat_PtAPMPI         *ptap;
212   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
213   MPI_Comm            comm;
214   PetscMPIInt         size,rank;
215   Mat                 Cmpi,P_loc,P_oth;
216   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
217   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
218   PetscInt            *lnk,i,k,pnz,row,nsend;
219   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
220   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
221   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
222   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
223   MPI_Request         *swaits,*rwaits;
224   MPI_Status          *sstatus,rstatus;
225   PetscLayout         rowmap;
226   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
227   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
228   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
229   Mat_SeqAIJ          *p_loc,*p_oth,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c_loc,*c_oth;
230   PetscScalar         *apv;
231   PetscTable          ta;
232 #if defined(PETSC_USE_INFO)
233   PetscReal           apfill;
234 #endif
235 #if defined(PTAP_PROFILE)
236   PetscLogDouble      t0,t1,t11,t12,t2,t3,t4;
237 #endif
238 
239   PetscFunctionBegin;
240   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
241   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
242   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
243 
244   /* create symbolic parallel matrix Cmpi */
245   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
246   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
247 
248 #if defined(PTAP_PROFILE)
249   ierr = PetscTime(&t0);CHKERRQ(ierr);
250 #endif
251 
252   /* create struct Mat_PtAPMPI and attached it to C later */
253   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
254   ptap->reuse = MAT_INITIAL_MATRIX;
255 
256   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
257   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr);
258   /* get P_loc by taking all local rows of P */
259   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr);
260 
261   ptap->P_loc = P_loc;
262   ptap->P_oth = P_oth;
263 
264   /* (0) compute Rd = Pd^T, Ro = Po^T  */
265   /* --------------------------------- */
266   ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
267   ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
268 #if defined(PTAP_PROFILE)
269   ierr = PetscTime(&t11);CHKERRQ(ierr);
270 #endif
271 
272   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
273   /* ----------------------------------------------------------------- */
274   p_loc  = (Mat_SeqAIJ*)P_loc->data;
275   p_oth  = (Mat_SeqAIJ*)P_oth->data;
276 
277   /* create and initialize a linked list */
278   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
279   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
280   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
281   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
282   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
283 
284   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
285 
286   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
287   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
288   current_space = free_space;
289   nspacedouble  = 0;
290 
291   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
292   api[0] = 0;
293   for (i=0; i<am; i++) {
294     /* diagonal portion: Ad[i,:]*P */
295     ai = ad->i; pi = p_loc->i;
296     nzi = ai[i+1] - ai[i];
297     aj  = ad->j + ai[i];
298     for (j=0; j<nzi; j++) {
299       row  = aj[j];
300       pnz  = pi[row+1] - pi[row];
301       Jptr = p_loc->j + pi[row];
302       /* add non-zero cols of P into the sorted linked list lnk */
303       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
304     }
305     /* off-diagonal portion: Ao[i,:]*P */
306     ai = ao->i; pi = p_oth->i;
307     nzi = ai[i+1] - ai[i];
308     aj  = ao->j + ai[i];
309     for (j=0; j<nzi; j++) {
310       row  = aj[j];
311       pnz  = pi[row+1] - pi[row];
312       Jptr = p_oth->j + pi[row];
313       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
314     }
315     apnz     = lnk[0];
316     api[i+1] = api[i] + apnz;
317     if (ap_rmax < apnz) ap_rmax = apnz;
318 
319     /* if free space is not available, double the total space in the list */
320     if (current_space->local_remaining<apnz) {
321       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
322       nspacedouble++;
323     }
324 
325     /* Copy data into free space, then initialize lnk */
326     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
327 
328     current_space->array           += apnz;
329     current_space->local_used      += apnz;
330     current_space->local_remaining -= apnz;
331   }
332   /* Allocate space for apj and apv, initialize apj, and */
333   /* destroy list of free space and other temporary array(s) */
334   ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
335   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
336   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
337 
338   /* Create AP_loc for reuse */
339   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
340 
341 #if defined(PETSC_USE_INFO)
342   apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
343   ptap->AP_loc->info.mallocs           = nspacedouble;
344   ptap->AP_loc->info.fill_ratio_given  = fill;
345   ptap->AP_loc->info.fill_ratio_needed = apfill;
346 
347   if (api[am]) {
348     ierr = PetscInfo3(ptap->AP_loc,"AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
349     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
350   } else {
351     ierr = PetscInfo(ptap->AP_loc,"AP_loc is empty \n");CHKERRQ(ierr);
352   }
353 #endif
354 
355 #if defined(PTAP_PROFILE)
356   ierr = PetscTime(&t12);CHKERRQ(ierr);
357 #endif
358 
359   /* (2-1) compute symbolic Co = Ro*AP_loc  */
360   /* ------------------------------------ */
361   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
362 #if defined(PTAP_PROFILE)
363   ierr = PetscTime(&t1);CHKERRQ(ierr);
364 #endif
365 
366   /* (3) send coj of C_oth to other processors  */
367   /* ------------------------------------------ */
368   /* determine row ownership */
369   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
370   rowmap->n  = pn;
371   rowmap->bs = 1;
372   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
373   owners = rowmap->range;
374 
375   /* determine the number of messages to send, their lengths */
376   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
377   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
378   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
379 
380   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
381   coi   = c_oth->i; coj = c_oth->j;
382   con   = ptap->C_oth->rmap->n;
383   proc  = 0;
384   for (i=0; i<con; i++) {
385     while (prmap[i] >= owners[proc+1]) proc++;
386     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
387     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
388   }
389 
390   len          = 0; /* max length of buf_si[], see (4) */
391   owners_co[0] = 0;
392   nsend        = 0;
393   for (proc=0; proc<size; proc++) {
394     owners_co[proc+1] = owners_co[proc] + len_si[proc];
395     if (len_s[proc]) {
396       nsend++;
397       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
398       len         += len_si[proc];
399     }
400   }
401 
402   /* determine the number and length of messages to receive for coi and coj  */
403   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
404   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
405 
406   /* post the Irecv and Isend of coj */
407   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
408   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
409   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
410   for (proc=0, k=0; proc<size; proc++) {
411     if (!len_s[proc]) continue;
412     i    = owners_co[proc];
413     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
414     k++;
415   }
416 
417   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
418   /* ---------------------------------------- */
419   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
420   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
421 
422   /* receives coj are complete */
423   for (i=0; i<nrecv; i++) {
424     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
425   }
426   ierr = PetscFree(rwaits);CHKERRQ(ierr);
427   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
428 
429   /* add received column indices into ta to update Crmax */
430   for (k=0; k<nrecv; k++) {/* k-th received message */
431     Jptr = buf_rj[k];
432     for (j=0; j<len_r[k]; j++) {
433       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
434     }
435   }
436   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
437   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
438 
439   /* (4) send and recv coi */
440   /*-----------------------*/
441   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
442   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
443   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
444   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
445   for (proc=0,k=0; proc<size; proc++) {
446     if (!len_s[proc]) continue;
447     /* form outgoing message for i-structure:
448          buf_si[0]:                 nrows to be sent
449                [1:nrows]:           row index (global)
450                [nrows+1:2*nrows+1]: i-structure index
451     */
452     /*-------------------------------------------*/
453     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
454     buf_si_i    = buf_si + nrows+1;
455     buf_si[0]   = nrows;
456     buf_si_i[0] = 0;
457     nrows       = 0;
458     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
459       nzi = coi[i+1] - coi[i];
460       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
461       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
462       nrows++;
463     }
464     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
465     k++;
466     buf_si += len_si[proc];
467   }
468   for (i=0; i<nrecv; i++) {
469     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
470   }
471   ierr = PetscFree(rwaits);CHKERRQ(ierr);
472   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
473 
474   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
475   ierr = PetscFree(len_ri);CHKERRQ(ierr);
476   ierr = PetscFree(swaits);CHKERRQ(ierr);
477   ierr = PetscFree(buf_s);CHKERRQ(ierr);
478 #if defined(PTAP_PROFILE)
479   ierr = PetscTime(&t2);CHKERRQ(ierr);
480 #endif
481   /* (5) compute the local portion of Cmpi      */
482   /* ------------------------------------------ */
483   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
484   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
485   current_space = free_space;
486 
487   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
488   for (k=0; k<nrecv; k++) {
489     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
490     nrows       = *buf_ri_k[k];
491     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
492     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
493   }
494 
495   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
496   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
497   for (i=0; i<pn; i++) {
498     /* add C_loc into Cmpi */
499     nzi  = c_loc->i[i+1] - c_loc->i[i];
500     Jptr = c_loc->j + c_loc->i[i];
501     ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
502 
503     /* add received col data into lnk */
504     for (k=0; k<nrecv; k++) { /* k-th received message */
505       if (i == *nextrow[k]) { /* i-th row */
506         nzi  = *(nextci[k]+1) - *nextci[k];
507         Jptr = buf_rj[k] + *nextci[k];
508         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
509         nextrow[k]++; nextci[k]++;
510       }
511     }
512     nzi = lnk[0];
513 
514     /* copy data into free space, then initialize lnk */
515     ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr);
516     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
517   }
518   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
519   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
520   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
521 #if defined(PTAP_PROFILE)
522   ierr = PetscTime(&t3);CHKERRQ(ierr);
523 #endif
524 
525   /* local sizes and preallocation */
526   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
527   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
528   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
529   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
530 
531   /* members in merge */
532   ierr = PetscFree(id_r);CHKERRQ(ierr);
533   ierr = PetscFree(len_r);CHKERRQ(ierr);
534   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
535   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
536   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
537   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
538   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
539 
540   /* attach the supporting struct to Cmpi for reuse */
541   c = (Mat_MPIAIJ*)Cmpi->data;
542   c->ptap         = ptap;
543   ptap->duplicate = Cmpi->ops->duplicate;
544   ptap->destroy   = Cmpi->ops->destroy;
545 
546   ierr = PetscCalloc1(ap_rmax,&ptap->apa);CHKERRQ(ierr);
547   /* printf("[%d] pN %d, ap_rmax %d\n",rank,pN,ap_rmax); */
548 
549   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
550   Cmpi->assembled        = PETSC_FALSE;
551   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
552   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
553   *C                     = Cmpi;
554 
555 #if defined(PTAP_PROFILE)
556   ierr = PetscTime(&t4);CHKERRQ(ierr);
557   if (rank == 1) {
558     printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
559   }
560 #endif
561   PetscFunctionReturn(0);
562 }
563 
564 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
565 {
566   PetscErrorCode      ierr;
567   Mat_PtAPMPI         *ptap;
568   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
569   MPI_Comm            comm;
570   PetscMPIInt         size,rank;
571   Mat                 Cmpi;
572   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
573   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
574   PetscInt            *lnk,i,k,pnz,row,nsend;
575   PetscBT             lnkbt;
576   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
577   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
578   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
579   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
580   MPI_Request         *swaits,*rwaits;
581   MPI_Status          *sstatus,rstatus;
582   PetscLayout         rowmap;
583   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
584   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
585   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
586   Mat_SeqAIJ          *p_loc,*p_oth,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c_loc,*c_oth;
587   PetscScalar         *apv;
588   PetscTable          ta;
589 #if defined(PETSC_HAVE_HYPRE)
590   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
591   PetscInt            nalg = 3;
592 #else
593   const char          *algTypes[2] = {"scalable","nonscalable"};
594   PetscInt            nalg = 2;
595 #endif
596   PetscInt            alg = 0; /* set default algorithm */
597 #if defined(PETSC_USE_INFO)
598   PetscReal           apfill;
599 #endif
600 #if defined(PTAP_PROFILE)
601   PetscLogDouble      t0,t1,t11,t12,t2,t3,t4;
602 #endif
603 
604   PetscFunctionBegin;
605   /* pick an algorithm */
606   ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
607   ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,NULL);CHKERRQ(ierr);
608   ierr = PetscOptionsEnd();CHKERRQ(ierr);
609 
610   if (alg == 0) {
611     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr);
612     (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
613     PetscFunctionReturn(0);
614 
615 #if defined(PETSC_HAVE_HYPRE)
616   } else if (alg == 2) {
617     /* Use boomerAMGBuildCoarseOperator */
618     ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr);
619     PetscFunctionReturn(0);
620 #endif
621   }
622 
623 #if defined(PTAP_PROFILE)
624   ierr = PetscTime(&t0);CHKERRQ(ierr);
625 #endif
626 
627   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
628   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
629   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
630 
631   /* create symbolic parallel matrix Cmpi */
632   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
633   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
634 
635   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
636   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
637 
638   /* create struct Mat_PtAPMPI and attached it to C later */
639   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
640   ptap->reuse = MAT_INITIAL_MATRIX;
641 
642   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
643   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
644   /* get P_loc by taking all local rows of P */
645   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
646 
647   /* (0) compute Rd = Pd^T, Ro = Po^T  */
648   /* --------------------------------- */
649   ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
650   ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
651 #if defined(PTAP_PROFILE)
652   ierr = PetscTime(&t11);CHKERRQ(ierr);
653 #endif
654 
655   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
656   /* ----------------------------------------------------------------- */
657   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
658   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
659 
660   /* create and initialize a linked list */
661   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
662   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
663   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
664   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
665   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
666 
667   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
668 
669   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
670   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
671   current_space = free_space;
672   nspacedouble  = 0;
673 
674   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
675   api[0] = 0;
676   for (i=0; i<am; i++) {
677     /* diagonal portion: Ad[i,:]*P */
678     ai = ad->i; pi = p_loc->i;
679     nzi = ai[i+1] - ai[i];
680     aj  = ad->j + ai[i];
681     for (j=0; j<nzi; j++) {
682       row  = aj[j];
683       pnz  = pi[row+1] - pi[row];
684       Jptr = p_loc->j + pi[row];
685       /* add non-zero cols of P into the sorted linked list lnk */
686       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
687     }
688     /* off-diagonal portion: Ao[i,:]*P */
689     ai = ao->i; pi = p_oth->i;
690     nzi = ai[i+1] - ai[i];
691     aj  = ao->j + ai[i];
692     for (j=0; j<nzi; j++) {
693       row  = aj[j];
694       pnz  = pi[row+1] - pi[row];
695       Jptr = p_oth->j + pi[row];
696       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
697     }
698     apnz     = lnk[0];
699     api[i+1] = api[i] + apnz;
700     if (ap_rmax < apnz) ap_rmax = apnz;
701 
702     /* if free space is not available, double the total space in the list */
703     if (current_space->local_remaining<apnz) {
704       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
705       nspacedouble++;
706     }
707 
708     /* Copy data into free space, then initialize lnk */
709     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
710 
711     current_space->array           += apnz;
712     current_space->local_used      += apnz;
713     current_space->local_remaining -= apnz;
714   }
715   /* Allocate space for apj and apv, initialize apj, and */
716   /* destroy list of free space and other temporary array(s) */
717   ierr   = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
718   ierr   = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
719   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
720 
721   /* Create AP_loc for reuse */
722   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
723 
724 #if defined(PETSC_USE_INFO)
725   apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
726   ptap->AP_loc->info.mallocs           = nspacedouble;
727   ptap->AP_loc->info.fill_ratio_given  = fill;
728   ptap->AP_loc->info.fill_ratio_needed = apfill;
729 
730   if (api[am]) {
731     ierr = PetscInfo3(ptap->AP_loc,"AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
732     ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
733   } else {
734     ierr = PetscInfo(ptap->AP_loc,"AP_loc is empty \n");CHKERRQ(ierr);
735   }
736 #endif
737 
738 #if defined(PTAP_PROFILE)
739   ierr = PetscTime(&t12);CHKERRQ(ierr);
740 #endif
741 
742   /* (2-1) compute symbolic Co = Ro*AP_loc  */
743   /* ------------------------------------ */
744   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
745 #if defined(PTAP_PROFILE)
746   ierr = PetscTime(&t1);CHKERRQ(ierr);
747 #endif
748 
749   /* (3) send coj of C_oth to other processors  */
750   /* ------------------------------------------ */
751   /* determine row ownership */
752   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
753   rowmap->n  = pn;
754   rowmap->bs = 1;
755   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
756   owners = rowmap->range;
757 
758   /* determine the number of messages to send, their lengths */
759   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
760   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
761   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
762 
763   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
764   coi   = c_oth->i; coj = c_oth->j;
765   con   = ptap->C_oth->rmap->n;
766   proc  = 0;
767   for (i=0; i<con; i++) {
768     while (prmap[i] >= owners[proc+1]) proc++;
769     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
770     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
771   }
772 
773   len          = 0; /* max length of buf_si[], see (4) */
774   owners_co[0] = 0;
775   nsend        = 0;
776   for (proc=0; proc<size; proc++) {
777     owners_co[proc+1] = owners_co[proc] + len_si[proc];
778     if (len_s[proc]) {
779       nsend++;
780       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
781       len         += len_si[proc];
782     }
783   }
784 
785   /* determine the number and length of messages to receive for coi and coj  */
786   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
787   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
788 
789   /* post the Irecv and Isend of coj */
790   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
791   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
792   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
793   for (proc=0, k=0; proc<size; proc++) {
794     if (!len_s[proc]) continue;
795     i    = owners_co[proc];
796     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
797     k++;
798   }
799 
800   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
801   /* ---------------------------------------- */
802   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
803   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
804 
805   /* receives coj are complete */
806   for (i=0; i<nrecv; i++) {
807     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
808   }
809   ierr = PetscFree(rwaits);CHKERRQ(ierr);
810   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
811 
812   /* add received column indices into ta to update Crmax */
813   for (k=0; k<nrecv; k++) {/* k-th received message */
814     Jptr = buf_rj[k];
815     for (j=0; j<len_r[k]; j++) {
816       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
817     }
818   }
819   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
820   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
821 
822   /* (4) send and recv coi */
823   /*-----------------------*/
824   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
825   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
826   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
827   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
828   for (proc=0,k=0; proc<size; proc++) {
829     if (!len_s[proc]) continue;
830     /* form outgoing message for i-structure:
831          buf_si[0]:                 nrows to be sent
832                [1:nrows]:           row index (global)
833                [nrows+1:2*nrows+1]: i-structure index
834     */
835     /*-------------------------------------------*/
836     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
837     buf_si_i    = buf_si + nrows+1;
838     buf_si[0]   = nrows;
839     buf_si_i[0] = 0;
840     nrows       = 0;
841     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
842       nzi = coi[i+1] - coi[i];
843       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
844       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
845       nrows++;
846     }
847     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
848     k++;
849     buf_si += len_si[proc];
850   }
851   for (i=0; i<nrecv; i++) {
852     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
853   }
854   ierr = PetscFree(rwaits);CHKERRQ(ierr);
855   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
856 
857   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
858   ierr = PetscFree(len_ri);CHKERRQ(ierr);
859   ierr = PetscFree(swaits);CHKERRQ(ierr);
860   ierr = PetscFree(buf_s);CHKERRQ(ierr);
861 #if defined(PTAP_PROFILE)
862   ierr = PetscTime(&t2);CHKERRQ(ierr);
863 #endif
864   /* (5) compute the local portion of Cmpi      */
865   /* ------------------------------------------ */
866   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
867   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
868   current_space = free_space;
869 
870   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
871   for (k=0; k<nrecv; k++) {
872     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
873     nrows       = *buf_ri_k[k];
874     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
875     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
876   }
877 
878   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
879   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
880   for (i=0; i<pn; i++) {
881     /* add C_loc into Cmpi */
882     nzi  = c_loc->i[i+1] - c_loc->i[i];
883     Jptr = c_loc->j + c_loc->i[i];
884     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
885 
886     /* add received col data into lnk */
887     for (k=0; k<nrecv; k++) { /* k-th received message */
888       if (i == *nextrow[k]) { /* i-th row */
889         nzi  = *(nextci[k]+1) - *nextci[k];
890         Jptr = buf_rj[k] + *nextci[k];
891         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
892         nextrow[k]++; nextci[k]++;
893       }
894     }
895     nzi = lnk[0];
896 
897     /* copy data into free space, then initialize lnk */
898     ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
899     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
900   }
901   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
902   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
903   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
904 #if defined(PTAP_PROFILE)
905   ierr = PetscTime(&t3);CHKERRQ(ierr);
906 #endif
907 
908   /* local sizes and preallocation */
909   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
910   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
911   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
912   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
913 
914   /* members in merge */
915   ierr = PetscFree(id_r);CHKERRQ(ierr);
916   ierr = PetscFree(len_r);CHKERRQ(ierr);
917   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
918   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
919   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
920   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
921   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
922 
923   /* attach the supporting struct to Cmpi for reuse */
924   c = (Mat_MPIAIJ*)Cmpi->data;
925   c->ptap         = ptap;
926   ptap->duplicate = Cmpi->ops->duplicate;
927   ptap->destroy   = Cmpi->ops->destroy;
928 
929   if (alg == 1) {
930     ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
931   }
932 
933   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
934   Cmpi->assembled        = PETSC_FALSE;
935   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
936   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
937   *C                     = Cmpi;
938 
939 #if defined(PTAP_PROFILE)
940   ierr = PetscTime(&t4);CHKERRQ(ierr);
941   if (rank == 1) {
942     printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
943   }
944 #endif
945   PetscFunctionReturn(0);
946 }
947 
948 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
949 {
950   PetscErrorCode    ierr;
951   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
952   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
953   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
954   Mat_PtAPMPI       *ptap = c->ptap;
955   Mat               AP_loc,C_loc,C_oth;
956   PetscInt          i,rstart,rend,cm,ncols,row;
957   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
958   PetscScalar       *apa;
959   const PetscInt    *cols;
960   const PetscScalar *vals;
961 #if defined(PTAP_PROFILE)
962   PetscMPIInt       rank;
963   MPI_Comm          comm;
964   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
965 #endif
966 
967   PetscFunctionBegin;
968   ierr = MatZeroEntries(C);CHKERRQ(ierr);
969 
970   /* 1) get R = Pd^T,Ro = Po^T */
971 #if defined(PTAP_PROFILE)
972   ierr = PetscTime(&t0);CHKERRQ(ierr);
973 #endif
974   if (ptap->reuse == MAT_REUSE_MATRIX) {
975     ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
976     ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
977   }
978 #if defined(PTAP_PROFILE)
979   ierr = PetscTime(&t1);CHKERRQ(ierr);
980   eR = t1 - t0;
981 #endif
982 
983   /* 2) get AP_loc */
984   AP_loc = ptap->AP_loc;
985   ap = (Mat_SeqAIJ*)AP_loc->data;
986 
987   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
988   /*-----------------------------------------------------*/
989   if (ptap->reuse == MAT_REUSE_MATRIX) {
990     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
991     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
992     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
993   }
994 
995   /* 2-2) compute numeric A_loc*P - dominating part */
996   /* ---------------------------------------------- */
997   /* get data from symbolic products */
998   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
999   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1000   apa   = ptap->apa;
1001   api   = ap->i;
1002   apj   = ap->j;
1003   for (i=0; i<am; i++) {
1004     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1005     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1006     apnz = api[i+1] - api[i];
1007     for (j=0; j<apnz; j++) {
1008       col = apj[j+api[i]];
1009       ap->a[j+ap->i[i]] = apa[col];
1010       apa[col] = 0.0;
1011     }
1012     ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1013   }
1014 #if defined(PTAP_PROFILE)
1015   ierr = PetscTime(&t2);CHKERRQ(ierr);
1016   eAP = t2 - t1;
1017 #endif
1018 
1019   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1020   ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
1021   ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
1022   C_loc = ptap->C_loc;
1023   C_oth = ptap->C_oth;
1024 #if defined(PTAP_PROFILE)
1025   ierr = PetscTime(&t3);CHKERRQ(ierr);
1026   eCseq = t3 - t2;
1027 #endif
1028 
1029   /* add C_loc and Co to to C */
1030   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
1031 
1032   /* C_loc -> C */
1033   cm    = C_loc->rmap->N;
1034   c_seq = (Mat_SeqAIJ*)C_loc->data;
1035   cols = c_seq->j;
1036   vals = c_seq->a;
1037   for (i=0; i<cm; i++) {
1038     ncols = c_seq->i[i+1] - c_seq->i[i];
1039     row = rstart + i;
1040     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
1041     cols += ncols; vals += ncols;
1042   }
1043 
1044   /* Co -> C, off-processor part */
1045   cm = C_oth->rmap->N;
1046   c_seq = (Mat_SeqAIJ*)C_oth->data;
1047   cols = c_seq->j;
1048   vals = c_seq->a;
1049   for (i=0; i<cm; i++) {
1050     ncols = c_seq->i[i+1] - c_seq->i[i];
1051     row = p->garray[i];
1052     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
1053     cols += ncols; vals += ncols;
1054   }
1055   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1056   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1057 #if defined(PTAP_PROFILE)
1058   ierr = PetscTime(&t4);CHKERRQ(ierr);
1059   eCmpi = t4 - t3;
1060 
1061   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1062   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1063   if (rank==1) {
1064     ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr);
1065   }
1066 #endif
1067   ptap->reuse = MAT_REUSE_MATRIX;
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C)
1072 {
1073   PetscErrorCode      ierr;
1074   Mat                 Cmpi;
1075   Mat_PtAPMPI         *ptap;
1076   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1077   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1078   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1079   Mat_SeqAIJ          *p_loc,*p_oth;
1080   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
1081   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
1082   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1083   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
1084   PetscBT             lnkbt;
1085   MPI_Comm            comm;
1086   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
1087   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1088   PetscInt            len,proc,*dnz,*onz,*owners;
1089   PetscInt            nzi,*pti,*ptj;
1090   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1091   MPI_Request         *swaits,*rwaits;
1092   MPI_Status          *sstatus,rstatus;
1093   Mat_Merge_SeqsToMPI *merge;
1094   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
1095   PetscReal           afill=1.0,afill_tmp;
1096 
1097   PetscFunctionBegin;
1098   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1099   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1100   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1101 
1102   /* create struct Mat_PtAPMPI and attached it to C later */
1103   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1104   ierr        = PetscNew(&merge);CHKERRQ(ierr);
1105   ptap->merge = merge;
1106   ptap->reuse = MAT_INITIAL_MATRIX;
1107 
1108   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1109   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1110 
1111   /* get P_loc by taking all local rows of P */
1112   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1113 
1114   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1115   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1116   pi_loc = p_loc->i; pj_loc = p_loc->j;
1117   pi_oth = p_oth->i; pj_oth = p_oth->j;
1118 
1119   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
1120   /*--------------------------------------------------------------------------*/
1121   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
1122   api[0] = 0;
1123 
1124   /* create and initialize a linked list */
1125   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1126 
1127   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
1128   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
1129   current_space = free_space;
1130 
1131   for (i=0; i<am; i++) {
1132     /* diagonal portion of A */
1133     nzi = adi[i+1] - adi[i];
1134     aj  = ad->j + adi[i];
1135     for (j=0; j<nzi; j++) {
1136       row  = aj[j];
1137       pnz  = pi_loc[row+1] - pi_loc[row];
1138       Jptr = pj_loc + pi_loc[row];
1139       /* add non-zero cols of P into the sorted linked list lnk */
1140       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1141     }
1142     /* off-diagonal portion of A */
1143     nzi = aoi[i+1] - aoi[i];
1144     aj  = ao->j + aoi[i];
1145     for (j=0; j<nzi; j++) {
1146       row  = aj[j];
1147       pnz  = pi_oth[row+1] - pi_oth[row];
1148       Jptr = pj_oth + pi_oth[row];
1149       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1150     }
1151     apnz     = lnk[0];
1152     api[i+1] = api[i] + apnz;
1153     if (ap_rmax < apnz) ap_rmax = apnz;
1154 
1155     /* if free space is not available, double the total space in the list */
1156     if (current_space->local_remaining<apnz) {
1157       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1158       nspacedouble++;
1159     }
1160 
1161     /* Copy data into free space, then initialize lnk */
1162     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1163 
1164     current_space->array           += apnz;
1165     current_space->local_used      += apnz;
1166     current_space->local_remaining -= apnz;
1167   }
1168 
1169   /* Allocate space for apj, initialize apj, and */
1170   /* destroy list of free space and other temporary array(s) */
1171   ierr      = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr);
1172   ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
1173   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
1174   if (afill_tmp > afill) afill = afill_tmp;
1175 
1176   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
1177   /*-----------------------------------------------------------------*/
1178   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
1179 
1180   /* then, compute symbolic Co = (p->B)^T*AP */
1181   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1182                          >= (num of nonzero rows of C_seq) - pn */
1183   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1184   coi[0] = 0;
1185 
1186   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
1187   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am]));
1188   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1189   current_space = free_space;
1190 
1191   for (i=0; i<pon; i++) {
1192     pnz = poti[i+1] - poti[i];
1193     ptJ = potj + poti[i];
1194     for (j=0; j<pnz; j++) {
1195       row  = ptJ[j]; /* row of AP == col of Pot */
1196       apnz = api[row+1] - api[row];
1197       Jptr = apj + api[row];
1198       /* add non-zero cols of AP into the sorted linked list lnk */
1199       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1200     }
1201     nnz = lnk[0];
1202 
1203     /* If free space is not available, double the total space in the list */
1204     if (current_space->local_remaining<nnz) {
1205       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1206       nspacedouble++;
1207     }
1208 
1209     /* Copy data into free space, and zero out denserows */
1210     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1211 
1212     current_space->array           += nnz;
1213     current_space->local_used      += nnz;
1214     current_space->local_remaining -= nnz;
1215 
1216     coi[i+1] = coi[i] + nnz;
1217   }
1218 
1219   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
1220   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1221   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
1222   if (afill_tmp > afill) afill = afill_tmp;
1223   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
1224 
1225   /* (3) send j-array (coj) of Co to other processors */
1226   /*--------------------------------------------------*/
1227   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
1228   len_s        = merge->len_s;
1229   merge->nsend = 0;
1230 
1231 
1232   /* determine row ownership */
1233   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1234   merge->rowmap->n  = pn;
1235   merge->rowmap->bs = 1;
1236 
1237   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1238   owners = merge->rowmap->range;
1239 
1240   /* determine the number of messages to send, their lengths */
1241   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
1242   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1243   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1244 
1245   proc = 0;
1246   for (i=0; i<pon; i++) {
1247     while (prmap[i] >= owners[proc+1]) proc++;
1248     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1249     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1250   }
1251 
1252   len          = 0; /* max length of buf_si[], see (4) */
1253   owners_co[0] = 0;
1254   for (proc=0; proc<size; proc++) {
1255     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1256     if (len_s[proc]) {
1257       merge->nsend++;
1258       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1259       len         += len_si[proc];
1260     }
1261   }
1262 
1263   /* determine the number and length of messages to receive for coi and coj  */
1264   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1265   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1266 
1267   /* post the Irecv and Isend of coj */
1268   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1269   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1270   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1271   for (proc=0, k=0; proc<size; proc++) {
1272     if (!len_s[proc]) continue;
1273     i    = owners_co[proc];
1274     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1275     k++;
1276   }
1277 
1278   /* receives and sends of coj are complete */
1279   for (i=0; i<merge->nrecv; i++) {
1280     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1281   }
1282   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1283   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1284 
1285   /* (4) send and recv coi */
1286   /*-----------------------*/
1287   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1288   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1289   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1290   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1291   for (proc=0,k=0; proc<size; proc++) {
1292     if (!len_s[proc]) continue;
1293     /* form outgoing message for i-structure:
1294          buf_si[0]:                 nrows to be sent
1295                [1:nrows]:           row index (global)
1296                [nrows+1:2*nrows+1]: i-structure index
1297     */
1298     /*-------------------------------------------*/
1299     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1300     buf_si_i    = buf_si + nrows+1;
1301     buf_si[0]   = nrows;
1302     buf_si_i[0] = 0;
1303     nrows       = 0;
1304     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1305       nzi = coi[i+1] - coi[i];
1306       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1307       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1308       nrows++;
1309     }
1310     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1311     k++;
1312     buf_si += len_si[proc];
1313   }
1314   i = merge->nrecv;
1315   while (i--) {
1316     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1317   }
1318   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1319   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1320 
1321   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
1322   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1323   ierr = PetscFree(swaits);CHKERRQ(ierr);
1324   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1325 
1326   /* (5) compute the local portion of C (mpi mat) */
1327   /*----------------------------------------------*/
1328   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
1329 
1330   /* allocate pti array and free space for accumulating nonzero column info */
1331   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
1332   pti[0] = 0;
1333 
1334   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1335   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am]));
1336   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1337   current_space = free_space;
1338 
1339   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1340   for (k=0; k<merge->nrecv; k++) {
1341     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1342     nrows       = *buf_ri_k[k];
1343     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1344     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1345   }
1346   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1347 
1348   for (i=0; i<pn; i++) {
1349     /* add pdt[i,:]*AP into lnk */
1350     pnz = pdti[i+1] - pdti[i];
1351     ptJ = pdtj + pdti[i];
1352     for (j=0; j<pnz; j++) {
1353       row  = ptJ[j];  /* row of AP == col of Pt */
1354       apnz = api[row+1] - api[row];
1355       Jptr = apj + api[row];
1356       /* add non-zero cols of AP into the sorted linked list lnk */
1357       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1358     }
1359 
1360     /* add received col data into lnk */
1361     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1362       if (i == *nextrow[k]) { /* i-th row */
1363         nzi  = *(nextci[k]+1) - *nextci[k];
1364         Jptr = buf_rj[k] + *nextci[k];
1365         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1366         nextrow[k]++; nextci[k]++;
1367       }
1368     }
1369     nnz = lnk[0];
1370 
1371     /* if free space is not available, make more free space */
1372     if (current_space->local_remaining<nnz) {
1373       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1374       nspacedouble++;
1375     }
1376     /* copy data into free space, then initialize lnk */
1377     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1378     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1379 
1380     current_space->array           += nnz;
1381     current_space->local_used      += nnz;
1382     current_space->local_remaining -= nnz;
1383 
1384     pti[i+1] = pti[i] + nnz;
1385   }
1386   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
1387   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1388 
1389   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
1390   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
1391   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
1392   if (afill_tmp > afill) afill = afill_tmp;
1393   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1394 
1395   /* (6) create symbolic parallel matrix Cmpi */
1396   /*------------------------------------------*/
1397   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1398   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1399   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1400   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1401   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1402   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1403 
1404   merge->bi        = pti;      /* Cseq->i */
1405   merge->bj        = ptj;      /* Cseq->j */
1406   merge->coi       = coi;      /* Co->i   */
1407   merge->coj       = coj;      /* Co->j   */
1408   merge->buf_ri    = buf_ri;
1409   merge->buf_rj    = buf_rj;
1410   merge->owners_co = owners_co;
1411   merge->destroy   = Cmpi->ops->destroy;
1412 
1413   /* attach the supporting struct to Cmpi for reuse */
1414   c           = (Mat_MPIAIJ*)Cmpi->data;
1415   c->ptap     = ptap;
1416   ptap->api   = api;
1417   ptap->apj   = apj;
1418   ptap->duplicate = Cmpi->ops->duplicate;
1419   ptap->destroy   = Cmpi->ops->destroy;
1420 
1421   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1422   Cmpi->assembled        = PETSC_FALSE;
1423   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1424   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
1425   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap;
1426   *C                     = Cmpi;
1427 
1428   /* flag 'scalable' determines which implementations to be used:
1429        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
1430        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
1431   /* set default scalable */
1432   ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */
1433 
1434   ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
1435   if (!ptap->scalable) {  /* Do dense axpy */
1436     ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
1437   } else {
1438     ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
1439   }
1440 
1441 #if defined(PETSC_USE_INFO)
1442   if (pti[pn] != 0) {
1443     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1444     ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1445   } else {
1446     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1447   }
1448 #endif
1449   PetscFunctionReturn(0);
1450 }
1451 
1452 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C)
1453 {
1454   PetscErrorCode      ierr;
1455   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1456   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1457   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1458   Mat_SeqAIJ          *p_loc,*p_oth;
1459   Mat_PtAPMPI         *ptap;
1460   Mat_Merge_SeqsToMPI *merge;
1461   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1462   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1463   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
1464   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1465   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1466   MPI_Comm            comm;
1467   PetscMPIInt         size,rank,taga,*len_s;
1468   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1469   PetscInt            **buf_ri,**buf_rj;
1470   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1471   MPI_Request         *s_waits,*r_waits;
1472   MPI_Status          *status;
1473   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1474   PetscInt            *api,*apj,*coi,*coj;
1475   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1476   PetscBool           scalable;
1477 #if defined(PTAP_PROFILE)
1478   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1479 #endif
1480 
1481   PetscFunctionBegin;
1482   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1483 #if defined(PTAP_PROFILE)
1484   ierr = PetscTime(&t0);CHKERRQ(ierr);
1485 #endif
1486   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1487   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1488 
1489   ptap = c->ptap;
1490   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1491   merge    = ptap->merge;
1492   apa      = ptap->apa;
1493   scalable = ptap->scalable;
1494 
1495   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1496   /*-----------------------------------------------------*/
1497   if (ptap->reuse == MAT_INITIAL_MATRIX) {
1498     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1499     ptap->reuse = MAT_REUSE_MATRIX;
1500   } else { /* update numerical values of P_oth and P_loc */
1501     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1502     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1503   }
1504 #if defined(PTAP_PROFILE)
1505   ierr = PetscTime(&t1);CHKERRQ(ierr);
1506   eP = t1-t0;
1507 #endif
1508   /*
1509   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1510          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1511          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1512          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1513    */
1514 
1515   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1516   /*--------------------------------------------------------------*/
1517   /* get data from symbolic products */
1518   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1519   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1520   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
1521   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1522 
1523   coi  = merge->coi; coj = merge->coj;
1524   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1525 
1526   bi     = merge->bi; bj = merge->bj;
1527   owners = merge->rowmap->range;
1528   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);  /* ba: Cseq->a */
1529 
1530   api = ptap->api; apj = ptap->apj;
1531 
1532   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1533     ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr);
1534 #if defined(PTAP_PROFILE)
1535     ierr = PetscTime(&t1);CHKERRQ(ierr);
1536 #endif
1537     for (i=0; i<am; i++) {
1538 #if defined(PTAP_PROFILE)
1539       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1540 #endif
1541       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1542       /*------------------------------------------------------------*/
1543       apJ = apj + api[i];
1544 
1545       /* diagonal portion of A */
1546       anz = adi[i+1] - adi[i];
1547       adj = ad->j + adi[i];
1548       ada = ad->a + adi[i];
1549       for (j=0; j<anz; j++) {
1550         row = adj[j];
1551         pnz = pi_loc[row+1] - pi_loc[row];
1552         pj  = pj_loc + pi_loc[row];
1553         pa  = pa_loc + pi_loc[row];
1554 
1555         /* perform dense axpy */
1556         valtmp = ada[j];
1557         for (k=0; k<pnz; k++) {
1558           apa[pj[k]] += valtmp*pa[k];
1559         }
1560         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1561       }
1562 
1563       /* off-diagonal portion of A */
1564       anz = aoi[i+1] - aoi[i];
1565       aoj = ao->j + aoi[i];
1566       aoa = ao->a + aoi[i];
1567       for (j=0; j<anz; j++) {
1568         row = aoj[j];
1569         pnz = pi_oth[row+1] - pi_oth[row];
1570         pj  = pj_oth + pi_oth[row];
1571         pa  = pa_oth + pi_oth[row];
1572 
1573         /* perform dense axpy */
1574         valtmp = aoa[j];
1575         for (k=0; k<pnz; k++) {
1576           apa[pj[k]] += valtmp*pa[k];
1577         }
1578         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1579       }
1580 #if defined(PTAP_PROFILE)
1581       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1582       et2_AP += t2_1 - t2_0;
1583 #endif
1584 
1585       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1586       /*--------------------------------------------------------------*/
1587       apnz = api[i+1] - api[i];
1588       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1589       pnz = po->i[i+1] - po->i[i];
1590       poJ = po->j + po->i[i];
1591       pA  = po->a + po->i[i];
1592       for (j=0; j<pnz; j++) {
1593         row = poJ[j];
1594         cnz = coi[row+1] - coi[row];
1595         cj  = coj + coi[row];
1596         ca  = coa + coi[row];
1597         /* perform dense axpy */
1598         valtmp = pA[j];
1599         for (k=0; k<cnz; k++) {
1600           ca[k] += valtmp*apa[cj[k]];
1601         }
1602         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1603       }
1604       /* put the value into Cd (diagonal part) */
1605       pnz = pd->i[i+1] - pd->i[i];
1606       pdJ = pd->j + pd->i[i];
1607       pA  = pd->a + pd->i[i];
1608       for (j=0; j<pnz; j++) {
1609         row = pdJ[j];
1610         cnz = bi[row+1] - bi[row];
1611         cj  = bj + bi[row];
1612         ca  = ba + bi[row];
1613         /* perform dense axpy */
1614         valtmp = pA[j];
1615         for (k=0; k<cnz; k++) {
1616           ca[k] += valtmp*apa[cj[k]];
1617         }
1618         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1619       }
1620 
1621       /* zero the current row of A*P */
1622       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1623 #if defined(PTAP_PROFILE)
1624       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1625       ePtAP += t2_2 - t2_1;
1626 #endif
1627     }
1628   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1629     ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr);
1630     /*-----------------------------------------------------------------------------------------*/
1631     pA=pa_loc;
1632     for (i=0; i<am; i++) {
1633 #if defined(PTAP_PROFILE)
1634       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1635 #endif
1636       /* form i-th sparse row of A*P */
1637       apnz = api[i+1] - api[i];
1638       apJ  = apj + api[i];
1639       /* diagonal portion of A */
1640       anz = adi[i+1] - adi[i];
1641       adj = ad->j + adi[i];
1642       ada = ad->a + adi[i];
1643       for (j=0; j<anz; j++) {
1644         row    = adj[j];
1645         pnz    = pi_loc[row+1] - pi_loc[row];
1646         pj     = pj_loc + pi_loc[row];
1647         pa     = pa_loc + pi_loc[row];
1648         valtmp = ada[j];
1649         nextp  = 0;
1650         for (k=0; nextp<pnz; k++) {
1651           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1652             apa[k] += valtmp*pa[nextp++];
1653           }
1654         }
1655         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1656       }
1657       /* off-diagonal portion of A */
1658       anz = aoi[i+1] - aoi[i];
1659       aoj = ao->j + aoi[i];
1660       aoa = ao->a + aoi[i];
1661       for (j=0; j<anz; j++) {
1662         row    = aoj[j];
1663         pnz    = pi_oth[row+1] - pi_oth[row];
1664         pj     = pj_oth + pi_oth[row];
1665         pa     = pa_oth + pi_oth[row];
1666         valtmp = aoa[j];
1667         nextp  = 0;
1668         for (k=0; nextp<pnz; k++) {
1669           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1670             apa[k] += valtmp*pa[nextp++];
1671           }
1672         }
1673         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1674       }
1675 #if defined(PTAP_PROFILE)
1676       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1677       et2_AP += t2_1 - t2_0;
1678 #endif
1679 
1680       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1681       /*--------------------------------------------------------------*/
1682       pnz = pi_loc[i+1] - pi_loc[i];
1683       pJ  = pj_loc + pi_loc[i];
1684       for (j=0; j<pnz; j++) {
1685         nextap = 0;
1686         row    = pJ[j]; /* global index */
1687         if (row < pcstart || row >=pcend) { /* put the value into Co */
1688           row = *poJ;
1689           cj  = coj + coi[row];
1690           ca  = coa + coi[row]; poJ++;
1691         } else {                            /* put the value into Cd */
1692           row = *pdJ;
1693           cj  = bj + bi[row];
1694           ca  = ba + bi[row]; pdJ++;
1695         }
1696         valtmp = pA[j];
1697         for (k=0; nextap<apnz; k++) {
1698           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1699         }
1700         ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1701       }
1702       pA += pnz;
1703       /* zero the current row info for A*P */
1704       ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
1705 #if defined(PTAP_PROFILE)
1706       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1707       ePtAP += t2_2 - t2_1;
1708 #endif
1709     }
1710   }
1711 #if defined(PTAP_PROFILE)
1712   ierr = PetscTime(&t2);CHKERRQ(ierr);
1713 #endif
1714 
1715   /* 3) send and recv matrix values coa */
1716   /*------------------------------------*/
1717   buf_ri = merge->buf_ri;
1718   buf_rj = merge->buf_rj;
1719   len_s  = merge->len_s;
1720   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1721   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1722 
1723   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1724   for (proc=0,k=0; proc<size; proc++) {
1725     if (!len_s[proc]) continue;
1726     i    = merge->owners_co[proc];
1727     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1728     k++;
1729   }
1730   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1731   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1732 
1733   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1734   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1735   ierr = PetscFree(coa);CHKERRQ(ierr);
1736 #if defined(PTAP_PROFILE)
1737   ierr = PetscTime(&t3);CHKERRQ(ierr);
1738 #endif
1739 
1740   /* 4) insert local Cseq and received values into Cmpi */
1741   /*------------------------------------------------------*/
1742   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1743   for (k=0; k<merge->nrecv; k++) {
1744     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1745     nrows       = *(buf_ri_k[k]);
1746     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1747     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1748   }
1749 
1750   for (i=0; i<cm; i++) {
1751     row  = owners[rank] + i; /* global row index of C_seq */
1752     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1753     ba_i = ba + bi[i];
1754     bnz  = bi[i+1] - bi[i];
1755     /* add received vals into ba */
1756     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1757       /* i-th row */
1758       if (i == *nextrow[k]) {
1759         cnz    = *(nextci[k]+1) - *nextci[k];
1760         cj     = buf_rj[k] + *(nextci[k]);
1761         ca     = abuf_r[k] + *(nextci[k]);
1762         nextcj = 0;
1763         for (j=0; nextcj<cnz; j++) {
1764           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1765             ba_i[j] += ca[nextcj++];
1766           }
1767         }
1768         nextrow[k]++; nextci[k]++;
1769         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1770       }
1771     }
1772     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1773   }
1774   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1775   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1776 
1777   ierr = PetscFree(ba);CHKERRQ(ierr);
1778   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1779   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1780   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1781 #if defined(PTAP_PROFILE)
1782   ierr = PetscTime(&t4);CHKERRQ(ierr);
1783   if (rank==1) {
1784     ierr = PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
1785   }
1786 #endif
1787   PetscFunctionReturn(0);
1788 }
1789