xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 748c7196a835988586c187c09a528d7bbbba79a0)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 
13 #define PTAP_PROFILE
14 
15 extern PetscErrorCode MatDestroy_MPIAIJ(Mat);
16 #undef __FUNCT__
17 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
18 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
19 {
20   PetscErrorCode ierr;
21   Mat_MPIAIJ     *a   =(Mat_MPIAIJ*)A->data;
22   Mat_PtAPMPI    *ptap=a->ptap;
23 
24   PetscFunctionBegin;
25   if (ptap) {
26     Mat_Merge_SeqsToMPI *merge=ptap->merge;
27     ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
28     ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
29     ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
30     ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
31     ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
32 
33     ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
34     ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
35     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
36     ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
37     ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
38 
39     if (ptap->api) {ierr = PetscFree(ptap->api);CHKERRQ(ierr);}
40     if (ptap->apj) {ierr = PetscFree(ptap->apj);CHKERRQ(ierr);}
41     if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
42     if (merge) {
43       ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
44       ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
45       ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
46       ierr = PetscFree(merge->bi);CHKERRQ(ierr);
47       ierr = PetscFree(merge->bj);CHKERRQ(ierr);
48       ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
49       ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
50       ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
51       ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
52       ierr = PetscFree(merge->coi);CHKERRQ(ierr);
53       ierr = PetscFree(merge->coj);CHKERRQ(ierr);
54       ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
55       ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
56       ierr = merge->destroy(A);CHKERRQ(ierr);
57       ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
58     }
59     ierr = PetscFree(ptap);CHKERRQ(ierr);
60   }
61   PetscFunctionReturn(0);
62 }
63 
64 #undef __FUNCT__
65 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatPtAP"
66 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
67 {
68   PetscErrorCode      ierr;
69   Mat_MPIAIJ          *a     = (Mat_MPIAIJ*)A->data;
70   Mat_PtAPMPI         *ptap  = a->ptap;
71   Mat_Merge_SeqsToMPI *merge = ptap->merge;
72 
73   PetscFunctionBegin;
74   ierr = (*merge->duplicate)(A,op,M);CHKERRQ(ierr);
75 
76   (*M)->ops->destroy   = merge->destroy;
77   (*M)->ops->duplicate = merge->duplicate;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
83 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
84 {
85   PetscErrorCode ierr;
86   PetscBool      newalg=PETSC_FALSE;
87 
88   PetscFunctionBegin;
89   ierr = PetscOptionsGetBool(NULL,"-matptap_new",&newalg,NULL);CHKERRQ(ierr);
90   if (scall == MAT_INITIAL_MATRIX) {
91     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
92     if (newalg) {
93       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(A,P,fill,C);CHKERRQ(ierr);
94     } else {
95       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
96     }
97     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
98   }
99   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
100   if (newalg) {
101     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ_new(A,P,*C);CHKERRQ(ierr);
102   } else {
103     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
104   }
105   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 #undef __FUNCT__
110 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_new"
111 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(Mat A,Mat P,PetscReal fill,Mat *C)
112 {
113   PetscErrorCode ierr;
114   Mat_PtAPMPI    *ptap;
115   Mat_MPIAIJ     *p=(Mat_MPIAIJ*)P->data;
116   Mat            AP;
117   Mat_MPIAIJ     *c;
118   MPI_Comm       comm;
119   PetscMPIInt    size,rank;
120   PetscLogDouble      t0,t1,t2,t3,t4;
121 
122   PetscFunctionBegin;
123   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
124   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
125   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
126 
127   /* check if matrix local sizes are compatible -- MV! */
128   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
129     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
130   }
131   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
132     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
133   }
134 
135   //ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); //rm later !!!
136   //=============================================================================
137   Mat                 Cmpi;
138   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
139   Mat_SeqAIJ          *p_loc;
140   PetscInt            *pi_loc;
141   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ,nnz;
142   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
143   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
144   PetscBT             lnkbt;
145   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
146   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
147   PetscInt            len,proc,*dnz,*onz,*owners;
148   PetscInt            nzi,*pti,*ptj;
149   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
150   MPI_Request         *swaits,*rwaits;
151   MPI_Status          *sstatus,rstatus;
152   Mat_Merge_SeqsToMPI *merge;
153   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
154   PetscReal           afill=1.0,afill_tmp;
155   PetscInt            rmax;
156 
157   ierr = PetscTime(&t0);CHKERRQ(ierr);
158   /* create struct Mat_PtAPMPI and attached it to C later */
159   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
160   ierr        = PetscNew(&merge);CHKERRQ(ierr);
161   ptap->merge = merge;
162   ptap->reuse = MAT_INITIAL_MATRIX;
163 
164   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
165   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
166 
167   /* get P_loc by taking all local rows of P */
168   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
169 
170   ptap->reuse = MAT_INITIAL_MATRIX;
171 
172   /* (1) compute symbolic AP = A*P, then get AP_loc */
173   /*--------------------------------------------------------------------------*/
174   ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
175   ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
176 
177   ierr = MatMatMult(A,P,MAT_INITIAL_MATRIX,2.0,&AP);CHKERRQ(ierr);
178   ierr = MatMPIAIJGetLocalMat(AP,MAT_INITIAL_MATRIX,&ptap->AP_loc);CHKERRQ(ierr);
179   ierr = MatDestroy(&AP);CHKERRQ(ierr);
180 
181   /* (2) compute C_loc=Rd*AP_loc, Co=Ro*AP_loc */
182   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,MAT_INITIAL_MATRIX,2.0,&ptap->C_loc);CHKERRQ(ierr);
183   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,MAT_INITIAL_MATRIX,2.0,&ptap->C_oth);CHKERRQ(ierr);
184   ierr = PetscTime(&t1);CHKERRQ(ierr);
185 
186   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
187   pi_loc = p_loc->i;
188 
189   Mat_SeqAIJ *ap = (Mat_SeqAIJ*)ptap->AP_loc->data;
190   Mat_SeqAIJ *c_oth  = (Mat_SeqAIJ*)ptap->C_oth->data;
191   api = ap->i; apj = ap->j;
192 
193   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
194   /*-----------------------------------------------------------------*/
195   //coi = c_oth->i; coj =  c_oth->j;
196 
197   Mat_SeqAIJ *po = (Mat_SeqAIJ *)ptap->Ro->data;
198   poti = po->i; potj = po->j;
199 
200   /* then, compute symbolic Co = (p->B)^T*AP */
201   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
202                          >= (num of nonzero rows of C_seq) - pn */
203 #if 1
204   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
205   coi[0] = 0;
206 
207   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
208   nnz           = fill*(poti[pon] + api[am]);
209   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
210   current_space = free_space;
211 
212   /* create and initialize a linked list */
213   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
214 
215   for (i=0; i<pon; i++) {
216     pnz = poti[i+1] - poti[i];
217     ptJ = potj + poti[i];
218     for (j=0; j<pnz; j++) {
219       row  = ptJ[j]; /* row of AP == col of Pot */
220       apnz = api[row+1] - api[row];
221       Jptr = apj + api[row];
222       /* add non-zero cols of AP into the sorted linked list lnk */
223       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
224     }
225     nnz = lnk[0];
226 
227     /* If free space is not available, double the total space in the list */
228     if (current_space->local_remaining<nnz) {
229       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
230       nspacedouble++;
231     }
232 
233     /* Copy data into free space, and zero out denserows */
234     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
235 
236     current_space->array           += nnz;
237     current_space->local_used      += nnz;
238     current_space->local_remaining -= nnz;
239 
240     coi[i+1] = coi[i] + nnz;
241     if (coi[i+1] != c_oth->i[i+1]) SETERRQ3(NULL,PETSC_ERR_ARG_SIZ,"%d coi %d !=  c_oth->i %d",i,coi[i+1],c_oth->i[i+1]);
242   }
243 
244   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
245   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
246   for (i=0; i<coi[pon]; i++) {
247     if (coj[i] != c_oth->j[i]) SETERRQ3(NULL,PETSC_ERR_ARG_SIZ,"%d coj %d !=  c_oth->j %d",i,coj[i],c_oth->j[i]);
248   }
249 #endif
250 
251   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
252   if (afill_tmp > afill) afill = afill_tmp;
253 
254   /* (3) send j-array (coj) of Co to other processors */
255   /*--------------------------------------------------*/
256   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
257   len_s        = merge->len_s;
258   merge->nsend = 0;
259 
260   /* determine row ownership */
261   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
262   merge->rowmap->n  = pn;
263   merge->rowmap->bs = 1;
264 
265   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
266   owners = merge->rowmap->range;
267 
268   /* determine the number of messages to send, their lengths */
269   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
270   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
271   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
272 
273   proc = 0;
274   for (i=0; i<pon; i++) {
275     while (prmap[i] >= owners[proc+1]) proc++;
276     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
277     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
278   }
279 
280   len          = 0; /* max length of buf_si[], see (4) */
281   owners_co[0] = 0;
282   for (proc=0; proc<size; proc++) {
283     owners_co[proc+1] = owners_co[proc] + len_si[proc];
284     if (len_s[proc]) {
285       merge->nsend++;
286       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
287       len         += len_si[proc];
288     }
289   }
290 
291   /* determine the number and length of messages to receive for coi and coj  */
292   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
293   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
294 
295   /* post the Irecv and Isend of coj */
296   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
297   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
298   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
299   for (proc=0, k=0; proc<size; proc++) {
300     if (!len_s[proc]) continue;
301     i    = owners_co[proc];
302     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
303     k++;
304   }
305 
306   /* receives and sends of coj are complete */
307   for (i=0; i<merge->nrecv; i++) {
308     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
309   }
310   ierr = PetscFree(rwaits);CHKERRQ(ierr);
311   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
312 
313   /* (4) send and recv coi */
314   /*-----------------------*/
315   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
316   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
317   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
318   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
319   for (proc=0,k=0; proc<size; proc++) {
320     if (!len_s[proc]) continue;
321     /* form outgoing message for i-structure:
322          buf_si[0]:                 nrows to be sent
323                [1:nrows]:           row index (global)
324                [nrows+1:2*nrows+1]: i-structure index
325     */
326     /*-------------------------------------------*/
327     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
328     buf_si_i    = buf_si + nrows+1;
329     buf_si[0]   = nrows;
330     buf_si_i[0] = 0;
331     nrows       = 0;
332     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
333       nzi = coi[i+1] - coi[i];
334       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
335       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
336       nrows++;
337     }
338     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
339     k++;
340     buf_si += len_si[proc];
341   }
342   i = merge->nrecv;
343   while (i--) {
344     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
345   }
346   ierr = PetscFree(rwaits);CHKERRQ(ierr);
347   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
348 
349   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
350   ierr = PetscFree(len_ri);CHKERRQ(ierr);
351   ierr = PetscFree(swaits);CHKERRQ(ierr);
352   ierr = PetscFree(buf_s);CHKERRQ(ierr);
353 
354   ierr = PetscTime(&t2);CHKERRQ(ierr);
355 
356   /* (5) compute the local portion of C (mpi mat) */
357   /*----------------------------------------------*/
358   Mat_SeqAIJ *pd = (Mat_SeqAIJ *)ptap->Rd->data;
359   pdti = pd->i; pdtj = pd->j;
360 
361   /* allocate pti array and free space for accumulating nonzero column info */
362   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
363   pti[0] = 0;
364 
365   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
366   nnz           = fill*(pi_loc[pm] + api[am]);
367   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
368   current_space = free_space;
369 
370   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
371   for (k=0; k<merge->nrecv; k++) {
372     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
373     nrows       = *buf_ri_k[k];
374     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
375     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
376   }
377   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
378   rmax = 0;
379   for (i=0; i<pn; i++) {
380     /* add pdt[i,:]*AP into lnk */
381     pnz = pdti[i+1] - pdti[i];
382     ptJ = pdtj + pdti[i];
383     for (j=0; j<pnz; j++) {
384       row  = ptJ[j];  /* row of AP == col of Pt */
385       apnz = api[row+1] - api[row];
386       Jptr = apj + api[row];
387       /* add non-zero cols of AP into the sorted linked list lnk */
388       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
389     }
390 
391     /* add received col data into lnk */
392     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
393       if (i == *nextrow[k]) { /* i-th row */
394         nzi  = *(nextci[k]+1) - *nextci[k];
395         Jptr = buf_rj[k] + *nextci[k];
396         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
397         nextrow[k]++; nextci[k]++;
398       }
399     }
400     nnz = lnk[0];
401 
402     /* if free space is not available, make more free space */
403     if (current_space->local_remaining<nnz) {
404       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
405       nspacedouble++;
406     }
407     /* copy data into free space, then initialize lnk */
408     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
409     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
410 
411     current_space->array           += nnz;
412     current_space->local_used      += nnz;
413     current_space->local_remaining -= nnz;
414 
415     pti[i+1] = pti[i] + nnz;
416     if (nnz > rmax) rmax = nnz;
417   }
418   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
419 
420   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
421   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
422   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
423   if (afill_tmp > afill) afill = afill_tmp;
424   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
425 
426   ierr = PetscTime(&t3);CHKERRQ(ierr);
427 
428   /* (6) create symbolic parallel matrix Cmpi */
429   /*------------------------------------------*/
430   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
431   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
432   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
433   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
434   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
435   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
436 
437   merge->bi        = pti;      /* Cseq->i */
438   merge->bj        = ptj;      /* Cseq->j */
439   merge->coi       = coi;      /* Co->i   */
440   merge->coj       = coj;      /* Co->j   */
441   merge->buf_ri    = buf_ri;
442   merge->buf_rj    = buf_rj;
443   merge->owners_co = owners_co;
444   merge->destroy   = Cmpi->ops->destroy;
445   merge->duplicate = Cmpi->ops->duplicate;
446 
447   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
448   Cmpi->assembled        = PETSC_FALSE;
449   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
450   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
451   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_new;
452 
453   /* attach the supporting struct to Cmpi for reuse */
454   c           = (Mat_MPIAIJ*)Cmpi->data;
455   c->ptap     = ptap;
456   ptap->api   = NULL;
457   ptap->apj   = NULL;
458   ptap->rmax  = ap_rmax;
459   *C          = Cmpi;
460 
461   /* flag 'scalable' determines which implementations to be used:
462        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
463        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
464   /* set default scalable */
465   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
466 
467   ierr = PetscOptionsGetBool(((PetscObject)(*C))->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
468   if (!ptap->scalable) {  /* Do dense axpy */
469     ierr = PetscCalloc1(P->cmap->N,&ptap->apa);CHKERRQ(ierr);
470   } else {
471     //ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
472   }
473 
474   ierr = PetscTime(&t4);CHKERRQ(ierr);
475   if (rank == 1) printf("PtAPSym: %g + %g + %g = %g\n",t1-t0,t2-t1,t3-t2,t4-t3);
476   PetscFunctionReturn(0);
477 }
478 
479 #undef __FUNCT__
480 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_new"
481 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_new(Mat A,Mat P,Mat C)
482 {
483   PetscErrorCode    ierr;
484   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
485   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
486   Mat_PtAPMPI       *ptap = c->ptap;
487   Mat               AP_loc,C_loc,C_oth;
488   PetscInt          i,rstart,rend,cm,ncols,row;
489   PetscMPIInt       rank;
490   MPI_Comm          comm;
491   const PetscInt    *cols;
492   const PetscScalar *vals;
493   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
494 
495   PetscFunctionBegin;
496   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
497   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
498 
499   ierr = MatZeroEntries(C);CHKERRQ(ierr);
500 
501   /* 1) get R = Pd^T,Ro = Po^T */
502   ierr = PetscTime(&t0);CHKERRQ(ierr);
503   if (ptap->reuse == MAT_REUSE_MATRIX) {
504     ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
505     ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
506   }
507   ierr = PetscTime(&t1);CHKERRQ(ierr);
508   eR = t1 - t0;
509 
510   /* 2) get AP_loc */
511   AP_loc = ptap->AP_loc;
512   Mat_SeqAIJ    *ap=(Mat_SeqAIJ*)AP_loc->data;
513 
514   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
515   /*-----------------------------------------------------*/
516   if (ptap->reuse == MAT_REUSE_MATRIX) {
517     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
518     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
519     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
520   }
521 
522   /* 2-2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
523   /*--------------------------------------------------------------*/
524   /* get data from symbolic products */
525   Mat_SeqAIJ  *p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
526   Mat_SeqAIJ  *p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
527   PetscInt    *api,*apj,am = A->rmap->n,j,col,apnz;
528   PetscScalar *apa = ptap->apa;
529   if (ptap->reuse == MAT_REUSE_MATRIX) {
530     api = ap->i;
531     apj = ap->j;
532     for (i=0; i<am; i++) {
533       /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
534       AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
535       apnz = api[i+1] - api[i];
536       for (j=0; j<apnz; j++) {
537         col = apj[j+api[i]];
538         ap->a[j+ap->i[i]] = apa[col];
539         apa[col] = 0.0;
540       }
541     }
542   }
543 
544   ierr = PetscTime(&t2);CHKERRQ(ierr);
545   eAP = t2 - t1;
546 
547   /* 3) C_loc = R*AP_loc, Co = Ro*AP_loc */
548   if (ptap->reuse == MAT_REUSE_MATRIX) {
549     ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Rd,AP_loc,MAT_REUSE_MATRIX,2.0,&ptap->C_loc);CHKERRQ(ierr);
550     ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Ro,AP_loc,MAT_REUSE_MATRIX,2.0,&ptap->C_oth);CHKERRQ(ierr);
551   }
552   C_loc = ptap->C_loc;
553   C_oth = ptap->C_oth;
554   //printf("[%d] Co %d, %d\n", rank,Co->rmap->N,Co->cmap->N);
555   ierr = PetscTime(&t3);CHKERRQ(ierr);
556   eCseq = t3 - t2;
557 
558   /* add C_loc and Co to to C */
559   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
560 
561   /* C_loc -> C */
562   cm = C_loc->rmap->N;
563   Mat_SeqAIJ *c_seq;
564   c_seq = (Mat_SeqAIJ*)C_loc->data;
565   for (i=0; i<cm; i++) {
566     ncols = c_seq->i[i+1] - c_seq->i[i];
567     row = rstart + i;
568     cols = c_seq->j + c_seq->i[i];
569     vals = c_seq->a + c_seq->i[i];
570     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
571   }
572 
573   /* Co -> C, off-processor part */
574   //printf("[%d] p->B %d, %d\n",rank,p->B->rmap->N,p->B->cmap->N);
575   cm = C_oth->rmap->N;
576   c_seq = (Mat_SeqAIJ*)C_oth->data;
577   for (i=0; i<cm; i++) {
578     ncols = c_seq->i[i+1] - c_seq->i[i];
579     row = p->garray[i];
580     cols = c_seq->j + c_seq->i[i];
581     vals = c_seq->a + c_seq->i[i];
582     //printf("[%d] row[%d] = %d\n",rank,i,row);
583     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
584   }
585   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
586   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
587   ierr = PetscTime(&t4);CHKERRQ(ierr);
588   eCmpi = t4 - t3;
589 
590   if (rank==1) {
591     ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr);
592   }
593   ptap->reuse = MAT_REUSE_MATRIX;
594   PetscFunctionReturn(0);
595 }
596 
597 #undef __FUNCT__
598 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
599 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
600 {
601   PetscErrorCode      ierr;
602   Mat                 Cmpi;
603   Mat_PtAPMPI         *ptap;
604   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
605   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
606   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
607   Mat_SeqAIJ          *p_loc,*p_oth;
608   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
609   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
610   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
611   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
612   PetscBT             lnkbt;
613   MPI_Comm            comm;
614   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
615   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
616   PetscInt            len,proc,*dnz,*onz,*owners;
617   PetscInt            nzi,*pti,*ptj;
618   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
619   MPI_Request         *swaits,*rwaits;
620   MPI_Status          *sstatus,rstatus;
621   Mat_Merge_SeqsToMPI *merge;
622   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
623   PetscReal           afill=1.0,afill_tmp;
624   PetscInt            rmax;
625 
626   PetscFunctionBegin;
627   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
628 
629   /* check if matrix local sizes are compatible */
630   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
631     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
632   }
633   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
634     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
635   }
636 
637   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
638   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
639 
640   /* create struct Mat_PtAPMPI and attached it to C later */
641   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
642   ierr        = PetscNew(&merge);CHKERRQ(ierr);
643   ptap->merge = merge;
644   ptap->reuse = MAT_INITIAL_MATRIX;
645 
646   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
647   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
648 
649   /* get P_loc by taking all local rows of P */
650   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
651 
652   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
653   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
654   pi_loc = p_loc->i; pj_loc = p_loc->j;
655   pi_oth = p_oth->i; pj_oth = p_oth->j;
656 
657   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
658   /*--------------------------------------------------------------------------*/
659   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
660   api[0] = 0;
661 
662   /* create and initialize a linked list */
663   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
664 
665   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
666   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
667 
668   current_space = free_space;
669 
670   for (i=0; i<am; i++) {
671     /* diagonal portion of A */
672     nzi = adi[i+1] - adi[i];
673     aj  = ad->j + adi[i];
674     for (j=0; j<nzi; j++) {
675       row  = aj[j];
676       pnz  = pi_loc[row+1] - pi_loc[row];
677       Jptr = pj_loc + pi_loc[row];
678       /* add non-zero cols of P into the sorted linked list lnk */
679       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
680     }
681     /* off-diagonal portion of A */
682     nzi = aoi[i+1] - aoi[i];
683     aj  = ao->j + aoi[i];
684     for (j=0; j<nzi; j++) {
685       row  = aj[j];
686       pnz  = pi_oth[row+1] - pi_oth[row];
687       Jptr = pj_oth + pi_oth[row];
688       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
689     }
690     apnz     = lnk[0];
691     api[i+1] = api[i] + apnz;
692     if (ap_rmax < apnz) ap_rmax = apnz;
693 
694     /* if free space is not available, double the total space in the list */
695     if (current_space->local_remaining<apnz) {
696       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
697       nspacedouble++;
698     }
699 
700     /* Copy data into free space, then initialize lnk */
701     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
702 
703     current_space->array           += apnz;
704     current_space->local_used      += apnz;
705     current_space->local_remaining -= apnz;
706   }
707 
708   /* Allocate space for apj, initialize apj, and */
709   /* destroy list of free space and other temporary array(s) */
710   ierr      = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr);
711   ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
712   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
713   if (afill_tmp > afill) afill = afill_tmp;
714 
715   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
716   /*-----------------------------------------------------------------*/
717   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
718 
719   /* then, compute symbolic Co = (p->B)^T*AP */
720   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
721                          >= (num of nonzero rows of C_seq) - pn */
722   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
723   coi[0] = 0;
724 
725   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
726   nnz           = fill*(poti[pon] + api[am]);
727   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
728   current_space = free_space;
729 
730   for (i=0; i<pon; i++) {
731     pnz = poti[i+1] - poti[i];
732     ptJ = potj + poti[i];
733     for (j=0; j<pnz; j++) {
734       row  = ptJ[j]; /* row of AP == col of Pot */
735       apnz = api[row+1] - api[row];
736       Jptr = apj + api[row];
737       /* add non-zero cols of AP into the sorted linked list lnk */
738       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
739     }
740     nnz = lnk[0];
741 
742     /* If free space is not available, double the total space in the list */
743     if (current_space->local_remaining<nnz) {
744       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
745       nspacedouble++;
746     }
747 
748     /* Copy data into free space, and zero out denserows */
749     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
750 
751     current_space->array           += nnz;
752     current_space->local_used      += nnz;
753     current_space->local_remaining -= nnz;
754 
755     coi[i+1] = coi[i] + nnz;
756   }
757 
758   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
759   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
760   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
761   if (afill_tmp > afill) afill = afill_tmp;
762   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
763 
764   /* (3) send j-array (coj) of Co to other processors */
765   /*--------------------------------------------------*/
766   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
767   len_s        = merge->len_s;
768   merge->nsend = 0;
769 
770 
771   /* determine row ownership */
772   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
773   merge->rowmap->n  = pn;
774   merge->rowmap->bs = 1;
775 
776   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
777   owners = merge->rowmap->range;
778 
779   /* determine the number of messages to send, their lengths */
780   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
781   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
782   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
783 
784   proc = 0;
785   for (i=0; i<pon; i++) {
786     while (prmap[i] >= owners[proc+1]) proc++;
787     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
788     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
789   }
790 
791   len          = 0; /* max length of buf_si[], see (4) */
792   owners_co[0] = 0;
793   for (proc=0; proc<size; proc++) {
794     owners_co[proc+1] = owners_co[proc] + len_si[proc];
795     if (len_s[proc]) {
796       merge->nsend++;
797       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
798       len         += len_si[proc];
799     }
800   }
801 
802   /* determine the number and length of messages to receive for coi and coj  */
803   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
804   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
805 
806   /* post the Irecv and Isend of coj */
807   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
808   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
809   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
810   for (proc=0, k=0; proc<size; proc++) {
811     if (!len_s[proc]) continue;
812     i    = owners_co[proc];
813     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
814     k++;
815   }
816 
817   /* receives and sends of coj are complete */
818   for (i=0; i<merge->nrecv; i++) {
819     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
820   }
821   ierr = PetscFree(rwaits);CHKERRQ(ierr);
822   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
823 
824   /* (4) send and recv coi */
825   /*-----------------------*/
826   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
827   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
828   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
829   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
830   for (proc=0,k=0; proc<size; proc++) {
831     if (!len_s[proc]) continue;
832     /* form outgoing message for i-structure:
833          buf_si[0]:                 nrows to be sent
834                [1:nrows]:           row index (global)
835                [nrows+1:2*nrows+1]: i-structure index
836     */
837     /*-------------------------------------------*/
838     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
839     buf_si_i    = buf_si + nrows+1;
840     buf_si[0]   = nrows;
841     buf_si_i[0] = 0;
842     nrows       = 0;
843     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
844       nzi = coi[i+1] - coi[i];
845       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
846       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
847       nrows++;
848     }
849     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
850     k++;
851     buf_si += len_si[proc];
852   }
853   i = merge->nrecv;
854   while (i--) {
855     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
856   }
857   ierr = PetscFree(rwaits);CHKERRQ(ierr);
858   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
859 
860   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
861   ierr = PetscFree(len_ri);CHKERRQ(ierr);
862   ierr = PetscFree(swaits);CHKERRQ(ierr);
863   ierr = PetscFree(buf_s);CHKERRQ(ierr);
864 
865   /* (5) compute the local portion of C (mpi mat) */
866   /*----------------------------------------------*/
867   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
868 
869   /* allocate pti array and free space for accumulating nonzero column info */
870   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
871   pti[0] = 0;
872 
873   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
874   nnz           = fill*(pi_loc[pm] + api[am]);
875   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
876   current_space = free_space;
877 
878   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
879   for (k=0; k<merge->nrecv; k++) {
880     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
881     nrows       = *buf_ri_k[k];
882     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
883     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
884   }
885   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
886   rmax = 0;
887   for (i=0; i<pn; i++) {
888     /* add pdt[i,:]*AP into lnk */
889     pnz = pdti[i+1] - pdti[i];
890     ptJ = pdtj + pdti[i];
891     for (j=0; j<pnz; j++) {
892       row  = ptJ[j];  /* row of AP == col of Pt */
893       apnz = api[row+1] - api[row];
894       Jptr = apj + api[row];
895       /* add non-zero cols of AP into the sorted linked list lnk */
896       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
897     }
898 
899     /* add received col data into lnk */
900     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
901       if (i == *nextrow[k]) { /* i-th row */
902         nzi  = *(nextci[k]+1) - *nextci[k];
903         Jptr = buf_rj[k] + *nextci[k];
904         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
905         nextrow[k]++; nextci[k]++;
906       }
907     }
908     nnz = lnk[0];
909 
910     /* if free space is not available, make more free space */
911     if (current_space->local_remaining<nnz) {
912       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
913       nspacedouble++;
914     }
915     /* copy data into free space, then initialize lnk */
916     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
917     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
918 
919     current_space->array           += nnz;
920     current_space->local_used      += nnz;
921     current_space->local_remaining -= nnz;
922 
923     pti[i+1] = pti[i] + nnz;
924     if (nnz > rmax) rmax = nnz;
925   }
926   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
927   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
928 
929   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
930   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
931   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
932   if (afill_tmp > afill) afill = afill_tmp;
933   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
934 
935   /* (6) create symbolic parallel matrix Cmpi */
936   /*------------------------------------------*/
937   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
938   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
939   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
940   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
941   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
942   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
943 
944   merge->bi        = pti;      /* Cseq->i */
945   merge->bj        = ptj;      /* Cseq->j */
946   merge->coi       = coi;      /* Co->i   */
947   merge->coj       = coj;      /* Co->j   */
948   merge->buf_ri    = buf_ri;
949   merge->buf_rj    = buf_rj;
950   merge->owners_co = owners_co;
951   merge->destroy   = Cmpi->ops->destroy;
952   merge->duplicate = Cmpi->ops->duplicate;
953 
954   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
955   Cmpi->assembled      = PETSC_FALSE;
956   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
957   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
958 
959   /* attach the supporting struct to Cmpi for reuse */
960   c           = (Mat_MPIAIJ*)Cmpi->data;
961   c->ptap     = ptap;
962   ptap->api   = api;
963   ptap->apj   = apj;
964   ptap->rmax  = ap_rmax;
965   *C          = Cmpi;
966 
967   /* flag 'scalable' determines which implementations to be used:
968        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
969        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
970   /* set default scalable */
971   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
972 
973   ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
974   if (!ptap->scalable) {  /* Do dense axpy */
975     ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
976   } else {
977     ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
978   }
979 
980 #if defined(PETSC_USE_INFO)
981   if (pti[pn] != 0) {
982     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
983     ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
984   } else {
985     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
986   }
987 #endif
988   PetscFunctionReturn(0);
989 }
990 
991 #undef __FUNCT__
992 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
993 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
994 {
995   PetscErrorCode      ierr;
996   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
997   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
998   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
999   Mat_SeqAIJ          *p_loc,*p_oth;
1000   Mat_PtAPMPI         *ptap;
1001   Mat_Merge_SeqsToMPI *merge;
1002   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1003   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1004   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
1005   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1006   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1007   MPI_Comm            comm;
1008   PetscMPIInt         size,rank,taga,*len_s;
1009   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1010   PetscInt            **buf_ri,**buf_rj;
1011   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1012   MPI_Request         *s_waits,*r_waits;
1013   MPI_Status          *status;
1014   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1015   PetscInt            *api,*apj,*coi,*coj;
1016   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1017   PetscBool           scalable;
1018 #if defined(PTAP_PROFILE)
1019   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1020 #endif
1021 
1022   PetscFunctionBegin;
1023   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1024 #if defined(PTAP_PROFILE)
1025   ierr = PetscTime(&t0);CHKERRQ(ierr);
1026 #endif
1027   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1028   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1029 
1030   ptap = c->ptap;
1031   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1032   merge    = ptap->merge;
1033   apa      = ptap->apa;
1034   scalable = ptap->scalable;
1035 
1036   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1037   /*-----------------------------------------------------*/
1038   if (ptap->reuse == MAT_INITIAL_MATRIX) {
1039     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1040     ptap->reuse = MAT_REUSE_MATRIX;
1041   } else { /* update numerical values of P_oth and P_loc */
1042     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1043     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1044   }
1045 #if defined(PTAP_PROFILE)
1046   ierr = PetscTime(&t1);CHKERRQ(ierr);
1047   eP = t1-t0;
1048 #endif
1049   /*
1050   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1051          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1052          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1053          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1054    */
1055 
1056   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1057   /*--------------------------------------------------------------*/
1058   /* get data from symbolic products */
1059   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1060   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1061   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a;
1062   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1063 
1064   coi  = merge->coi; coj = merge->coj;
1065   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1066 
1067   bi     = merge->bi; bj = merge->bj;
1068   owners = merge->rowmap->range;
1069   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);  /* ba: Cseq->a */
1070 
1071   api = ptap->api; apj = ptap->apj;
1072 
1073   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1074     ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr);
1075 #if 0
1076     /* ------ 10x slower -------------- */
1077     /*==================================*/
1078     Mat         R = ptap->R;
1079     Mat_SeqAIJ  *r = (Mat_SeqAIJ*)R->data;
1080     PetscInt    *ri=r->i,*rj=r->j,rnz,arow,l,prow,pcol,pN=P->cmap->N;
1081     PetscScalar *ra=r->a,tmp,cdense[pN];
1082 
1083     ierr = PetscMemzero(cdense,pN*sizeof(PetscScalar));CHKERRQ(ierr);
1084     for (i=0; i<cm; i++) { /* each row of C or R */
1085       rnz = ri[i+1] - ri[i];
1086 
1087       for (j=0; j<rnz; j++) { /* each nz of R */
1088         arow = rj[ri[i] + j];
1089 
1090         /* diagonal portion of A */
1091         anz  = ad->i[arow+1] - ad->i[arow];
1092         for (k=0; k<anz; k++) { /* each nz of Ad */
1093           tmp  = ra[ri[i] + j]*ad->a[ad->i[arow] + k];
1094           prow = ad->j[ad->i[arow] + k];
1095           pnz  = pi_loc[prow+1] - pi_loc[prow];
1096 
1097           for (l=0; l<pnz; l++) { /* each nz of P_loc */
1098             pcol = pj_loc[pi_loc[prow] + l];
1099             cdense[pcol] += tmp*pa_loc[pi_loc[prow] + l];
1100           }
1101         }
1102 
1103         /* off-diagonal portion of A */
1104         anz  = ao->i[arow+1] - ao->i[arow];
1105         for (k=0; k<anz; k++) { /* each nz of Ao */
1106           tmp  = ra[ri[i] + j]*ao->a[ao->i[arow] + k];
1107           prow = ao->j[ao->i[arow] + k];
1108           pnz  = pi_oth[prow+1] - pi_oth[prow];
1109 
1110           for (l=0; l<pnz; l++) { /* each nz of P_oth */
1111             pcol = pj_oth[pi_oth[prow] + l];
1112             cdense[pcol] += tmp*pa_oth[pi_oth[prow] + l];
1113           }
1114         }
1115 
1116       } //for (j=0; j<rnz; j++)
1117 
1118       /* copy cdense[] into ca; zero cdense[] */
1119       cnz = bi[i+1] - bi[i];
1120       cj  = bj + bi[i];
1121       ca  = ba + bi[i];
1122       for (j=0; j<cnz; j++) {
1123         ca[j] += cdense[cj[j]];
1124         cdense[cj[j]] = 0.0;
1125       }
1126 #if 0
1127       if (rank == 0) {
1128         printf("[%d] row %d: ",rank,i);
1129         for (j=0; j<pN; j++) printf(" %g,",cdense[j]);
1130         printf("\n");
1131       }
1132       for (j=0; j<pN; j++) cdense[j]=0.0; // zero cdnese[]
1133 #endif
1134     } //for (i=0; i<cm; i++) {
1135 #endif
1136 
1137     //==========================================
1138 
1139     ierr = PetscTime(&t1);CHKERRQ(ierr);
1140     for (i=0; i<am; i++) {
1141 #if defined(PTAP_PROFILE)
1142       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1143 #endif
1144       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1145       /*------------------------------------------------------------*/
1146       apJ = apj + api[i];
1147 
1148       /* diagonal portion of A */
1149       anz = adi[i+1] - adi[i];
1150       adj = ad->j + adi[i];
1151       ada = ad->a + adi[i];
1152       for (j=0; j<anz; j++) {
1153         row = adj[j];
1154         pnz = pi_loc[row+1] - pi_loc[row];
1155         pj  = pj_loc + pi_loc[row];
1156         pa  = pa_loc + pi_loc[row];
1157 
1158         /* perform dense axpy */
1159         valtmp = ada[j];
1160         for (k=0; k<pnz; k++) {
1161           apa[pj[k]] += valtmp*pa[k];
1162         }
1163         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1164       }
1165 
1166       /* off-diagonal portion of A */
1167       anz = aoi[i+1] - aoi[i];
1168       aoj = ao->j + aoi[i];
1169       aoa = ao->a + aoi[i];
1170       for (j=0; j<anz; j++) {
1171         row = aoj[j];
1172         pnz = pi_oth[row+1] - pi_oth[row];
1173         pj  = pj_oth + pi_oth[row];
1174         pa  = pa_oth + pi_oth[row];
1175 
1176         /* perform dense axpy */
1177         valtmp = aoa[j];
1178         for (k=0; k<pnz; k++) {
1179           apa[pj[k]] += valtmp*pa[k];
1180         }
1181         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1182       }
1183 #if defined(PTAP_PROFILE)
1184       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1185       et2_AP += t2_1 - t2_0;
1186 #endif
1187 
1188       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1189       /*--------------------------------------------------------------*/
1190       apnz = api[i+1] - api[i];
1191       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1192       pnz = po->i[i+1] - po->i[i];
1193       poJ = po->j + po->i[i];
1194       pA  = po->a + po->i[i];
1195       for (j=0; j<pnz; j++) {
1196         row = poJ[j];
1197         cnz = coi[row+1] - coi[row];
1198         cj  = coj + coi[row];
1199         ca  = coa + coi[row];
1200         /* perform dense axpy */
1201         valtmp = pA[j];
1202         for (k=0; k<cnz; k++) {
1203           ca[k] += valtmp*apa[cj[k]];
1204         }
1205         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1206       }
1207 #if 1
1208       /* put the value into Cd (diagonal part) */
1209       pnz = pd->i[i+1] - pd->i[i];
1210       pdJ = pd->j + pd->i[i];
1211       pA  = pd->a + pd->i[i];
1212       for (j=0; j<pnz; j++) {
1213         row = pdJ[j];
1214         cnz = bi[row+1] - bi[row];
1215         cj  = bj + bi[row];
1216         ca  = ba + bi[row];
1217         /* perform dense axpy */
1218         valtmp = pA[j];
1219         for (k=0; k<cnz; k++) {
1220           ca[k] += valtmp*apa[cj[k]];
1221         }
1222         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1223       }
1224 #endif
1225       /* zero the current row of A*P */
1226       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1227 #if defined(PTAP_PROFILE)
1228       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1229       ePtAP += t2_2 - t2_1;
1230 #endif
1231     }
1232 
1233     if (rank == 100) {
1234     for (row=0; row<cm; row++) {
1235       printf("[%d] row %d: ",rank,row);
1236       cnz = bi[row+1] - bi[row];
1237       for (j=0; j<cnz; j++) printf(" %g,",ba[bi[row]+j]);
1238       printf("\n");
1239     }
1240     }
1241 
1242   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1243     ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr);
1244     /*-----------------------------------------------------------------------------------------*/
1245     pA=pa_loc;
1246     for (i=0; i<am; i++) {
1247 #if defined(PTAP_PROFILE)
1248       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1249 #endif
1250       /* form i-th sparse row of A*P */
1251       apnz = api[i+1] - api[i];
1252       apJ  = apj + api[i];
1253       /* diagonal portion of A */
1254       anz = adi[i+1] - adi[i];
1255       adj = ad->j + adi[i];
1256       ada = ad->a + adi[i];
1257       for (j=0; j<anz; j++) {
1258         row    = adj[j];
1259         pnz    = pi_loc[row+1] - pi_loc[row];
1260         pj     = pj_loc + pi_loc[row];
1261         pa     = pa_loc + pi_loc[row];
1262         valtmp = ada[j];
1263         nextp  = 0;
1264         for (k=0; nextp<pnz; k++) {
1265           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1266             apa[k] += valtmp*pa[nextp++];
1267           }
1268         }
1269         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1270       }
1271       /* off-diagonal portion of A */
1272       anz = aoi[i+1] - aoi[i];
1273       aoj = ao->j + aoi[i];
1274       aoa = ao->a + aoi[i];
1275       for (j=0; j<anz; j++) {
1276         row    = aoj[j];
1277         pnz    = pi_oth[row+1] - pi_oth[row];
1278         pj     = pj_oth + pi_oth[row];
1279         pa     = pa_oth + pi_oth[row];
1280         valtmp = aoa[j];
1281         nextp  = 0;
1282         for (k=0; nextp<pnz; k++) {
1283           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1284             apa[k] += valtmp*pa[nextp++];
1285           }
1286         }
1287         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1288       }
1289 #if defined(PTAP_PROFILE)
1290       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1291       et2_AP += t2_1 - t2_0;
1292 #endif
1293 
1294       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1295       /*--------------------------------------------------------------*/
1296       pnz = pi_loc[i+1] - pi_loc[i];
1297       pJ  = pj_loc + pi_loc[i];
1298       for (j=0; j<pnz; j++) {
1299         nextap = 0;
1300         row    = pJ[j]; /* global index */
1301         if (row < pcstart || row >=pcend) { /* put the value into Co */
1302           row = *poJ;
1303           cj  = coj + coi[row];
1304           ca  = coa + coi[row]; poJ++;
1305         } else {                            /* put the value into Cd */
1306           row = *pdJ;
1307           cj  = bj + bi[row];
1308           ca  = ba + bi[row]; pdJ++;
1309         }
1310         valtmp = pA[j];
1311         for (k=0; nextap<apnz; k++) {
1312           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1313         }
1314         ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1315       }
1316       pA += pnz;
1317       /* zero the current row info for A*P */
1318       ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
1319 #if defined(PTAP_PROFILE)
1320       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1321       ePtAP += t2_2 - t2_1;
1322 #endif
1323     }
1324   }
1325 #if defined(PTAP_PROFILE)
1326   ierr = PetscTime(&t2);CHKERRQ(ierr);
1327 #endif
1328 
1329   /* 3) send and recv matrix values coa */
1330   /*------------------------------------*/
1331   buf_ri = merge->buf_ri;
1332   buf_rj = merge->buf_rj;
1333   len_s  = merge->len_s;
1334   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1335   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1336 
1337   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1338   for (proc=0,k=0; proc<size; proc++) {
1339     if (!len_s[proc]) continue;
1340     i    = merge->owners_co[proc];
1341     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1342     k++;
1343   }
1344   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1345   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1346 
1347   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1348   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1349   ierr = PetscFree(coa);CHKERRQ(ierr);
1350 #if defined(PTAP_PROFILE)
1351   ierr = PetscTime(&t3);CHKERRQ(ierr);
1352 #endif
1353 
1354   /* 4) insert local Cseq and received values into Cmpi */
1355   /*------------------------------------------------------*/
1356   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1357   for (k=0; k<merge->nrecv; k++) {
1358     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1359     nrows       = *(buf_ri_k[k]);
1360     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1361     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1362   }
1363 
1364   for (i=0; i<cm; i++) {
1365     row  = owners[rank] + i; /* global row index of C_seq */
1366     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1367     ba_i = ba + bi[i];
1368     bnz  = bi[i+1] - bi[i];
1369     /* add received vals into ba */
1370     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1371       /* i-th row */
1372       if (i == *nextrow[k]) {
1373         cnz    = *(nextci[k]+1) - *nextci[k];
1374         cj     = buf_rj[k] + *(nextci[k]);
1375         ca     = abuf_r[k] + *(nextci[k]);
1376         nextcj = 0;
1377         for (j=0; nextcj<cnz; j++) {
1378           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1379             ba_i[j] += ca[nextcj++];
1380           }
1381         }
1382         nextrow[k]++; nextci[k]++;
1383         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1384       }
1385     }
1386     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1387   }
1388   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1389   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1390 
1391   ierr = PetscFree(ba);CHKERRQ(ierr);
1392   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1393   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1394   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1395 #if defined(PTAP_PROFILE)
1396   ierr = PetscTime(&t4);CHKERRQ(ierr);
1397   if (rank==1) {
1398     ierr = PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
1399   }
1400 #endif
1401   PetscFunctionReturn(0);
1402 }
1403