xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 1a47ec5458ed0a0c3ef4c772e1bac0b9f700f9be)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 
13 #define PTAP_PROFILE
14 
15 extern PetscErrorCode MatDestroy_MPIAIJ(Mat);
16 #undef __FUNCT__
17 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
18 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
19 {
20   PetscErrorCode ierr;
21   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
22   Mat_PtAPMPI    *ptap=a->ptap;
23 
24   PetscFunctionBegin;
25   if (ptap) {
26     Mat_Merge_SeqsToMPI *merge=ptap->merge;
27     ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
28     ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
29     ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
30     ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
31     ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
32 
33     ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
34     ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
35 
36     ierr = PetscFree(ptap->api);CHKERRQ(ierr);
37     ierr = PetscFree2(ptap->apj,ptap->apv);CHKERRQ(ierr);
38     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
39     ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
40     ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
41 
42     if (ptap->api) {ierr = PetscFree(ptap->api);CHKERRQ(ierr);}
43     if (ptap->apj) {ierr = PetscFree(ptap->apj);CHKERRQ(ierr);}
44     if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
45     if (merge) {
46       ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
47       ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
48       ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
49       ierr = PetscFree(merge->bi);CHKERRQ(ierr);
50       ierr = PetscFree(merge->bj);CHKERRQ(ierr);
51       ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
52       ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
53       ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
54       ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
55       ierr = PetscFree(merge->coi);CHKERRQ(ierr);
56       ierr = PetscFree(merge->coj);CHKERRQ(ierr);
57       ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
58       ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
59       ierr = merge->destroy(A);CHKERRQ(ierr);
60       ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
61     } else {
62       ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
63     }
64     ierr = PetscFree(ptap);CHKERRQ(ierr);
65   }
66   PetscFunctionReturn(0);
67 }
68 
69 #undef __FUNCT__
70 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatPtAP"
71 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
72 {
73   PetscErrorCode      ierr;
74   Mat_MPIAIJ          *a     = (Mat_MPIAIJ*)A->data;
75   Mat_PtAPMPI         *ptap  = a->ptap;
76   Mat_Merge_SeqsToMPI *merge = ptap->merge;
77 
78   PetscFunctionBegin;
79   ierr = (*merge->duplicate)(A,op,M);CHKERRQ(ierr);
80 
81   (*M)->ops->destroy   = merge->destroy;
82   (*M)->ops->duplicate = merge->duplicate;
83   PetscFunctionReturn(0);
84 }
85 
86 
87 #undef __FUNCT__
88 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatPtAP_new"
89 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP_new(Mat A, MatDuplicateOption op, Mat *M)
90 {
91   PetscErrorCode      ierr;
92   Mat_MPIAIJ          *a     = (Mat_MPIAIJ*)A->data;
93   Mat_PtAPMPI         *ptap  = a->ptap;
94 
95   PetscFunctionBegin;
96   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
97   (*M)->ops->destroy   = ptap->destroy;
98   (*M)->ops->duplicate = ptap->duplicate;
99   PetscFunctionReturn(0);
100 }
101 
102 #undef __FUNCT__
103 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
104 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
105 {
106   PetscErrorCode ierr;
107   PetscBool      newalg=PETSC_FALSE;
108   MPI_Comm       comm;
109 
110   PetscFunctionBegin;
111   /* check if matrix local sizes are compatible */
112   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
113   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
114     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
115   }
116   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
117     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
118   }
119 
120   ierr = PetscOptionsGetBool(NULL,"-matptap_new",&newalg,NULL);CHKERRQ(ierr);
121   if (scall == MAT_INITIAL_MATRIX) {
122     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
123     if (newalg) {
124       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(A,P,fill,C);CHKERRQ(ierr);
125     } else {
126       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
127     }
128     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
129   }
130   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
131   if (newalg) {
132     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ_new(A,P,*C);CHKERRQ(ierr);
133   } else {
134     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
135   }
136   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
137   PetscFunctionReturn(0);
138 }
139 
140 /* requires array of size pN, thus nonscalable version */
141 #undef __FUNCT__
142 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_new"
143 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(Mat A,Mat P,PetscReal fill,Mat *C)
144 {
145   PetscErrorCode ierr;
146   Mat_PtAPMPI    *ptap;
147   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
148   Mat_MPIAIJ     *c;
149   MPI_Comm       comm;
150   PetscMPIInt    size,rank;
151 #if defined(PTAP_PROFILE)
152   PetscLogDouble      t0,t1,t11,t12,t2,t3,t4;
153 #endif
154   Mat                 Cmpi;
155   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
156   PetscInt            am=A->rmap->n,pm=P->rmap->n;
157   PetscInt            *lnk,i,k,pnz,row,nnz;
158   PetscInt            pN=P->cmap->N,pn=P->cmap->n;
159   PetscBT             lnkbt;
160   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
161   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
162   PetscInt            len,proc,*dnz,*onz,*owners;
163   PetscInt            nzi,nspacedouble;
164   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
165   MPI_Request         *swaits,*rwaits;
166   MPI_Status          *sstatus,rstatus;
167   PetscLayout         rowmap;
168   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
169   PetscInt            nsend,nrecv;
170   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
171   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0;
172   PetscInt            rmax,*aj,*ai,*pi;
173   Mat_SeqAIJ          *p_loc,*p_oth,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c_loc,*c_oth;
174   PetscScalar         *apv;
175   PetscReal           apfill;
176 
177   PetscFunctionBegin;
178   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
179   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
180   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
181 #if defined(PTAP_PROFILE)
182   ierr = PetscTime(&t0);CHKERRQ(ierr);
183 #endif
184 
185   /* create struct Mat_PtAPMPI and attached it to C later */
186   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
187   ptap->reuse = MAT_INITIAL_MATRIX;
188 
189   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
190   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
191   /* get P_loc by taking all local rows of P */
192   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
193 
194   /* (0) compute Rd = Pd^T, Ro = Po^T  */
195   /* --------------------------------- */
196   ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
197   ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
198 #if defined(PTAP_PROFILE)
199   ierr = PetscTime(&t11);CHKERRQ(ierr);
200 #endif
201 
202   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
203   /* ----------------------------------------------------------------- */
204   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
205   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
206 
207   /* create and initialize a linked list - nonscalable version */
208   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
209 
210   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
211   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ad->i[am]+ao->i[am]+p_loc->i[pm])),&free_space);CHKERRQ(ierr);
212   current_space = free_space;
213   nspacedouble  = 0;
214 
215   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
216   api[0] = 0;
217   for (i=0; i<am; i++) {
218     /* diagonal portion: Ad[i,:]*P */
219     ai = ad->i; pi = p_loc->i;
220     nzi = ai[i+1] - ai[i];
221     aj  = ad->j + ai[i];
222     for (j=0; j<nzi; j++) {
223       row  = aj[j];
224       pnz  = pi[row+1] - pi[row];
225       Jptr = p_loc->j + pi[row];
226       /* add non-zero cols of P into the sorted linked list lnk */
227       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
228     }
229     /* off-diagonal portion: Ao[i,:]*P */
230     ai = ao->i; pi = p_oth->i;
231     nzi = ai[i+1] - ai[i];
232     aj  = ao->j + ai[i];
233     for (j=0; j<nzi; j++) {
234       row  = aj[j];
235       pnz  = pi[row+1] - pi[row];
236       Jptr = p_oth->j + pi[row];
237       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
238     }
239     apnz     = lnk[0];
240     api[i+1] = api[i] + apnz;
241     if (ap_rmax < apnz) ap_rmax = apnz;
242 
243     /* if free space is not available, double the total space in the list */
244     if (current_space->local_remaining<apnz) {
245       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
246       nspacedouble++;
247     }
248 
249     /* Copy data into free space, then initialize lnk */
250     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
251 
252     current_space->array           += apnz;
253     current_space->local_used      += apnz;
254     current_space->local_remaining -= apnz;
255   }
256   /* Allocate space for apj, initialize apj, and */
257   /* destroy list of free space and other temporary array(s) */
258   ierr   = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
259   ierr   = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
260   apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
261 
262   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
263 #if defined(PTAP_PROFILE)
264   ierr = PetscTime(&t12);CHKERRQ(ierr);
265 #endif
266 
267   /* (2) compute symbolic C_loc = Rd*AP_loc, Co = Ro*AP_loc  */
268   /* ------------------------------------------------------- */
269   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr);
270   ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr);
271   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
272   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
273 #if defined(PTAP_PROFILE)
274   ierr = PetscTime(&t1);CHKERRQ(ierr);
275 #endif
276 
277   /* (3) send coj of C_oth to other processors  */
278   /* ------------------------------------------ */
279   /* determine row ownership */
280   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
281   rowmap->n  = pn;
282   rowmap->bs = 1;
283   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
284   owners = rowmap->range;
285 
286   /* determine the number of messages to send, their lengths */
287   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
288   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
289   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
290 
291   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
292   coi   = c_oth->i; coj = c_oth->j;
293   con   = ptap->C_oth->rmap->n;
294   proc  = 0;
295   for (i=0; i<con; i++) {
296     while (prmap[i] >= owners[proc+1]) proc++;
297     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
298     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
299   }
300 
301   len          = 0; /* max length of buf_si[], see (4) */
302   owners_co[0] = 0;
303   nsend        = 0;
304   for (proc=0; proc<size; proc++) {
305     owners_co[proc+1] = owners_co[proc] + len_si[proc];
306     if (len_s[proc]) {
307       nsend++;
308       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
309       len         += len_si[proc];
310     }
311   }
312 
313   /* determine the number and length of messages to receive for coi and coj  */
314   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
315   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
316 
317   /* post the Irecv and Isend of coj */
318   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
319   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
320   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
321   for (proc=0, k=0; proc<size; proc++) {
322     if (!len_s[proc]) continue;
323     i    = owners_co[proc];
324     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
325     k++;
326   }
327 
328   /* receives and sends of coj are complete */
329   for (i=0; i<nrecv; i++) {
330     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
331   }
332   ierr = PetscFree(rwaits);CHKERRQ(ierr);
333   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
334 
335   /* (4) send and recv coi */
336   /*-----------------------*/
337   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
338   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
339   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
340   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
341   for (proc=0,k=0; proc<size; proc++) {
342     if (!len_s[proc]) continue;
343     /* form outgoing message for i-structure:
344          buf_si[0]:                 nrows to be sent
345                [1:nrows]:           row index (global)
346                [nrows+1:2*nrows+1]: i-structure index
347     */
348     /*-------------------------------------------*/
349     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
350     buf_si_i    = buf_si + nrows+1;
351     buf_si[0]   = nrows;
352     buf_si_i[0] = 0;
353     nrows       = 0;
354     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
355       nzi = coi[i+1] - coi[i];
356       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
357       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
358       nrows++;
359     }
360     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
361     k++;
362     buf_si += len_si[proc];
363   }
364   i = nrecv;
365   while (i--) {
366     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
367   }
368   ierr = PetscFree(rwaits);CHKERRQ(ierr);
369   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);}
370 
371   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
372   ierr = PetscFree(len_ri);CHKERRQ(ierr);
373   ierr = PetscFree(swaits);CHKERRQ(ierr);
374   ierr = PetscFree(buf_s);CHKERRQ(ierr);
375 #if defined(PTAP_PROFILE)
376   ierr = PetscTime(&t2);CHKERRQ(ierr);
377 #endif
378   /* (5) compute the local portion of Cmpi      */
379   /* ------------------------------------------ */
380   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
381   ierr          = PetscFreeSpaceGet(pN,&free_space);CHKERRQ(ierr); /* non-scalable version */
382   current_space = free_space;
383 
384   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
385   for (k=0; k<nrecv; k++) {
386     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
387     nrows       = *buf_ri_k[k];
388     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
389     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
390   }
391   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
392 
393   rmax = 0;
394   for (i=0; i<pn; i++) {
395     /* add C_loc into Cmpi */
396     nzi  = c_loc->i[i+1] - c_loc->i[i];
397     Jptr = c_loc->j + c_loc->i[i];
398     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
399 
400     /* add received col data into lnk */
401     for (k=0; k<nrecv; k++) { /* k-th received message */
402       if (i == *nextrow[k]) { /* i-th row */
403         nzi  = *(nextci[k]+1) - *nextci[k];
404         Jptr = buf_rj[k] + *nextci[k];
405         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
406         nextrow[k]++; nextci[k]++;
407       }
408     }
409     nnz = lnk[0];
410 
411     /* copy data into free space, then initialize lnk */
412     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
413     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
414     if (nnz > rmax) rmax = nnz;
415   }
416   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
417   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
418   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
419 #if defined(PTAP_PROFILE)
420   ierr = PetscTime(&t3);CHKERRQ(ierr);
421 #endif
422 
423   /* (6) create symbolic parallel matrix Cmpi */
424   /*------------------------------------------*/
425   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
426   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
427   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
428   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
429   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
430   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
431 
432   /* members in merge */
433   ierr = PetscFree(id_r);CHKERRQ(ierr);
434   ierr = PetscFree(len_r);CHKERRQ(ierr);
435   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
436   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
437   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
438   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
439   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
440 
441   /* attach the supporting struct to Cmpi for reuse */
442   c = (Mat_MPIAIJ*)Cmpi->data;
443   c->ptap         = ptap;
444   ptap->api       = api;
445   ptap->apj       = apj;
446   ptap->apv       = apv;
447   ptap->rmax      = ap_rmax;
448   ptap->duplicate = Cmpi->ops->duplicate;
449   ptap->destroy   = Cmpi->ops->destroy;
450 
451   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ_new() */
452   ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
453 
454 
455   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
456   Cmpi->assembled        = PETSC_FALSE;
457   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
458   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP_new;
459   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_new;
460   *C                     = Cmpi;
461 
462 #if defined(PTAP_PROFILE)
463   ierr = PetscTime(&t4);CHKERRQ(ierr);
464   if (rank == 1) {
465     printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0);
466   }
467 #endif
468   PetscFunctionReturn(0);
469 }
470 
471 #undef __FUNCT__
472 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_new"
473 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_new(Mat A,Mat P,Mat C)
474 {
475   PetscErrorCode    ierr;
476   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
477   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
478   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
479   Mat_PtAPMPI       *ptap = c->ptap;
480   Mat               AP_loc,C_loc,C_oth;
481   PetscInt          i,rstart,rend,cm,ncols,row;
482   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
483   PetscScalar       *apa;
484   const PetscInt    *cols;
485   const PetscScalar *vals;
486 #if defined(PTAP_PROFILE)
487   PetscMPIInt       rank;
488   MPI_Comm          comm;
489   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
490 #endif
491 
492   PetscFunctionBegin;
493   ierr = MatZeroEntries(C);CHKERRQ(ierr);
494 
495   /* 1) get R = Pd^T,Ro = Po^T */
496 #if defined(PTAP_PROFILE)
497   ierr = PetscTime(&t0);CHKERRQ(ierr);
498 #endif
499   if (ptap->reuse == MAT_REUSE_MATRIX) {
500     ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
501     ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
502   }
503 #if defined(PTAP_PROFILE)
504   ierr = PetscTime(&t1);CHKERRQ(ierr);
505   eR = t1 - t0;
506 #endif
507 
508   /* 2) get AP_loc */
509   AP_loc = ptap->AP_loc;
510   ap = (Mat_SeqAIJ*)AP_loc->data;
511 
512   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
513   /*-----------------------------------------------------*/
514   if (ptap->reuse == MAT_REUSE_MATRIX) {
515     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
516     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
517     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
518   }
519 
520   /* 2-2) compute numeric A_loc*P - dominating part */
521   /* ---------------------------------------------- */
522   /* get data from symbolic products */
523   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
524   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
525   apa   = ptap->apa;
526   api = ptap->api;
527   apj = ptap->apj;
528   for (i=0; i<am; i++) {
529     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
530     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
531     apnz = api[i+1] - api[i];
532     for (j=0; j<apnz; j++) {
533       col = apj[j+api[i]];
534       ap->a[j+ap->i[i]] = apa[col];
535       apa[col] = 0.0;
536     }
537   }
538 #if defined(PTAP_PROFILE)
539   ierr = PetscTime(&t2);CHKERRQ(ierr);
540   eAP = t2 - t1;
541 #endif
542 
543   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
544   ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
545   ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
546   C_loc = ptap->C_loc;
547   C_oth = ptap->C_oth;
548 #if defined(PTAP_PROFILE)
549   ierr = PetscTime(&t3);CHKERRQ(ierr);
550   eCseq = t3 - t2;
551 #endif
552 
553   /* add C_loc and Co to to C */
554   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
555 
556   /* C_loc -> C */
557   cm    = C_loc->rmap->N;
558   c_seq = (Mat_SeqAIJ*)C_loc->data;
559   cols = c_seq->j;
560   vals = c_seq->a;
561   for (i=0; i<cm; i++) {
562     ncols = c_seq->i[i+1] - c_seq->i[i];
563     row = rstart + i;
564     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
565     cols += ncols; vals += ncols;
566   }
567 
568   /* Co -> C, off-processor part */
569   cm = C_oth->rmap->N;
570   c_seq = (Mat_SeqAIJ*)C_oth->data;
571   cols = c_seq->j;
572   vals = c_seq->a;
573   for (i=0; i<cm; i++) {
574     ncols = c_seq->i[i+1] - c_seq->i[i];
575     row = p->garray[i];
576     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
577     cols += ncols; vals += ncols;
578   }
579   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
580   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
581 #if defined(PTAP_PROFILE)
582   ierr = PetscTime(&t4);CHKERRQ(ierr);
583   eCmpi = t4 - t3;
584 
585   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
586   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
587   if (rank==1) {
588     ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr);
589   }
590 #endif
591   ptap->reuse = MAT_REUSE_MATRIX;
592   PetscFunctionReturn(0);
593 }
594 
595 #undef __FUNCT__
596 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
597 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
598 {
599   PetscErrorCode      ierr;
600   Mat                 Cmpi;
601   Mat_PtAPMPI         *ptap;
602   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
603   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
604   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
605   Mat_SeqAIJ          *p_loc,*p_oth;
606   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
607   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
608   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
609   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
610   PetscBT             lnkbt;
611   MPI_Comm            comm;
612   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
613   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
614   PetscInt            len,proc,*dnz,*onz,*owners;
615   PetscInt            nzi,*pti,*ptj;
616   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
617   MPI_Request         *swaits,*rwaits;
618   MPI_Status          *sstatus,rstatus;
619   Mat_Merge_SeqsToMPI *merge;
620   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
621   PetscReal           afill=1.0,afill_tmp;
622   PetscInt            rmax;
623 
624   PetscFunctionBegin;
625   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
626 
627   /* check if matrix local sizes are compatible */
628   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
629     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
630   }
631   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
632     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
633   }
634 
635   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
636   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
637 
638   /* create struct Mat_PtAPMPI and attached it to C later */
639   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
640   ierr        = PetscNew(&merge);CHKERRQ(ierr);
641   ptap->merge = merge;
642   ptap->reuse = MAT_INITIAL_MATRIX;
643 
644   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
645   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
646 
647   /* get P_loc by taking all local rows of P */
648   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
649 
650   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
651   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
652   pi_loc = p_loc->i; pj_loc = p_loc->j;
653   pi_oth = p_oth->i; pj_oth = p_oth->j;
654 
655   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
656   /*--------------------------------------------------------------------------*/
657   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
658   api[0] = 0;
659 
660   /* create and initialize a linked list */
661   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
662 
663   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
664   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
665   current_space = free_space;
666 
667   for (i=0; i<am; i++) {
668     /* diagonal portion of A */
669     nzi = adi[i+1] - adi[i];
670     aj  = ad->j + adi[i];
671     for (j=0; j<nzi; j++) {
672       row  = aj[j];
673       pnz  = pi_loc[row+1] - pi_loc[row];
674       Jptr = pj_loc + pi_loc[row];
675       /* add non-zero cols of P into the sorted linked list lnk */
676       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
677     }
678     /* off-diagonal portion of A */
679     nzi = aoi[i+1] - aoi[i];
680     aj  = ao->j + aoi[i];
681     for (j=0; j<nzi; j++) {
682       row  = aj[j];
683       pnz  = pi_oth[row+1] - pi_oth[row];
684       Jptr = pj_oth + pi_oth[row];
685       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
686     }
687     apnz     = lnk[0];
688     api[i+1] = api[i] + apnz;
689     if (ap_rmax < apnz) ap_rmax = apnz;
690 
691     /* if free space is not available, double the total space in the list */
692     if (current_space->local_remaining<apnz) {
693       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
694       nspacedouble++;
695     }
696 
697     /* Copy data into free space, then initialize lnk */
698     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
699 
700     current_space->array           += apnz;
701     current_space->local_used      += apnz;
702     current_space->local_remaining -= apnz;
703   }
704 
705   /* Allocate space for apj, initialize apj, and */
706   /* destroy list of free space and other temporary array(s) */
707   ierr      = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr);
708   ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
709   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
710   if (afill_tmp > afill) afill = afill_tmp;
711 
712   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
713   /*-----------------------------------------------------------------*/
714   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
715 
716   /* then, compute symbolic Co = (p->B)^T*AP */
717   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
718                          >= (num of nonzero rows of C_seq) - pn */
719   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
720   coi[0] = 0;
721 
722   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
723   nnz           = fill*(poti[pon] + api[am]);
724   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
725   current_space = free_space;
726 
727   for (i=0; i<pon; i++) {
728     pnz = poti[i+1] - poti[i];
729     ptJ = potj + poti[i];
730     for (j=0; j<pnz; j++) {
731       row  = ptJ[j]; /* row of AP == col of Pot */
732       apnz = api[row+1] - api[row];
733       Jptr = apj + api[row];
734       /* add non-zero cols of AP into the sorted linked list lnk */
735       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
736     }
737     nnz = lnk[0];
738 
739     /* If free space is not available, double the total space in the list */
740     if (current_space->local_remaining<nnz) {
741       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
742       nspacedouble++;
743     }
744 
745     /* Copy data into free space, and zero out denserows */
746     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
747 
748     current_space->array           += nnz;
749     current_space->local_used      += nnz;
750     current_space->local_remaining -= nnz;
751 
752     coi[i+1] = coi[i] + nnz;
753   }
754 
755   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
756   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
757   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
758   if (afill_tmp > afill) afill = afill_tmp;
759   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
760 
761   /* (3) send j-array (coj) of Co to other processors */
762   /*--------------------------------------------------*/
763   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
764   len_s        = merge->len_s;
765   merge->nsend = 0;
766 
767 
768   /* determine row ownership */
769   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
770   merge->rowmap->n  = pn;
771   merge->rowmap->bs = 1;
772 
773   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
774   owners = merge->rowmap->range;
775 
776   /* determine the number of messages to send, their lengths */
777   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
778   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
779   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
780 
781   proc = 0;
782   for (i=0; i<pon; i++) {
783     while (prmap[i] >= owners[proc+1]) proc++;
784     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
785     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
786   }
787 
788   len          = 0; /* max length of buf_si[], see (4) */
789   owners_co[0] = 0;
790   for (proc=0; proc<size; proc++) {
791     owners_co[proc+1] = owners_co[proc] + len_si[proc];
792     if (len_s[proc]) {
793       merge->nsend++;
794       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
795       len         += len_si[proc];
796     }
797   }
798 
799   /* determine the number and length of messages to receive for coi and coj  */
800   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
801   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
802 
803   /* post the Irecv and Isend of coj */
804   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
805   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
806   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
807   for (proc=0, k=0; proc<size; proc++) {
808     if (!len_s[proc]) continue;
809     i    = owners_co[proc];
810     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
811     k++;
812   }
813 
814   /* receives and sends of coj are complete */
815   for (i=0; i<merge->nrecv; i++) {
816     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
817   }
818   ierr = PetscFree(rwaits);CHKERRQ(ierr);
819   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
820 
821   /* (4) send and recv coi */
822   /*-----------------------*/
823   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
824   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
825   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
826   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
827   for (proc=0,k=0; proc<size; proc++) {
828     if (!len_s[proc]) continue;
829     /* form outgoing message for i-structure:
830          buf_si[0]:                 nrows to be sent
831                [1:nrows]:           row index (global)
832                [nrows+1:2*nrows+1]: i-structure index
833     */
834     /*-------------------------------------------*/
835     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
836     buf_si_i    = buf_si + nrows+1;
837     buf_si[0]   = nrows;
838     buf_si_i[0] = 0;
839     nrows       = 0;
840     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
841       nzi = coi[i+1] - coi[i];
842       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
843       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
844       nrows++;
845     }
846     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
847     k++;
848     buf_si += len_si[proc];
849   }
850   i = merge->nrecv;
851   while (i--) {
852     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
853   }
854   ierr = PetscFree(rwaits);CHKERRQ(ierr);
855   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
856 
857   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
858   ierr = PetscFree(len_ri);CHKERRQ(ierr);
859   ierr = PetscFree(swaits);CHKERRQ(ierr);
860   ierr = PetscFree(buf_s);CHKERRQ(ierr);
861 
862   /* (5) compute the local portion of C (mpi mat) */
863   /*----------------------------------------------*/
864   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
865 
866   /* allocate pti array and free space for accumulating nonzero column info */
867   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
868   pti[0] = 0;
869 
870   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
871   nnz           = fill*(pi_loc[pm] + api[am]);
872   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
873   current_space = free_space;
874 
875   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
876   for (k=0; k<merge->nrecv; k++) {
877     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
878     nrows       = *buf_ri_k[k];
879     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
880     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
881   }
882   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
883   rmax = 0;
884   for (i=0; i<pn; i++) {
885     /* add pdt[i,:]*AP into lnk */
886     pnz = pdti[i+1] - pdti[i];
887     ptJ = pdtj + pdti[i];
888     for (j=0; j<pnz; j++) {
889       row  = ptJ[j];  /* row of AP == col of Pt */
890       apnz = api[row+1] - api[row];
891       Jptr = apj + api[row];
892       /* add non-zero cols of AP into the sorted linked list lnk */
893       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
894     }
895 
896     /* add received col data into lnk */
897     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
898       if (i == *nextrow[k]) { /* i-th row */
899         nzi  = *(nextci[k]+1) - *nextci[k];
900         Jptr = buf_rj[k] + *nextci[k];
901         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
902         nextrow[k]++; nextci[k]++;
903       }
904     }
905     nnz = lnk[0];
906 
907     /* if free space is not available, make more free space */
908     if (current_space->local_remaining<nnz) {
909       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
910       nspacedouble++;
911     }
912     /* copy data into free space, then initialize lnk */
913     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
914     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
915 
916     current_space->array           += nnz;
917     current_space->local_used      += nnz;
918     current_space->local_remaining -= nnz;
919 
920     pti[i+1] = pti[i] + nnz;
921     if (nnz > rmax) rmax = nnz;
922   }
923   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
924   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
925 
926   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
927   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
928   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
929   if (afill_tmp > afill) afill = afill_tmp;
930   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
931 
932   /* (6) create symbolic parallel matrix Cmpi */
933   /*------------------------------------------*/
934   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
935   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
936   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
937   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
938   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
939   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
940 
941   merge->bi        = pti;      /* Cseq->i */
942   merge->bj        = ptj;      /* Cseq->j */
943   merge->coi       = coi;      /* Co->i   */
944   merge->coj       = coj;      /* Co->j   */
945   merge->buf_ri    = buf_ri;
946   merge->buf_rj    = buf_rj;
947   merge->owners_co = owners_co;
948   merge->destroy   = Cmpi->ops->destroy;
949   merge->duplicate = Cmpi->ops->duplicate;
950 
951   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
952   Cmpi->assembled      = PETSC_FALSE;
953   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
954   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
955 
956   /* attach the supporting struct to Cmpi for reuse */
957   c           = (Mat_MPIAIJ*)Cmpi->data;
958   c->ptap     = ptap;
959   ptap->api   = api;
960   ptap->apj   = apj;
961   ptap->rmax  = ap_rmax;
962   *C          = Cmpi;
963 
964   /* flag 'scalable' determines which implementations to be used:
965        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
966        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
967   /* set default scalable */
968   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
969 
970   ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
971   if (!ptap->scalable) {  /* Do dense axpy */
972     ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
973   } else {
974     ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
975   }
976 
977 #if defined(PETSC_USE_INFO)
978   if (pti[pn] != 0) {
979     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
980     ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
981   } else {
982     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
983   }
984 #endif
985   PetscFunctionReturn(0);
986 }
987 
988 #undef __FUNCT__
989 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
990 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
991 {
992   PetscErrorCode      ierr;
993   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
994   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
995   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
996   Mat_SeqAIJ          *p_loc,*p_oth;
997   Mat_PtAPMPI         *ptap;
998   Mat_Merge_SeqsToMPI *merge;
999   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1000   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1001   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
1002   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1003   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1004   MPI_Comm            comm;
1005   PetscMPIInt         size,rank,taga,*len_s;
1006   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1007   PetscInt            **buf_ri,**buf_rj;
1008   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1009   MPI_Request         *s_waits,*r_waits;
1010   MPI_Status          *status;
1011   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1012   PetscInt            *api,*apj,*coi,*coj;
1013   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1014   PetscBool           scalable;
1015 #if defined(PTAP_PROFILE)
1016   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1017 #endif
1018 
1019   PetscFunctionBegin;
1020   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1021 #if defined(PTAP_PROFILE)
1022   ierr = PetscTime(&t0);CHKERRQ(ierr);
1023 #endif
1024   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1025   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1026 
1027   ptap = c->ptap;
1028   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1029   merge    = ptap->merge;
1030   apa      = ptap->apa;
1031   scalable = ptap->scalable;
1032 
1033   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1034   /*-----------------------------------------------------*/
1035   if (ptap->reuse == MAT_INITIAL_MATRIX) {
1036     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1037     ptap->reuse = MAT_REUSE_MATRIX;
1038   } else { /* update numerical values of P_oth and P_loc */
1039     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1040     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1041   }
1042 #if defined(PTAP_PROFILE)
1043   ierr = PetscTime(&t1);CHKERRQ(ierr);
1044   eP = t1-t0;
1045 #endif
1046   /*
1047   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1048          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1049          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1050          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1051    */
1052 
1053   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1054   /*--------------------------------------------------------------*/
1055   /* get data from symbolic products */
1056   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1057   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1058   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a;
1059   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1060 
1061   coi  = merge->coi; coj = merge->coj;
1062   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1063 
1064   bi     = merge->bi; bj = merge->bj;
1065   owners = merge->rowmap->range;
1066   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);  /* ba: Cseq->a */
1067 
1068   api = ptap->api; apj = ptap->apj;
1069 
1070   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1071     ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr);
1072 #if 0
1073     /* ------ 10x slower -------------- */
1074     /*==================================*/
1075     Mat         R = ptap->R;
1076     Mat_SeqAIJ  *r = (Mat_SeqAIJ*)R->data;
1077     PetscInt    *ri=r->i,*rj=r->j,rnz,arow,l,prow,pcol,pN=P->cmap->N;
1078     PetscScalar *ra=r->a,tmp,cdense[pN];
1079 
1080     ierr = PetscMemzero(cdense,pN*sizeof(PetscScalar));CHKERRQ(ierr);
1081     for (i=0; i<cm; i++) { /* each row of C or R */
1082       rnz = ri[i+1] - ri[i];
1083 
1084       for (j=0; j<rnz; j++) { /* each nz of R */
1085         arow = rj[ri[i] + j];
1086 
1087         /* diagonal portion of A */
1088         anz  = ad->i[arow+1] - ad->i[arow];
1089         for (k=0; k<anz; k++) { /* each nz of Ad */
1090           tmp  = ra[ri[i] + j]*ad->a[ad->i[arow] + k];
1091           prow = ad->j[ad->i[arow] + k];
1092           pnz  = pi_loc[prow+1] - pi_loc[prow];
1093 
1094           for (l=0; l<pnz; l++) { /* each nz of P_loc */
1095             pcol = pj_loc[pi_loc[prow] + l];
1096             cdense[pcol] += tmp*pa_loc[pi_loc[prow] + l];
1097           }
1098         }
1099 
1100         /* off-diagonal portion of A */
1101         anz  = ao->i[arow+1] - ao->i[arow];
1102         for (k=0; k<anz; k++) { /* each nz of Ao */
1103           tmp  = ra[ri[i] + j]*ao->a[ao->i[arow] + k];
1104           prow = ao->j[ao->i[arow] + k];
1105           pnz  = pi_oth[prow+1] - pi_oth[prow];
1106 
1107           for (l=0; l<pnz; l++) { /* each nz of P_oth */
1108             pcol = pj_oth[pi_oth[prow] + l];
1109             cdense[pcol] += tmp*pa_oth[pi_oth[prow] + l];
1110           }
1111         }
1112 
1113       } //for (j=0; j<rnz; j++)
1114 
1115       /* copy cdense[] into ca; zero cdense[] */
1116       cnz = bi[i+1] - bi[i];
1117       cj  = bj + bi[i];
1118       ca  = ba + bi[i];
1119       for (j=0; j<cnz; j++) {
1120         ca[j] += cdense[cj[j]];
1121         cdense[cj[j]] = 0.0;
1122       }
1123 #if 0
1124       if (rank == 0) {
1125         printf("[%d] row %d: ",rank,i);
1126         for (j=0; j<pN; j++) printf(" %g,",cdense[j]);
1127         printf("\n");
1128       }
1129       for (j=0; j<pN; j++) cdense[j]=0.0; // zero cdnese[]
1130 #endif
1131     } //for (i=0; i<cm; i++) {
1132 #endif
1133 
1134     //==========================================
1135 #if defined(PTAP_PROFILE)
1136     ierr = PetscTime(&t1);CHKERRQ(ierr);
1137 #endif
1138     for (i=0; i<am; i++) {
1139 #if defined(PTAP_PROFILE)
1140       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1141 #endif
1142       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1143       /*------------------------------------------------------------*/
1144       apJ = apj + api[i];
1145 
1146       /* diagonal portion of A */
1147       anz = adi[i+1] - adi[i];
1148       adj = ad->j + adi[i];
1149       ada = ad->a + adi[i];
1150       for (j=0; j<anz; j++) {
1151         row = adj[j];
1152         pnz = pi_loc[row+1] - pi_loc[row];
1153         pj  = pj_loc + pi_loc[row];
1154         pa  = pa_loc + pi_loc[row];
1155 
1156         /* perform dense axpy */
1157         valtmp = ada[j];
1158         for (k=0; k<pnz; k++) {
1159           apa[pj[k]] += valtmp*pa[k];
1160         }
1161         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1162       }
1163 
1164       /* off-diagonal portion of A */
1165       anz = aoi[i+1] - aoi[i];
1166       aoj = ao->j + aoi[i];
1167       aoa = ao->a + aoi[i];
1168       for (j=0; j<anz; j++) {
1169         row = aoj[j];
1170         pnz = pi_oth[row+1] - pi_oth[row];
1171         pj  = pj_oth + pi_oth[row];
1172         pa  = pa_oth + pi_oth[row];
1173 
1174         /* perform dense axpy */
1175         valtmp = aoa[j];
1176         for (k=0; k<pnz; k++) {
1177           apa[pj[k]] += valtmp*pa[k];
1178         }
1179         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1180       }
1181 #if defined(PTAP_PROFILE)
1182       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1183       et2_AP += t2_1 - t2_0;
1184 #endif
1185 
1186       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1187       /*--------------------------------------------------------------*/
1188       apnz = api[i+1] - api[i];
1189       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1190       pnz = po->i[i+1] - po->i[i];
1191       poJ = po->j + po->i[i];
1192       pA  = po->a + po->i[i];
1193       for (j=0; j<pnz; j++) {
1194         row = poJ[j];
1195         cnz = coi[row+1] - coi[row];
1196         cj  = coj + coi[row];
1197         ca  = coa + coi[row];
1198         /* perform dense axpy */
1199         valtmp = pA[j];
1200         for (k=0; k<cnz; k++) {
1201           ca[k] += valtmp*apa[cj[k]];
1202         }
1203         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1204       }
1205 #if 1
1206       /* put the value into Cd (diagonal part) */
1207       pnz = pd->i[i+1] - pd->i[i];
1208       pdJ = pd->j + pd->i[i];
1209       pA  = pd->a + pd->i[i];
1210       for (j=0; j<pnz; j++) {
1211         row = pdJ[j];
1212         cnz = bi[row+1] - bi[row];
1213         cj  = bj + bi[row];
1214         ca  = ba + bi[row];
1215         /* perform dense axpy */
1216         valtmp = pA[j];
1217         for (k=0; k<cnz; k++) {
1218           ca[k] += valtmp*apa[cj[k]];
1219         }
1220         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1221       }
1222 #endif
1223       /* zero the current row of A*P */
1224       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1225 #if defined(PTAP_PROFILE)
1226       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1227       ePtAP += t2_2 - t2_1;
1228 #endif
1229     }
1230 
1231     if (rank == 100) {
1232     for (row=0; row<cm; row++) {
1233       printf("[%d] row %d: ",rank,row);
1234       cnz = bi[row+1] - bi[row];
1235       for (j=0; j<cnz; j++) printf(" %g,",ba[bi[row]+j]);
1236       printf("\n");
1237     }
1238     }
1239 
1240   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1241     ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr);
1242     /*-----------------------------------------------------------------------------------------*/
1243     pA=pa_loc;
1244     for (i=0; i<am; i++) {
1245 #if defined(PTAP_PROFILE)
1246       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1247 #endif
1248       /* form i-th sparse row of A*P */
1249       apnz = api[i+1] - api[i];
1250       apJ  = apj + api[i];
1251       /* diagonal portion of A */
1252       anz = adi[i+1] - adi[i];
1253       adj = ad->j + adi[i];
1254       ada = ad->a + adi[i];
1255       for (j=0; j<anz; j++) {
1256         row    = adj[j];
1257         pnz    = pi_loc[row+1] - pi_loc[row];
1258         pj     = pj_loc + pi_loc[row];
1259         pa     = pa_loc + pi_loc[row];
1260         valtmp = ada[j];
1261         nextp  = 0;
1262         for (k=0; nextp<pnz; k++) {
1263           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1264             apa[k] += valtmp*pa[nextp++];
1265           }
1266         }
1267         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1268       }
1269       /* off-diagonal portion of A */
1270       anz = aoi[i+1] - aoi[i];
1271       aoj = ao->j + aoi[i];
1272       aoa = ao->a + aoi[i];
1273       for (j=0; j<anz; j++) {
1274         row    = aoj[j];
1275         pnz    = pi_oth[row+1] - pi_oth[row];
1276         pj     = pj_oth + pi_oth[row];
1277         pa     = pa_oth + pi_oth[row];
1278         valtmp = aoa[j];
1279         nextp  = 0;
1280         for (k=0; nextp<pnz; k++) {
1281           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1282             apa[k] += valtmp*pa[nextp++];
1283           }
1284         }
1285         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1286       }
1287 #if defined(PTAP_PROFILE)
1288       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1289       et2_AP += t2_1 - t2_0;
1290 #endif
1291 
1292       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1293       /*--------------------------------------------------------------*/
1294       pnz = pi_loc[i+1] - pi_loc[i];
1295       pJ  = pj_loc + pi_loc[i];
1296       for (j=0; j<pnz; j++) {
1297         nextap = 0;
1298         row    = pJ[j]; /* global index */
1299         if (row < pcstart || row >=pcend) { /* put the value into Co */
1300           row = *poJ;
1301           cj  = coj + coi[row];
1302           ca  = coa + coi[row]; poJ++;
1303         } else {                            /* put the value into Cd */
1304           row = *pdJ;
1305           cj  = bj + bi[row];
1306           ca  = ba + bi[row]; pdJ++;
1307         }
1308         valtmp = pA[j];
1309         for (k=0; nextap<apnz; k++) {
1310           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1311         }
1312         ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1313       }
1314       pA += pnz;
1315       /* zero the current row info for A*P */
1316       ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
1317 #if defined(PTAP_PROFILE)
1318       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1319       ePtAP += t2_2 - t2_1;
1320 #endif
1321     }
1322   }
1323 #if defined(PTAP_PROFILE)
1324   ierr = PetscTime(&t2);CHKERRQ(ierr);
1325 #endif
1326 
1327   /* 3) send and recv matrix values coa */
1328   /*------------------------------------*/
1329   buf_ri = merge->buf_ri;
1330   buf_rj = merge->buf_rj;
1331   len_s  = merge->len_s;
1332   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1333   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1334 
1335   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1336   for (proc=0,k=0; proc<size; proc++) {
1337     if (!len_s[proc]) continue;
1338     i    = merge->owners_co[proc];
1339     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1340     k++;
1341   }
1342   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1343   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1344 
1345   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1346   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1347   ierr = PetscFree(coa);CHKERRQ(ierr);
1348 #if defined(PTAP_PROFILE)
1349   ierr = PetscTime(&t3);CHKERRQ(ierr);
1350 #endif
1351 
1352   /* 4) insert local Cseq and received values into Cmpi */
1353   /*------------------------------------------------------*/
1354   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1355   for (k=0; k<merge->nrecv; k++) {
1356     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1357     nrows       = *(buf_ri_k[k]);
1358     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1359     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1360   }
1361 
1362   for (i=0; i<cm; i++) {
1363     row  = owners[rank] + i; /* global row index of C_seq */
1364     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1365     ba_i = ba + bi[i];
1366     bnz  = bi[i+1] - bi[i];
1367     /* add received vals into ba */
1368     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1369       /* i-th row */
1370       if (i == *nextrow[k]) {
1371         cnz    = *(nextci[k]+1) - *nextci[k];
1372         cj     = buf_rj[k] + *(nextci[k]);
1373         ca     = abuf_r[k] + *(nextci[k]);
1374         nextcj = 0;
1375         for (j=0; nextcj<cnz; j++) {
1376           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1377             ba_i[j] += ca[nextcj++];
1378           }
1379         }
1380         nextrow[k]++; nextci[k]++;
1381         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1382       }
1383     }
1384     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1385   }
1386   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1387   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1388 
1389   ierr = PetscFree(ba);CHKERRQ(ierr);
1390   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1391   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1392   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1393 #if defined(PTAP_PROFILE)
1394   ierr = PetscTime(&t4);CHKERRQ(ierr);
1395   if (rank==1) {
1396     ierr = PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
1397   }
1398 #endif
1399   PetscFunctionReturn(0);
1400 }
1401