xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 15a3b8e2693f5607fd21f236b5b960f29edfdd4f)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 
13 #define PTAP_PROFILE
14 
15 extern PetscErrorCode MatDestroy_MPIAIJ(Mat);
16 #undef __FUNCT__
17 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
18 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
19 {
20   PetscErrorCode ierr;
21   Mat_MPIAIJ     *a   =(Mat_MPIAIJ*)A->data;
22   Mat_PtAPMPI    *ptap=a->ptap;
23 
24   PetscFunctionBegin;
25   if (ptap) {
26     Mat_Merge_SeqsToMPI *merge=ptap->merge;
27     ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
28     ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
29     ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
30     ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
31     ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
32 
33     ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
34     ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
35     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
36     ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
37     ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
38 
39     if (ptap->api) {ierr = PetscFree(ptap->api);CHKERRQ(ierr);}
40     if (ptap->apj) {ierr = PetscFree(ptap->apj);CHKERRQ(ierr);}
41     if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
42     if (merge) {
43       ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
44       ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
45       ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
46       ierr = PetscFree(merge->bi);CHKERRQ(ierr);
47       ierr = PetscFree(merge->bj);CHKERRQ(ierr);
48       ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
49       ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
50       ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
51       ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
52       ierr = PetscFree(merge->coi);CHKERRQ(ierr);
53       ierr = PetscFree(merge->coj);CHKERRQ(ierr);
54       ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
55       ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
56       ierr = merge->destroy(A);CHKERRQ(ierr);
57       ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
58     }
59     ierr = PetscFree(ptap);CHKERRQ(ierr);
60   }
61   PetscFunctionReturn(0);
62 }
63 
64 #undef __FUNCT__
65 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatPtAP"
66 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
67 {
68   PetscErrorCode      ierr;
69   Mat_MPIAIJ          *a     = (Mat_MPIAIJ*)A->data;
70   Mat_PtAPMPI         *ptap  = a->ptap;
71   Mat_Merge_SeqsToMPI *merge = ptap->merge;
72 
73   PetscFunctionBegin;
74   ierr = (*merge->duplicate)(A,op,M);CHKERRQ(ierr);
75 
76   (*M)->ops->destroy   = merge->destroy;
77   (*M)->ops->duplicate = merge->duplicate;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
83 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
84 {
85   PetscErrorCode ierr;
86   PetscBool      newalg=PETSC_FALSE;
87 
88   PetscFunctionBegin;
89   ierr = PetscOptionsGetBool(NULL,"-matptap_new",&newalg,NULL);CHKERRQ(ierr);
90   if (scall == MAT_INITIAL_MATRIX) {
91     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
92     if (newalg) {
93       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(A,P,fill,C);CHKERRQ(ierr);
94     } else {
95       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
96     }
97     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
98   }
99   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
100   if (newalg) {
101     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ_new(A,P,*C);CHKERRQ(ierr);
102   } else {
103     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
104   }
105   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 #undef __FUNCT__
110 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_new"
111 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(Mat A,Mat P,PetscReal fill,Mat *C)
112 {
113   PetscErrorCode ierr;
114   Mat_PtAPMPI    *ptap;
115   Mat_MPIAIJ     *p=(Mat_MPIAIJ*)P->data;
116   Mat            AP;
117   Mat_MPIAIJ     *c;
118 
119   PetscFunctionBegin;
120   MPI_Comm            comm;
121   PetscMPIInt         size,rank;
122   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
123   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
124   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
125 
126   /* check if matrix local sizes are compatible -- MV! */
127   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
128     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
129   }
130   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
131     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
132   }
133 
134   //ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); //rm later !!!
135   //=============================================================================
136   Mat                 Cmpi;
137   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
138   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data;
139   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
140   Mat_SeqAIJ          *p_loc,*p_oth;
141   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
142   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
143   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
144   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
145   PetscBT             lnkbt;
146   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
147   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
148   PetscInt            len,proc,*dnz,*onz,*owners;
149   PetscInt            nzi,*pti,*ptj;
150   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
151   MPI_Request         *swaits,*rwaits;
152   MPI_Status          *sstatus,rstatus;
153   Mat_Merge_SeqsToMPI *merge;
154   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
155   PetscReal           afill=1.0,afill_tmp;
156   PetscInt            rmax;
157 
158   /* create struct Mat_PtAPMPI and attached it to C later */
159   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
160   ierr        = PetscNew(&merge);CHKERRQ(ierr);
161   ptap->merge = merge;
162   ptap->reuse = MAT_INITIAL_MATRIX;
163 
164   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
165   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
166 
167   /* get P_loc by taking all local rows of P */
168   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
169 
170   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
171   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
172   pi_loc = p_loc->i; pj_loc = p_loc->j;
173   pi_oth = p_oth->i; pj_oth = p_oth->j;
174 
175   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
176   /*--------------------------------------------------------------------------*/
177   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
178   api[0] = 0;
179 
180   /* create and initialize a linked list */
181   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
182 
183   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
184   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
185 
186   current_space = free_space;
187 
188   for (i=0; i<am; i++) {
189     /* diagonal portion of A */
190     nzi = adi[i+1] - adi[i];
191     aj  = ad->j + adi[i];
192     for (j=0; j<nzi; j++) {
193       row  = aj[j];
194       pnz  = pi_loc[row+1] - pi_loc[row];
195       Jptr = pj_loc + pi_loc[row];
196       /* add non-zero cols of P into the sorted linked list lnk */
197       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
198     }
199     /* off-diagonal portion of A */
200     nzi = aoi[i+1] - aoi[i];
201     aj  = ao->j + aoi[i];
202     for (j=0; j<nzi; j++) {
203       row  = aj[j];
204       pnz  = pi_oth[row+1] - pi_oth[row];
205       Jptr = pj_oth + pi_oth[row];
206       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
207     }
208     apnz     = lnk[0];
209     api[i+1] = api[i] + apnz;
210     if (ap_rmax < apnz) ap_rmax = apnz;
211 
212     /* if free space is not available, double the total space in the list */
213     if (current_space->local_remaining<apnz) {
214       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
215       nspacedouble++;
216     }
217 
218     /* Copy data into free space, then initialize lnk */
219     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
220 
221     current_space->array           += apnz;
222     current_space->local_used      += apnz;
223     current_space->local_remaining -= apnz;
224   }
225 
226   /* Allocate space for apj, initialize apj, and */
227   /* destroy list of free space and other temporary array(s) */
228   ierr      = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr);
229   ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
230   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
231   if (afill_tmp > afill) afill = afill_tmp;
232 
233   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
234   /*-----------------------------------------------------------------*/
235   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
236 
237   /* then, compute symbolic Co = (p->B)^T*AP */
238   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
239                          >= (num of nonzero rows of C_seq) - pn */
240   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
241   coi[0] = 0;
242 
243   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
244   nnz           = fill*(poti[pon] + api[am]);
245   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
246   current_space = free_space;
247 
248   for (i=0; i<pon; i++) {
249     pnz = poti[i+1] - poti[i];
250     ptJ = potj + poti[i];
251     for (j=0; j<pnz; j++) {
252       row  = ptJ[j]; /* row of AP == col of Pot */
253       apnz = api[row+1] - api[row];
254       Jptr = apj + api[row];
255       /* add non-zero cols of AP into the sorted linked list lnk */
256       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
257     }
258     nnz = lnk[0];
259 
260     /* If free space is not available, double the total space in the list */
261     if (current_space->local_remaining<nnz) {
262       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
263       nspacedouble++;
264     }
265 
266     /* Copy data into free space, and zero out denserows */
267     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
268 
269     current_space->array           += nnz;
270     current_space->local_used      += nnz;
271     current_space->local_remaining -= nnz;
272 
273     coi[i+1] = coi[i] + nnz;
274   }
275 
276   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
277   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
278   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
279   if (afill_tmp > afill) afill = afill_tmp;
280   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
281 
282   /* (3) send j-array (coj) of Co to other processors */
283   /*--------------------------------------------------*/
284   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
285   len_s        = merge->len_s;
286   merge->nsend = 0;
287 
288 
289   /* determine row ownership */
290   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
291   merge->rowmap->n  = pn;
292   merge->rowmap->bs = 1;
293 
294   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
295   owners = merge->rowmap->range;
296 
297   /* determine the number of messages to send, their lengths */
298   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
299   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
300   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
301 
302   proc = 0;
303   for (i=0; i<pon; i++) {
304     while (prmap[i] >= owners[proc+1]) proc++;
305     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
306     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
307   }
308 
309   len          = 0; /* max length of buf_si[], see (4) */
310   owners_co[0] = 0;
311   for (proc=0; proc<size; proc++) {
312     owners_co[proc+1] = owners_co[proc] + len_si[proc];
313     if (len_s[proc]) {
314       merge->nsend++;
315       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
316       len         += len_si[proc];
317     }
318   }
319 
320   /* determine the number and length of messages to receive for coi and coj  */
321   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
322   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
323 
324   /* post the Irecv and Isend of coj */
325   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
326   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
327   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
328   for (proc=0, k=0; proc<size; proc++) {
329     if (!len_s[proc]) continue;
330     i    = owners_co[proc];
331     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
332     k++;
333   }
334 
335   /* receives and sends of coj are complete */
336   for (i=0; i<merge->nrecv; i++) {
337     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
338   }
339   ierr = PetscFree(rwaits);CHKERRQ(ierr);
340   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
341 
342   /* (4) send and recv coi */
343   /*-----------------------*/
344   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
345   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
346   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
347   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
348   for (proc=0,k=0; proc<size; proc++) {
349     if (!len_s[proc]) continue;
350     /* form outgoing message for i-structure:
351          buf_si[0]:                 nrows to be sent
352                [1:nrows]:           row index (global)
353                [nrows+1:2*nrows+1]: i-structure index
354     */
355     /*-------------------------------------------*/
356     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
357     buf_si_i    = buf_si + nrows+1;
358     buf_si[0]   = nrows;
359     buf_si_i[0] = 0;
360     nrows       = 0;
361     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
362       nzi = coi[i+1] - coi[i];
363       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
364       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
365       nrows++;
366     }
367     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
368     k++;
369     buf_si += len_si[proc];
370   }
371   i = merge->nrecv;
372   while (i--) {
373     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
374   }
375   ierr = PetscFree(rwaits);CHKERRQ(ierr);
376   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
377 
378   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
379   ierr = PetscFree(len_ri);CHKERRQ(ierr);
380   ierr = PetscFree(swaits);CHKERRQ(ierr);
381   ierr = PetscFree(buf_s);CHKERRQ(ierr);
382 
383   /* (5) compute the local portion of C (mpi mat) */
384   /*----------------------------------------------*/
385   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
386 
387   /* allocate pti array and free space for accumulating nonzero column info */
388   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
389   pti[0] = 0;
390 
391   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
392   nnz           = fill*(pi_loc[pm] + api[am]);
393   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
394   current_space = free_space;
395 
396   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
397   for (k=0; k<merge->nrecv; k++) {
398     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
399     nrows       = *buf_ri_k[k];
400     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
401     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
402   }
403   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
404   rmax = 0;
405   for (i=0; i<pn; i++) {
406     /* add pdt[i,:]*AP into lnk */
407     pnz = pdti[i+1] - pdti[i];
408     ptJ = pdtj + pdti[i];
409     for (j=0; j<pnz; j++) {
410       row  = ptJ[j];  /* row of AP == col of Pt */
411       apnz = api[row+1] - api[row];
412       Jptr = apj + api[row];
413       /* add non-zero cols of AP into the sorted linked list lnk */
414       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
415     }
416 
417     /* add received col data into lnk */
418     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
419       if (i == *nextrow[k]) { /* i-th row */
420         nzi  = *(nextci[k]+1) - *nextci[k];
421         Jptr = buf_rj[k] + *nextci[k];
422         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
423         nextrow[k]++; nextci[k]++;
424       }
425     }
426     nnz = lnk[0];
427 
428     /* if free space is not available, make more free space */
429     if (current_space->local_remaining<nnz) {
430       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
431       nspacedouble++;
432     }
433     /* copy data into free space, then initialize lnk */
434     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
435     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
436 
437     current_space->array           += nnz;
438     current_space->local_used      += nnz;
439     current_space->local_remaining -= nnz;
440 
441     pti[i+1] = pti[i] + nnz;
442     if (nnz > rmax) rmax = nnz;
443   }
444   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
445   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
446 
447   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
448   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
449   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
450   if (afill_tmp > afill) afill = afill_tmp;
451   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
452 
453   /* (6) create symbolic parallel matrix Cmpi */
454   /*------------------------------------------*/
455   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
456   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
457   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
458   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
459   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
460   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
461 
462   merge->bi        = pti;      /* Cseq->i */
463   merge->bj        = ptj;      /* Cseq->j */
464   merge->coi       = coi;      /* Co->i   */
465   merge->coj       = coj;      /* Co->j   */
466   merge->buf_ri    = buf_ri;
467   merge->buf_rj    = buf_rj;
468   merge->owners_co = owners_co;
469   merge->destroy   = Cmpi->ops->destroy;
470   merge->duplicate = Cmpi->ops->duplicate;
471 
472   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
473   Cmpi->assembled      = PETSC_FALSE;
474   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
475   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
476 
477   /* attach the supporting struct to Cmpi for reuse */
478   c           = (Mat_MPIAIJ*)Cmpi->data;
479   c->ptap     = ptap;
480   ptap->api   = api;
481   ptap->apj   = apj;
482   ptap->rmax  = ap_rmax;
483   *C          = Cmpi;
484 
485   //==============================================================================
486   c = (Mat_MPIAIJ*)(*C)->data;
487   ptap = c->ptap;
488 
489 #if 0
490   /* create struct Mat_PtAPMPI and attached it to C later */
491   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
492 
493   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
494   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
495 
496   /* get P_loc by taking all local rows of P */
497   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
498 #endif
499   //===========================================
500    ptap->reuse = MAT_INITIAL_MATRIX;
501 
502   /* (1) compute symbolic AP = A*P, then get AP_loc */
503   /*--------------------------------------------------------------------------*/
504   ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
505   ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
506 
507   ierr = MatMatMult(A,P,MAT_INITIAL_MATRIX,2.0,&AP);CHKERRQ(ierr);
508   ierr = MatMPIAIJGetLocalMat(AP,MAT_INITIAL_MATRIX,&ptap->AP_loc);CHKERRQ(ierr);
509   ierr = MatDestroy(&AP);CHKERRQ(ierr);
510 
511   /* (2) compute C_loc=Rd*AP_loc, Co=Ro*AP_loc */
512   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,MAT_INITIAL_MATRIX,2.0,&ptap->C_loc);CHKERRQ(ierr);
513   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,MAT_INITIAL_MATRIX,2.0,&ptap->C_oth);CHKERRQ(ierr);
514 
515   /* (6) create symbolic parallel matrix Cmpi */
516   /*------------------------------------------*/
517 
518   Mat C_loc=ptap->C_loc;
519 
520   /* estimate dnz, onz arrays */
521   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
522   PetscInt i;
523   Mat_SeqAIJ *c_loc = (Mat_SeqAIJ*)C_loc->data;
524   for (i=0; i<C_loc->rmap->N; i++) {
525     printf("%d \n",c_loc->ilen[i]);
526     //dnz[i] = c_loc->ilen[i]; if (c_loc->ilen[i] > pn) dnz[i] = pn;
527     //onz[i] = c_loc->ilen[i]; if (c_loc->ilen[i] > pN - pn) onz[i] = pN - pn;
528     dnz[i] = pn; onz[i] = pN - pn;
529   }
530 
531   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
532 
533   /* flag 'scalable' determines which implementations to be used:
534        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
535        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
536   /* set default scalable */
537   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
538 
539   ierr = PetscOptionsGetBool(((PetscObject)(*C))->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
540   if (!ptap->scalable) {  /* Do dense axpy */
541     ierr = PetscCalloc1(P->cmap->N,&ptap->apa);CHKERRQ(ierr);
542   } else {
543     //ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
544   }
545 
546   //Mat_SeqAIJ    *ap=(Mat_SeqAIJ*)(ptap->AP_loc)->data;
547   //ptap->api   = ap->i;
548   //ptap->apj   = ap->j;
549   //ptap->rmax  = ap_rmax;
550 
551   PetscFunctionReturn(0);
552 }
553 
554 #undef __FUNCT__
555 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_new"
556 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_new(Mat A,Mat P,Mat C)
557 {
558   PetscErrorCode    ierr;
559   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
560   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
561   Mat_PtAPMPI       *ptap = c->ptap;
562   Mat               AP_loc,C_loc,C_oth;
563   PetscInt          i,rstart,rend,cm,ncols,row;
564   PetscMPIInt       rank;
565   MPI_Comm          comm;
566   const PetscInt    *cols;
567   const PetscScalar *vals;
568   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
569 
570   PetscFunctionBegin;
571   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
572   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
573 
574   ierr = MatZeroEntries(C);CHKERRQ(ierr);
575 
576   /* 1) get R = Pd^T,Ro = Po^T */
577   ierr = PetscTime(&t0);CHKERRQ(ierr);
578   ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
579   ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
580   ierr = PetscTime(&t1);CHKERRQ(ierr);
581   eR = t1 - t0;
582 
583   /* 2) get AP_loc */
584   AP_loc = ptap->AP_loc;
585   Mat_SeqAIJ    *ap=(Mat_SeqAIJ*)AP_loc->data;
586 
587   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
588   /*-----------------------------------------------------*/
589   if (ptap->reuse == MAT_INITIAL_MATRIX) {
590     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
591     ptap->reuse = MAT_REUSE_MATRIX;
592   } else { /* update numerical values of P_oth and P_loc */
593     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
594     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
595   }
596 
597   /* 2-2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
598   /*--------------------------------------------------------------*/
599   /* get data from symbolic products */
600   Mat_SeqAIJ  *p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
601   Mat_SeqAIJ  *p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
602   PetscInt    *api,*apj,am = A->rmap->n,j,col,apnz;
603   PetscScalar *apa = ptap->apa;
604 
605   api = ap->i;
606   apj = ap->j;
607   for (i=0; i<am; i++) {
608     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
609     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
610     apnz = api[i+1] - api[i];
611     for (j=0; j<apnz; j++) {
612       col = apj[j+api[i]];
613       ap->a[j+ap->i[i]] = apa[col];
614       apa[col] = 0.0;
615     }
616   }
617 
618   ierr = PetscTime(&t2);CHKERRQ(ierr);
619   eAP = t2 - t1;
620 
621   /* 3) C_loc = R*AP_loc, Co = Ro*AP_loc */
622   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Rd,AP_loc,MAT_REUSE_MATRIX,2.0,&ptap->C_loc);CHKERRQ(ierr);
623   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Ro,AP_loc,MAT_REUSE_MATRIX,2.0,&ptap->C_oth);CHKERRQ(ierr);
624   C_loc = ptap->C_loc;
625   C_oth = ptap->C_oth;
626   //printf("[%d] Co %d, %d\n", rank,Co->rmap->N,Co->cmap->N);
627   ierr = PetscTime(&t3);CHKERRQ(ierr);
628   eCseq = t3 - t2;
629 
630   /* add C_loc and Co to to C */
631   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
632 
633   /* C_loc -> C */
634   cm = C_loc->rmap->N;
635   Mat_SeqAIJ *c_seq;
636   c_seq = (Mat_SeqAIJ*)C_loc->data;
637   for (i=0; i<cm; i++) {
638     ncols = c_seq->i[i+1] - c_seq->i[i];
639     row = rstart + i;
640     cols = c_seq->j + c_seq->i[i];
641     vals = c_seq->a + c_seq->i[i];
642     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
643   }
644 
645   /* Co -> C, off-processor part */
646   //printf("[%d] p->B %d, %d\n",rank,p->B->rmap->N,p->B->cmap->N);
647   cm = C_oth->rmap->N;
648   c_seq = (Mat_SeqAIJ*)C_oth->data;
649   for (i=0; i<cm; i++) {
650     ncols = c_seq->i[i+1] - c_seq->i[i];
651     row = p->garray[i];
652     cols = c_seq->j + c_seq->i[i];
653     vals = c_seq->a + c_seq->i[i];
654     //printf("[%d] row[%d] = %d\n",rank,i,row);
655     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
656   }
657   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
658   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
659   ierr = PetscTime(&t4);CHKERRQ(ierr);
660   eCmpi = t4 - t3;
661 
662   if (rank==1) {
663     ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr);
664   }
665   PetscFunctionReturn(0);
666 }
667 
668 #undef __FUNCT__
669 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
670 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
671 {
672   PetscErrorCode      ierr;
673   Mat                 Cmpi;
674   Mat_PtAPMPI         *ptap;
675   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
676   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
677   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
678   Mat_SeqAIJ          *p_loc,*p_oth;
679   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
680   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
681   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
682   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
683   PetscBT             lnkbt;
684   MPI_Comm            comm;
685   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
686   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
687   PetscInt            len,proc,*dnz,*onz,*owners;
688   PetscInt            nzi,*pti,*ptj;
689   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
690   MPI_Request         *swaits,*rwaits;
691   MPI_Status          *sstatus,rstatus;
692   Mat_Merge_SeqsToMPI *merge;
693   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
694   PetscReal           afill=1.0,afill_tmp;
695   PetscInt            rmax;
696 
697   PetscFunctionBegin;
698   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
699 
700   /* check if matrix local sizes are compatible */
701   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
702     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
703   }
704   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
705     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
706   }
707 
708   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
709   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
710 
711   /* create struct Mat_PtAPMPI and attached it to C later */
712   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
713   ierr        = PetscNew(&merge);CHKERRQ(ierr);
714   ptap->merge = merge;
715   ptap->reuse = MAT_INITIAL_MATRIX;
716 
717   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
718   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
719 
720   /* get P_loc by taking all local rows of P */
721   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
722 
723   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
724   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
725   pi_loc = p_loc->i; pj_loc = p_loc->j;
726   pi_oth = p_oth->i; pj_oth = p_oth->j;
727 
728   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
729   /*--------------------------------------------------------------------------*/
730   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
731   api[0] = 0;
732 
733   /* create and initialize a linked list */
734   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
735 
736   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
737   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
738 
739   current_space = free_space;
740 
741   for (i=0; i<am; i++) {
742     /* diagonal portion of A */
743     nzi = adi[i+1] - adi[i];
744     aj  = ad->j + adi[i];
745     for (j=0; j<nzi; j++) {
746       row  = aj[j];
747       pnz  = pi_loc[row+1] - pi_loc[row];
748       Jptr = pj_loc + pi_loc[row];
749       /* add non-zero cols of P into the sorted linked list lnk */
750       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
751     }
752     /* off-diagonal portion of A */
753     nzi = aoi[i+1] - aoi[i];
754     aj  = ao->j + aoi[i];
755     for (j=0; j<nzi; j++) {
756       row  = aj[j];
757       pnz  = pi_oth[row+1] - pi_oth[row];
758       Jptr = pj_oth + pi_oth[row];
759       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
760     }
761     apnz     = lnk[0];
762     api[i+1] = api[i] + apnz;
763     if (ap_rmax < apnz) ap_rmax = apnz;
764 
765     /* if free space is not available, double the total space in the list */
766     if (current_space->local_remaining<apnz) {
767       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
768       nspacedouble++;
769     }
770 
771     /* Copy data into free space, then initialize lnk */
772     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
773 
774     current_space->array           += apnz;
775     current_space->local_used      += apnz;
776     current_space->local_remaining -= apnz;
777   }
778 
779   /* Allocate space for apj, initialize apj, and */
780   /* destroy list of free space and other temporary array(s) */
781   ierr      = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr);
782   ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
783   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
784   if (afill_tmp > afill) afill = afill_tmp;
785 
786   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
787   /*-----------------------------------------------------------------*/
788   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
789 
790   /* then, compute symbolic Co = (p->B)^T*AP */
791   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
792                          >= (num of nonzero rows of C_seq) - pn */
793   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
794   coi[0] = 0;
795 
796   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
797   nnz           = fill*(poti[pon] + api[am]);
798   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
799   current_space = free_space;
800 
801   for (i=0; i<pon; i++) {
802     pnz = poti[i+1] - poti[i];
803     ptJ = potj + poti[i];
804     for (j=0; j<pnz; j++) {
805       row  = ptJ[j]; /* row of AP == col of Pot */
806       apnz = api[row+1] - api[row];
807       Jptr = apj + api[row];
808       /* add non-zero cols of AP into the sorted linked list lnk */
809       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
810     }
811     nnz = lnk[0];
812 
813     /* If free space is not available, double the total space in the list */
814     if (current_space->local_remaining<nnz) {
815       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
816       nspacedouble++;
817     }
818 
819     /* Copy data into free space, and zero out denserows */
820     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
821 
822     current_space->array           += nnz;
823     current_space->local_used      += nnz;
824     current_space->local_remaining -= nnz;
825 
826     coi[i+1] = coi[i] + nnz;
827   }
828 
829   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
830   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
831   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
832   if (afill_tmp > afill) afill = afill_tmp;
833   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
834 
835   /* (3) send j-array (coj) of Co to other processors */
836   /*--------------------------------------------------*/
837   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
838   len_s        = merge->len_s;
839   merge->nsend = 0;
840 
841 
842   /* determine row ownership */
843   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
844   merge->rowmap->n  = pn;
845   merge->rowmap->bs = 1;
846 
847   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
848   owners = merge->rowmap->range;
849 
850   /* determine the number of messages to send, their lengths */
851   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
852   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
853   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
854 
855   proc = 0;
856   for (i=0; i<pon; i++) {
857     while (prmap[i] >= owners[proc+1]) proc++;
858     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
859     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
860   }
861 
862   len          = 0; /* max length of buf_si[], see (4) */
863   owners_co[0] = 0;
864   for (proc=0; proc<size; proc++) {
865     owners_co[proc+1] = owners_co[proc] + len_si[proc];
866     if (len_s[proc]) {
867       merge->nsend++;
868       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
869       len         += len_si[proc];
870     }
871   }
872 
873   /* determine the number and length of messages to receive for coi and coj  */
874   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
875   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
876 
877   /* post the Irecv and Isend of coj */
878   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
879   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
880   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
881   for (proc=0, k=0; proc<size; proc++) {
882     if (!len_s[proc]) continue;
883     i    = owners_co[proc];
884     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
885     k++;
886   }
887 
888   /* receives and sends of coj are complete */
889   for (i=0; i<merge->nrecv; i++) {
890     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
891   }
892   ierr = PetscFree(rwaits);CHKERRQ(ierr);
893   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
894 
895   /* (4) send and recv coi */
896   /*-----------------------*/
897   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
898   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
899   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
900   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
901   for (proc=0,k=0; proc<size; proc++) {
902     if (!len_s[proc]) continue;
903     /* form outgoing message for i-structure:
904          buf_si[0]:                 nrows to be sent
905                [1:nrows]:           row index (global)
906                [nrows+1:2*nrows+1]: i-structure index
907     */
908     /*-------------------------------------------*/
909     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
910     buf_si_i    = buf_si + nrows+1;
911     buf_si[0]   = nrows;
912     buf_si_i[0] = 0;
913     nrows       = 0;
914     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
915       nzi = coi[i+1] - coi[i];
916       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
917       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
918       nrows++;
919     }
920     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
921     k++;
922     buf_si += len_si[proc];
923   }
924   i = merge->nrecv;
925   while (i--) {
926     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
927   }
928   ierr = PetscFree(rwaits);CHKERRQ(ierr);
929   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
930 
931   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
932   ierr = PetscFree(len_ri);CHKERRQ(ierr);
933   ierr = PetscFree(swaits);CHKERRQ(ierr);
934   ierr = PetscFree(buf_s);CHKERRQ(ierr);
935 
936   /* (5) compute the local portion of C (mpi mat) */
937   /*----------------------------------------------*/
938   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
939 
940   /* allocate pti array and free space for accumulating nonzero column info */
941   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
942   pti[0] = 0;
943 
944   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
945   nnz           = fill*(pi_loc[pm] + api[am]);
946   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
947   current_space = free_space;
948 
949   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
950   for (k=0; k<merge->nrecv; k++) {
951     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
952     nrows       = *buf_ri_k[k];
953     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
954     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
955   }
956   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
957   rmax = 0;
958   for (i=0; i<pn; i++) {
959     /* add pdt[i,:]*AP into lnk */
960     pnz = pdti[i+1] - pdti[i];
961     ptJ = pdtj + pdti[i];
962     for (j=0; j<pnz; j++) {
963       row  = ptJ[j];  /* row of AP == col of Pt */
964       apnz = api[row+1] - api[row];
965       Jptr = apj + api[row];
966       /* add non-zero cols of AP into the sorted linked list lnk */
967       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
968     }
969 
970     /* add received col data into lnk */
971     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
972       if (i == *nextrow[k]) { /* i-th row */
973         nzi  = *(nextci[k]+1) - *nextci[k];
974         Jptr = buf_rj[k] + *nextci[k];
975         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
976         nextrow[k]++; nextci[k]++;
977       }
978     }
979     nnz = lnk[0];
980 
981     /* if free space is not available, make more free space */
982     if (current_space->local_remaining<nnz) {
983       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
984       nspacedouble++;
985     }
986     /* copy data into free space, then initialize lnk */
987     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
988     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
989 
990     current_space->array           += nnz;
991     current_space->local_used      += nnz;
992     current_space->local_remaining -= nnz;
993 
994     pti[i+1] = pti[i] + nnz;
995     if (nnz > rmax) rmax = nnz;
996   }
997   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
998   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
999 
1000   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
1001   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
1002   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
1003   if (afill_tmp > afill) afill = afill_tmp;
1004   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1005 
1006   /* (6) create symbolic parallel matrix Cmpi */
1007   /*------------------------------------------*/
1008   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1009   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1010   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
1011   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1012   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1013   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1014 
1015   merge->bi        = pti;      /* Cseq->i */
1016   merge->bj        = ptj;      /* Cseq->j */
1017   merge->coi       = coi;      /* Co->i   */
1018   merge->coj       = coj;      /* Co->j   */
1019   merge->buf_ri    = buf_ri;
1020   merge->buf_rj    = buf_rj;
1021   merge->owners_co = owners_co;
1022   merge->destroy   = Cmpi->ops->destroy;
1023   merge->duplicate = Cmpi->ops->duplicate;
1024 
1025   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1026   Cmpi->assembled      = PETSC_FALSE;
1027   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
1028   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1029 
1030   /* attach the supporting struct to Cmpi for reuse */
1031   c           = (Mat_MPIAIJ*)Cmpi->data;
1032   c->ptap     = ptap;
1033   ptap->api   = api;
1034   ptap->apj   = apj;
1035   ptap->rmax  = ap_rmax;
1036   *C          = Cmpi;
1037 
1038   /* flag 'scalable' determines which implementations to be used:
1039        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
1040        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
1041   /* set default scalable */
1042   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
1043 
1044   ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
1045   if (!ptap->scalable) {  /* Do dense axpy */
1046     ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
1047   } else {
1048     ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
1049   }
1050 
1051 #if defined(PETSC_USE_INFO)
1052   if (pti[pn] != 0) {
1053     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1054     ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1055   } else {
1056     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1057   }
1058 #endif
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 #undef __FUNCT__
1063 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
1064 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1065 {
1066   PetscErrorCode      ierr;
1067   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1068   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1069   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1070   Mat_SeqAIJ          *p_loc,*p_oth;
1071   Mat_PtAPMPI         *ptap;
1072   Mat_Merge_SeqsToMPI *merge;
1073   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
1074   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
1075   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
1076   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
1077   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1078   MPI_Comm            comm;
1079   PetscMPIInt         size,rank,taga,*len_s;
1080   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1081   PetscInt            **buf_ri,**buf_rj;
1082   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1083   MPI_Request         *s_waits,*r_waits;
1084   MPI_Status          *status;
1085   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1086   PetscInt            *api,*apj,*coi,*coj;
1087   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
1088   PetscBool           scalable;
1089 #if defined(PTAP_PROFILE)
1090   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
1091 #endif
1092 
1093   PetscFunctionBegin;
1094   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1095 #if defined(PTAP_PROFILE)
1096   ierr = PetscTime(&t0);CHKERRQ(ierr);
1097 #endif
1098   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1099   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1100 
1101   ptap = c->ptap;
1102   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
1103   merge    = ptap->merge;
1104   apa      = ptap->apa;
1105   scalable = ptap->scalable;
1106 
1107   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1108   /*-----------------------------------------------------*/
1109   if (ptap->reuse == MAT_INITIAL_MATRIX) {
1110     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
1111     ptap->reuse = MAT_REUSE_MATRIX;
1112   } else { /* update numerical values of P_oth and P_loc */
1113     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1114     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1115   }
1116 #if defined(PTAP_PROFILE)
1117   ierr = PetscTime(&t1);CHKERRQ(ierr);
1118   eP = t1-t0;
1119 #endif
1120   /*
1121   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
1122          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
1123          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
1124          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
1125    */
1126 
1127   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
1128   /*--------------------------------------------------------------*/
1129   /* get data from symbolic products */
1130   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1131   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1132   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a;
1133   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1134 
1135   coi  = merge->coi; coj = merge->coj;
1136   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1137 
1138   bi     = merge->bi; bj = merge->bj;
1139   owners = merge->rowmap->range;
1140   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);  /* ba: Cseq->a */
1141 
1142   api = ptap->api; apj = ptap->apj;
1143 
1144   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
1145     ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr);
1146 #if 0
1147     /* ------ 10x slower -------------- */
1148     /*==================================*/
1149     Mat         R = ptap->R;
1150     Mat_SeqAIJ  *r = (Mat_SeqAIJ*)R->data;
1151     PetscInt    *ri=r->i,*rj=r->j,rnz,arow,l,prow,pcol,pN=P->cmap->N;
1152     PetscScalar *ra=r->a,tmp,cdense[pN];
1153 
1154     ierr = PetscMemzero(cdense,pN*sizeof(PetscScalar));CHKERRQ(ierr);
1155     for (i=0; i<cm; i++) { /* each row of C or R */
1156       rnz = ri[i+1] - ri[i];
1157 
1158       for (j=0; j<rnz; j++) { /* each nz of R */
1159         arow = rj[ri[i] + j];
1160 
1161         /* diagonal portion of A */
1162         anz  = ad->i[arow+1] - ad->i[arow];
1163         for (k=0; k<anz; k++) { /* each nz of Ad */
1164           tmp  = ra[ri[i] + j]*ad->a[ad->i[arow] + k];
1165           prow = ad->j[ad->i[arow] + k];
1166           pnz  = pi_loc[prow+1] - pi_loc[prow];
1167 
1168           for (l=0; l<pnz; l++) { /* each nz of P_loc */
1169             pcol = pj_loc[pi_loc[prow] + l];
1170             cdense[pcol] += tmp*pa_loc[pi_loc[prow] + l];
1171           }
1172         }
1173 
1174         /* off-diagonal portion of A */
1175         anz  = ao->i[arow+1] - ao->i[arow];
1176         for (k=0; k<anz; k++) { /* each nz of Ao */
1177           tmp  = ra[ri[i] + j]*ao->a[ao->i[arow] + k];
1178           prow = ao->j[ao->i[arow] + k];
1179           pnz  = pi_oth[prow+1] - pi_oth[prow];
1180 
1181           for (l=0; l<pnz; l++) { /* each nz of P_oth */
1182             pcol = pj_oth[pi_oth[prow] + l];
1183             cdense[pcol] += tmp*pa_oth[pi_oth[prow] + l];
1184           }
1185         }
1186 
1187       } //for (j=0; j<rnz; j++)
1188 
1189       /* copy cdense[] into ca; zero cdense[] */
1190       cnz = bi[i+1] - bi[i];
1191       cj  = bj + bi[i];
1192       ca  = ba + bi[i];
1193       for (j=0; j<cnz; j++) {
1194         ca[j] += cdense[cj[j]];
1195         cdense[cj[j]] = 0.0;
1196       }
1197 #if 0
1198       if (rank == 0) {
1199         printf("[%d] row %d: ",rank,i);
1200         for (j=0; j<pN; j++) printf(" %g,",cdense[j]);
1201         printf("\n");
1202       }
1203       for (j=0; j<pN; j++) cdense[j]=0.0; // zero cdnese[]
1204 #endif
1205     } //for (i=0; i<cm; i++) {
1206 #endif
1207 
1208     //==========================================
1209 
1210     ierr = PetscTime(&t1);CHKERRQ(ierr);
1211     for (i=0; i<am; i++) {
1212 #if defined(PTAP_PROFILE)
1213       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1214 #endif
1215       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
1216       /*------------------------------------------------------------*/
1217       apJ = apj + api[i];
1218 
1219       /* diagonal portion of A */
1220       anz = adi[i+1] - adi[i];
1221       adj = ad->j + adi[i];
1222       ada = ad->a + adi[i];
1223       for (j=0; j<anz; j++) {
1224         row = adj[j];
1225         pnz = pi_loc[row+1] - pi_loc[row];
1226         pj  = pj_loc + pi_loc[row];
1227         pa  = pa_loc + pi_loc[row];
1228 
1229         /* perform dense axpy */
1230         valtmp = ada[j];
1231         for (k=0; k<pnz; k++) {
1232           apa[pj[k]] += valtmp*pa[k];
1233         }
1234         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1235       }
1236 
1237       /* off-diagonal portion of A */
1238       anz = aoi[i+1] - aoi[i];
1239       aoj = ao->j + aoi[i];
1240       aoa = ao->a + aoi[i];
1241       for (j=0; j<anz; j++) {
1242         row = aoj[j];
1243         pnz = pi_oth[row+1] - pi_oth[row];
1244         pj  = pj_oth + pi_oth[row];
1245         pa  = pa_oth + pi_oth[row];
1246 
1247         /* perform dense axpy */
1248         valtmp = aoa[j];
1249         for (k=0; k<pnz; k++) {
1250           apa[pj[k]] += valtmp*pa[k];
1251         }
1252         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1253       }
1254 #if defined(PTAP_PROFILE)
1255       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1256       et2_AP += t2_1 - t2_0;
1257 #endif
1258 
1259       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1260       /*--------------------------------------------------------------*/
1261       apnz = api[i+1] - api[i];
1262       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
1263       pnz = po->i[i+1] - po->i[i];
1264       poJ = po->j + po->i[i];
1265       pA  = po->a + po->i[i];
1266       for (j=0; j<pnz; j++) {
1267         row = poJ[j];
1268         cnz = coi[row+1] - coi[row];
1269         cj  = coj + coi[row];
1270         ca  = coa + coi[row];
1271         /* perform dense axpy */
1272         valtmp = pA[j];
1273         for (k=0; k<cnz; k++) {
1274           ca[k] += valtmp*apa[cj[k]];
1275         }
1276         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1277       }
1278 #if 1
1279       /* put the value into Cd (diagonal part) */
1280       pnz = pd->i[i+1] - pd->i[i];
1281       pdJ = pd->j + pd->i[i];
1282       pA  = pd->a + pd->i[i];
1283       for (j=0; j<pnz; j++) {
1284         row = pdJ[j];
1285         cnz = bi[row+1] - bi[row];
1286         cj  = bj + bi[row];
1287         ca  = ba + bi[row];
1288         /* perform dense axpy */
1289         valtmp = pA[j];
1290         for (k=0; k<cnz; k++) {
1291           ca[k] += valtmp*apa[cj[k]];
1292         }
1293         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1294       }
1295 #endif
1296       /* zero the current row of A*P */
1297       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
1298 #if defined(PTAP_PROFILE)
1299       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1300       ePtAP += t2_2 - t2_1;
1301 #endif
1302     }
1303 
1304     if (rank == 100) {
1305     for (row=0; row<cm; row++) {
1306       printf("[%d] row %d: ",rank,row);
1307       cnz = bi[row+1] - bi[row];
1308       for (j=0; j<cnz; j++) printf(" %g,",ba[bi[row]+j]);
1309       printf("\n");
1310     }
1311     }
1312 
1313   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
1314     ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr);
1315     /*-----------------------------------------------------------------------------------------*/
1316     pA=pa_loc;
1317     for (i=0; i<am; i++) {
1318 #if defined(PTAP_PROFILE)
1319       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
1320 #endif
1321       /* form i-th sparse row of A*P */
1322       apnz = api[i+1] - api[i];
1323       apJ  = apj + api[i];
1324       /* diagonal portion of A */
1325       anz = adi[i+1] - adi[i];
1326       adj = ad->j + adi[i];
1327       ada = ad->a + adi[i];
1328       for (j=0; j<anz; j++) {
1329         row    = adj[j];
1330         pnz    = pi_loc[row+1] - pi_loc[row];
1331         pj     = pj_loc + pi_loc[row];
1332         pa     = pa_loc + pi_loc[row];
1333         valtmp = ada[j];
1334         nextp  = 0;
1335         for (k=0; nextp<pnz; k++) {
1336           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1337             apa[k] += valtmp*pa[nextp++];
1338           }
1339         }
1340         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1341       }
1342       /* off-diagonal portion of A */
1343       anz = aoi[i+1] - aoi[i];
1344       aoj = ao->j + aoi[i];
1345       aoa = ao->a + aoi[i];
1346       for (j=0; j<anz; j++) {
1347         row    = aoj[j];
1348         pnz    = pi_oth[row+1] - pi_oth[row];
1349         pj     = pj_oth + pi_oth[row];
1350         pa     = pa_oth + pi_oth[row];
1351         valtmp = aoa[j];
1352         nextp  = 0;
1353         for (k=0; nextp<pnz; k++) {
1354           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1355             apa[k] += valtmp*pa[nextp++];
1356           }
1357         }
1358         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1359       }
1360 #if defined(PTAP_PROFILE)
1361       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1362       et2_AP += t2_1 - t2_0;
1363 #endif
1364 
1365       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1366       /*--------------------------------------------------------------*/
1367       pnz = pi_loc[i+1] - pi_loc[i];
1368       pJ  = pj_loc + pi_loc[i];
1369       for (j=0; j<pnz; j++) {
1370         nextap = 0;
1371         row    = pJ[j]; /* global index */
1372         if (row < pcstart || row >=pcend) { /* put the value into Co */
1373           row = *poJ;
1374           cj  = coj + coi[row];
1375           ca  = coa + coi[row]; poJ++;
1376         } else {                            /* put the value into Cd */
1377           row = *pdJ;
1378           cj  = bj + bi[row];
1379           ca  = ba + bi[row]; pdJ++;
1380         }
1381         valtmp = pA[j];
1382         for (k=0; nextap<apnz; k++) {
1383           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1384         }
1385         ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1386       }
1387       pA += pnz;
1388       /* zero the current row info for A*P */
1389       ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
1390 #if defined(PTAP_PROFILE)
1391       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1392       ePtAP += t2_2 - t2_1;
1393 #endif
1394     }
1395   }
1396 #if defined(PTAP_PROFILE)
1397   ierr = PetscTime(&t2);CHKERRQ(ierr);
1398 #endif
1399 
1400   /* 3) send and recv matrix values coa */
1401   /*------------------------------------*/
1402   buf_ri = merge->buf_ri;
1403   buf_rj = merge->buf_rj;
1404   len_s  = merge->len_s;
1405   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1406   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1407 
1408   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1409   for (proc=0,k=0; proc<size; proc++) {
1410     if (!len_s[proc]) continue;
1411     i    = merge->owners_co[proc];
1412     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1413     k++;
1414   }
1415   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1416   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1417 
1418   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1419   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1420   ierr = PetscFree(coa);CHKERRQ(ierr);
1421 #if defined(PTAP_PROFILE)
1422   ierr = PetscTime(&t3);CHKERRQ(ierr);
1423 #endif
1424 
1425   /* 4) insert local Cseq and received values into Cmpi */
1426   /*------------------------------------------------------*/
1427   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1428   for (k=0; k<merge->nrecv; k++) {
1429     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1430     nrows       = *(buf_ri_k[k]);
1431     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1432     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1433   }
1434 
1435   for (i=0; i<cm; i++) {
1436     row  = owners[rank] + i; /* global row index of C_seq */
1437     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1438     ba_i = ba + bi[i];
1439     bnz  = bi[i+1] - bi[i];
1440     /* add received vals into ba */
1441     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1442       /* i-th row */
1443       if (i == *nextrow[k]) {
1444         cnz    = *(nextci[k]+1) - *nextci[k];
1445         cj     = buf_rj[k] + *(nextci[k]);
1446         ca     = abuf_r[k] + *(nextci[k]);
1447         nextcj = 0;
1448         for (j=0; nextcj<cnz; j++) {
1449           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1450             ba_i[j] += ca[nextcj++];
1451           }
1452         }
1453         nextrow[k]++; nextci[k]++;
1454         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1455       }
1456     }
1457     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1458   }
1459   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1460   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1461 
1462   ierr = PetscFree(ba);CHKERRQ(ierr);
1463   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1464   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1465   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1466 #if defined(PTAP_PROFILE)
1467   ierr = PetscTime(&t4);CHKERRQ(ierr);
1468   if (rank==1) {
1469     ierr = PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
1470   }
1471 #endif
1472   PetscFunctionReturn(0);
1473 }
1474