xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 2259aa2ea84764ed1ca27a4c2c2f9c3a3d34d915)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 
13 #define PTAP_PROFILE
14 
15 extern PetscErrorCode MatDestroy_MPIAIJ(Mat);
16 #undef __FUNCT__
17 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
18 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
19 {
20   PetscErrorCode ierr;
21   Mat_MPIAIJ     *a   =(Mat_MPIAIJ*)A->data;
22   Mat_PtAPMPI    *ptap=a->ptap;
23 
24   PetscFunctionBegin;
25   if (ptap) {
26     Mat_Merge_SeqsToMPI *merge=ptap->merge;
27     ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
28     ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
29     ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
30     ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
31     ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
32 
33     ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
34     ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
35     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
36     ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
37     ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
38 
39     if (ptap->api) {ierr = PetscFree(ptap->api);CHKERRQ(ierr);}
40     if (ptap->apj) {ierr = PetscFree(ptap->apj);CHKERRQ(ierr);}
41     if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
42     if (merge) {
43       ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
44       ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
45       ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
46       ierr = PetscFree(merge->bi);CHKERRQ(ierr);
47       ierr = PetscFree(merge->bj);CHKERRQ(ierr);
48       ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
49       ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
50       ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
51       ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
52       ierr = PetscFree(merge->coi);CHKERRQ(ierr);
53       ierr = PetscFree(merge->coj);CHKERRQ(ierr);
54       ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
55       ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
56       ierr = merge->destroy(A);CHKERRQ(ierr);
57       ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
58     }
59     ierr = PetscFree(ptap);CHKERRQ(ierr);
60   }
61   PetscFunctionReturn(0);
62 }
63 
64 #undef __FUNCT__
65 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatPtAP"
66 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
67 {
68   PetscErrorCode      ierr;
69   Mat_MPIAIJ          *a     = (Mat_MPIAIJ*)A->data;
70   Mat_PtAPMPI         *ptap  = a->ptap;
71   Mat_Merge_SeqsToMPI *merge = ptap->merge;
72 
73   PetscFunctionBegin;
74   ierr = (*merge->duplicate)(A,op,M);CHKERRQ(ierr);
75 
76   (*M)->ops->destroy   = merge->destroy;
77   (*M)->ops->duplicate = merge->duplicate;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
83 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
84 {
85   PetscErrorCode ierr;
86   PetscBool      newalg=PETSC_FALSE;
87 
88   PetscFunctionBegin;
89   ierr = PetscOptionsGetBool(NULL,"-matptap_new",&newalg,NULL);CHKERRQ(ierr);
90   if (scall == MAT_INITIAL_MATRIX) {
91     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
92     if (newalg) {
93       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(A,P,fill,C);CHKERRQ(ierr);
94     } else {
95       ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
96     }
97     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
98   }
99   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
100   if (newalg) {
101     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ_new(A,P,*C);CHKERRQ(ierr);
102   } else {
103     ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
104   }
105   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 #undef __FUNCT__
110 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_new"
111 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_new(Mat A,Mat P,PetscReal fill,Mat *C)
112 {
113   PetscErrorCode ierr;
114   Mat_PtAPMPI    *ptap;
115   Mat_MPIAIJ     *p=(Mat_MPIAIJ*)P->data;
116   Mat            AP;
117   Mat_MPIAIJ     *c;
118 
119   PetscFunctionBegin;
120   ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); //rm later !!!
121   c = (Mat_MPIAIJ*)(*C)->data;
122   ptap = c->ptap;
123   //======================================
124 #if 0
125   MPI_Comm            comm;
126   PetscMPIInt         size,rank;
127   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
128   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
129   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
130 
131   /* check if matrix local sizes are compatible -- MV! */
132   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
133     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
134   }
135   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
136     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
137   }
138 
139   /* create struct Mat_PtAPMPI and attached it to C later */
140   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
141 
142   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
143   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
144 
145   /* get P_loc by taking all local rows of P */
146   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
147 #endif
148   //===========================================
149    ptap->reuse = MAT_INITIAL_MATRIX;
150 
151   /* (1) compute symbolic AP = A*P, then get AP_loc */
152   /*--------------------------------------------------------------------------*/
153   ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
154   ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
155 
156   ierr = MatMatMult(A,P,MAT_INITIAL_MATRIX,2.0,&AP);CHKERRQ(ierr);
157   ierr = MatMPIAIJGetLocalMat(AP,MAT_INITIAL_MATRIX,&ptap->AP_loc);CHKERRQ(ierr);
158   ierr = MatDestroy(&AP);CHKERRQ(ierr);
159 
160   /* (2) compute C_loc=Rd*AP_loc, Co=Ro*AP_loc */
161   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,MAT_INITIAL_MATRIX,2.0,&ptap->C_loc);CHKERRQ(ierr);
162   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,MAT_INITIAL_MATRIX,2.0,&ptap->C_oth);CHKERRQ(ierr);
163 
164 #if 0
165   /* (6) create symbolic parallel matrix Cmpi */
166   /*------------------------------------------*/
167   PetscInt pn = P->cmap->n,pN = P->cmap->N,*dnz,*onz;
168   Mat      Cmpi,C_loc=ptap->C_loc;
169 
170   /* estimate dnz, onz arrays */
171   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
172   PetscInt i;
173   Mat_SeqAIJ *c_loc = (Mat_SeqAIJ*)C_loc->data;
174   for (i=0; i<C_loc->rmap->N; i++) {
175     printf("%d \n",c_loc->ilen[i]);
176     //dnz[i] = c_loc->ilen[i]; if (c_loc->ilen[i] > pn) dnz[i] = pn;
177     //onz[i] = c_loc->ilen[i]; if (c_loc->ilen[i] > pN - pn) onz[i] = pN - pn;
178     dnz[i] = pn; onz[i] = pN - pn;
179   }
180 
181   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
182   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
183   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
184   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
185   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
186   ierr = MatSetOption(Cmpi,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_FALSE);CHKERRQ(ierr);
187   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
188 
189   //ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
190   //ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
191 
192   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
193   //Cmpi->assembled      = PETSC_FALSE;
194   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
195   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
196 
197   /* attach the supporting struct to Cmpi for reuse */
198   c = (Mat_MPIAIJ*)Cmpi->data;
199   c->ptap       = ptap;
200   /* flag 'scalable' determines which implementations to be used:
201        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
202        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
203   /* set default scalable */
204   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
205 
206   ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
207   if (!ptap->scalable) {  /* Do dense axpy */
208     ierr = PetscCalloc1(P->cmap->N,&ptap->apa);CHKERRQ(ierr);
209   } else {
210     //ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
211   }
212 
213   //Mat_SeqAIJ    *ap=(Mat_SeqAIJ*)(ptap->AP_loc)->data;
214   //ptap->api   = ap->i;
215   //ptap->apj   = ap->j;
216   //ptap->rmax  = ap_rmax;
217   ptap->merge = NULL;
218   *C          = Cmpi;
219 #endif
220   PetscFunctionReturn(0);
221 }
222 
223 #undef __FUNCT__
224 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_new"
225 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_new(Mat A,Mat P,Mat C)
226 {
227   PetscErrorCode    ierr;
228   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
229   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
230   Mat_PtAPMPI       *ptap = c->ptap;
231   Mat               AP_loc,C_loc,C_oth;
232   PetscInt          i,rstart,rend,cm,ncols,row;
233   PetscMPIInt       rank;
234   MPI_Comm          comm;
235   const PetscInt    *cols;
236   const PetscScalar *vals;
237   PetscLogDouble    t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi;
238 
239   PetscFunctionBegin;
240   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
241   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
242 
243   ierr = MatZeroEntries(C);CHKERRQ(ierr);
244 
245   /* 1) get R = Pd^T,Ro = Po^T */
246   ierr = PetscTime(&t0);CHKERRQ(ierr);
247   ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
248   ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
249   ierr = PetscTime(&t1);CHKERRQ(ierr);
250   eR = t1 - t0;
251 
252   /* 2) get AP_loc */
253   AP_loc = ptap->AP_loc;
254   Mat_SeqAIJ    *ap=(Mat_SeqAIJ*)AP_loc->data;
255 
256   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
257   /*-----------------------------------------------------*/
258   if (ptap->reuse == MAT_INITIAL_MATRIX) {
259     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
260     ptap->reuse = MAT_REUSE_MATRIX;
261   } else { /* update numerical values of P_oth and P_loc */
262     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
263     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
264   }
265 
266   /* 2-2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
267   /*--------------------------------------------------------------*/
268   /* get data from symbolic products */
269   Mat_SeqAIJ  *p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
270   Mat_SeqAIJ  *p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
271   PetscInt    *api,*apj,am = A->rmap->n,j,col,apnz;
272   PetscScalar *apa = ptap->apa;
273 
274   api = ap->i;
275   apj = ap->j;
276   for (i=0; i<am; i++) {
277     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
278     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
279     apnz = api[i+1] - api[i];
280     for (j=0; j<apnz; j++) {
281       col = apj[j+api[i]];
282       ap->a[j+ap->i[i]] = apa[col];
283       apa[col] = 0.0;
284     }
285   }
286 
287   ierr = PetscTime(&t2);CHKERRQ(ierr);
288   eAP = t2 - t1;
289 
290   /* 3) C_loc = R*AP_loc, Co = Ro*AP_loc */
291   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Rd,AP_loc,MAT_REUSE_MATRIX,2.0,&ptap->C_loc);CHKERRQ(ierr);
292   ierr = MatMatMult_SeqAIJ_SeqAIJ(ptap->Ro,AP_loc,MAT_REUSE_MATRIX,2.0,&ptap->C_oth);CHKERRQ(ierr);
293   C_loc = ptap->C_loc;
294   C_oth = ptap->C_oth;
295   //printf("[%d] Co %d, %d\n", rank,Co->rmap->N,Co->cmap->N);
296   ierr = PetscTime(&t3);CHKERRQ(ierr);
297   eCseq = t3 - t2;
298 
299   /* add C_loc and Co to to C */
300   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
301 
302   /* C_loc -> C */
303   cm = C_loc->rmap->N;
304   Mat_SeqAIJ *c_seq;
305   c_seq = (Mat_SeqAIJ*)C_loc->data;
306   for (i=0; i<cm; i++) {
307     ncols = c_seq->i[i+1] - c_seq->i[i];
308     row = rstart + i;
309     cols = c_seq->j + c_seq->i[i];
310     vals = c_seq->a + c_seq->i[i];
311     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
312   }
313 
314   /* Co -> C, off-processor part */
315   //printf("[%d] p->B %d, %d\n",rank,p->B->rmap->N,p->B->cmap->N);
316   cm = C_oth->rmap->N;
317   c_seq = (Mat_SeqAIJ*)C_oth->data;
318   for (i=0; i<cm; i++) {
319     ncols = c_seq->i[i+1] - c_seq->i[i];
320     row = p->garray[i];
321     cols = c_seq->j + c_seq->i[i];
322     vals = c_seq->a + c_seq->i[i];
323     //printf("[%d] row[%d] = %d\n",rank,i,row);
324     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
325   }
326   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
327   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
328   ierr = PetscTime(&t4);CHKERRQ(ierr);
329   eCmpi = t4 - t3;
330 
331   if (rank==1) {
332     ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr);
333   }
334   PetscFunctionReturn(0);
335 }
336 
337 #undef __FUNCT__
338 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
339 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
340 {
341   PetscErrorCode      ierr;
342   Mat                 Cmpi;
343   Mat_PtAPMPI         *ptap;
344   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
345   Mat_MPIAIJ          *a        =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
346   Mat_SeqAIJ          *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
347   Mat_SeqAIJ          *p_loc,*p_oth;
348   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
349   PetscInt            *adi=ad->i,*aj,*aoi=ao->i,nnz;
350   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
351   PetscInt            am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n;
352   PetscBT             lnkbt;
353   MPI_Comm            comm;
354   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0;
355   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
356   PetscInt            len,proc,*dnz,*onz,*owners;
357   PetscInt            nzi,*pti,*ptj;
358   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
359   MPI_Request         *swaits,*rwaits;
360   MPI_Status          *sstatus,rstatus;
361   Mat_Merge_SeqsToMPI *merge;
362   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0;
363   PetscReal           afill=1.0,afill_tmp;
364   PetscInt            rmax;
365 
366   PetscFunctionBegin;
367   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
368 
369   /* check if matrix local sizes are compatible */
370   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
371     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
372   }
373   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
374     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
375   }
376 
377   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
378   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
379 
380   /* create struct Mat_PtAPMPI and attached it to C later */
381   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
382   ierr        = PetscNew(&merge);CHKERRQ(ierr);
383   ptap->merge = merge;
384   ptap->reuse = MAT_INITIAL_MATRIX;
385 
386   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
387   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
388 
389   /* get P_loc by taking all local rows of P */
390   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
391 
392   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
393   p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
394   pi_loc = p_loc->i; pj_loc = p_loc->j;
395   pi_oth = p_oth->i; pj_oth = p_oth->j;
396 
397   /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */
398   /*--------------------------------------------------------------------------*/
399   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
400   api[0] = 0;
401 
402   /* create and initialize a linked list */
403   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
404 
405   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */
406   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
407 
408   current_space = free_space;
409 
410   for (i=0; i<am; i++) {
411     /* diagonal portion of A */
412     nzi = adi[i+1] - adi[i];
413     aj  = ad->j + adi[i];
414     for (j=0; j<nzi; j++) {
415       row  = aj[j];
416       pnz  = pi_loc[row+1] - pi_loc[row];
417       Jptr = pj_loc + pi_loc[row];
418       /* add non-zero cols of P into the sorted linked list lnk */
419       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
420     }
421     /* off-diagonal portion of A */
422     nzi = aoi[i+1] - aoi[i];
423     aj  = ao->j + aoi[i];
424     for (j=0; j<nzi; j++) {
425       row  = aj[j];
426       pnz  = pi_oth[row+1] - pi_oth[row];
427       Jptr = pj_oth + pi_oth[row];
428       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
429     }
430     apnz     = lnk[0];
431     api[i+1] = api[i] + apnz;
432     if (ap_rmax < apnz) ap_rmax = apnz;
433 
434     /* if free space is not available, double the total space in the list */
435     if (current_space->local_remaining<apnz) {
436       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
437       nspacedouble++;
438     }
439 
440     /* Copy data into free space, then initialize lnk */
441     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
442 
443     current_space->array           += apnz;
444     current_space->local_used      += apnz;
445     current_space->local_remaining -= apnz;
446   }
447 
448   /* Allocate space for apj, initialize apj, and */
449   /* destroy list of free space and other temporary array(s) */
450   ierr      = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr);
451   ierr      = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
452   afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1);
453   if (afill_tmp > afill) afill = afill_tmp;
454 
455   /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/
456   /*-----------------------------------------------------------------*/
457   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
458 
459   /* then, compute symbolic Co = (p->B)^T*AP */
460   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
461                          >= (num of nonzero rows of C_seq) - pn */
462   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
463   coi[0] = 0;
464 
465   /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */
466   nnz           = fill*(poti[pon] + api[am]);
467   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
468   current_space = free_space;
469 
470   for (i=0; i<pon; i++) {
471     pnz = poti[i+1] - poti[i];
472     ptJ = potj + poti[i];
473     for (j=0; j<pnz; j++) {
474       row  = ptJ[j]; /* row of AP == col of Pot */
475       apnz = api[row+1] - api[row];
476       Jptr = apj + api[row];
477       /* add non-zero cols of AP into the sorted linked list lnk */
478       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
479     }
480     nnz = lnk[0];
481 
482     /* If free space is not available, double the total space in the list */
483     if (current_space->local_remaining<nnz) {
484       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
485       nspacedouble++;
486     }
487 
488     /* Copy data into free space, and zero out denserows */
489     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
490 
491     current_space->array           += nnz;
492     current_space->local_used      += nnz;
493     current_space->local_remaining -= nnz;
494 
495     coi[i+1] = coi[i] + nnz;
496   }
497 
498   ierr      = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr);
499   ierr      = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
500   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1);
501   if (afill_tmp > afill) afill = afill_tmp;
502   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
503 
504   /* (3) send j-array (coj) of Co to other processors */
505   /*--------------------------------------------------*/
506   ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr);
507   len_s        = merge->len_s;
508   merge->nsend = 0;
509 
510 
511   /* determine row ownership */
512   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
513   merge->rowmap->n  = pn;
514   merge->rowmap->bs = 1;
515 
516   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
517   owners = merge->rowmap->range;
518 
519   /* determine the number of messages to send, their lengths */
520   ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr);
521   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
522   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
523 
524   proc = 0;
525   for (i=0; i<pon; i++) {
526     while (prmap[i] >= owners[proc+1]) proc++;
527     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
528     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
529   }
530 
531   len          = 0; /* max length of buf_si[], see (4) */
532   owners_co[0] = 0;
533   for (proc=0; proc<size; proc++) {
534     owners_co[proc+1] = owners_co[proc] + len_si[proc];
535     if (len_s[proc]) {
536       merge->nsend++;
537       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
538       len         += len_si[proc];
539     }
540   }
541 
542   /* determine the number and length of messages to receive for coi and coj  */
543   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
544   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
545 
546   /* post the Irecv and Isend of coj */
547   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
548   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
549   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
550   for (proc=0, k=0; proc<size; proc++) {
551     if (!len_s[proc]) continue;
552     i    = owners_co[proc];
553     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
554     k++;
555   }
556 
557   /* receives and sends of coj are complete */
558   for (i=0; i<merge->nrecv; i++) {
559     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
560   }
561   ierr = PetscFree(rwaits);CHKERRQ(ierr);
562   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
563 
564   /* (4) send and recv coi */
565   /*-----------------------*/
566   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
567   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
568   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
569   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
570   for (proc=0,k=0; proc<size; proc++) {
571     if (!len_s[proc]) continue;
572     /* form outgoing message for i-structure:
573          buf_si[0]:                 nrows to be sent
574                [1:nrows]:           row index (global)
575                [nrows+1:2*nrows+1]: i-structure index
576     */
577     /*-------------------------------------------*/
578     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
579     buf_si_i    = buf_si + nrows+1;
580     buf_si[0]   = nrows;
581     buf_si_i[0] = 0;
582     nrows       = 0;
583     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
584       nzi = coi[i+1] - coi[i];
585       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
586       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
587       nrows++;
588     }
589     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
590     k++;
591     buf_si += len_si[proc];
592   }
593   i = merge->nrecv;
594   while (i--) {
595     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
596   }
597   ierr = PetscFree(rwaits);CHKERRQ(ierr);
598   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
599 
600   ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr);
601   ierr = PetscFree(len_ri);CHKERRQ(ierr);
602   ierr = PetscFree(swaits);CHKERRQ(ierr);
603   ierr = PetscFree(buf_s);CHKERRQ(ierr);
604 
605   /* (5) compute the local portion of C (mpi mat) */
606   /*----------------------------------------------*/
607   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
608 
609   /* allocate pti array and free space for accumulating nonzero column info */
610   ierr   = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr);
611   pti[0] = 0;
612 
613   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
614   nnz           = fill*(pi_loc[pm] + api[am]);
615   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
616   current_space = free_space;
617 
618   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
619   for (k=0; k<merge->nrecv; k++) {
620     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
621     nrows       = *buf_ri_k[k];
622     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
623     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
624   }
625   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
626   rmax = 0;
627   for (i=0; i<pn; i++) {
628     /* add pdt[i,:]*AP into lnk */
629     pnz = pdti[i+1] - pdti[i];
630     ptJ = pdtj + pdti[i];
631     for (j=0; j<pnz; j++) {
632       row  = ptJ[j];  /* row of AP == col of Pt */
633       apnz = api[row+1] - api[row];
634       Jptr = apj + api[row];
635       /* add non-zero cols of AP into the sorted linked list lnk */
636       ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
637     }
638 
639     /* add received col data into lnk */
640     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
641       if (i == *nextrow[k]) { /* i-th row */
642         nzi  = *(nextci[k]+1) - *nextci[k];
643         Jptr = buf_rj[k] + *nextci[k];
644         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
645         nextrow[k]++; nextci[k]++;
646       }
647     }
648     nnz = lnk[0];
649 
650     /* if free space is not available, make more free space */
651     if (current_space->local_remaining<nnz) {
652       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
653       nspacedouble++;
654     }
655     /* copy data into free space, then initialize lnk */
656     ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
657     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
658 
659     current_space->array           += nnz;
660     current_space->local_used      += nnz;
661     current_space->local_remaining -= nnz;
662 
663     pti[i+1] = pti[i] + nnz;
664     if (nnz > rmax) rmax = nnz;
665   }
666   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
667   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
668 
669   ierr      = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr);
670   ierr      = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr);
671   afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1);
672   if (afill_tmp > afill) afill = afill_tmp;
673   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
674 
675   /* (6) create symbolic parallel matrix Cmpi */
676   /*------------------------------------------*/
677   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
678   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
679   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr);
680   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
681   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
682   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
683 
684   merge->bi        = pti;      /* Cseq->i */
685   merge->bj        = ptj;      /* Cseq->j */
686   merge->coi       = coi;      /* Co->i   */
687   merge->coj       = coj;      /* Co->j   */
688   merge->buf_ri    = buf_ri;
689   merge->buf_rj    = buf_rj;
690   merge->owners_co = owners_co;
691   merge->destroy   = Cmpi->ops->destroy;
692   merge->duplicate = Cmpi->ops->duplicate;
693 
694   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
695   Cmpi->assembled      = PETSC_FALSE;
696   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_PtAP;
697   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
698 
699   /* attach the supporting struct to Cmpi for reuse */
700   c           = (Mat_MPIAIJ*)Cmpi->data;
701   c->ptap     = ptap;
702   ptap->api   = api;
703   ptap->apj   = apj;
704   ptap->rmax  = ap_rmax;
705   *C          = Cmpi;
706 
707   /* flag 'scalable' determines which implementations to be used:
708        0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa;
709        1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */
710   /* set default scalable */
711   ptap->scalable = PETSC_FALSE; //PETSC_TRUE;
712 
713   ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr);
714   if (!ptap->scalable) {  /* Do dense axpy */
715     ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
716   } else {
717     ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr);
718   }
719 
720 #if defined(PETSC_USE_INFO)
721   if (pti[pn] != 0) {
722     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
723     ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
724   } else {
725     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
726   }
727 #endif
728   PetscFunctionReturn(0);
729 }
730 
731 #undef __FUNCT__
732 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
733 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
734 {
735   PetscErrorCode      ierr;
736   Mat_MPIAIJ          *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
737   Mat_SeqAIJ          *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
738   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
739   Mat_SeqAIJ          *p_loc,*p_oth;
740   Mat_PtAPMPI         *ptap;
741   Mat_Merge_SeqsToMPI *merge;
742   PetscInt            *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp;
743   PetscInt            *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
744   PetscInt            i,j,k,anz,pnz,apnz,nextap,row,*cj;
745   MatScalar           *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp;
746   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
747   MPI_Comm            comm;
748   PetscMPIInt         size,rank,taga,*len_s;
749   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
750   PetscInt            **buf_ri,**buf_rj;
751   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
752   MPI_Request         *s_waits,*r_waits;
753   MPI_Status          *status;
754   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
755   PetscInt            *api,*apj,*coi,*coj;
756   PetscInt            *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend;
757   PetscBool           scalable;
758 #if defined(PTAP_PROFILE)
759   PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2;
760 #endif
761 
762   PetscFunctionBegin;
763   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
764 #if defined(PTAP_PROFILE)
765   ierr = PetscTime(&t0);CHKERRQ(ierr);
766 #endif
767   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
768   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
769 
770   ptap = c->ptap;
771   if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX");
772   merge    = ptap->merge;
773   apa      = ptap->apa;
774   scalable = ptap->scalable;
775 
776   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
777   /*-----------------------------------------------------*/
778   if (ptap->reuse == MAT_INITIAL_MATRIX) {
779     /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */
780     ptap->reuse = MAT_REUSE_MATRIX;
781   } else { /* update numerical values of P_oth and P_loc */
782     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
783     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
784   }
785 #if defined(PTAP_PROFILE)
786   ierr = PetscTime(&t1);CHKERRQ(ierr);
787   eP = t1-t0;
788 #endif
789   /*
790   printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank,
791          a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N,
792          ptap->P_loc->rmap->N,ptap->P_loc->cmap->N,
793          ptap->P_oth->rmap->N,ptap->P_oth->cmap->N);
794    */
795 
796   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
797   /*--------------------------------------------------------------*/
798   /* get data from symbolic products */
799   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
800   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
801   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a;
802   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
803 
804   coi  = merge->coi; coj = merge->coj;
805   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
806 
807   bi     = merge->bi; bj = merge->bj;
808   owners = merge->rowmap->range;
809   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);  /* ba: Cseq->a */
810 
811   api = ptap->api; apj = ptap->apj;
812 
813   if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */
814     ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr);
815 #if 0
816     /* ------ 10x slower -------------- */
817     /*==================================*/
818     Mat         R = ptap->R;
819     Mat_SeqAIJ  *r = (Mat_SeqAIJ*)R->data;
820     PetscInt    *ri=r->i,*rj=r->j,rnz,arow,l,prow,pcol,pN=P->cmap->N;
821     PetscScalar *ra=r->a,tmp,cdense[pN];
822 
823     ierr = PetscMemzero(cdense,pN*sizeof(PetscScalar));CHKERRQ(ierr);
824     for (i=0; i<cm; i++) { /* each row of C or R */
825       rnz = ri[i+1] - ri[i];
826 
827       for (j=0; j<rnz; j++) { /* each nz of R */
828         arow = rj[ri[i] + j];
829 
830         /* diagonal portion of A */
831         anz  = ad->i[arow+1] - ad->i[arow];
832         for (k=0; k<anz; k++) { /* each nz of Ad */
833           tmp  = ra[ri[i] + j]*ad->a[ad->i[arow] + k];
834           prow = ad->j[ad->i[arow] + k];
835           pnz  = pi_loc[prow+1] - pi_loc[prow];
836 
837           for (l=0; l<pnz; l++) { /* each nz of P_loc */
838             pcol = pj_loc[pi_loc[prow] + l];
839             cdense[pcol] += tmp*pa_loc[pi_loc[prow] + l];
840           }
841         }
842 
843         /* off-diagonal portion of A */
844         anz  = ao->i[arow+1] - ao->i[arow];
845         for (k=0; k<anz; k++) { /* each nz of Ao */
846           tmp  = ra[ri[i] + j]*ao->a[ao->i[arow] + k];
847           prow = ao->j[ao->i[arow] + k];
848           pnz  = pi_oth[prow+1] - pi_oth[prow];
849 
850           for (l=0; l<pnz; l++) { /* each nz of P_oth */
851             pcol = pj_oth[pi_oth[prow] + l];
852             cdense[pcol] += tmp*pa_oth[pi_oth[prow] + l];
853           }
854         }
855 
856       } //for (j=0; j<rnz; j++)
857 
858       /* copy cdense[] into ca; zero cdense[] */
859       cnz = bi[i+1] - bi[i];
860       cj  = bj + bi[i];
861       ca  = ba + bi[i];
862       for (j=0; j<cnz; j++) {
863         ca[j] += cdense[cj[j]];
864         cdense[cj[j]] = 0.0;
865       }
866 #if 0
867       if (rank == 0) {
868         printf("[%d] row %d: ",rank,i);
869         for (j=0; j<pN; j++) printf(" %g,",cdense[j]);
870         printf("\n");
871       }
872       for (j=0; j<pN; j++) cdense[j]=0.0; // zero cdnese[]
873 #endif
874     } //for (i=0; i<cm; i++) {
875 #endif
876 
877     //==========================================
878 
879     ierr = PetscTime(&t1);CHKERRQ(ierr);
880     for (i=0; i<am; i++) {
881 #if defined(PTAP_PROFILE)
882       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
883 #endif
884       /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */
885       /*------------------------------------------------------------*/
886       apJ = apj + api[i];
887 
888       /* diagonal portion of A */
889       anz = adi[i+1] - adi[i];
890       adj = ad->j + adi[i];
891       ada = ad->a + adi[i];
892       for (j=0; j<anz; j++) {
893         row = adj[j];
894         pnz = pi_loc[row+1] - pi_loc[row];
895         pj  = pj_loc + pi_loc[row];
896         pa  = pa_loc + pi_loc[row];
897 
898         /* perform dense axpy */
899         valtmp = ada[j];
900         for (k=0; k<pnz; k++) {
901           apa[pj[k]] += valtmp*pa[k];
902         }
903         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
904       }
905 
906       /* off-diagonal portion of A */
907       anz = aoi[i+1] - aoi[i];
908       aoj = ao->j + aoi[i];
909       aoa = ao->a + aoi[i];
910       for (j=0; j<anz; j++) {
911         row = aoj[j];
912         pnz = pi_oth[row+1] - pi_oth[row];
913         pj  = pj_oth + pi_oth[row];
914         pa  = pa_oth + pi_oth[row];
915 
916         /* perform dense axpy */
917         valtmp = aoa[j];
918         for (k=0; k<pnz; k++) {
919           apa[pj[k]] += valtmp*pa[k];
920         }
921         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
922       }
923 #if defined(PTAP_PROFILE)
924       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
925       et2_AP += t2_1 - t2_0;
926 #endif
927 
928       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
929       /*--------------------------------------------------------------*/
930       apnz = api[i+1] - api[i];
931       /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */
932       pnz = po->i[i+1] - po->i[i];
933       poJ = po->j + po->i[i];
934       pA  = po->a + po->i[i];
935       for (j=0; j<pnz; j++) {
936         row = poJ[j];
937         cnz = coi[row+1] - coi[row];
938         cj  = coj + coi[row];
939         ca  = coa + coi[row];
940         /* perform dense axpy */
941         valtmp = pA[j];
942         for (k=0; k<cnz; k++) {
943           ca[k] += valtmp*apa[cj[k]];
944         }
945         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
946       }
947 #if 1
948       /* put the value into Cd (diagonal part) */
949       pnz = pd->i[i+1] - pd->i[i];
950       pdJ = pd->j + pd->i[i];
951       pA  = pd->a + pd->i[i];
952       for (j=0; j<pnz; j++) {
953         row = pdJ[j];
954         cnz = bi[row+1] - bi[row];
955         cj  = bj + bi[row];
956         ca  = ba + bi[row];
957         /* perform dense axpy */
958         valtmp = pA[j];
959         for (k=0; k<cnz; k++) {
960           ca[k] += valtmp*apa[cj[k]];
961         }
962         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
963       }
964 #endif
965       /* zero the current row of A*P */
966       for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0;
967 #if defined(PTAP_PROFILE)
968       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
969       ePtAP += t2_2 - t2_1;
970 #endif
971     }
972 
973     if (rank == 100) {
974     for (row=0; row<cm; row++) {
975       printf("[%d] row %d: ",rank,row);
976       cnz = bi[row+1] - bi[row];
977       for (j=0; j<cnz; j++) printf(" %g,",ba[bi[row]+j]);
978       printf("\n");
979     }
980     }
981 
982   } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */
983     ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr);
984     /*-----------------------------------------------------------------------------------------*/
985     pA=pa_loc;
986     for (i=0; i<am; i++) {
987 #if defined(PTAP_PROFILE)
988       ierr = PetscTime(&t2_0);CHKERRQ(ierr);
989 #endif
990       /* form i-th sparse row of A*P */
991       apnz = api[i+1] - api[i];
992       apJ  = apj + api[i];
993       /* diagonal portion of A */
994       anz = adi[i+1] - adi[i];
995       adj = ad->j + adi[i];
996       ada = ad->a + adi[i];
997       for (j=0; j<anz; j++) {
998         row    = adj[j];
999         pnz    = pi_loc[row+1] - pi_loc[row];
1000         pj     = pj_loc + pi_loc[row];
1001         pa     = pa_loc + pi_loc[row];
1002         valtmp = ada[j];
1003         nextp  = 0;
1004         for (k=0; nextp<pnz; k++) {
1005           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1006             apa[k] += valtmp*pa[nextp++];
1007           }
1008         }
1009         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1010       }
1011       /* off-diagonal portion of A */
1012       anz = aoi[i+1] - aoi[i];
1013       aoj = ao->j + aoi[i];
1014       aoa = ao->a + aoi[i];
1015       for (j=0; j<anz; j++) {
1016         row    = aoj[j];
1017         pnz    = pi_oth[row+1] - pi_oth[row];
1018         pj     = pj_oth + pi_oth[row];
1019         pa     = pa_oth + pi_oth[row];
1020         valtmp = aoa[j];
1021         nextp  = 0;
1022         for (k=0; nextp<pnz; k++) {
1023           if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
1024             apa[k] += valtmp*pa[nextp++];
1025           }
1026         }
1027         ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
1028       }
1029 #if defined(PTAP_PROFILE)
1030       ierr    = PetscTime(&t2_1);CHKERRQ(ierr);
1031       et2_AP += t2_1 - t2_0;
1032 #endif
1033 
1034       /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */
1035       /*--------------------------------------------------------------*/
1036       pnz = pi_loc[i+1] - pi_loc[i];
1037       pJ  = pj_loc + pi_loc[i];
1038       for (j=0; j<pnz; j++) {
1039         nextap = 0;
1040         row    = pJ[j]; /* global index */
1041         if (row < pcstart || row >=pcend) { /* put the value into Co */
1042           row = *poJ;
1043           cj  = coj + coi[row];
1044           ca  = coa + coi[row]; poJ++;
1045         } else {                            /* put the value into Cd */
1046           row = *pdJ;
1047           cj  = bj + bi[row];
1048           ca  = ba + bi[row]; pdJ++;
1049         }
1050         valtmp = pA[j];
1051         for (k=0; nextap<apnz; k++) {
1052           if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++];
1053         }
1054         ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr);
1055       }
1056       pA += pnz;
1057       /* zero the current row info for A*P */
1058       ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
1059 #if defined(PTAP_PROFILE)
1060       ierr      = PetscTime(&t2_2);CHKERRQ(ierr);
1061       ePtAP += t2_2 - t2_1;
1062 #endif
1063     }
1064   }
1065 #if defined(PTAP_PROFILE)
1066   ierr = PetscTime(&t2);CHKERRQ(ierr);
1067 #endif
1068 
1069   /* 3) send and recv matrix values coa */
1070   /*------------------------------------*/
1071   buf_ri = merge->buf_ri;
1072   buf_rj = merge->buf_rj;
1073   len_s  = merge->len_s;
1074   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1075   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1076 
1077   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1078   for (proc=0,k=0; proc<size; proc++) {
1079     if (!len_s[proc]) continue;
1080     i    = merge->owners_co[proc];
1081     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1082     k++;
1083   }
1084   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1085   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1086 
1087   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1088   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1089   ierr = PetscFree(coa);CHKERRQ(ierr);
1090 #if defined(PTAP_PROFILE)
1091   ierr = PetscTime(&t3);CHKERRQ(ierr);
1092 #endif
1093 
1094   /* 4) insert local Cseq and received values into Cmpi */
1095   /*------------------------------------------------------*/
1096   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1097   for (k=0; k<merge->nrecv; k++) {
1098     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1099     nrows       = *(buf_ri_k[k]);
1100     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1101     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1102   }
1103 
1104   for (i=0; i<cm; i++) {
1105     row  = owners[rank] + i; /* global row index of C_seq */
1106     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1107     ba_i = ba + bi[i];
1108     bnz  = bi[i+1] - bi[i];
1109     /* add received vals into ba */
1110     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1111       /* i-th row */
1112       if (i == *nextrow[k]) {
1113         cnz    = *(nextci[k]+1) - *nextci[k];
1114         cj     = buf_rj[k] + *(nextci[k]);
1115         ca     = abuf_r[k] + *(nextci[k]);
1116         nextcj = 0;
1117         for (j=0; nextcj<cnz; j++) {
1118           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1119             ba_i[j] += ca[nextcj++];
1120           }
1121         }
1122         nextrow[k]++; nextci[k]++;
1123         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1124       }
1125     }
1126     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1127   }
1128   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1129   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1130 
1131   ierr = PetscFree(ba);CHKERRQ(ierr);
1132   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1133   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1134   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1135 #if defined(PTAP_PROFILE)
1136   ierr = PetscTime(&t4);CHKERRQ(ierr);
1137   if (rank==1) {
1138     ierr = PetscPrintf(MPI_COMM_SELF,"  [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr);
1139   }
1140 #endif
1141   PetscFunctionReturn(0);
1142 }
1143