xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 968382fd6276caa3ed3a1e5a3f88bb3b5c8dfc7f)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc-private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=0; /* set default algorithm */
20 
21   PetscFunctionBegin;
22   if (scall == MAT_INITIAL_MATRIX) {
23     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
24     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
25     ierr = PetscOptionsEnd();CHKERRQ(ierr);
26 
27     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
28     switch (alg) {
29     case 1:
30       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
31       break;
32     default:
33       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
34       break;
35     }
36     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
37   }
38   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
39   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
40   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
41   PetscFunctionReturn(0);
42 }
43 
44 #undef __FUNCT__
45 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
46 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47 {
48   PetscErrorCode ierr;
49   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
50   Mat_PtAPMPI    *ptap = a->ptap;
51 
52   PetscFunctionBegin;
53   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
54   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
55   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
56   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
57   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
59   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
60   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
61   ierr = ptap->destroy(A);CHKERRQ(ierr);
62   ierr = PetscFree(ptap);CHKERRQ(ierr);
63   PetscFunctionReturn(0);
64 }
65 
66 #undef __FUNCT__
67 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
68 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69 {
70   PetscErrorCode ierr;
71   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
72   Mat_PtAPMPI    *ptap = a->ptap;
73 
74   PetscFunctionBegin;
75   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
76 
77   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79   PetscFunctionReturn(0);
80 }
81 
82 #undef __FUNCT__
83 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
84 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
85 {
86   PetscErrorCode ierr;
87   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
88   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
89   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
90   PetscInt       *adi=ad->i,*adj,*aoi=ao->i,*aoj;
91   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
92   Mat_SeqAIJ     *p_loc,*p_oth;
93   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
94   PetscScalar    *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
95   PetscInt       cm   =C->rmap->n,anz,pnz;
96   Mat_PtAPMPI    *ptap=c->ptap;
97   PetscInt       *api,*apj,*apJ,i,j,k,row;
98   PetscInt       cstart=C->cmap->rstart;
99   PetscInt       cdnz,conz,k0,k1;
100   MPI_Comm       comm;
101   PetscMPIInt    size;
102 
103   PetscFunctionBegin;
104   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
105   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
106 
107   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
108   /*-----------------------------------------------------*/
109   /* update numerical values of P_oth and P_loc */
110   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
111   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
112 
113   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
114   /*----------------------------------------------------------*/
115   /* get data from symbolic products */
116   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
117   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
118   if (size >1) {
119     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
120     pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
121   } else {
122     pi_oth=NULL; pj_oth=NULL; pa_oth=NULL;
123   }
124 
125   /* get apa for storing dense row A[i,:]*P */
126   apa = ptap->apa;
127 
128   api = ptap->api;
129   apj = ptap->apj;
130   for (i=0; i<cm; i++) {
131     /* diagonal portion of A */
132     anz = adi[i+1] - adi[i];
133     adj = ad->j + adi[i];
134     ada = ad->a + adi[i];
135     for (j=0; j<anz; j++) {
136       row = adj[j];
137       pnz = pi_loc[row+1] - pi_loc[row];
138       pj  = pj_loc + pi_loc[row];
139       pa  = pa_loc + pi_loc[row];
140 
141       /* perform dense axpy */
142       valtmp = ada[j];
143       for (k=0; k<pnz; k++) {
144         apa[pj[k]] += valtmp*pa[k];
145       }
146       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
147     }
148 
149     /* off-diagonal portion of A */
150     anz = aoi[i+1] - aoi[i];
151     aoj = ao->j + aoi[i];
152     aoa = ao->a + aoi[i];
153     for (j=0; j<anz; j++) {
154       row = aoj[j];
155       pnz = pi_oth[row+1] - pi_oth[row];
156       pj  = pj_oth + pi_oth[row];
157       pa  = pa_oth + pi_oth[row];
158 
159       /* perform dense axpy */
160       valtmp = aoa[j];
161       for (k=0; k<pnz; k++) {
162         apa[pj[k]] += valtmp*pa[k];
163       }
164       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
165     }
166 
167     /* set values in C */
168     apJ  = apj + api[i];
169     cdnz = cd->i[i+1] - cd->i[i];
170     conz = co->i[i+1] - co->i[i];
171 
172     /* 1st off-diagoanl part of C */
173     ca = coa + co->i[i];
174     k  = 0;
175     for (k0=0; k0<conz; k0++) {
176       if (apJ[k] >= cstart) break;
177       ca[k0]      = apa[apJ[k]];
178       apa[apJ[k]] = 0.0;
179       k++;
180     }
181 
182     /* diagonal part of C */
183     ca = cda + cd->i[i];
184     for (k1=0; k1<cdnz; k1++) {
185       ca[k1]      = apa[apJ[k]];
186       apa[apJ[k]] = 0.0;
187       k++;
188     }
189 
190     /* 2nd off-diagoanl part of C */
191     ca = coa + co->i[i];
192     for (; k0<conz; k0++) {
193       ca[k0]      = apa[apJ[k]];
194       apa[apJ[k]] = 0.0;
195       k++;
196     }
197   }
198   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
199   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNCT__
204 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
205 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
206 {
207   PetscErrorCode     ierr;
208   MPI_Comm           comm;
209   PetscMPIInt        size;
210   Mat                Cmpi;
211   Mat_PtAPMPI        *ptap;
212   PetscFreeSpaceList free_space=NULL,current_space=NULL;
213   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
214   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
215   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
216   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
217   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
218   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
219   PetscBT            lnkbt;
220   PetscScalar        *apa;
221   PetscReal          afill;
222   PetscInt           nlnk_max,armax,prmax;
223 
224   PetscFunctionBegin;
225   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
226   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
227 
228   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
229     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
230   }
231 
232   /* create struct Mat_PtAPMPI and attached it to C later */
233   ierr = PetscNew(&ptap);CHKERRQ(ierr);
234 
235   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
236   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
237 
238   /* get P_loc by taking all local rows of P */
239   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
240 
241   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
242   pi_loc = p_loc->i; pj_loc = p_loc->j;
243   if (size > 1) {
244     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
245     pi_oth = p_oth->i; pj_oth = p_oth->j;
246   }
247 
248   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
249   /*-------------------------------------------------------------------*/
250   ierr      = PetscMalloc1((am+2),&api);CHKERRQ(ierr);
251   ptap->api = api;
252   api[0]    = 0;
253 
254   /* create and initialize a linked list */
255   armax    = ad->rmax+ao->rmax;
256   if (size >1) {
257     prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
258   } else {
259     prmax = p_loc->rmax;
260   }
261   nlnk_max = armax*prmax;
262   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
263   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
264 
265   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
266   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
267 
268   current_space = free_space;
269 
270   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
271   for (i=0; i<am; i++) {
272     /* diagonal portion of A */
273     nzi = adi[i+1] - adi[i];
274     for (j=0; j<nzi; j++) {
275       row  = *adj++;
276       pnz  = pi_loc[row+1] - pi_loc[row];
277       Jptr = pj_loc + pi_loc[row];
278       /* add non-zero cols of P into the sorted linked list lnk */
279       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
280     }
281     /* off-diagonal portion of A */
282     nzi = aoi[i+1] - aoi[i];
283     for (j=0; j<nzi; j++) {
284       row  = *aoj++;
285       pnz  = pi_oth[row+1] - pi_oth[row];
286       Jptr = pj_oth + pi_oth[row];
287       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
288     }
289 
290     apnz     = lnk[0];
291     api[i+1] = api[i] + apnz;
292 
293     /* if free space is not available, double the total space in the list */
294     if (current_space->local_remaining<apnz) {
295       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
296       nspacedouble++;
297     }
298 
299     /* Copy data into free space, then initialize lnk */
300     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
301     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
302 
303     current_space->array           += apnz;
304     current_space->local_used      += apnz;
305     current_space->local_remaining -= apnz;
306   }
307 
308   /* Allocate space for apj, initialize apj, and */
309   /* destroy list of free space and other temporary array(s) */
310   ierr = PetscMalloc1((api[am]+1),&ptap->apj);CHKERRQ(ierr);
311   apj  = ptap->apj;
312   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
313   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
314 
315   /* malloc apa to store dense row A[i,:]*P */
316   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
317 
318   ptap->apa = apa;
319 
320   /* create and assemble symbolic parallel matrix Cmpi */
321   /*----------------------------------------------------*/
322   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
323   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
324   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
325 
326   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
327   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
328   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
329   for (i=0; i<am; i++) {
330     row  = i + rstart;
331     apnz = api[i+1] - api[i];
332     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
333     apj += apnz;
334   }
335   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
336   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
337 
338   ptap->destroy        = Cmpi->ops->destroy;
339   ptap->duplicate      = Cmpi->ops->duplicate;
340   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
341   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
342   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
343 
344   /* attach the supporting struct to Cmpi for reuse */
345   c       = (Mat_MPIAIJ*)Cmpi->data;
346   c->ptap = ptap;
347 
348   *C = Cmpi;
349 
350   /* set MatInfo */
351   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
352   if (afill < 1.0) afill = 1.0;
353   Cmpi->info.mallocs           = nspacedouble;
354   Cmpi->info.fill_ratio_given  = fill;
355   Cmpi->info.fill_ratio_needed = afill;
356 
357 #if defined(PETSC_USE_INFO)
358   if (api[am]) {
359     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
360     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
361   } else {
362     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
363   }
364 #endif
365   PetscFunctionReturn(0);
366 }
367 
368 #undef __FUNCT__
369 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
370 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
371 {
372   PetscErrorCode ierr;
373 
374   PetscFunctionBegin;
375   if (scall == MAT_INITIAL_MATRIX) {
376     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
377     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
378     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
379   }
380   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
381   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
382   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 typedef struct {
387   Mat         workB;
388   PetscScalar *rvalues,*svalues;
389   MPI_Request *rwaits,*swaits;
390 } MPIAIJ_MPIDense;
391 
392 #undef __FUNCT__
393 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
394 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
395 {
396   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
397   PetscErrorCode  ierr;
398 
399   PetscFunctionBegin;
400   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
401   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
402   ierr = PetscFree(contents);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
408 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
409 {
410   PetscErrorCode         ierr;
411   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
412   PetscInt               nz   = aij->B->cmap->n;
413   PetscContainer         container;
414   MPIAIJ_MPIDense        *contents;
415   VecScatter             ctx   = aij->Mvctx;
416   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
417   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
418   PetscInt               m     = A->rmap->n,n=B->cmap->n;
419 
420   PetscFunctionBegin;
421   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
422   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
423   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
424   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
425   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
426   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
427   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
428 
429   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
430 
431   ierr = PetscNew(&contents);CHKERRQ(ierr);
432   /* Create work matrix used to store off processor rows of B needed for local product */
433   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
434   /* Create work arrays needed */
435   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
436                       B->cmap->N*to->starts[to->n],&contents->svalues,
437                       from->n,&contents->rwaits,
438                       to->n,&contents->swaits);CHKERRQ(ierr);
439 
440   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
441   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
442   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
443   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
444   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
445   PetscFunctionReturn(0);
446 }
447 
448 #undef __FUNCT__
449 #define __FUNCT__ "MatMPIDenseScatter"
450 /*
451     Performs an efficient scatter on the rows of B needed by this process; this is
452     a modification of the VecScatterBegin_() routines.
453 */
454 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
455 {
456   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
457   PetscErrorCode         ierr;
458   PetscScalar            *b,*w,*svalues,*rvalues;
459   VecScatter             ctx   = aij->Mvctx;
460   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
461   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
462   PetscInt               i,j,k;
463   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
464   PetscMPIInt            *sprocs,*rprocs,nrecvs;
465   MPI_Request            *swaits,*rwaits;
466   MPI_Comm               comm;
467   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
468   MPI_Status             status;
469   MPIAIJ_MPIDense        *contents;
470   PetscContainer         container;
471   Mat                    workB;
472 
473   PetscFunctionBegin;
474   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
475   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
476   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
477   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
478 
479   workB = *outworkB = contents->workB;
480   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
481   sindices = to->indices;
482   sstarts  = to->starts;
483   sprocs   = to->procs;
484   swaits   = contents->swaits;
485   svalues  = contents->svalues;
486 
487   rindices = from->indices;
488   rstarts  = from->starts;
489   rprocs   = from->procs;
490   rwaits   = contents->rwaits;
491   rvalues  = contents->rvalues;
492 
493   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
494   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
495 
496   for (i=0; i<from->n; i++) {
497     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
498   }
499 
500   for (i=0; i<to->n; i++) {
501     /* pack a message at a time */
502     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
503       for (k=0; k<ncols; k++) {
504         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
505       }
506     }
507     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
508   }
509 
510   nrecvs = from->n;
511   while (nrecvs) {
512     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
513     nrecvs--;
514     /* unpack a message at a time */
515     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
516       for (k=0; k<ncols; k++) {
517         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
518       }
519     }
520   }
521   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
522 
523   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
524   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
525   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
526   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
527   PetscFunctionReturn(0);
528 }
529 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
530 
531 #undef __FUNCT__
532 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
533 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
534 {
535   PetscErrorCode ierr;
536   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
537   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
538   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
539   Mat            workB;
540 
541   PetscFunctionBegin;
542   /* diagonal block of A times all local rows of B*/
543   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
544 
545   /* get off processor parts of B needed to complete the product */
546   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
547 
548   /* off-diagonal block of A times nonlocal rows of B */
549   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
550   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
551   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
552   PetscFunctionReturn(0);
553 }
554 
555 #undef __FUNCT__
556 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
557 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
558 {
559   PetscErrorCode ierr;
560   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
561   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
562   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
563   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
564   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
565   Mat_SeqAIJ     *p_loc,*p_oth;
566   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
567   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
568   PetscInt       cm          = C->rmap->n,anz,pnz;
569   Mat_PtAPMPI    *ptap       = c->ptap;
570   PetscScalar    *apa_sparse = ptap->apa;
571   PetscInt       *api,*apj,*apJ,i,j,k,row;
572   PetscInt       cstart = C->cmap->rstart;
573   PetscInt       cdnz,conz,k0,k1,nextp;
574   MPI_Comm       comm;
575   PetscMPIInt    size;
576 
577   PetscFunctionBegin;
578   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
579   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
580 
581   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
582   /*-----------------------------------------------------*/
583   /* update numerical values of P_oth and P_loc */
584   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
585   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
586 
587   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
588   /*----------------------------------------------------------*/
589   /* get data from symbolic products */
590   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
591   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
592   if (size >1) {
593     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
594     pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
595   }
596 
597   api = ptap->api;
598   apj = ptap->apj;
599   for (i=0; i<cm; i++) {
600     apJ = apj + api[i];
601 
602     /* diagonal portion of A */
603     anz = adi[i+1] - adi[i];
604     adj = ad->j + adi[i];
605     ada = ad->a + adi[i];
606     for (j=0; j<anz; j++) {
607       row = adj[j];
608       pnz = pi_loc[row+1] - pi_loc[row];
609       pj  = pj_loc + pi_loc[row];
610       pa  = pa_loc + pi_loc[row];
611       /* perform sparse axpy */
612       valtmp = ada[j];
613       nextp  = 0;
614       for (k=0; nextp<pnz; k++) {
615         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
616           apa_sparse[k] += valtmp*pa[nextp++];
617         }
618       }
619       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
620     }
621 
622     /* off-diagonal portion of A */
623     anz = aoi[i+1] - aoi[i];
624     aoj = ao->j + aoi[i];
625     aoa = ao->a + aoi[i];
626     for (j=0; j<anz; j++) {
627       row = aoj[j];
628       pnz = pi_oth[row+1] - pi_oth[row];
629       pj  = pj_oth + pi_oth[row];
630       pa  = pa_oth + pi_oth[row];
631       /* perform sparse axpy */
632       valtmp = aoa[j];
633       nextp  = 0;
634       for (k=0; nextp<pnz; k++) {
635         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
636           apa_sparse[k] += valtmp*pa[nextp++];
637         }
638       }
639       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
640     }
641 
642     /* set values in C */
643     cdnz = cd->i[i+1] - cd->i[i];
644     conz = co->i[i+1] - co->i[i];
645 
646     /* 1st off-diagoanl part of C */
647     ca = coa + co->i[i];
648     k  = 0;
649     for (k0=0; k0<conz; k0++) {
650       if (apJ[k] >= cstart) break;
651       ca[k0]        = apa_sparse[k];
652       apa_sparse[k] = 0.0;
653       k++;
654     }
655 
656     /* diagonal part of C */
657     ca = cda + cd->i[i];
658     for (k1=0; k1<cdnz; k1++) {
659       ca[k1]        = apa_sparse[k];
660       apa_sparse[k] = 0.0;
661       k++;
662     }
663 
664     /* 2nd off-diagoanl part of C */
665     ca = coa + co->i[i];
666     for (; k0<conz; k0++) {
667       ca[k0]        = apa_sparse[k];
668       apa_sparse[k] = 0.0;
669       k++;
670     }
671   }
672   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
673   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
674   PetscFunctionReturn(0);
675 }
676 
677 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
678 #undef __FUNCT__
679 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
680 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
681 {
682   PetscErrorCode     ierr;
683   MPI_Comm           comm;
684   PetscMPIInt        size;
685   Mat                Cmpi;
686   Mat_PtAPMPI        *ptap;
687   PetscFreeSpaceList free_space = NULL,current_space=NULL;
688   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
689   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
690   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
691   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
692   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
693   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
694   PetscInt           nlnk_max,armax,prmax;
695   PetscReal          afill;
696   PetscScalar        *apa;
697 
698   PetscFunctionBegin;
699   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
700   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
701 
702   /* create struct Mat_PtAPMPI and attached it to C later */
703   ierr = PetscNew(&ptap);CHKERRQ(ierr);
704 
705   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
706   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
707 
708   /* get P_loc by taking all local rows of P */
709   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
710 
711   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
712   pi_loc = p_loc->i; pj_loc = p_loc->j;
713   if (size > 1) {
714     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
715     pi_oth = p_oth->i;
716     pj_oth = p_oth->j;
717   } else {
718     pi_oth = NULL;
719     pj_oth = NULL;
720   }
721 
722   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
723   /*-------------------------------------------------------------------*/
724   ierr      = PetscMalloc1((am+2),&api);CHKERRQ(ierr);
725   ptap->api = api;
726   api[0]    = 0;
727 
728   /* create and initialize a linked list */
729   armax = ad->rmax+ao->rmax;
730   if (size >1) {
731     prmax = PetscMax(p_loc->rmax,p_oth->rmax);
732   } else {
733     prmax = p_loc->rmax;
734   }
735   nlnk_max = armax*prmax;
736   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
737   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
738 
739   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
740   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
741 
742   current_space = free_space;
743 
744   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
745   for (i=0; i<am; i++) {
746     /* diagonal portion of A */
747     nzi = adi[i+1] - adi[i];
748     for (j=0; j<nzi; j++) {
749       row  = *adj++;
750       pnz  = pi_loc[row+1] - pi_loc[row];
751       Jptr = pj_loc + pi_loc[row];
752       /* add non-zero cols of P into the sorted linked list lnk */
753       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
754     }
755     /* off-diagonal portion of A */
756     nzi = aoi[i+1] - aoi[i];
757     for (j=0; j<nzi; j++) {
758       row  = *aoj++;
759       pnz  = pi_oth[row+1] - pi_oth[row];
760       Jptr = pj_oth + pi_oth[row];
761       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
762     }
763 
764     apnz     = *lnk;
765     api[i+1] = api[i] + apnz;
766     if (apnz > apnz_max) apnz_max = apnz;
767 
768     /* if free space is not available, double the total space in the list */
769     if (current_space->local_remaining<apnz) {
770       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
771       nspacedouble++;
772     }
773 
774     /* Copy data into free space, then initialize lnk */
775     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
776     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
777 
778     current_space->array           += apnz;
779     current_space->local_used      += apnz;
780     current_space->local_remaining -= apnz;
781   }
782 
783   /* Allocate space for apj, initialize apj, and */
784   /* destroy list of free space and other temporary array(s) */
785   ierr = PetscMalloc1((api[am]+1),&ptap->apj);CHKERRQ(ierr);
786   apj  = ptap->apj;
787   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
788   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
789 
790   /* create and assemble symbolic parallel matrix Cmpi */
791   /*----------------------------------------------------*/
792   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
793   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
794   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
795   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
796   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
797   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
798 
799   /* malloc apa for assembly Cmpi */
800   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
801 
802   ptap->apa = apa;
803   for (i=0; i<am; i++) {
804     row  = i + rstart;
805     apnz = api[i+1] - api[i];
806     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
807     apj += apnz;
808   }
809   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
810   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
811 
812   ptap->destroy             = Cmpi->ops->destroy;
813   ptap->duplicate           = Cmpi->ops->duplicate;
814   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
815   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
816   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
817 
818   /* attach the supporting struct to Cmpi for reuse */
819   c       = (Mat_MPIAIJ*)Cmpi->data;
820   c->ptap = ptap;
821 
822   *C = Cmpi;
823 
824   /* set MatInfo */
825   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
826   if (afill < 1.0) afill = 1.0;
827   Cmpi->info.mallocs           = nspacedouble;
828   Cmpi->info.fill_ratio_given  = fill;
829   Cmpi->info.fill_ratio_needed = afill;
830 
831 #if defined(PETSC_USE_INFO)
832   if (api[am]) {
833     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
834     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
835   } else {
836     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
837   }
838 #endif
839   PetscFunctionReturn(0);
840 }
841 
842 /*-------------------------------------------------------------------------*/
843 #undef __FUNCT__
844 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
845 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
846 {
847   PetscErrorCode ierr;
848   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
849   PetscInt       alg=0; /* set default algorithm */
850 
851   PetscFunctionBegin;
852   if (scall == MAT_INITIAL_MATRIX) {
853     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
854     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
855     ierr = PetscOptionsEnd();CHKERRQ(ierr);
856 
857     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
858     switch (alg) {
859     case 1:
860       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
861       break;
862     case 2:
863     {
864       Mat         Pt;
865       Mat_PtAPMPI *ptap;
866       Mat_MPIAIJ  *c;
867       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
868       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
869       c        = (Mat_MPIAIJ*)(*C)->data;
870       ptap     = c->ptap;
871       ptap->Pt = Pt;
872       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
873       PetscFunctionReturn(0);
874     }
875       break;
876     default:
877       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
878       break;
879     }
880     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
881   }
882   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
883   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
884   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
885   PetscFunctionReturn(0);
886 }
887 
888 /* This routine only works when scall=MAT_REUSE_MATRIX! */
889 #undef __FUNCT__
890 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
891 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
892 {
893   PetscErrorCode ierr;
894   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
895   Mat_PtAPMPI    *ptap= c->ptap;
896   Mat            Pt=ptap->Pt;
897 
898   PetscFunctionBegin;
899   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
900   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
901   PetscFunctionReturn(0);
902 }
903 
904 /* Non-scalable version, use dense axpy */
905 #undef __FUNCT__
906 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
907 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
908 {
909   PetscErrorCode      ierr;
910   Mat_Merge_SeqsToMPI *merge;
911   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
912   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
913   Mat_PtAPMPI         *ptap;
914   PetscInt            *adj,*aJ;
915   PetscInt            i,j,k,anz,pnz,row,*cj;
916   MatScalar           *ada,*aval,*ca,valtmp;
917   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
918   MPI_Comm            comm;
919   PetscMPIInt         size,rank,taga,*len_s;
920   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
921   PetscInt            **buf_ri,**buf_rj;
922   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
923   MPI_Request         *s_waits,*r_waits;
924   MPI_Status          *status;
925   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
926   PetscInt            *ai,*aj,*coi,*coj;
927   PetscInt            *poJ,*pdJ;
928   Mat                 A_loc;
929   Mat_SeqAIJ          *a_loc;
930 
931   PetscFunctionBegin;
932   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
933   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
934   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
935 
936   ptap  = c->ptap;
937   merge = ptap->merge;
938 
939   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
940   /*--------------------------------------------------------------*/
941   /* get data from symbolic products */
942   coi  = merge->coi; coj = merge->coj;
943   ierr = PetscCalloc1((coi[pon]+1),&coa);CHKERRQ(ierr);
944 
945   bi     = merge->bi; bj = merge->bj;
946   owners = merge->rowmap->range;
947   ierr   = PetscCalloc1((bi[cm]+1),&ba);CHKERRQ(ierr);
948 
949   /* get A_loc by taking all local rows of A */
950   A_loc = ptap->A_loc;
951   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
952   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
953   ai    = a_loc->i;
954   aj    = a_loc->j;
955 
956   ierr = PetscCalloc1((A->cmap->N),&aval);CHKERRQ(ierr); /* non-scalable!!! */
957 
958   for (i=0; i<am; i++) {
959     /* 2-a) put A[i,:] to dense array aval */
960     anz = ai[i+1] - ai[i];
961     adj = aj + ai[i];
962     ada = a_loc->a + ai[i];
963     for (j=0; j<anz; j++) {
964       aval[adj[j]] = ada[j];
965     }
966 
967     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
968     /*--------------------------------------------------------------*/
969     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
970     pnz = po->i[i+1] - po->i[i];
971     poJ = po->j + po->i[i];
972     pA  = po->a + po->i[i];
973     for (j=0; j<pnz; j++) {
974       row = poJ[j];
975       cnz = coi[row+1] - coi[row];
976       cj  = coj + coi[row];
977       ca  = coa + coi[row];
978       /* perform dense axpy */
979       valtmp = pA[j];
980       for (k=0; k<cnz; k++) {
981         ca[k] += valtmp*aval[cj[k]];
982       }
983       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
984     }
985 
986     /* put the value into Cd (diagonal part) */
987     pnz = pd->i[i+1] - pd->i[i];
988     pdJ = pd->j + pd->i[i];
989     pA  = pd->a + pd->i[i];
990     for (j=0; j<pnz; j++) {
991       row = pdJ[j];
992       cnz = bi[row+1] - bi[row];
993       cj  = bj + bi[row];
994       ca  = ba + bi[row];
995       /* perform dense axpy */
996       valtmp = pA[j];
997       for (k=0; k<cnz; k++) {
998         ca[k] += valtmp*aval[cj[k]];
999       }
1000       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1001     }
1002 
1003     /* zero the current row of Pt*A */
1004     aJ = aj + ai[i];
1005     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1006   }
1007 
1008   /* 3) send and recv matrix values coa */
1009   /*------------------------------------*/
1010   buf_ri = merge->buf_ri;
1011   buf_rj = merge->buf_rj;
1012   len_s  = merge->len_s;
1013   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1014   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1015 
1016   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1017   for (proc=0,k=0; proc<size; proc++) {
1018     if (!len_s[proc]) continue;
1019     i    = merge->owners_co[proc];
1020     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1021     k++;
1022   }
1023   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1024   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1025 
1026   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1027   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1028   ierr = PetscFree(coa);CHKERRQ(ierr);
1029 
1030   /* 4) insert local Cseq and received values into Cmpi */
1031   /*----------------------------------------------------*/
1032   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1033   for (k=0; k<merge->nrecv; k++) {
1034     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1035     nrows       = *(buf_ri_k[k]);
1036     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1037     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1038   }
1039 
1040   for (i=0; i<cm; i++) {
1041     row  = owners[rank] + i; /* global row index of C_seq */
1042     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1043     ba_i = ba + bi[i];
1044     bnz  = bi[i+1] - bi[i];
1045     /* add received vals into ba */
1046     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1047       /* i-th row */
1048       if (i == *nextrow[k]) {
1049         cnz    = *(nextci[k]+1) - *nextci[k];
1050         cj     = buf_rj[k] + *(nextci[k]);
1051         ca     = abuf_r[k] + *(nextci[k]);
1052         nextcj = 0;
1053         for (j=0; nextcj<cnz; j++) {
1054           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1055             ba_i[j] += ca[nextcj++];
1056           }
1057         }
1058         nextrow[k]++; nextci[k]++;
1059         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1060       }
1061     }
1062     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1063   }
1064   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1065   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1066 
1067   ierr = PetscFree(ba);CHKERRQ(ierr);
1068   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1069   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1070   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1071   ierr = PetscFree(aval);CHKERRQ(ierr);
1072   PetscFunctionReturn(0);
1073 }
1074 
1075 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1076 #undef __FUNCT__
1077 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1078 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1079 {
1080   PetscErrorCode      ierr;
1081   Mat                 Cmpi,A_loc,POt,PDt;
1082   Mat_PtAPMPI         *ptap;
1083   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1084   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1085   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1086   PetscInt            nnz;
1087   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1088   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1089   PetscBT             lnkbt;
1090   MPI_Comm            comm;
1091   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1092   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1093   PetscInt            len,proc,*dnz,*onz,*owners;
1094   PetscInt            nzi,*bi,*bj;
1095   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1096   MPI_Request         *swaits,*rwaits;
1097   MPI_Status          *sstatus,rstatus;
1098   Mat_Merge_SeqsToMPI *merge;
1099   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1100   PetscReal           afill  =1.0,afill_tmp;
1101   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1102   PetscScalar         *vals;
1103   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1104 
1105   PetscFunctionBegin;
1106   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1107   /* check if matrix local sizes are compatible */
1108   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1109     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1110   }
1111 
1112   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1113   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1114 
1115   /* create struct Mat_PtAPMPI and attached it to C later */
1116   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1117 
1118   /* get A_loc by taking all local rows of A */
1119   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1120 
1121   ptap->A_loc = A_loc;
1122 
1123   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1124   ai    = a_loc->i;
1125   aj    = a_loc->j;
1126 
1127   /* determine symbolic Co=(p->B)^T*A - send to others */
1128   /*----------------------------------------------------*/
1129   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1130   pdt  = (Mat_SeqAIJ*)PDt->data;
1131   pdti = pdt->i; pdtj = pdt->j;
1132 
1133   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1134   pot  = (Mat_SeqAIJ*)POt->data;
1135   poti = pot->i; potj = pot->j;
1136 
1137   /* then, compute symbolic Co = (p->B)^T*A */
1138   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1139   ierr   = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr);
1140   coi[0] = 0;
1141 
1142   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1143   nnz           = fill*(poti[pon] + ai[am]);
1144   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1145   current_space = free_space;
1146 
1147   /* create and initialize a linked list */
1148   i     = PetscMax(pdt->rmax,pot->rmax);
1149   Crmax = i*a_loc->rmax*size;
1150   if (!Crmax || Crmax > aN) Crmax = aN;
1151   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1152 
1153   for (i=0; i<pon; i++) {
1154     pnz = poti[i+1] - poti[i];
1155     ptJ = potj + poti[i];
1156     for (j=0; j<pnz; j++) {
1157       row  = ptJ[j]; /* row of A_loc == col of Pot */
1158       anz  = ai[row+1] - ai[row];
1159       Jptr = aj + ai[row];
1160       /* add non-zero cols of AP into the sorted linked list lnk */
1161       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1162     }
1163     nnz = lnk[0];
1164 
1165     /* If free space is not available, double the total space in the list */
1166     if (current_space->local_remaining<nnz) {
1167       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1168       nspacedouble++;
1169     }
1170 
1171     /* Copy data into free space, and zero out denserows */
1172     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1173 
1174     current_space->array           += nnz;
1175     current_space->local_used      += nnz;
1176     current_space->local_remaining -= nnz;
1177 
1178     coi[i+1] = coi[i] + nnz;
1179   }
1180 
1181   ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr);
1182   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1183 
1184   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1185   if (afill_tmp > afill) afill = afill_tmp;
1186 
1187   /* send j-array (coj) of Co to other processors */
1188   /*----------------------------------------------*/
1189   /* determine row ownership */
1190   ierr = PetscNew(&merge);CHKERRQ(ierr);
1191   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1192 
1193   merge->rowmap->n  = pn;
1194   merge->rowmap->bs = 1;
1195 
1196   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1197   owners = merge->rowmap->range;
1198 
1199   /* determine the number of messages to send, their lengths */
1200   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1201   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1202 
1203   len_s        = merge->len_s;
1204   merge->nsend = 0;
1205 
1206   ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr);
1207   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1208 
1209   proc = 0;
1210   for (i=0; i<pon; i++) {
1211     while (prmap[i] >= owners[proc+1]) proc++;
1212     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1213     len_s[proc] += coi[i+1] - coi[i];
1214   }
1215 
1216   len          = 0; /* max length of buf_si[] */
1217   owners_co[0] = 0;
1218   for (proc=0; proc<size; proc++) {
1219     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1220     if (len_si[proc]) {
1221       merge->nsend++;
1222       len_si[proc] = 2*(len_si[proc] + 1);
1223       len         += len_si[proc];
1224     }
1225   }
1226 
1227   /* determine the number and length of messages to receive for coi and coj  */
1228   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1229   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1230 
1231   /* post the Irecv and Isend of coj */
1232   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1233   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1234   ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr);
1235   for (proc=0, k=0; proc<size; proc++) {
1236     if (!len_s[proc]) continue;
1237     i    = owners_co[proc];
1238     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1239     k++;
1240   }
1241 
1242   /* receives and sends of coj are complete */
1243   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1244   for (i=0; i<merge->nrecv; i++) {
1245     PetscMPIInt icompleted;
1246     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1247   }
1248   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1249   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1250 
1251   /* send and recv coi */
1252   /*-------------------*/
1253   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1254   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1255   ierr   = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr);
1256   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1257   for (proc=0,k=0; proc<size; proc++) {
1258     if (!len_s[proc]) continue;
1259     /* form outgoing message for i-structure:
1260          buf_si[0]:                 nrows to be sent
1261                [1:nrows]:           row index (global)
1262                [nrows+1:2*nrows+1]: i-structure index
1263     */
1264     /*-------------------------------------------*/
1265     nrows       = len_si[proc]/2 - 1;
1266     buf_si_i    = buf_si + nrows+1;
1267     buf_si[0]   = nrows;
1268     buf_si_i[0] = 0;
1269     nrows       = 0;
1270     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1271       nzi               = coi[i+1] - coi[i];
1272       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1273       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1274       nrows++;
1275     }
1276     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1277     k++;
1278     buf_si += len_si[proc];
1279   }
1280   i = merge->nrecv;
1281   while (i--) {
1282     PetscMPIInt icompleted;
1283     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1284   }
1285   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1286   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1287   ierr = PetscFree(len_si);CHKERRQ(ierr);
1288   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1289   ierr = PetscFree(swaits);CHKERRQ(ierr);
1290   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1291   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1292 
1293   /* compute the local portion of C (mpi mat) */
1294   /*------------------------------------------*/
1295   /* allocate bi array and free space for accumulating nonzero column info */
1296   ierr  = PetscMalloc1((pn+1),&bi);CHKERRQ(ierr);
1297   bi[0] = 0;
1298 
1299   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1300   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1301   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1302   current_space = free_space;
1303 
1304   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1305   for (k=0; k<merge->nrecv; k++) {
1306     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1307     nrows       = *buf_ri_k[k];
1308     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1309     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1310   }
1311 
1312   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1313   rmax = 0;
1314   for (i=0; i<pn; i++) {
1315     /* add pdt[i,:]*AP into lnk */
1316     pnz = pdti[i+1] - pdti[i];
1317     ptJ = pdtj + pdti[i];
1318     for (j=0; j<pnz; j++) {
1319       row  = ptJ[j];  /* row of AP == col of Pt */
1320       anz  = ai[row+1] - ai[row];
1321       Jptr = aj + ai[row];
1322       /* add non-zero cols of AP into the sorted linked list lnk */
1323       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1324     }
1325 
1326     /* add received col data into lnk */
1327     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1328       if (i == *nextrow[k]) { /* i-th row */
1329         nzi  = *(nextci[k]+1) - *nextci[k];
1330         Jptr = buf_rj[k] + *nextci[k];
1331         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1332         nextrow[k]++; nextci[k]++;
1333       }
1334     }
1335     nnz = lnk[0];
1336 
1337     /* if free space is not available, make more free space */
1338     if (current_space->local_remaining<nnz) {
1339       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1340       nspacedouble++;
1341     }
1342     /* copy data into free space, then initialize lnk */
1343     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1344     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1345 
1346     current_space->array           += nnz;
1347     current_space->local_used      += nnz;
1348     current_space->local_remaining -= nnz;
1349 
1350     bi[i+1] = bi[i] + nnz;
1351     if (nnz > rmax) rmax = nnz;
1352   }
1353   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1354 
1355   ierr = PetscMalloc1((bi[pn]+1),&bj);CHKERRQ(ierr);
1356   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1357 
1358   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1359   if (afill_tmp > afill) afill = afill_tmp;
1360   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1361   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1362   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1363 
1364   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1365   /*----------------------------------------------------------------------------------*/
1366   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1367 
1368   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1369   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1370   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1371   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1372   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1373   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1374   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1375   for (i=0; i<pn; i++) {
1376     row  = i + rstart;
1377     nnz  = bi[i+1] - bi[i];
1378     Jptr = bj + bi[i];
1379     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1380   }
1381   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1382   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1383   ierr = PetscFree(vals);CHKERRQ(ierr);
1384 
1385   merge->bi        = bi;
1386   merge->bj        = bj;
1387   merge->coi       = coi;
1388   merge->coj       = coj;
1389   merge->buf_ri    = buf_ri;
1390   merge->buf_rj    = buf_rj;
1391   merge->owners_co = owners_co;
1392   merge->destroy   = Cmpi->ops->destroy;
1393   merge->duplicate = Cmpi->ops->duplicate;
1394 
1395   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1396   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1397 
1398   /* attach the supporting struct to Cmpi for reuse */
1399   c           = (Mat_MPIAIJ*)Cmpi->data;
1400   c->ptap     = ptap;
1401   ptap->api   = NULL;
1402   ptap->apj   = NULL;
1403   ptap->merge = merge;
1404   ptap->rmax  = rmax;
1405 
1406   *C = Cmpi;
1407 #if defined(PETSC_USE_INFO)
1408   if (bi[pn] != 0) {
1409     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1410     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1411   } else {
1412     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1413   }
1414 #endif
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 #undef __FUNCT__
1419 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1420 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1421 {
1422   PetscErrorCode      ierr;
1423   Mat_Merge_SeqsToMPI *merge;
1424   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1425   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1426   Mat_PtAPMPI         *ptap;
1427   PetscInt            *adj;
1428   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1429   MatScalar           *ada,*ca,valtmp;
1430   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1431   MPI_Comm            comm;
1432   PetscMPIInt         size,rank,taga,*len_s;
1433   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1434   PetscInt            **buf_ri,**buf_rj;
1435   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1436   MPI_Request         *s_waits,*r_waits;
1437   MPI_Status          *status;
1438   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1439   PetscInt            *ai,*aj,*coi,*coj;
1440   PetscInt            *poJ,*pdJ;
1441   Mat                 A_loc;
1442   Mat_SeqAIJ          *a_loc;
1443 
1444   PetscFunctionBegin;
1445   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1446   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1447   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1448 
1449   ptap  = c->ptap;
1450   merge = ptap->merge;
1451 
1452   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1453   /*------------------------------------------*/
1454   /* get data from symbolic products */
1455   coi    = merge->coi; coj = merge->coj;
1456   ierr   = PetscCalloc1((coi[pon]+1),&coa);CHKERRQ(ierr);
1457   bi     = merge->bi; bj = merge->bj;
1458   owners = merge->rowmap->range;
1459   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1460 
1461   /* get A_loc by taking all local rows of A */
1462   A_loc = ptap->A_loc;
1463   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1464   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1465   ai    = a_loc->i;
1466   aj    = a_loc->j;
1467 
1468   for (i=0; i<am; i++) {
1469     anz = ai[i+1] - ai[i];
1470     adj = aj + ai[i];
1471     ada = a_loc->a + ai[i];
1472 
1473     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1474     /*-------------------------------------------------------------*/
1475     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1476     pnz = po->i[i+1] - po->i[i];
1477     poJ = po->j + po->i[i];
1478     pA  = po->a + po->i[i];
1479     for (j=0; j<pnz; j++) {
1480       row = poJ[j];
1481       cj  = coj + coi[row];
1482       ca  = coa + coi[row];
1483       /* perform sparse axpy */
1484       nexta  = 0;
1485       valtmp = pA[j];
1486       for (k=0; nexta<anz; k++) {
1487         if (cj[k] == adj[nexta]) {
1488           ca[k] += valtmp*ada[nexta];
1489           nexta++;
1490         }
1491       }
1492       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1493     }
1494 
1495     /* put the value into Cd (diagonal part) */
1496     pnz = pd->i[i+1] - pd->i[i];
1497     pdJ = pd->j + pd->i[i];
1498     pA  = pd->a + pd->i[i];
1499     for (j=0; j<pnz; j++) {
1500       row = pdJ[j];
1501       cj  = bj + bi[row];
1502       ca  = ba + bi[row];
1503       /* perform sparse axpy */
1504       nexta  = 0;
1505       valtmp = pA[j];
1506       for (k=0; nexta<anz; k++) {
1507         if (cj[k] == adj[nexta]) {
1508           ca[k] += valtmp*ada[nexta];
1509           nexta++;
1510         }
1511       }
1512       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1513     }
1514   }
1515 
1516   /* 3) send and recv matrix values coa */
1517   /*------------------------------------*/
1518   buf_ri = merge->buf_ri;
1519   buf_rj = merge->buf_rj;
1520   len_s  = merge->len_s;
1521   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1522   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1523 
1524   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1525   for (proc=0,k=0; proc<size; proc++) {
1526     if (!len_s[proc]) continue;
1527     i    = merge->owners_co[proc];
1528     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1529     k++;
1530   }
1531   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1532   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1533 
1534   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1535   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1536   ierr = PetscFree(coa);CHKERRQ(ierr);
1537 
1538   /* 4) insert local Cseq and received values into Cmpi */
1539   /*----------------------------------------------------*/
1540   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1541   for (k=0; k<merge->nrecv; k++) {
1542     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1543     nrows       = *(buf_ri_k[k]);
1544     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1545     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1546   }
1547 
1548   for (i=0; i<cm; i++) {
1549     row  = owners[rank] + i; /* global row index of C_seq */
1550     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1551     ba_i = ba + bi[i];
1552     bnz  = bi[i+1] - bi[i];
1553     /* add received vals into ba */
1554     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1555       /* i-th row */
1556       if (i == *nextrow[k]) {
1557         cnz    = *(nextci[k]+1) - *nextci[k];
1558         cj     = buf_rj[k] + *(nextci[k]);
1559         ca     = abuf_r[k] + *(nextci[k]);
1560         nextcj = 0;
1561         for (j=0; nextcj<cnz; j++) {
1562           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1563             ba_i[j] += ca[nextcj++];
1564           }
1565         }
1566         nextrow[k]++; nextci[k]++;
1567         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1568       }
1569     }
1570     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1571   }
1572   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1573   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1574 
1575   ierr = PetscFree(ba);CHKERRQ(ierr);
1576   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1577   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1578   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1583    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1584 #undef __FUNCT__
1585 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1586 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1587 {
1588   PetscErrorCode      ierr;
1589   Mat                 Cmpi,A_loc,POt,PDt;
1590   Mat_PtAPMPI         *ptap;
1591   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1592   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1593   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1594   PetscInt            nnz;
1595   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1596   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1597   MPI_Comm            comm;
1598   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1599   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1600   PetscInt            len,proc,*dnz,*onz,*owners;
1601   PetscInt            nzi,*bi,*bj;
1602   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1603   MPI_Request         *swaits,*rwaits;
1604   MPI_Status          *sstatus,rstatus;
1605   Mat_Merge_SeqsToMPI *merge;
1606   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1607   PetscReal           afill  =1.0,afill_tmp;
1608   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1609   PetscScalar         *vals;
1610   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1614   /* check if matrix local sizes are compatible */
1615   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1616     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1617   }
1618 
1619   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1620   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1621 
1622   /* create struct Mat_PtAPMPI and attached it to C later */
1623   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1624 
1625   /* get A_loc by taking all local rows of A */
1626   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1627 
1628   ptap->A_loc = A_loc;
1629   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1630   ai          = a_loc->i;
1631   aj          = a_loc->j;
1632 
1633   /* determine symbolic Co=(p->B)^T*A - send to others */
1634   /*----------------------------------------------------*/
1635   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1636   pdt  = (Mat_SeqAIJ*)PDt->data;
1637   pdti = pdt->i; pdtj = pdt->j;
1638 
1639   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1640   pot  = (Mat_SeqAIJ*)POt->data;
1641   poti = pot->i; potj = pot->j;
1642 
1643   /* then, compute symbolic Co = (p->B)^T*A */
1644   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1645                          >= (num of nonzero rows of C_seq) - pn */
1646   ierr   = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr);
1647   coi[0] = 0;
1648 
1649   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1650   nnz           = fill*(poti[pon] + ai[am]);
1651   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1652   current_space = free_space;
1653 
1654   /* create and initialize a linked list */
1655   i     = PetscMax(pdt->rmax,pot->rmax);
1656   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1657   if (!Crmax || Crmax > aN) Crmax = aN;
1658   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1659 
1660   for (i=0; i<pon; i++) {
1661     pnz = poti[i+1] - poti[i];
1662     ptJ = potj + poti[i];
1663     for (j=0; j<pnz; j++) {
1664       row  = ptJ[j]; /* row of A_loc == col of Pot */
1665       anz  = ai[row+1] - ai[row];
1666       Jptr = aj + ai[row];
1667       /* add non-zero cols of AP into the sorted linked list lnk */
1668       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1669     }
1670     nnz = lnk[0];
1671 
1672     /* If free space is not available, double the total space in the list */
1673     if (current_space->local_remaining<nnz) {
1674       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1675       nspacedouble++;
1676     }
1677 
1678     /* Copy data into free space, and zero out denserows */
1679     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1680 
1681     current_space->array           += nnz;
1682     current_space->local_used      += nnz;
1683     current_space->local_remaining -= nnz;
1684 
1685     coi[i+1] = coi[i] + nnz;
1686   }
1687 
1688   ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr);
1689   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1690 
1691   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1692   if (afill_tmp > afill) afill = afill_tmp;
1693 
1694   /* send j-array (coj) of Co to other processors */
1695   /*----------------------------------------------*/
1696   /* determine row ownership */
1697   ierr = PetscNew(&merge);CHKERRQ(ierr);
1698   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1699 
1700   merge->rowmap->n  = pn;
1701   merge->rowmap->bs = 1;
1702 
1703   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1704   owners = merge->rowmap->range;
1705 
1706   /* determine the number of messages to send, their lengths */
1707   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1708   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1709 
1710   len_s        = merge->len_s;
1711   merge->nsend = 0;
1712 
1713   ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr);
1714   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1715 
1716   proc = 0;
1717   for (i=0; i<pon; i++) {
1718     while (prmap[i] >= owners[proc+1]) proc++;
1719     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1720     len_s[proc] += coi[i+1] - coi[i];
1721   }
1722 
1723   len          = 0; /* max length of buf_si[] */
1724   owners_co[0] = 0;
1725   for (proc=0; proc<size; proc++) {
1726     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1727     if (len_si[proc]) {
1728       merge->nsend++;
1729       len_si[proc] = 2*(len_si[proc] + 1);
1730       len         += len_si[proc];
1731     }
1732   }
1733 
1734   /* determine the number and length of messages to receive for coi and coj  */
1735   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1736   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1737 
1738   /* post the Irecv and Isend of coj */
1739   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1740   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1741   ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr);
1742   for (proc=0, k=0; proc<size; proc++) {
1743     if (!len_s[proc]) continue;
1744     i    = owners_co[proc];
1745     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1746     k++;
1747   }
1748 
1749   /* receives and sends of coj are complete */
1750   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1751   for (i=0; i<merge->nrecv; i++) {
1752     PetscMPIInt icompleted;
1753     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1754   }
1755   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1756   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1757 
1758   /* send and recv coi */
1759   /*-------------------*/
1760   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1761   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1762   ierr   = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr);
1763   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1764   for (proc=0,k=0; proc<size; proc++) {
1765     if (!len_s[proc]) continue;
1766     /* form outgoing message for i-structure:
1767          buf_si[0]:                 nrows to be sent
1768                [1:nrows]:           row index (global)
1769                [nrows+1:2*nrows+1]: i-structure index
1770     */
1771     /*-------------------------------------------*/
1772     nrows       = len_si[proc]/2 - 1;
1773     buf_si_i    = buf_si + nrows+1;
1774     buf_si[0]   = nrows;
1775     buf_si_i[0] = 0;
1776     nrows       = 0;
1777     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1778       nzi               = coi[i+1] - coi[i];
1779       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1780       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1781       nrows++;
1782     }
1783     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1784     k++;
1785     buf_si += len_si[proc];
1786   }
1787   i = merge->nrecv;
1788   while (i--) {
1789     PetscMPIInt icompleted;
1790     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1791   }
1792   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1793   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1794   ierr = PetscFree(len_si);CHKERRQ(ierr);
1795   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1796   ierr = PetscFree(swaits);CHKERRQ(ierr);
1797   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1798   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1799 
1800   /* compute the local portion of C (mpi mat) */
1801   /*------------------------------------------*/
1802   /* allocate bi array and free space for accumulating nonzero column info */
1803   ierr  = PetscMalloc1((pn+1),&bi);CHKERRQ(ierr);
1804   bi[0] = 0;
1805 
1806   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1807   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1808   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1809   current_space = free_space;
1810 
1811   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1812   for (k=0; k<merge->nrecv; k++) {
1813     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1814     nrows       = *buf_ri_k[k];
1815     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1816     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1817   }
1818 
1819   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1820   rmax = 0;
1821   for (i=0; i<pn; i++) {
1822     /* add pdt[i,:]*AP into lnk */
1823     pnz = pdti[i+1] - pdti[i];
1824     ptJ = pdtj + pdti[i];
1825     for (j=0; j<pnz; j++) {
1826       row  = ptJ[j];  /* row of AP == col of Pt */
1827       anz  = ai[row+1] - ai[row];
1828       Jptr = aj + ai[row];
1829       /* add non-zero cols of AP into the sorted linked list lnk */
1830       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1831     }
1832 
1833     /* add received col data into lnk */
1834     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1835       if (i == *nextrow[k]) { /* i-th row */
1836         nzi  = *(nextci[k]+1) - *nextci[k];
1837         Jptr = buf_rj[k] + *nextci[k];
1838         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1839         nextrow[k]++; nextci[k]++;
1840       }
1841     }
1842     nnz = lnk[0];
1843 
1844     /* if free space is not available, make more free space */
1845     if (current_space->local_remaining<nnz) {
1846       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1847       nspacedouble++;
1848     }
1849     /* copy data into free space, then initialize lnk */
1850     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1851     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1852 
1853     current_space->array           += nnz;
1854     current_space->local_used      += nnz;
1855     current_space->local_remaining -= nnz;
1856 
1857     bi[i+1] = bi[i] + nnz;
1858     if (nnz > rmax) rmax = nnz;
1859   }
1860   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1861 
1862   ierr      = PetscMalloc1((bi[pn]+1),&bj);CHKERRQ(ierr);
1863   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1864   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1865   if (afill_tmp > afill) afill = afill_tmp;
1866   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1867   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1868   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1869 
1870   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1871   /*----------------------------------------------------------------------------------*/
1872   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1873 
1874   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1875   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1876   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1877   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1878   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1879   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1880   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1881   for (i=0; i<pn; i++) {
1882     row  = i + rstart;
1883     nnz  = bi[i+1] - bi[i];
1884     Jptr = bj + bi[i];
1885     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1886   }
1887   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1888   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1889   ierr = PetscFree(vals);CHKERRQ(ierr);
1890 
1891   merge->bi        = bi;
1892   merge->bj        = bj;
1893   merge->coi       = coi;
1894   merge->coj       = coj;
1895   merge->buf_ri    = buf_ri;
1896   merge->buf_rj    = buf_rj;
1897   merge->owners_co = owners_co;
1898   merge->destroy   = Cmpi->ops->destroy;
1899   merge->duplicate = Cmpi->ops->duplicate;
1900 
1901   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1902   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1903 
1904   /* attach the supporting struct to Cmpi for reuse */
1905   c = (Mat_MPIAIJ*)Cmpi->data;
1906 
1907   c->ptap     = ptap;
1908   ptap->api   = NULL;
1909   ptap->apj   = NULL;
1910   ptap->merge = merge;
1911   ptap->rmax  = rmax;
1912   ptap->apa   = NULL;
1913 
1914   *C = Cmpi;
1915 #if defined(PETSC_USE_INFO)
1916   if (bi[pn] != 0) {
1917     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1918     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1919   } else {
1920     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1921   }
1922 #endif
1923   PetscFunctionReturn(0);
1924 }
1925