xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 5cab4f04ef148687507f6caacff5c5d88946249c)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #if defined(PETSC_HAVE_HYPRE)
14 PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
15 #endif
16 
17 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
18 {
19   PetscErrorCode ierr;
20 #if defined(PETSC_HAVE_HYPRE)
21   const char     *algTypes[3] = {"scalable","nonscalable","hypre"};
22   PetscInt       nalg = 3;
23 #else
24   const char     *algTypes[2] = {"scalable","nonscalable"};
25   PetscInt       nalg = 2;
26 #endif
27   PetscInt       alg = 1; /* set default algorithm */
28   MPI_Comm       comm;
29 
30   PetscFunctionBegin;
31   if (scall == MAT_INITIAL_MATRIX) {
32     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
33     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
34 
35     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
36     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[1],&alg,NULL);CHKERRQ(ierr);
37     ierr = PetscOptionsEnd();CHKERRQ(ierr);
38 
39     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
40     switch (alg) {
41     case 1:
42       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
43       break;
44 #if defined(PETSC_HAVE_HYPRE)
45     case 2:
46       ierr = MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);CHKERRQ(ierr);
47       break;
48 #endif
49     default:
50       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
51       break;
52     }
53     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
54   }
55   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
56   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
57   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
58   PetscFunctionReturn(0);
59 }
60 
61 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
62 {
63   PetscErrorCode ierr;
64   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
65   Mat_PtAPMPI    *ptap = a->ptap;
66 
67   PetscFunctionBegin;
68   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
69   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
70   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
71   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
72   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
73   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
74   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
75   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
76   ierr = ptap->destroy(A);CHKERRQ(ierr);
77   ierr = PetscFree(ptap);CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
82 {
83   PetscErrorCode ierr;
84   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
85   Mat_PtAPMPI    *ptap = a->ptap;
86 
87   PetscFunctionBegin;
88   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
89 
90   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
91   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
92   PetscFunctionReturn(0);
93 }
94 
95 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
96 {
97   PetscErrorCode ierr;
98   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
99   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
100   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
101   PetscScalar    *cda=cd->a,*coa=co->a;
102   Mat_SeqAIJ     *p_loc,*p_oth;
103   PetscScalar    *apa,*ca;
104   PetscInt       cm   =C->rmap->n;
105   Mat_PtAPMPI    *ptap=c->ptap;
106   PetscInt       *api,*apj,*apJ,i,k;
107   PetscInt       cstart=C->cmap->rstart;
108   PetscInt       cdnz,conz,k0,k1;
109   MPI_Comm       comm;
110   PetscMPIInt    size;
111 
112   PetscFunctionBegin;
113   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
114   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
115 
116   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
117   /*-----------------------------------------------------*/
118   /* update numerical values of P_oth and P_loc */
119   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
120   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
121 
122   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
123   /*----------------------------------------------------------*/
124   /* get data from symbolic products */
125   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
126   p_oth = NULL;
127   if (size >1) {
128     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
129   }
130 
131   /* get apa for storing dense row A[i,:]*P */
132   apa = ptap->apa;
133 
134   api = ptap->api;
135   apj = ptap->apj;
136   for (i=0; i<cm; i++) {
137     /* compute apa = A[i,:]*P */
138     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
139 
140     /* set values in C */
141     apJ  = apj + api[i];
142     cdnz = cd->i[i+1] - cd->i[i];
143     conz = co->i[i+1] - co->i[i];
144 
145     /* 1st off-diagoanl part of C */
146     ca = coa + co->i[i];
147     k  = 0;
148     for (k0=0; k0<conz; k0++) {
149       if (apJ[k] >= cstart) break;
150       ca[k0]      = apa[apJ[k]];
151       apa[apJ[k++]] = 0.0;
152     }
153 
154     /* diagonal part of C */
155     ca = cda + cd->i[i];
156     for (k1=0; k1<cdnz; k1++) {
157       ca[k1]      = apa[apJ[k]];
158       apa[apJ[k++]] = 0.0;
159     }
160 
161     /* 2nd off-diagoanl part of C */
162     ca = coa + co->i[i];
163     for (; k0<conz; k0++) {
164       ca[k0]      = apa[apJ[k]];
165       apa[apJ[k++]] = 0.0;
166     }
167   }
168   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
174 {
175   PetscErrorCode     ierr;
176   MPI_Comm           comm;
177   PetscMPIInt        size;
178   Mat                Cmpi;
179   Mat_PtAPMPI        *ptap;
180   PetscFreeSpaceList free_space=NULL,current_space=NULL;
181   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
182   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
183   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
184   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
185   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
186   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
187   PetscBT            lnkbt;
188   PetscScalar        *apa;
189   PetscReal          afill;
190 
191   PetscFunctionBegin;
192   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
193   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
194 
195   /* create struct Mat_PtAPMPI and attached it to C later */
196   ierr = PetscNew(&ptap);CHKERRQ(ierr);
197 
198   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
199   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
200 
201   /* get P_loc by taking all local rows of P */
202   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
203 
204   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
205   pi_loc = p_loc->i; pj_loc = p_loc->j;
206   if (size > 1) {
207     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
208     pi_oth = p_oth->i; pj_oth = p_oth->j;
209   } else {
210     p_oth = NULL;
211     pi_oth = NULL; pj_oth = NULL;
212   }
213 
214   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
215   /*-------------------------------------------------------------------*/
216   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
217   ptap->api = api;
218   api[0]    = 0;
219 
220   /* create and initialize a linked list */
221   ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr);
222 
223   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
224   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
225   current_space = free_space;
226 
227   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
228   for (i=0; i<am; i++) {
229     /* diagonal portion of A */
230     nzi = adi[i+1] - adi[i];
231     for (j=0; j<nzi; j++) {
232       row  = *adj++;
233       pnz  = pi_loc[row+1] - pi_loc[row];
234       Jptr = pj_loc + pi_loc[row];
235       /* add non-zero cols of P into the sorted linked list lnk */
236       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
237     }
238     /* off-diagonal portion of A */
239     nzi = aoi[i+1] - aoi[i];
240     for (j=0; j<nzi; j++) {
241       row  = *aoj++;
242       pnz  = pi_oth[row+1] - pi_oth[row];
243       Jptr = pj_oth + pi_oth[row];
244       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
245     }
246 
247     apnz     = lnk[0];
248     api[i+1] = api[i] + apnz;
249 
250     /* if free space is not available, double the total space in the list */
251     if (current_space->local_remaining<apnz) {
252       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
253       nspacedouble++;
254     }
255 
256     /* Copy data into free space, then initialize lnk */
257     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
258     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
259 
260     current_space->array           += apnz;
261     current_space->local_used      += apnz;
262     current_space->local_remaining -= apnz;
263   }
264 
265   /* Allocate space for apj, initialize apj, and */
266   /* destroy list of free space and other temporary array(s) */
267   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
268   apj  = ptap->apj;
269   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
270   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
271 
272   /* malloc apa to store dense row A[i,:]*P */
273   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
274 
275   ptap->apa = apa;
276 
277   /* create and assemble symbolic parallel matrix Cmpi */
278   /*----------------------------------------------------*/
279   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
280   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
281   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
282 
283   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
284   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
285   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
286   for (i=0; i<am; i++) {
287     row  = i + rstart;
288     apnz = api[i+1] - api[i];
289     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
290     apj += apnz;
291   }
292   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
293   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
294 
295   ptap->destroy        = Cmpi->ops->destroy;
296   ptap->duplicate      = Cmpi->ops->duplicate;
297   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
298   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
299   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
300 
301   /* attach the supporting struct to Cmpi for reuse */
302   c       = (Mat_MPIAIJ*)Cmpi->data;
303   c->ptap = ptap;
304 
305   *C = Cmpi;
306 
307   /* set MatInfo */
308   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
309   if (afill < 1.0) afill = 1.0;
310   Cmpi->info.mallocs           = nspacedouble;
311   Cmpi->info.fill_ratio_given  = fill;
312   Cmpi->info.fill_ratio_needed = afill;
313 
314 #if defined(PETSC_USE_INFO)
315   if (api[am]) {
316     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
317     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
318   } else {
319     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
320   }
321 #endif
322   PetscFunctionReturn(0);
323 }
324 
325 PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
326 {
327   PetscErrorCode ierr;
328 
329   PetscFunctionBegin;
330   if (scall == MAT_INITIAL_MATRIX) {
331     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
332     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
333     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
334   }
335   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
336   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
337   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 typedef struct {
342   Mat         workB;
343   PetscScalar *rvalues,*svalues;
344   MPI_Request *rwaits,*swaits;
345 } MPIAIJ_MPIDense;
346 
347 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
348 {
349   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
350   PetscErrorCode  ierr;
351 
352   PetscFunctionBegin;
353   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
354   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
355   ierr = PetscFree(contents);CHKERRQ(ierr);
356   PetscFunctionReturn(0);
357 }
358 
359 /*
360     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
361   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
362 
363   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
364 */
365 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
366 {
367   PetscErrorCode         ierr;
368   PetscBool              flg;
369   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
370   PetscInt               nz   = aij->B->cmap->n;
371   PetscContainer         container;
372   MPIAIJ_MPIDense        *contents;
373   VecScatter             ctx   = aij->Mvctx;
374   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
375   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
376 
377   PetscFunctionBegin;
378   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
379   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
380 
381   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
382   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
383   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
384 
385   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
386 
387   ierr = PetscNew(&contents);CHKERRQ(ierr);
388   /* Create work matrix used to store off processor rows of B needed for local product */
389   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
390   /* Create work arrays needed */
391   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
392                       B->cmap->N*to->starts[to->n],&contents->svalues,
393                       from->n,&contents->rwaits,
394                       to->n,&contents->swaits);CHKERRQ(ierr);
395 
396   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
397   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
398   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
399   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
400   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
401 
402   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
407 {
408   PetscErrorCode         ierr;
409   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
410   PetscInt               nz   = aij->B->cmap->n;
411   PetscContainer         container;
412   MPIAIJ_MPIDense        *contents;
413   VecScatter             ctx   = aij->Mvctx;
414   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
415   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
416   PetscInt               m     = A->rmap->n,n=B->cmap->n;
417 
418   PetscFunctionBegin;
419   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
420   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
421   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
422   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
423   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
424   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
425   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
426 
427   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
428 
429   ierr = PetscNew(&contents);CHKERRQ(ierr);
430   /* Create work matrix used to store off processor rows of B needed for local product */
431   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
432   /* Create work arrays needed */
433   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
434                       B->cmap->N*to->starts[to->n],&contents->svalues,
435                       from->n,&contents->rwaits,
436                       to->n,&contents->swaits);CHKERRQ(ierr);
437 
438   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
439   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
440   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
441   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
442   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 
446 /*
447     Performs an efficient scatter on the rows of B needed by this process; this is
448     a modification of the VecScatterBegin_() routines.
449 */
450 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
451 {
452   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
453   PetscErrorCode         ierr;
454   PetscScalar            *b,*w,*svalues,*rvalues;
455   VecScatter             ctx   = aij->Mvctx;
456   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
457   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
458   PetscInt               i,j,k;
459   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
460   PetscMPIInt            *sprocs,*rprocs,nrecvs;
461   MPI_Request            *swaits,*rwaits;
462   MPI_Comm               comm;
463   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
464   MPI_Status             status;
465   MPIAIJ_MPIDense        *contents;
466   PetscContainer         container;
467   Mat                    workB;
468 
469   PetscFunctionBegin;
470   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
471   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
472   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
473   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
474 
475   workB = *outworkB = contents->workB;
476   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
477   sindices = to->indices;
478   sstarts  = to->starts;
479   sprocs   = to->procs;
480   swaits   = contents->swaits;
481   svalues  = contents->svalues;
482 
483   rindices = from->indices;
484   rstarts  = from->starts;
485   rprocs   = from->procs;
486   rwaits   = contents->rwaits;
487   rvalues  = contents->rvalues;
488 
489   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
490   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
491 
492   for (i=0; i<from->n; i++) {
493     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
494   }
495 
496   for (i=0; i<to->n; i++) {
497     /* pack a message at a time */
498     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
499       for (k=0; k<ncols; k++) {
500         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
501       }
502     }
503     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
504   }
505 
506   nrecvs = from->n;
507   while (nrecvs) {
508     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
509     nrecvs--;
510     /* unpack a message at a time */
511     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
512       for (k=0; k<ncols; k++) {
513         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
514       }
515     }
516   }
517   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
518 
519   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
520   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
521   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
522   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
523   PetscFunctionReturn(0);
524 }
525 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
526 
527 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
528 {
529   PetscErrorCode ierr;
530   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
531   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
532   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
533   Mat            workB;
534 
535   PetscFunctionBegin;
536   /* diagonal block of A times all local rows of B*/
537   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
538 
539   /* get off processor parts of B needed to complete the product */
540   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
541 
542   /* off-diagonal block of A times nonlocal rows of B */
543   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
544   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
545   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
546   PetscFunctionReturn(0);
547 }
548 
549 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
550 {
551   PetscErrorCode ierr;
552   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
553   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
554   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
555   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
556   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
557   Mat_SeqAIJ     *p_loc,*p_oth;
558   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
559   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
560   PetscInt       cm          = C->rmap->n,anz,pnz;
561   Mat_PtAPMPI    *ptap       = c->ptap;
562   PetscScalar    *apa_sparse = ptap->apa;
563   PetscInt       *api,*apj,*apJ,i,j,k,row;
564   PetscInt       cstart = C->cmap->rstart;
565   PetscInt       cdnz,conz,k0,k1,nextp;
566   MPI_Comm       comm;
567   PetscMPIInt    size;
568 
569   PetscFunctionBegin;
570   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
571   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
572 
573   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
574   /*-----------------------------------------------------*/
575   /* update numerical values of P_oth and P_loc */
576   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
577   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
578 
579   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
580   /*----------------------------------------------------------*/
581   /* get data from symbolic products */
582   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
583   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
584   if (size >1) {
585     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
586     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
587   } else {
588     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
589   }
590 
591   api = ptap->api;
592   apj = ptap->apj;
593   for (i=0; i<cm; i++) {
594     apJ = apj + api[i];
595 
596     /* diagonal portion of A */
597     anz = adi[i+1] - adi[i];
598     adj = ad->j + adi[i];
599     ada = ad->a + adi[i];
600     for (j=0; j<anz; j++) {
601       row = adj[j];
602       pnz = pi_loc[row+1] - pi_loc[row];
603       pj  = pj_loc + pi_loc[row];
604       pa  = pa_loc + pi_loc[row];
605       /* perform sparse axpy */
606       valtmp = ada[j];
607       nextp  = 0;
608       for (k=0; nextp<pnz; k++) {
609         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
610           apa_sparse[k] += valtmp*pa[nextp++];
611         }
612       }
613       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
614     }
615 
616     /* off-diagonal portion of A */
617     anz = aoi[i+1] - aoi[i];
618     aoj = ao->j + aoi[i];
619     aoa = ao->a + aoi[i];
620     for (j=0; j<anz; j++) {
621       row = aoj[j];
622       pnz = pi_oth[row+1] - pi_oth[row];
623       pj  = pj_oth + pi_oth[row];
624       pa  = pa_oth + pi_oth[row];
625       /* perform sparse axpy */
626       valtmp = aoa[j];
627       nextp  = 0;
628       for (k=0; nextp<pnz; k++) {
629         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
630           apa_sparse[k] += valtmp*pa[nextp++];
631         }
632       }
633       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
634     }
635 
636     /* set values in C */
637     cdnz = cd->i[i+1] - cd->i[i];
638     conz = co->i[i+1] - co->i[i];
639 
640     /* 1st off-diagoanl part of C */
641     ca = coa + co->i[i];
642     k  = 0;
643     for (k0=0; k0<conz; k0++) {
644       if (apJ[k] >= cstart) break;
645       ca[k0]        = apa_sparse[k];
646       apa_sparse[k] = 0.0;
647       k++;
648     }
649 
650     /* diagonal part of C */
651     ca = cda + cd->i[i];
652     for (k1=0; k1<cdnz; k1++) {
653       ca[k1]        = apa_sparse[k];
654       apa_sparse[k] = 0.0;
655       k++;
656     }
657 
658     /* 2nd off-diagoanl part of C */
659     ca = coa + co->i[i];
660     for (; k0<conz; k0++) {
661       ca[k0]        = apa_sparse[k];
662       apa_sparse[k] = 0.0;
663       k++;
664     }
665   }
666   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
667   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
668   PetscFunctionReturn(0);
669 }
670 
671 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
672 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
673 {
674   PetscErrorCode     ierr;
675   MPI_Comm           comm;
676   PetscMPIInt        size;
677   Mat                Cmpi;
678   Mat_PtAPMPI        *ptap;
679   PetscFreeSpaceList free_space = NULL,current_space=NULL;
680   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
681   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
682   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
683   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
684   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
685   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
686   PetscReal          afill;
687   PetscScalar        *apa;
688   PetscTable         ta;
689 
690   PetscFunctionBegin;
691   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
692   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
693 
694   /* create struct Mat_PtAPMPI and attached it to C later */
695   ierr = PetscNew(&ptap);CHKERRQ(ierr);
696 
697   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
698   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
699 
700   /* get P_loc by taking all local rows of P */
701   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
702 
703   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
704   pi_loc = p_loc->i; pj_loc = p_loc->j;
705   if (size > 1) {
706     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
707     pi_oth = p_oth->i; pj_oth = p_oth->j;
708   } else {
709     p_oth  = NULL;
710     pi_oth = NULL; pj_oth = NULL;
711   }
712 
713   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
714   /*-------------------------------------------------------------------*/
715   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
716   ptap->api = api;
717   api[0]    = 0;
718 
719   /* create and initialize a linked list */
720   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr);
721 
722   /* Calculate apnz_max */
723   apnz_max = 0;
724   for (i=0; i<am; i++) {
725     ierr = PetscTableRemoveAll(ta);CHKERRQ(ierr);
726     /* diagonal portion of A */
727     nzi  = adi[i+1] - adi[i];
728     Jptr = adj+adi[i];  /* cols of A_diag */
729     MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
730     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
731     if (apnz_max < apnz) apnz_max = apnz;
732 
733     /*  off-diagonal portion of A */
734     nzi = aoi[i+1] - aoi[i];
735     Jptr = aoj+aoi[i];  /* cols of A_off */
736     MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
737     ierr = PetscTableGetCount(ta,&apnz);CHKERRQ(ierr);
738     if (apnz_max < apnz) apnz_max = apnz;
739   }
740   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
741 
742   ierr = PetscLLCondensedCreate_Scalable(apnz_max,&lnk);CHKERRQ(ierr);
743 
744   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
745   ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr);
746   current_space = free_space;
747   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
748   for (i=0; i<am; i++) {
749     /* diagonal portion of A */
750     nzi = adi[i+1] - adi[i];
751     for (j=0; j<nzi; j++) {
752       row  = *adj++;
753       pnz  = pi_loc[row+1] - pi_loc[row];
754       Jptr = pj_loc + pi_loc[row];
755       /* add non-zero cols of P into the sorted linked list lnk */
756       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
757     }
758     /* off-diagonal portion of A */
759     nzi = aoi[i+1] - aoi[i];
760     for (j=0; j<nzi; j++) {
761       row  = *aoj++;
762       pnz  = pi_oth[row+1] - pi_oth[row];
763       Jptr = pj_oth + pi_oth[row];
764       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
765     }
766 
767     apnz     = *lnk;
768     api[i+1] = api[i] + apnz;
769 
770     /* if free space is not available, double the total space in the list */
771     if (current_space->local_remaining<apnz) {
772       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
773       nspacedouble++;
774     }
775 
776     /* Copy data into free space, then initialize lnk */
777     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
778     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
779 
780     current_space->array           += apnz;
781     current_space->local_used      += apnz;
782     current_space->local_remaining -= apnz;
783   }
784 
785   /* Allocate space for apj, initialize apj, and */
786   /* destroy list of free space and other temporary array(s) */
787   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
788   apj  = ptap->apj;
789   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
790   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
791 
792   /* create and assemble symbolic parallel matrix Cmpi */
793   /*----------------------------------------------------*/
794   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
795   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
796   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
797   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
798   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
799   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
800 
801   /* malloc apa for assembly Cmpi */
802   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
803 
804   ptap->apa = apa;
805   for (i=0; i<am; i++) {
806     row  = i + rstart;
807     apnz = api[i+1] - api[i];
808     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
809     apj += apnz;
810   }
811   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
812   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
813 
814   ptap->destroy             = Cmpi->ops->destroy;
815   ptap->duplicate           = Cmpi->ops->duplicate;
816   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
817   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
818   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
819 
820   /* attach the supporting struct to Cmpi for reuse */
821   c       = (Mat_MPIAIJ*)Cmpi->data;
822   c->ptap = ptap;
823 
824   *C = Cmpi;
825 
826   /* set MatInfo */
827   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
828   if (afill < 1.0) afill = 1.0;
829   Cmpi->info.mallocs           = nspacedouble;
830   Cmpi->info.fill_ratio_given  = fill;
831   Cmpi->info.fill_ratio_needed = afill;
832 
833 #if defined(PETSC_USE_INFO)
834   if (api[am]) {
835     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
836     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
837   } else {
838     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
839   }
840 #endif
841   PetscFunctionReturn(0);
842 }
843 
844 /*-------------------------------------------------------------------------*/
845 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
846 {
847   PetscErrorCode ierr;
848   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
849   PetscInt       alg=0; /* set default algorithm */
850 
851   PetscFunctionBegin;
852   if (scall == MAT_INITIAL_MATRIX) {
853     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
854     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
855     ierr = PetscOptionsEnd();CHKERRQ(ierr);
856 
857     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
858     switch (alg) {
859     case 1:
860       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
861       break;
862     case 2:
863     {
864       Mat         Pt;
865       Mat_PtAPMPI *ptap;
866       Mat_MPIAIJ  *c;
867       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
868       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
869       c        = (Mat_MPIAIJ*)(*C)->data;
870       ptap     = c->ptap;
871       ptap->Pt = Pt;
872       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
873       PetscFunctionReturn(0);
874     }
875       break;
876     default:
877       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
878       break;
879     }
880     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
881   }
882   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
883   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
884   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
885   PetscFunctionReturn(0);
886 }
887 
888 /* This routine only works when scall=MAT_REUSE_MATRIX! */
889 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
890 {
891   PetscErrorCode ierr;
892   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
893   Mat_PtAPMPI    *ptap= c->ptap;
894   Mat            Pt=ptap->Pt;
895 
896   PetscFunctionBegin;
897   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
898   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
899   PetscFunctionReturn(0);
900 }
901 
902 /* Non-scalable version, use dense axpy */
903 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
904 {
905   PetscErrorCode      ierr;
906   Mat_Merge_SeqsToMPI *merge;
907   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
908   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
909   Mat_PtAPMPI         *ptap;
910   PetscInt            *adj,*aJ;
911   PetscInt            i,j,k,anz,pnz,row,*cj;
912   MatScalar           *ada,*aval,*ca,valtmp;
913   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
914   MPI_Comm            comm;
915   PetscMPIInt         size,rank,taga,*len_s;
916   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
917   PetscInt            **buf_ri,**buf_rj;
918   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
919   MPI_Request         *s_waits,*r_waits;
920   MPI_Status          *status;
921   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
922   PetscInt            *ai,*aj,*coi,*coj;
923   PetscInt            *poJ,*pdJ;
924   Mat                 A_loc;
925   Mat_SeqAIJ          *a_loc;
926 
927   PetscFunctionBegin;
928   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
929   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
930   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
931 
932   ptap  = c->ptap;
933   merge = ptap->merge;
934 
935   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
936   /*--------------------------------------------------------------*/
937   /* get data from symbolic products */
938   coi  = merge->coi; coj = merge->coj;
939   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
940 
941   bi     = merge->bi; bj = merge->bj;
942   owners = merge->rowmap->range;
943   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
944 
945   /* get A_loc by taking all local rows of A */
946   A_loc = ptap->A_loc;
947   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
948   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
949   ai    = a_loc->i;
950   aj    = a_loc->j;
951 
952   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
953 
954   for (i=0; i<am; i++) {
955     /* 2-a) put A[i,:] to dense array aval */
956     anz = ai[i+1] - ai[i];
957     adj = aj + ai[i];
958     ada = a_loc->a + ai[i];
959     for (j=0; j<anz; j++) {
960       aval[adj[j]] = ada[j];
961     }
962 
963     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
964     /*--------------------------------------------------------------*/
965     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
966     pnz = po->i[i+1] - po->i[i];
967     poJ = po->j + po->i[i];
968     pA  = po->a + po->i[i];
969     for (j=0; j<pnz; j++) {
970       row = poJ[j];
971       cnz = coi[row+1] - coi[row];
972       cj  = coj + coi[row];
973       ca  = coa + coi[row];
974       /* perform dense axpy */
975       valtmp = pA[j];
976       for (k=0; k<cnz; k++) {
977         ca[k] += valtmp*aval[cj[k]];
978       }
979       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
980     }
981 
982     /* put the value into Cd (diagonal part) */
983     pnz = pd->i[i+1] - pd->i[i];
984     pdJ = pd->j + pd->i[i];
985     pA  = pd->a + pd->i[i];
986     for (j=0; j<pnz; j++) {
987       row = pdJ[j];
988       cnz = bi[row+1] - bi[row];
989       cj  = bj + bi[row];
990       ca  = ba + bi[row];
991       /* perform dense axpy */
992       valtmp = pA[j];
993       for (k=0; k<cnz; k++) {
994         ca[k] += valtmp*aval[cj[k]];
995       }
996       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
997     }
998 
999     /* zero the current row of Pt*A */
1000     aJ = aj + ai[i];
1001     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1002   }
1003 
1004   /* 3) send and recv matrix values coa */
1005   /*------------------------------------*/
1006   buf_ri = merge->buf_ri;
1007   buf_rj = merge->buf_rj;
1008   len_s  = merge->len_s;
1009   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1010   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1011 
1012   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1013   for (proc=0,k=0; proc<size; proc++) {
1014     if (!len_s[proc]) continue;
1015     i    = merge->owners_co[proc];
1016     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1017     k++;
1018   }
1019   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1020   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1021 
1022   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1023   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1024   ierr = PetscFree(coa);CHKERRQ(ierr);
1025 
1026   /* 4) insert local Cseq and received values into Cmpi */
1027   /*----------------------------------------------------*/
1028   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1029   for (k=0; k<merge->nrecv; k++) {
1030     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1031     nrows       = *(buf_ri_k[k]);
1032     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1033     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1034   }
1035 
1036   for (i=0; i<cm; i++) {
1037     row  = owners[rank] + i; /* global row index of C_seq */
1038     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1039     ba_i = ba + bi[i];
1040     bnz  = bi[i+1] - bi[i];
1041     /* add received vals into ba */
1042     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1043       /* i-th row */
1044       if (i == *nextrow[k]) {
1045         cnz    = *(nextci[k]+1) - *nextci[k];
1046         cj     = buf_rj[k] + *(nextci[k]);
1047         ca     = abuf_r[k] + *(nextci[k]);
1048         nextcj = 0;
1049         for (j=0; nextcj<cnz; j++) {
1050           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1051             ba_i[j] += ca[nextcj++];
1052           }
1053         }
1054         nextrow[k]++; nextci[k]++;
1055         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1056       }
1057     }
1058     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1059   }
1060   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1061   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1062 
1063   ierr = PetscFree(ba);CHKERRQ(ierr);
1064   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1065   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1066   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1067   ierr = PetscFree(aval);CHKERRQ(ierr);
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1072 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1073 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1074 {
1075   PetscErrorCode      ierr;
1076   Mat                 Cmpi,A_loc,POt,PDt;
1077   Mat_PtAPMPI         *ptap;
1078   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1079   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1080   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1081   PetscInt            nnz;
1082   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1083   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1084   PetscBT             lnkbt;
1085   MPI_Comm            comm;
1086   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1087   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1088   PetscInt            len,proc,*dnz,*onz,*owners;
1089   PetscInt            nzi,*bi,*bj;
1090   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1091   MPI_Request         *swaits,*rwaits;
1092   MPI_Status          *sstatus,rstatus;
1093   Mat_Merge_SeqsToMPI *merge;
1094   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1095   PetscReal           afill  =1.0,afill_tmp;
1096   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1097   PetscScalar         *vals;
1098   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1099 
1100   PetscFunctionBegin;
1101   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1102   /* check if matrix local sizes are compatible */
1103   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1104 
1105   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1106   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1107 
1108   /* create struct Mat_PtAPMPI and attached it to C later */
1109   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1110 
1111   /* get A_loc by taking all local rows of A */
1112   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1113 
1114   ptap->A_loc = A_loc;
1115 
1116   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1117   ai    = a_loc->i;
1118   aj    = a_loc->j;
1119 
1120   /* determine symbolic Co=(p->B)^T*A - send to others */
1121   /*----------------------------------------------------*/
1122   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1123   pdt  = (Mat_SeqAIJ*)PDt->data;
1124   pdti = pdt->i; pdtj = pdt->j;
1125 
1126   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1127   pot  = (Mat_SeqAIJ*)POt->data;
1128   poti = pot->i; potj = pot->j;
1129 
1130   /* then, compute symbolic Co = (p->B)^T*A */
1131   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1132   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1133   coi[0] = 0;
1134 
1135   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1136   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1137   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1138   current_space = free_space;
1139 
1140   /* create and initialize a linked list */
1141   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1142 
1143   for (i=0; i<pon; i++) {
1144     pnz = poti[i+1] - poti[i];
1145     ptJ = potj + poti[i];
1146     for (j=0; j<pnz; j++) {
1147       row  = ptJ[j]; /* row of A_loc == col of Pot */
1148       anz  = ai[row+1] - ai[row];
1149       Jptr = aj + ai[row];
1150       /* add non-zero cols of AP into the sorted linked list lnk */
1151       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1152     }
1153     nnz = lnk[0];
1154 
1155     /* If free space is not available, double the total space in the list */
1156     if (current_space->local_remaining<nnz) {
1157       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1158       nspacedouble++;
1159     }
1160 
1161     /* Copy data into free space, and zero out denserows */
1162     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1163 
1164     current_space->array           += nnz;
1165     current_space->local_used      += nnz;
1166     current_space->local_remaining -= nnz;
1167 
1168     coi[i+1] = coi[i] + nnz;
1169   }
1170 
1171   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1172   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1173 
1174   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1175   if (afill_tmp > afill) afill = afill_tmp;
1176 
1177   /* send j-array (coj) of Co to other processors */
1178   /*----------------------------------------------*/
1179   /* determine row ownership */
1180   ierr = PetscNew(&merge);CHKERRQ(ierr);
1181   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1182 
1183   merge->rowmap->n  = pn;
1184   merge->rowmap->bs = 1;
1185 
1186   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1187   owners = merge->rowmap->range;
1188 
1189   /* determine the number of messages to send, their lengths */
1190   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1191   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1192 
1193   len_s        = merge->len_s;
1194   merge->nsend = 0;
1195 
1196   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1197   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1198 
1199   proc = 0;
1200   for (i=0; i<pon; i++) {
1201     while (prmap[i] >= owners[proc+1]) proc++;
1202     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1203     len_s[proc] += coi[i+1] - coi[i];
1204   }
1205 
1206   len          = 0; /* max length of buf_si[] */
1207   owners_co[0] = 0;
1208   for (proc=0; proc<size; proc++) {
1209     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1210     if (len_si[proc]) {
1211       merge->nsend++;
1212       len_si[proc] = 2*(len_si[proc] + 1);
1213       len         += len_si[proc];
1214     }
1215   }
1216 
1217   /* determine the number and length of messages to receive for coi and coj  */
1218   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1219   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1220 
1221   /* post the Irecv and Isend of coj */
1222   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1223   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1224   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1225   for (proc=0, k=0; proc<size; proc++) {
1226     if (!len_s[proc]) continue;
1227     i    = owners_co[proc];
1228     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1229     k++;
1230   }
1231 
1232   /* receives and sends of coj are complete */
1233   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1234   for (i=0; i<merge->nrecv; i++) {
1235     PetscMPIInt icompleted;
1236     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1237   }
1238   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1239   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1240 
1241   /* send and recv coi */
1242   /*-------------------*/
1243   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1244   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1245   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1246   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1247   for (proc=0,k=0; proc<size; proc++) {
1248     if (!len_s[proc]) continue;
1249     /* form outgoing message for i-structure:
1250          buf_si[0]:                 nrows to be sent
1251                [1:nrows]:           row index (global)
1252                [nrows+1:2*nrows+1]: i-structure index
1253     */
1254     /*-------------------------------------------*/
1255     nrows       = len_si[proc]/2 - 1;
1256     buf_si_i    = buf_si + nrows+1;
1257     buf_si[0]   = nrows;
1258     buf_si_i[0] = 0;
1259     nrows       = 0;
1260     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1261       nzi               = coi[i+1] - coi[i];
1262       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1263       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1264       nrows++;
1265     }
1266     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1267     k++;
1268     buf_si += len_si[proc];
1269   }
1270   i = merge->nrecv;
1271   while (i--) {
1272     PetscMPIInt icompleted;
1273     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1274   }
1275   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1276   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1277   ierr = PetscFree(len_si);CHKERRQ(ierr);
1278   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1279   ierr = PetscFree(swaits);CHKERRQ(ierr);
1280   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1281   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1282 
1283   /* compute the local portion of C (mpi mat) */
1284   /*------------------------------------------*/
1285   /* allocate bi array and free space for accumulating nonzero column info */
1286   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1287   bi[0] = 0;
1288 
1289   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1290   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1291   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1292   current_space = free_space;
1293 
1294   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1295   for (k=0; k<merge->nrecv; k++) {
1296     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1297     nrows       = *buf_ri_k[k];
1298     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1299     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1300   }
1301 
1302   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1303   rmax = 0;
1304   for (i=0; i<pn; i++) {
1305     /* add pdt[i,:]*AP into lnk */
1306     pnz = pdti[i+1] - pdti[i];
1307     ptJ = pdtj + pdti[i];
1308     for (j=0; j<pnz; j++) {
1309       row  = ptJ[j];  /* row of AP == col of Pt */
1310       anz  = ai[row+1] - ai[row];
1311       Jptr = aj + ai[row];
1312       /* add non-zero cols of AP into the sorted linked list lnk */
1313       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1314     }
1315 
1316     /* add received col data into lnk */
1317     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1318       if (i == *nextrow[k]) { /* i-th row */
1319         nzi  = *(nextci[k]+1) - *nextci[k];
1320         Jptr = buf_rj[k] + *nextci[k];
1321         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1322         nextrow[k]++; nextci[k]++;
1323       }
1324     }
1325     nnz = lnk[0];
1326 
1327     /* if free space is not available, make more free space */
1328     if (current_space->local_remaining<nnz) {
1329       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1330       nspacedouble++;
1331     }
1332     /* copy data into free space, then initialize lnk */
1333     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1334     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1335 
1336     current_space->array           += nnz;
1337     current_space->local_used      += nnz;
1338     current_space->local_remaining -= nnz;
1339 
1340     bi[i+1] = bi[i] + nnz;
1341     if (nnz > rmax) rmax = nnz;
1342   }
1343   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1344 
1345   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1346   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1347 
1348   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1349   if (afill_tmp > afill) afill = afill_tmp;
1350   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1351   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1352   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1353 
1354   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1355   /*----------------------------------------------------------------------------------*/
1356   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1357 
1358   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1359   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1360   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1361   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1362   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1363   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1364   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1365   for (i=0; i<pn; i++) {
1366     row  = i + rstart;
1367     nnz  = bi[i+1] - bi[i];
1368     Jptr = bj + bi[i];
1369     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1370   }
1371   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1372   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1373   ierr = PetscFree(vals);CHKERRQ(ierr);
1374 
1375   merge->bi        = bi;
1376   merge->bj        = bj;
1377   merge->coi       = coi;
1378   merge->coj       = coj;
1379   merge->buf_ri    = buf_ri;
1380   merge->buf_rj    = buf_rj;
1381   merge->owners_co = owners_co;
1382 
1383   /* attach the supporting struct to Cmpi for reuse */
1384   c           = (Mat_MPIAIJ*)Cmpi->data;
1385   c->ptap     = ptap;
1386   ptap->api   = NULL;
1387   ptap->apj   = NULL;
1388   ptap->merge = merge;
1389   ptap->destroy   = Cmpi->ops->destroy;
1390   ptap->duplicate = Cmpi->ops->duplicate;
1391 
1392   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1393   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1394   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1395 
1396   *C = Cmpi;
1397 #if defined(PETSC_USE_INFO)
1398   if (bi[pn] != 0) {
1399     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1400     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1401   } else {
1402     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1403   }
1404 #endif
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1409 {
1410   PetscErrorCode      ierr;
1411   Mat_Merge_SeqsToMPI *merge;
1412   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1413   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1414   Mat_PtAPMPI         *ptap;
1415   PetscInt            *adj;
1416   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1417   MatScalar           *ada,*ca,valtmp;
1418   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1419   MPI_Comm            comm;
1420   PetscMPIInt         size,rank,taga,*len_s;
1421   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1422   PetscInt            **buf_ri,**buf_rj;
1423   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1424   MPI_Request         *s_waits,*r_waits;
1425   MPI_Status          *status;
1426   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1427   PetscInt            *ai,*aj,*coi,*coj;
1428   PetscInt            *poJ,*pdJ;
1429   Mat                 A_loc;
1430   Mat_SeqAIJ          *a_loc;
1431 
1432   PetscFunctionBegin;
1433   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1434   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1435   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1436 
1437   ptap  = c->ptap;
1438   merge = ptap->merge;
1439 
1440   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1441   /*------------------------------------------*/
1442   /* get data from symbolic products */
1443   coi    = merge->coi; coj = merge->coj;
1444   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1445   bi     = merge->bi; bj = merge->bj;
1446   owners = merge->rowmap->range;
1447   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1448 
1449   /* get A_loc by taking all local rows of A */
1450   A_loc = ptap->A_loc;
1451   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1452   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1453   ai    = a_loc->i;
1454   aj    = a_loc->j;
1455 
1456   for (i=0; i<am; i++) {
1457     anz = ai[i+1] - ai[i];
1458     adj = aj + ai[i];
1459     ada = a_loc->a + ai[i];
1460 
1461     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1462     /*-------------------------------------------------------------*/
1463     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1464     pnz = po->i[i+1] - po->i[i];
1465     poJ = po->j + po->i[i];
1466     pA  = po->a + po->i[i];
1467     for (j=0; j<pnz; j++) {
1468       row = poJ[j];
1469       cj  = coj + coi[row];
1470       ca  = coa + coi[row];
1471       /* perform sparse axpy */
1472       nexta  = 0;
1473       valtmp = pA[j];
1474       for (k=0; nexta<anz; k++) {
1475         if (cj[k] == adj[nexta]) {
1476           ca[k] += valtmp*ada[nexta];
1477           nexta++;
1478         }
1479       }
1480       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1481     }
1482 
1483     /* put the value into Cd (diagonal part) */
1484     pnz = pd->i[i+1] - pd->i[i];
1485     pdJ = pd->j + pd->i[i];
1486     pA  = pd->a + pd->i[i];
1487     for (j=0; j<pnz; j++) {
1488       row = pdJ[j];
1489       cj  = bj + bi[row];
1490       ca  = ba + bi[row];
1491       /* perform sparse axpy */
1492       nexta  = 0;
1493       valtmp = pA[j];
1494       for (k=0; nexta<anz; k++) {
1495         if (cj[k] == adj[nexta]) {
1496           ca[k] += valtmp*ada[nexta];
1497           nexta++;
1498         }
1499       }
1500       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1501     }
1502   }
1503 
1504   /* 3) send and recv matrix values coa */
1505   /*------------------------------------*/
1506   buf_ri = merge->buf_ri;
1507   buf_rj = merge->buf_rj;
1508   len_s  = merge->len_s;
1509   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1510   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1511 
1512   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1513   for (proc=0,k=0; proc<size; proc++) {
1514     if (!len_s[proc]) continue;
1515     i    = merge->owners_co[proc];
1516     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1517     k++;
1518   }
1519   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1520   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1521 
1522   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1523   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1524   ierr = PetscFree(coa);CHKERRQ(ierr);
1525 
1526   /* 4) insert local Cseq and received values into Cmpi */
1527   /*----------------------------------------------------*/
1528   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1529   for (k=0; k<merge->nrecv; k++) {
1530     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1531     nrows       = *(buf_ri_k[k]);
1532     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1533     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1534   }
1535 
1536   for (i=0; i<cm; i++) {
1537     row  = owners[rank] + i; /* global row index of C_seq */
1538     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1539     ba_i = ba + bi[i];
1540     bnz  = bi[i+1] - bi[i];
1541     /* add received vals into ba */
1542     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1543       /* i-th row */
1544       if (i == *nextrow[k]) {
1545         cnz    = *(nextci[k]+1) - *nextci[k];
1546         cj     = buf_rj[k] + *(nextci[k]);
1547         ca     = abuf_r[k] + *(nextci[k]);
1548         nextcj = 0;
1549         for (j=0; nextcj<cnz; j++) {
1550           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1551             ba_i[j] += ca[nextcj++];
1552           }
1553         }
1554         nextrow[k]++; nextci[k]++;
1555         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1556       }
1557     }
1558     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1559   }
1560   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1561   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1562 
1563   ierr = PetscFree(ba);CHKERRQ(ierr);
1564   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1565   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1566   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1571 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1572    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1573 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1574 {
1575   PetscErrorCode      ierr;
1576   Mat                 Cmpi,A_loc,POt,PDt;
1577   Mat_PtAPMPI         *ptap;
1578   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1579   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*a=(Mat_MPIAIJ*)A->data,*c;
1580   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1581   PetscInt            nnz;
1582   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1583   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1584   MPI_Comm            comm;
1585   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1586   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1587   PetscInt            len,proc,*dnz,*onz,*owners;
1588   PetscInt            nzi,*bi,*bj;
1589   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1590   MPI_Request         *swaits,*rwaits;
1591   MPI_Status          *sstatus,rstatus;
1592   Mat_Merge_SeqsToMPI *merge;
1593   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1594   PetscReal           afill  =1.0,afill_tmp;
1595   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1596   PetscScalar         *vals;
1597   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1598   PetscTable          ta;
1599 
1600   PetscFunctionBegin;
1601   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1602   /* check if matrix local sizes are compatible */
1603   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1604 
1605   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1606   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1607 
1608   /* create struct Mat_PtAPMPI and attached it to C later */
1609   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1610 
1611   /* get A_loc by taking all local rows of A */
1612   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1613 
1614   ptap->A_loc = A_loc;
1615   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1616   ai          = a_loc->i;
1617   aj          = a_loc->j;
1618 
1619   /* determine symbolic Co=(p->B)^T*A - send to others */
1620   /*----------------------------------------------------*/
1621   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1622   pdt  = (Mat_SeqAIJ*)PDt->data;
1623   pdti = pdt->i; pdtj = pdt->j;
1624 
1625   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1626   pot  = (Mat_SeqAIJ*)POt->data;
1627   poti = pot->i; potj = pot->j;
1628 
1629   /* then, compute symbolic Co = (p->B)^T*A */
1630   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1631                          >= (num of nonzero rows of C_seq) - pn */
1632   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1633   coi[0] = 0;
1634 
1635   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1636   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1637   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1638   current_space = free_space;
1639 
1640   /* create and initialize a linked list */
1641   ierr = PetscTableCreate(A->cmap->n + a->B->cmap->N,aN,&ta);CHKERRQ(ierr);
1642   MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1643   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1644 
1645   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1646 
1647   for (i=0; i<pon; i++) {
1648     pnz = poti[i+1] - poti[i];
1649     ptJ = potj + poti[i];
1650     for (j=0; j<pnz; j++) {
1651       row  = ptJ[j]; /* row of A_loc == col of Pot */
1652       anz  = ai[row+1] - ai[row];
1653       Jptr = aj + ai[row];
1654       /* add non-zero cols of AP into the sorted linked list lnk */
1655       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1656     }
1657     nnz = lnk[0];
1658 
1659     /* If free space is not available, double the total space in the list */
1660     if (current_space->local_remaining<nnz) {
1661       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1662       nspacedouble++;
1663     }
1664 
1665     /* Copy data into free space, and zero out denserows */
1666     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1667 
1668     current_space->array           += nnz;
1669     current_space->local_used      += nnz;
1670     current_space->local_remaining -= nnz;
1671 
1672     coi[i+1] = coi[i] + nnz;
1673   }
1674 
1675   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1676   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1677   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1678 
1679   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1680   if (afill_tmp > afill) afill = afill_tmp;
1681 
1682   /* send j-array (coj) of Co to other processors */
1683   /*----------------------------------------------*/
1684   /* determine row ownership */
1685   ierr = PetscNew(&merge);CHKERRQ(ierr);
1686   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1687 
1688   merge->rowmap->n  = pn;
1689   merge->rowmap->bs = 1;
1690 
1691   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1692   owners = merge->rowmap->range;
1693 
1694   /* determine the number of messages to send, their lengths */
1695   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1696   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1697 
1698   len_s        = merge->len_s;
1699   merge->nsend = 0;
1700 
1701   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1702   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1703 
1704   proc = 0;
1705   for (i=0; i<pon; i++) {
1706     while (prmap[i] >= owners[proc+1]) proc++;
1707     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1708     len_s[proc] += coi[i+1] - coi[i];
1709   }
1710 
1711   len          = 0; /* max length of buf_si[] */
1712   owners_co[0] = 0;
1713   for (proc=0; proc<size; proc++) {
1714     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1715     if (len_si[proc]) {
1716       merge->nsend++;
1717       len_si[proc] = 2*(len_si[proc] + 1);
1718       len         += len_si[proc];
1719     }
1720   }
1721 
1722   /* determine the number and length of messages to receive for coi and coj  */
1723   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1724   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1725 
1726   /* post the Irecv and Isend of coj */
1727   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1728   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1729   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1730   for (proc=0, k=0; proc<size; proc++) {
1731     if (!len_s[proc]) continue;
1732     i    = owners_co[proc];
1733     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1734     k++;
1735   }
1736 
1737   /* receives and sends of coj are complete */
1738   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1739   for (i=0; i<merge->nrecv; i++) {
1740     PetscMPIInt icompleted;
1741     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1742   }
1743   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1744   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1745 
1746   /* add received column indices into table to update Armax */
1747   /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/examples/tutorials/ex56.c! */
1748   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1749     Jptr = buf_rj[k];
1750     for (j=0; j<merge->len_r[k]; j++) {
1751       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1752     }
1753   }
1754   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1755   /* printf("Armax %d, an %d + Bn %d = %d, aN %d\n",Armax,A->cmap->n,a->B->cmap->N,A->cmap->n+a->B->cmap->N,aN); */
1756 
1757   /* send and recv coi */
1758   /*-------------------*/
1759   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1760   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1761   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1762   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1763   for (proc=0,k=0; proc<size; proc++) {
1764     if (!len_s[proc]) continue;
1765     /* form outgoing message for i-structure:
1766          buf_si[0]:                 nrows to be sent
1767                [1:nrows]:           row index (global)
1768                [nrows+1:2*nrows+1]: i-structure index
1769     */
1770     /*-------------------------------------------*/
1771     nrows       = len_si[proc]/2 - 1;
1772     buf_si_i    = buf_si + nrows+1;
1773     buf_si[0]   = nrows;
1774     buf_si_i[0] = 0;
1775     nrows       = 0;
1776     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1777       nzi               = coi[i+1] - coi[i];
1778       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1779       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1780       nrows++;
1781     }
1782     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1783     k++;
1784     buf_si += len_si[proc];
1785   }
1786   i = merge->nrecv;
1787   while (i--) {
1788     PetscMPIInt icompleted;
1789     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1790   }
1791   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1792   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1793   ierr = PetscFree(len_si);CHKERRQ(ierr);
1794   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1795   ierr = PetscFree(swaits);CHKERRQ(ierr);
1796   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1797   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1798 
1799   /* compute the local portion of C (mpi mat) */
1800   /*------------------------------------------*/
1801   /* allocate bi array and free space for accumulating nonzero column info */
1802   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1803   bi[0] = 0;
1804 
1805   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1806   nnz           = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1807   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1808   current_space = free_space;
1809 
1810   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1811   for (k=0; k<merge->nrecv; k++) {
1812     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1813     nrows       = *buf_ri_k[k];
1814     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1815     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1816   }
1817 
1818   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1819   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1820   rmax = 0;
1821   for (i=0; i<pn; i++) {
1822     /* add pdt[i,:]*AP into lnk */
1823     pnz = pdti[i+1] - pdti[i];
1824     ptJ = pdtj + pdti[i];
1825     for (j=0; j<pnz; j++) {
1826       row  = ptJ[j];  /* row of AP == col of Pt */
1827       anz  = ai[row+1] - ai[row];
1828       Jptr = aj + ai[row];
1829       /* add non-zero cols of AP into the sorted linked list lnk */
1830       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1831     }
1832 
1833     /* add received col data into lnk */
1834     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1835       if (i == *nextrow[k]) { /* i-th row */
1836         nzi  = *(nextci[k]+1) - *nextci[k];
1837         Jptr = buf_rj[k] + *nextci[k];
1838         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1839         nextrow[k]++; nextci[k]++;
1840       }
1841     }
1842     nnz = lnk[0];
1843 
1844     /* if free space is not available, make more free space */
1845     if (current_space->local_remaining<nnz) {
1846       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1847       nspacedouble++;
1848     }
1849     /* copy data into free space, then initialize lnk */
1850     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1851     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1852 
1853     current_space->array           += nnz;
1854     current_space->local_used      += nnz;
1855     current_space->local_remaining -= nnz;
1856 
1857     bi[i+1] = bi[i] + nnz;
1858     if (nnz > rmax) rmax = nnz;
1859   }
1860   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1861 
1862   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1863   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1864   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1865   if (afill_tmp > afill) afill = afill_tmp;
1866   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1867   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1868 
1869   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1870   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1871 
1872   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1873   /*----------------------------------------------------------------------------------*/
1874   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1875 
1876   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1877   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1878   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1879   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1880   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1881   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1882   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1883   for (i=0; i<pn; i++) {
1884     row  = i + rstart;
1885     nnz  = bi[i+1] - bi[i];
1886     Jptr = bj + bi[i];
1887     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1888   }
1889   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1890   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1891   ierr = PetscFree(vals);CHKERRQ(ierr);
1892 
1893   merge->bi        = bi;
1894   merge->bj        = bj;
1895   merge->coi       = coi;
1896   merge->coj       = coj;
1897   merge->buf_ri    = buf_ri;
1898   merge->buf_rj    = buf_rj;
1899   merge->owners_co = owners_co;
1900 
1901   /* attach the supporting struct to Cmpi for reuse */
1902   c = (Mat_MPIAIJ*)Cmpi->data;
1903 
1904   c->ptap     = ptap;
1905   ptap->api   = NULL;
1906   ptap->apj   = NULL;
1907   ptap->merge = merge;
1908   ptap->apa   = NULL;
1909   ptap->destroy   = Cmpi->ops->destroy;
1910   ptap->duplicate = Cmpi->ops->duplicate;
1911 
1912   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1913   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1914   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1915 
1916   *C = Cmpi;
1917 #if defined(PETSC_USE_INFO)
1918   if (bi[pn] != 0) {
1919     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1920     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1921   } else {
1922     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1923   }
1924 #endif
1925   PetscFunctionReturn(0);
1926 }
1927