xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision e55be12dc9957904bffc077bd8358204ed1851a4)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20   MPI_Comm       comm;
21 
22   PetscFunctionBegin;
23   if (scall == MAT_INITIAL_MATRIX) {
24     ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
25     if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) {
26       SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
27     }
28 
29     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
30     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);CHKERRQ(ierr);
31     ierr = PetscOptionsEnd();CHKERRQ(ierr);
32 
33     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
34     switch (alg) {
35     case 1:
36       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
37       break;
38     default:
39       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
40       break;
41     }
42     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
43   }
44   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
45   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
46   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
47   PetscFunctionReturn(0);
48 }
49 
50 #undef __FUNCT__
51 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
52 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
53 {
54   PetscErrorCode ierr;
55   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
56   Mat_PtAPMPI    *ptap = a->ptap;
57 
58   PetscFunctionBegin;
59   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
60   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
61   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
62   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
63   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
64   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
65   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
66   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
67   ierr = ptap->destroy(A);CHKERRQ(ierr);
68   ierr = PetscFree(ptap);CHKERRQ(ierr);
69   PetscFunctionReturn(0);
70 }
71 
72 #undef __FUNCT__
73 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
74 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
75 {
76   PetscErrorCode ierr;
77   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
78   Mat_PtAPMPI    *ptap = a->ptap;
79 
80   PetscFunctionBegin;
81   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
82 
83   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
84   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
85   PetscFunctionReturn(0);
86 }
87 
88 #undef __FUNCT__
89 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
90 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
91 {
92   PetscErrorCode ierr;
93   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
94   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
95   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
96   PetscScalar    *cda=cd->a,*coa=co->a;
97   Mat_SeqAIJ     *p_loc,*p_oth;
98   PetscScalar    *apa,*ca;
99   PetscInt       cm   =C->rmap->n;
100   Mat_PtAPMPI    *ptap=c->ptap;
101   PetscInt       *api,*apj,*apJ,i,k;
102   PetscInt       cstart=C->cmap->rstart;
103   PetscInt       cdnz,conz,k0,k1;
104   MPI_Comm       comm;
105   PetscMPIInt    size;
106 
107   PetscFunctionBegin;
108   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
109   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
110 
111   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
112   /*-----------------------------------------------------*/
113   /* update numerical values of P_oth and P_loc */
114   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
115   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
116 
117   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
118   /*----------------------------------------------------------*/
119   /* get data from symbolic products */
120   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
121   p_oth = NULL;
122   if (size >1) {
123     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
124   }
125 
126   /* get apa for storing dense row A[i,:]*P */
127   apa = ptap->apa;
128 
129   api = ptap->api;
130   apj = ptap->apj;
131   for (i=0; i<cm; i++) {
132     /* compute apa = A[i,:]*P */
133     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
134 
135     /* set values in C */
136     apJ  = apj + api[i];
137     cdnz = cd->i[i+1] - cd->i[i];
138     conz = co->i[i+1] - co->i[i];
139 
140     /* 1st off-diagoanl part of C */
141     ca = coa + co->i[i];
142     k  = 0;
143     for (k0=0; k0<conz; k0++) {
144       if (apJ[k] >= cstart) break;
145       ca[k0]      = apa[apJ[k]];
146       apa[apJ[k]] = 0.0;
147       k++;
148     }
149 
150     /* diagonal part of C */
151     ca = cda + cd->i[i];
152     for (k1=0; k1<cdnz; k1++) {
153       ca[k1]      = apa[apJ[k]];
154       apa[apJ[k]] = 0.0;
155       k++;
156     }
157 
158     /* 2nd off-diagoanl part of C */
159     ca = coa + co->i[i];
160     for (; k0<conz; k0++) {
161       ca[k0]      = apa[apJ[k]];
162       apa[apJ[k]] = 0.0;
163       k++;
164     }
165   }
166   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
167   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 #undef __FUNCT__
172 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
173 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
174 {
175   PetscErrorCode     ierr;
176   MPI_Comm           comm;
177   PetscMPIInt        size;
178   Mat                Cmpi;
179   Mat_PtAPMPI        *ptap;
180   PetscFreeSpaceList free_space=NULL,current_space=NULL;
181   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
182   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
183   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
184   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
185   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
186   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
187   PetscBT            lnkbt;
188   PetscScalar        *apa;
189   PetscReal          afill;
190   PetscTable         ta;
191 
192   PetscFunctionBegin;
193   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
194   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
195 
196   /* create struct Mat_PtAPMPI and attached it to C later */
197   ierr = PetscNew(&ptap);CHKERRQ(ierr);
198 
199   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
200   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
201 
202   /* get P_loc by taking all local rows of P */
203   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
204 
205   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
206   pi_loc = p_loc->i; pj_loc = p_loc->j;
207   if (size > 1) {
208     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
209     pi_oth = p_oth->i; pj_oth = p_oth->j;
210   } else {
211     p_oth = NULL;
212     pi_oth = NULL; pj_oth = NULL;
213   }
214 
215   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
216   /*-------------------------------------------------------------------*/
217   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
218   ptap->api = api;
219   api[0]    = 0;
220 
221   /* create and initialize a linked list */
222   Crmax = p_loc->rmax+p_oth->rmax;
223   ierr = PetscTableCreate(2*Crmax,pN,&ta);CHKERRQ(ierr);
224   for (row=0; row<ptap->P_loc->rmap->N; row++) {
225     nzi = p_loc->i[row+1] - p_loc->i[row];
226     for (j=0; j<nzi; j++) {
227       Jptr = j + p_loc->j + p_loc->i[row];
228       ierr = PetscTableAdd(ta,*Jptr+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
229     }
230   }
231   for (row=0; row<ptap->P_oth->rmap->N; row++) {
232     nzi = p_oth->i[row+1] - p_oth->i[row];
233     for (j=0; j<nzi; j++) {
234       Jptr = j + p_oth->j + p_oth->i[row];
235       ierr = PetscTableAdd(ta,*Jptr+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
236     }
237   }
238   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
239   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
240 
241   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
242 
243   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
244   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
245   current_space = free_space;
246 
247   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
248   for (i=0; i<am; i++) {
249     /* diagonal portion of A */
250     nzi = adi[i+1] - adi[i];
251     for (j=0; j<nzi; j++) {
252       row  = *adj++;
253       pnz  = pi_loc[row+1] - pi_loc[row];
254       Jptr = pj_loc + pi_loc[row];
255       /* add non-zero cols of P into the sorted linked list lnk */
256       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
257     }
258     /* off-diagonal portion of A */
259     nzi = aoi[i+1] - aoi[i];
260     for (j=0; j<nzi; j++) {
261       row  = *aoj++;
262       pnz  = pi_oth[row+1] - pi_oth[row];
263       Jptr = pj_oth + pi_oth[row];
264       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
265     }
266 
267     apnz     = lnk[0];
268     api[i+1] = api[i] + apnz;
269 
270     /* if free space is not available, double the total space in the list */
271     if (current_space->local_remaining<apnz) {
272       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
273       nspacedouble++;
274     }
275 
276     /* Copy data into free space, then initialize lnk */
277     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
278     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
279 
280     current_space->array           += apnz;
281     current_space->local_used      += apnz;
282     current_space->local_remaining -= apnz;
283   }
284 
285   /* Allocate space for apj, initialize apj, and */
286   /* destroy list of free space and other temporary array(s) */
287   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
288   apj  = ptap->apj;
289   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
290   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
291 
292   /* malloc apa to store dense row A[i,:]*P */
293   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
294 
295   ptap->apa = apa;
296 
297   /* create and assemble symbolic parallel matrix Cmpi */
298   /*----------------------------------------------------*/
299   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
300   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
301   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
302 
303   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
304   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
305   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
306   for (i=0; i<am; i++) {
307     row  = i + rstart;
308     apnz = api[i+1] - api[i];
309     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
310     apj += apnz;
311   }
312   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
313   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
314 
315   ptap->destroy        = Cmpi->ops->destroy;
316   ptap->duplicate      = Cmpi->ops->duplicate;
317   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
318   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
319   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
320 
321   /* attach the supporting struct to Cmpi for reuse */
322   c       = (Mat_MPIAIJ*)Cmpi->data;
323   c->ptap = ptap;
324 
325   *C = Cmpi;
326 
327   /* set MatInfo */
328   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
329   if (afill < 1.0) afill = 1.0;
330   Cmpi->info.mallocs           = nspacedouble;
331   Cmpi->info.fill_ratio_given  = fill;
332   Cmpi->info.fill_ratio_needed = afill;
333 
334 #if defined(PETSC_USE_INFO)
335   if (api[am]) {
336     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
337     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
338   } else {
339     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
340   }
341 #endif
342   PetscFunctionReturn(0);
343 }
344 
345 #undef __FUNCT__
346 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
347 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
348 {
349   PetscErrorCode ierr;
350 
351   PetscFunctionBegin;
352   if (scall == MAT_INITIAL_MATRIX) {
353     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
354     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
355     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
356   }
357   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
358   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
359   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
360   PetscFunctionReturn(0);
361 }
362 
363 typedef struct {
364   Mat         workB;
365   PetscScalar *rvalues,*svalues;
366   MPI_Request *rwaits,*swaits;
367 } MPIAIJ_MPIDense;
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
371 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
372 {
373   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
374   PetscErrorCode  ierr;
375 
376   PetscFunctionBegin;
377   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
378   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
379   ierr = PetscFree(contents);CHKERRQ(ierr);
380   PetscFunctionReturn(0);
381 }
382 
383 #undef __FUNCT__
384 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
385 /*
386     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
387   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
388 
389   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
390 */
391 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
392 {
393   PetscErrorCode         ierr;
394   PetscBool              flg;
395   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
396   PetscInt               nz   = aij->B->cmap->n;
397   PetscContainer         container;
398   MPIAIJ_MPIDense        *contents;
399   VecScatter             ctx   = aij->Mvctx;
400   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
401   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
402 
403   PetscFunctionBegin;
404   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
405   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
406 
407   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
408   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
409   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
410 
411   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
412 
413   ierr = PetscNew(&contents);CHKERRQ(ierr);
414   /* Create work matrix used to store off processor rows of B needed for local product */
415   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
416   /* Create work arrays needed */
417   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
418                       B->cmap->N*to->starts[to->n],&contents->svalues,
419                       from->n,&contents->rwaits,
420                       to->n,&contents->swaits);CHKERRQ(ierr);
421 
422   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
423   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
424   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
425   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
426   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
427 
428   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
429   PetscFunctionReturn(0);
430 }
431 
432 #undef __FUNCT__
433 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
434 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
435 {
436   PetscErrorCode         ierr;
437   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
438   PetscInt               nz   = aij->B->cmap->n;
439   PetscContainer         container;
440   MPIAIJ_MPIDense        *contents;
441   VecScatter             ctx   = aij->Mvctx;
442   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
443   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
444   PetscInt               m     = A->rmap->n,n=B->cmap->n;
445 
446   PetscFunctionBegin;
447   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
448   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
449   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
450   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
451   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
452   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
453   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
454 
455   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
456 
457   ierr = PetscNew(&contents);CHKERRQ(ierr);
458   /* Create work matrix used to store off processor rows of B needed for local product */
459   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
460   /* Create work arrays needed */
461   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
462                       B->cmap->N*to->starts[to->n],&contents->svalues,
463                       from->n,&contents->rwaits,
464                       to->n,&contents->swaits);CHKERRQ(ierr);
465 
466   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
467   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
468   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
469   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
470   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
471   PetscFunctionReturn(0);
472 }
473 
474 #undef __FUNCT__
475 #define __FUNCT__ "MatMPIDenseScatter"
476 /*
477     Performs an efficient scatter on the rows of B needed by this process; this is
478     a modification of the VecScatterBegin_() routines.
479 */
480 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
481 {
482   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
483   PetscErrorCode         ierr;
484   PetscScalar            *b,*w,*svalues,*rvalues;
485   VecScatter             ctx   = aij->Mvctx;
486   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
487   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
488   PetscInt               i,j,k;
489   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
490   PetscMPIInt            *sprocs,*rprocs,nrecvs;
491   MPI_Request            *swaits,*rwaits;
492   MPI_Comm               comm;
493   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
494   MPI_Status             status;
495   MPIAIJ_MPIDense        *contents;
496   PetscContainer         container;
497   Mat                    workB;
498 
499   PetscFunctionBegin;
500   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
501   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
502   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
503   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
504 
505   workB = *outworkB = contents->workB;
506   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
507   sindices = to->indices;
508   sstarts  = to->starts;
509   sprocs   = to->procs;
510   swaits   = contents->swaits;
511   svalues  = contents->svalues;
512 
513   rindices = from->indices;
514   rstarts  = from->starts;
515   rprocs   = from->procs;
516   rwaits   = contents->rwaits;
517   rvalues  = contents->rvalues;
518 
519   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
520   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
521 
522   for (i=0; i<from->n; i++) {
523     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
524   }
525 
526   for (i=0; i<to->n; i++) {
527     /* pack a message at a time */
528     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
529       for (k=0; k<ncols; k++) {
530         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
531       }
532     }
533     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
534   }
535 
536   nrecvs = from->n;
537   while (nrecvs) {
538     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
539     nrecvs--;
540     /* unpack a message at a time */
541     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
542       for (k=0; k<ncols; k++) {
543         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
544       }
545     }
546   }
547   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
548 
549   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
550   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
551   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
552   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
553   PetscFunctionReturn(0);
554 }
555 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
556 
557 #undef __FUNCT__
558 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
559 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
560 {
561   PetscErrorCode ierr;
562   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
563   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
564   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
565   Mat            workB;
566 
567   PetscFunctionBegin;
568   /* diagonal block of A times all local rows of B*/
569   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
570 
571   /* get off processor parts of B needed to complete the product */
572   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
573 
574   /* off-diagonal block of A times nonlocal rows of B */
575   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
576   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
577   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
578   PetscFunctionReturn(0);
579 }
580 
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
583 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
584 {
585   PetscErrorCode ierr;
586   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
587   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
588   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
589   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
590   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
591   Mat_SeqAIJ     *p_loc,*p_oth;
592   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
593   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
594   PetscInt       cm          = C->rmap->n,anz,pnz;
595   Mat_PtAPMPI    *ptap       = c->ptap;
596   PetscScalar    *apa_sparse = ptap->apa;
597   PetscInt       *api,*apj,*apJ,i,j,k,row;
598   PetscInt       cstart = C->cmap->rstart;
599   PetscInt       cdnz,conz,k0,k1,nextp;
600   MPI_Comm       comm;
601   PetscMPIInt    size;
602 
603   PetscFunctionBegin;
604   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
605   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
606 
607   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
608   /*-----------------------------------------------------*/
609   /* update numerical values of P_oth and P_loc */
610   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
611   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
612 
613   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
614   /*----------------------------------------------------------*/
615   /* get data from symbolic products */
616   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
617   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
618   if (size >1) {
619     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
620     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
621   } else {
622     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
623   }
624 
625   api = ptap->api;
626   apj = ptap->apj;
627   for (i=0; i<cm; i++) {
628     apJ = apj + api[i];
629 
630     /* diagonal portion of A */
631     anz = adi[i+1] - adi[i];
632     adj = ad->j + adi[i];
633     ada = ad->a + adi[i];
634     for (j=0; j<anz; j++) {
635       row = adj[j];
636       pnz = pi_loc[row+1] - pi_loc[row];
637       pj  = pj_loc + pi_loc[row];
638       pa  = pa_loc + pi_loc[row];
639       /* perform sparse axpy */
640       valtmp = ada[j];
641       nextp  = 0;
642       for (k=0; nextp<pnz; k++) {
643         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
644           apa_sparse[k] += valtmp*pa[nextp++];
645         }
646       }
647       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
648     }
649 
650     /* off-diagonal portion of A */
651     anz = aoi[i+1] - aoi[i];
652     aoj = ao->j + aoi[i];
653     aoa = ao->a + aoi[i];
654     for (j=0; j<anz; j++) {
655       row = aoj[j];
656       pnz = pi_oth[row+1] - pi_oth[row];
657       pj  = pj_oth + pi_oth[row];
658       pa  = pa_oth + pi_oth[row];
659       /* perform sparse axpy */
660       valtmp = aoa[j];
661       nextp  = 0;
662       for (k=0; nextp<pnz; k++) {
663         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
664           apa_sparse[k] += valtmp*pa[nextp++];
665         }
666       }
667       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
668     }
669 
670     /* set values in C */
671     cdnz = cd->i[i+1] - cd->i[i];
672     conz = co->i[i+1] - co->i[i];
673 
674     /* 1st off-diagoanl part of C */
675     ca = coa + co->i[i];
676     k  = 0;
677     for (k0=0; k0<conz; k0++) {
678       if (apJ[k] >= cstart) break;
679       ca[k0]        = apa_sparse[k];
680       apa_sparse[k] = 0.0;
681       k++;
682     }
683 
684     /* diagonal part of C */
685     ca = cda + cd->i[i];
686     for (k1=0; k1<cdnz; k1++) {
687       ca[k1]        = apa_sparse[k];
688       apa_sparse[k] = 0.0;
689       k++;
690     }
691 
692     /* 2nd off-diagoanl part of C */
693     ca = coa + co->i[i];
694     for (; k0<conz; k0++) {
695       ca[k0]        = apa_sparse[k];
696       apa_sparse[k] = 0.0;
697       k++;
698     }
699   }
700   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
701   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
702   PetscFunctionReturn(0);
703 }
704 
705 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
706 #undef __FUNCT__
707 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
708 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
709 {
710   PetscErrorCode     ierr;
711   MPI_Comm           comm;
712   PetscMPIInt        size;
713   Mat                Cmpi;
714   Mat_PtAPMPI        *ptap;
715   PetscFreeSpaceList free_space = NULL,current_space=NULL;
716   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
717   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
718   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
719   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
720   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
721   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
722   PetscInt           nlnk_max,armax,prmax;
723   PetscReal          afill;
724   PetscScalar        *apa;
725 
726   PetscFunctionBegin;
727   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
728   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
729 
730   /* create struct Mat_PtAPMPI and attached it to C later */
731   ierr = PetscNew(&ptap);CHKERRQ(ierr);
732 
733   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
734   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
735 
736   /* get P_loc by taking all local rows of P */
737   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
738 
739   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
740   pi_loc = p_loc->i; pj_loc = p_loc->j;
741   if (size > 1) {
742     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
743     pi_oth = p_oth->i; pj_oth = p_oth->j;
744   } else {
745     p_oth  = NULL;
746     pi_oth = NULL; pj_oth = NULL;
747   }
748 
749   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
750   /*-------------------------------------------------------------------*/
751   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
752   ptap->api = api;
753   api[0]    = 0;
754 
755   /* create and initialize a linked list */
756   armax = ad->rmax+ao->rmax;
757   if (size >1) {
758     prmax = PetscMax(p_loc->rmax,p_oth->rmax);
759   } else {
760     prmax = p_loc->rmax;
761   }
762   nlnk_max = armax*prmax;
763   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
764   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
765 
766   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
767   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
768 
769   current_space = free_space;
770 
771   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
772   for (i=0; i<am; i++) {
773     /* diagonal portion of A */
774     nzi = adi[i+1] - adi[i];
775     for (j=0; j<nzi; j++) {
776       row  = *adj++;
777       pnz  = pi_loc[row+1] - pi_loc[row];
778       Jptr = pj_loc + pi_loc[row];
779       /* add non-zero cols of P into the sorted linked list lnk */
780       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
781     }
782     /* off-diagonal portion of A */
783     nzi = aoi[i+1] - aoi[i];
784     for (j=0; j<nzi; j++) {
785       row  = *aoj++;
786       pnz  = pi_oth[row+1] - pi_oth[row];
787       Jptr = pj_oth + pi_oth[row];
788       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
789     }
790 
791     apnz     = *lnk;
792     api[i+1] = api[i] + apnz;
793     if (apnz > apnz_max) apnz_max = apnz;
794 
795     /* if free space is not available, double the total space in the list */
796     if (current_space->local_remaining<apnz) {
797       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
798       nspacedouble++;
799     }
800 
801     /* Copy data into free space, then initialize lnk */
802     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
803     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
804 
805     current_space->array           += apnz;
806     current_space->local_used      += apnz;
807     current_space->local_remaining -= apnz;
808   }
809 
810   /* Allocate space for apj, initialize apj, and */
811   /* destroy list of free space and other temporary array(s) */
812   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
813   apj  = ptap->apj;
814   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
815   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
816 
817   /* create and assemble symbolic parallel matrix Cmpi */
818   /*----------------------------------------------------*/
819   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
820   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
821   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
822   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
823   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
824   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
825 
826   /* malloc apa for assembly Cmpi */
827   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
828 
829   ptap->apa = apa;
830   for (i=0; i<am; i++) {
831     row  = i + rstart;
832     apnz = api[i+1] - api[i];
833     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
834     apj += apnz;
835   }
836   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
837   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
838 
839   ptap->destroy             = Cmpi->ops->destroy;
840   ptap->duplicate           = Cmpi->ops->duplicate;
841   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
842   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
843   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
844 
845   /* attach the supporting struct to Cmpi for reuse */
846   c       = (Mat_MPIAIJ*)Cmpi->data;
847   c->ptap = ptap;
848 
849   *C = Cmpi;
850 
851   /* set MatInfo */
852   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
853   if (afill < 1.0) afill = 1.0;
854   Cmpi->info.mallocs           = nspacedouble;
855   Cmpi->info.fill_ratio_given  = fill;
856   Cmpi->info.fill_ratio_needed = afill;
857 
858 #if defined(PETSC_USE_INFO)
859   if (api[am]) {
860     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
861     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
862   } else {
863     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
864   }
865 #endif
866   PetscFunctionReturn(0);
867 }
868 
869 /*-------------------------------------------------------------------------*/
870 #undef __FUNCT__
871 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
872 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
873 {
874   PetscErrorCode ierr;
875   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
876   PetscInt       alg=0; /* set default algorithm */
877 
878   PetscFunctionBegin;
879   if (scall == MAT_INITIAL_MATRIX) {
880     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
881     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
882     ierr = PetscOptionsEnd();CHKERRQ(ierr);
883 
884     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
885     switch (alg) {
886     case 1:
887       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
888       break;
889     case 2:
890     {
891       Mat         Pt;
892       Mat_PtAPMPI *ptap;
893       Mat_MPIAIJ  *c;
894       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
895       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
896       c        = (Mat_MPIAIJ*)(*C)->data;
897       ptap     = c->ptap;
898       ptap->Pt = Pt;
899       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
900       PetscFunctionReturn(0);
901     }
902       break;
903     default:
904       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
905       break;
906     }
907     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
908   }
909   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
910   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
911   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
912   PetscFunctionReturn(0);
913 }
914 
915 /* This routine only works when scall=MAT_REUSE_MATRIX! */
916 #undef __FUNCT__
917 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
918 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
919 {
920   PetscErrorCode ierr;
921   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
922   Mat_PtAPMPI    *ptap= c->ptap;
923   Mat            Pt=ptap->Pt;
924 
925   PetscFunctionBegin;
926   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
927   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 /* Non-scalable version, use dense axpy */
932 #undef __FUNCT__
933 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
934 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
935 {
936   PetscErrorCode      ierr;
937   Mat_Merge_SeqsToMPI *merge;
938   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
939   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
940   Mat_PtAPMPI         *ptap;
941   PetscInt            *adj,*aJ;
942   PetscInt            i,j,k,anz,pnz,row,*cj;
943   MatScalar           *ada,*aval,*ca,valtmp;
944   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
945   MPI_Comm            comm;
946   PetscMPIInt         size,rank,taga,*len_s;
947   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
948   PetscInt            **buf_ri,**buf_rj;
949   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
950   MPI_Request         *s_waits,*r_waits;
951   MPI_Status          *status;
952   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
953   PetscInt            *ai,*aj,*coi,*coj;
954   PetscInt            *poJ,*pdJ;
955   Mat                 A_loc;
956   Mat_SeqAIJ          *a_loc;
957 
958   PetscFunctionBegin;
959   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
960   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
961   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
962 
963   ptap  = c->ptap;
964   merge = ptap->merge;
965 
966   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
967   /*--------------------------------------------------------------*/
968   /* get data from symbolic products */
969   coi  = merge->coi; coj = merge->coj;
970   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
971 
972   bi     = merge->bi; bj = merge->bj;
973   owners = merge->rowmap->range;
974   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
975 
976   /* get A_loc by taking all local rows of A */
977   A_loc = ptap->A_loc;
978   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
979   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
980   ai    = a_loc->i;
981   aj    = a_loc->j;
982 
983   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
984 
985   for (i=0; i<am; i++) {
986     /* 2-a) put A[i,:] to dense array aval */
987     anz = ai[i+1] - ai[i];
988     adj = aj + ai[i];
989     ada = a_loc->a + ai[i];
990     for (j=0; j<anz; j++) {
991       aval[adj[j]] = ada[j];
992     }
993 
994     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
995     /*--------------------------------------------------------------*/
996     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
997     pnz = po->i[i+1] - po->i[i];
998     poJ = po->j + po->i[i];
999     pA  = po->a + po->i[i];
1000     for (j=0; j<pnz; j++) {
1001       row = poJ[j];
1002       cnz = coi[row+1] - coi[row];
1003       cj  = coj + coi[row];
1004       ca  = coa + coi[row];
1005       /* perform dense axpy */
1006       valtmp = pA[j];
1007       for (k=0; k<cnz; k++) {
1008         ca[k] += valtmp*aval[cj[k]];
1009       }
1010       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1011     }
1012 
1013     /* put the value into Cd (diagonal part) */
1014     pnz = pd->i[i+1] - pd->i[i];
1015     pdJ = pd->j + pd->i[i];
1016     pA  = pd->a + pd->i[i];
1017     for (j=0; j<pnz; j++) {
1018       row = pdJ[j];
1019       cnz = bi[row+1] - bi[row];
1020       cj  = bj + bi[row];
1021       ca  = ba + bi[row];
1022       /* perform dense axpy */
1023       valtmp = pA[j];
1024       for (k=0; k<cnz; k++) {
1025         ca[k] += valtmp*aval[cj[k]];
1026       }
1027       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1028     }
1029 
1030     /* zero the current row of Pt*A */
1031     aJ = aj + ai[i];
1032     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1033   }
1034 
1035   /* 3) send and recv matrix values coa */
1036   /*------------------------------------*/
1037   buf_ri = merge->buf_ri;
1038   buf_rj = merge->buf_rj;
1039   len_s  = merge->len_s;
1040   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1041   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1042 
1043   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1044   for (proc=0,k=0; proc<size; proc++) {
1045     if (!len_s[proc]) continue;
1046     i    = merge->owners_co[proc];
1047     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1048     k++;
1049   }
1050   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1051   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1052 
1053   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1054   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1055   ierr = PetscFree(coa);CHKERRQ(ierr);
1056 
1057   /* 4) insert local Cseq and received values into Cmpi */
1058   /*----------------------------------------------------*/
1059   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1060   for (k=0; k<merge->nrecv; k++) {
1061     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1062     nrows       = *(buf_ri_k[k]);
1063     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1064     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1065   }
1066 
1067   for (i=0; i<cm; i++) {
1068     row  = owners[rank] + i; /* global row index of C_seq */
1069     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1070     ba_i = ba + bi[i];
1071     bnz  = bi[i+1] - bi[i];
1072     /* add received vals into ba */
1073     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1074       /* i-th row */
1075       if (i == *nextrow[k]) {
1076         cnz    = *(nextci[k]+1) - *nextci[k];
1077         cj     = buf_rj[k] + *(nextci[k]);
1078         ca     = abuf_r[k] + *(nextci[k]);
1079         nextcj = 0;
1080         for (j=0; nextcj<cnz; j++) {
1081           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1082             ba_i[j] += ca[nextcj++];
1083           }
1084         }
1085         nextrow[k]++; nextci[k]++;
1086         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1087       }
1088     }
1089     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1090   }
1091   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1092   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1093 
1094   ierr = PetscFree(ba);CHKERRQ(ierr);
1095   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1096   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1097   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1098   ierr = PetscFree(aval);CHKERRQ(ierr);
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1103 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1104 #undef __FUNCT__
1105 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1106 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1107 {
1108   PetscErrorCode      ierr;
1109   Mat                 Cmpi,A_loc,POt,PDt;
1110   Mat_PtAPMPI         *ptap;
1111   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1112   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1113   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1114   PetscInt            nnz;
1115   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1116   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1117   PetscBT             lnkbt;
1118   MPI_Comm            comm;
1119   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1120   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1121   PetscInt            len,proc,*dnz,*onz,*owners;
1122   PetscInt            nzi,*bi,*bj;
1123   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1124   MPI_Request         *swaits,*rwaits;
1125   MPI_Status          *sstatus,rstatus;
1126   Mat_Merge_SeqsToMPI *merge;
1127   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1128   PetscReal           afill  =1.0,afill_tmp;
1129   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1130   PetscScalar         *vals;
1131   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1132 
1133   PetscFunctionBegin;
1134   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1135   /* check if matrix local sizes are compatible */
1136   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1137     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1138   }
1139 
1140   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1141   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1142 
1143   /* create struct Mat_PtAPMPI and attached it to C later */
1144   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1145 
1146   /* get A_loc by taking all local rows of A */
1147   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1148 
1149   ptap->A_loc = A_loc;
1150 
1151   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1152   ai    = a_loc->i;
1153   aj    = a_loc->j;
1154 
1155   /* determine symbolic Co=(p->B)^T*A - send to others */
1156   /*----------------------------------------------------*/
1157   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1158   pdt  = (Mat_SeqAIJ*)PDt->data;
1159   pdti = pdt->i; pdtj = pdt->j;
1160 
1161   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1162   pot  = (Mat_SeqAIJ*)POt->data;
1163   poti = pot->i; potj = pot->j;
1164 
1165   /* then, compute symbolic Co = (p->B)^T*A */
1166   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1167   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1168   coi[0] = 0;
1169 
1170   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1171   nnz           = fill*(poti[pon] + ai[am]);
1172   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1173   current_space = free_space;
1174 
1175   /* create and initialize a linked list */
1176   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1177 
1178   for (i=0; i<pon; i++) {
1179     pnz = poti[i+1] - poti[i];
1180     ptJ = potj + poti[i];
1181     for (j=0; j<pnz; j++) {
1182       row  = ptJ[j]; /* row of A_loc == col of Pot */
1183       anz  = ai[row+1] - ai[row];
1184       Jptr = aj + ai[row];
1185       /* add non-zero cols of AP into the sorted linked list lnk */
1186       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1187     }
1188     nnz = lnk[0];
1189 
1190     /* If free space is not available, double the total space in the list */
1191     if (current_space->local_remaining<nnz) {
1192       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1193       nspacedouble++;
1194     }
1195 
1196     /* Copy data into free space, and zero out denserows */
1197     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1198 
1199     current_space->array           += nnz;
1200     current_space->local_used      += nnz;
1201     current_space->local_remaining -= nnz;
1202 
1203     coi[i+1] = coi[i] + nnz;
1204   }
1205 
1206   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1207   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1208 
1209   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1210   if (afill_tmp > afill) afill = afill_tmp;
1211 
1212   /* send j-array (coj) of Co to other processors */
1213   /*----------------------------------------------*/
1214   /* determine row ownership */
1215   ierr = PetscNew(&merge);CHKERRQ(ierr);
1216   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1217 
1218   merge->rowmap->n  = pn;
1219   merge->rowmap->bs = 1;
1220 
1221   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1222   owners = merge->rowmap->range;
1223 
1224   /* determine the number of messages to send, their lengths */
1225   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1226   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1227 
1228   len_s        = merge->len_s;
1229   merge->nsend = 0;
1230 
1231   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1232   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1233 
1234   proc = 0;
1235   for (i=0; i<pon; i++) {
1236     while (prmap[i] >= owners[proc+1]) proc++;
1237     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1238     len_s[proc] += coi[i+1] - coi[i];
1239   }
1240 
1241   len          = 0; /* max length of buf_si[] */
1242   owners_co[0] = 0;
1243   for (proc=0; proc<size; proc++) {
1244     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1245     if (len_si[proc]) {
1246       merge->nsend++;
1247       len_si[proc] = 2*(len_si[proc] + 1);
1248       len         += len_si[proc];
1249     }
1250   }
1251 
1252   /* determine the number and length of messages to receive for coi and coj  */
1253   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1254   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1255 
1256   /* post the Irecv and Isend of coj */
1257   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1258   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1259   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1260   for (proc=0, k=0; proc<size; proc++) {
1261     if (!len_s[proc]) continue;
1262     i    = owners_co[proc];
1263     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1264     k++;
1265   }
1266 
1267   /* receives and sends of coj are complete */
1268   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1269   for (i=0; i<merge->nrecv; i++) {
1270     PetscMPIInt icompleted;
1271     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1272   }
1273   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1274   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1275 
1276   /* send and recv coi */
1277   /*-------------------*/
1278   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1279   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1280   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1281   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1282   for (proc=0,k=0; proc<size; proc++) {
1283     if (!len_s[proc]) continue;
1284     /* form outgoing message for i-structure:
1285          buf_si[0]:                 nrows to be sent
1286                [1:nrows]:           row index (global)
1287                [nrows+1:2*nrows+1]: i-structure index
1288     */
1289     /*-------------------------------------------*/
1290     nrows       = len_si[proc]/2 - 1;
1291     buf_si_i    = buf_si + nrows+1;
1292     buf_si[0]   = nrows;
1293     buf_si_i[0] = 0;
1294     nrows       = 0;
1295     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1296       nzi               = coi[i+1] - coi[i];
1297       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1298       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1299       nrows++;
1300     }
1301     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1302     k++;
1303     buf_si += len_si[proc];
1304   }
1305   i = merge->nrecv;
1306   while (i--) {
1307     PetscMPIInt icompleted;
1308     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1309   }
1310   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1311   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1312   ierr = PetscFree(len_si);CHKERRQ(ierr);
1313   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1314   ierr = PetscFree(swaits);CHKERRQ(ierr);
1315   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1316   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1317 
1318   /* compute the local portion of C (mpi mat) */
1319   /*------------------------------------------*/
1320   /* allocate bi array and free space for accumulating nonzero column info */
1321   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1322   bi[0] = 0;
1323 
1324   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1325   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1326   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1327   current_space = free_space;
1328 
1329   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1330   for (k=0; k<merge->nrecv; k++) {
1331     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1332     nrows       = *buf_ri_k[k];
1333     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1334     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1335   }
1336 
1337   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1338   rmax = 0;
1339   for (i=0; i<pn; i++) {
1340     /* add pdt[i,:]*AP into lnk */
1341     pnz = pdti[i+1] - pdti[i];
1342     ptJ = pdtj + pdti[i];
1343     for (j=0; j<pnz; j++) {
1344       row  = ptJ[j];  /* row of AP == col of Pt */
1345       anz  = ai[row+1] - ai[row];
1346       Jptr = aj + ai[row];
1347       /* add non-zero cols of AP into the sorted linked list lnk */
1348       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1349     }
1350 
1351     /* add received col data into lnk */
1352     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1353       if (i == *nextrow[k]) { /* i-th row */
1354         nzi  = *(nextci[k]+1) - *nextci[k];
1355         Jptr = buf_rj[k] + *nextci[k];
1356         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1357         nextrow[k]++; nextci[k]++;
1358       }
1359     }
1360     nnz = lnk[0];
1361 
1362     /* if free space is not available, make more free space */
1363     if (current_space->local_remaining<nnz) {
1364       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1365       nspacedouble++;
1366     }
1367     /* copy data into free space, then initialize lnk */
1368     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1369     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1370 
1371     current_space->array           += nnz;
1372     current_space->local_used      += nnz;
1373     current_space->local_remaining -= nnz;
1374 
1375     bi[i+1] = bi[i] + nnz;
1376     if (nnz > rmax) rmax = nnz;
1377   }
1378   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1379 
1380   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1381   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1382 
1383   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1384   if (afill_tmp > afill) afill = afill_tmp;
1385   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1386   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1387   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1388 
1389   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1390   /*----------------------------------------------------------------------------------*/
1391   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1392 
1393   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1394   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1395   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1396   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1397   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1398   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1399   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1400   for (i=0; i<pn; i++) {
1401     row  = i + rstart;
1402     nnz  = bi[i+1] - bi[i];
1403     Jptr = bj + bi[i];
1404     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1405   }
1406   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1407   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1408   ierr = PetscFree(vals);CHKERRQ(ierr);
1409 
1410   merge->bi        = bi;
1411   merge->bj        = bj;
1412   merge->coi       = coi;
1413   merge->coj       = coj;
1414   merge->buf_ri    = buf_ri;
1415   merge->buf_rj    = buf_rj;
1416   merge->owners_co = owners_co;
1417   merge->destroy   = Cmpi->ops->destroy;
1418   merge->duplicate = Cmpi->ops->duplicate;
1419 
1420   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1421   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1422   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1423 
1424   /* attach the supporting struct to Cmpi for reuse */
1425   c           = (Mat_MPIAIJ*)Cmpi->data;
1426   c->ptap     = ptap;
1427   ptap->api   = NULL;
1428   ptap->apj   = NULL;
1429   ptap->merge = merge;
1430   ptap->rmax  = rmax;
1431 
1432   *C = Cmpi;
1433 #if defined(PETSC_USE_INFO)
1434   if (bi[pn] != 0) {
1435     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1436     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1437   } else {
1438     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1439   }
1440 #endif
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 #undef __FUNCT__
1445 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1446 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1447 {
1448   PetscErrorCode      ierr;
1449   Mat_Merge_SeqsToMPI *merge;
1450   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1451   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1452   Mat_PtAPMPI         *ptap;
1453   PetscInt            *adj;
1454   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1455   MatScalar           *ada,*ca,valtmp;
1456   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1457   MPI_Comm            comm;
1458   PetscMPIInt         size,rank,taga,*len_s;
1459   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1460   PetscInt            **buf_ri,**buf_rj;
1461   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1462   MPI_Request         *s_waits,*r_waits;
1463   MPI_Status          *status;
1464   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1465   PetscInt            *ai,*aj,*coi,*coj;
1466   PetscInt            *poJ,*pdJ;
1467   Mat                 A_loc;
1468   Mat_SeqAIJ          *a_loc;
1469 
1470   PetscFunctionBegin;
1471   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1472   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1473   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1474 
1475   ptap  = c->ptap;
1476   merge = ptap->merge;
1477 
1478   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1479   /*------------------------------------------*/
1480   /* get data from symbolic products */
1481   coi    = merge->coi; coj = merge->coj;
1482   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1483   bi     = merge->bi; bj = merge->bj;
1484   owners = merge->rowmap->range;
1485   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1486 
1487   /* get A_loc by taking all local rows of A */
1488   A_loc = ptap->A_loc;
1489   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1490   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1491   ai    = a_loc->i;
1492   aj    = a_loc->j;
1493 
1494   for (i=0; i<am; i++) {
1495     anz = ai[i+1] - ai[i];
1496     adj = aj + ai[i];
1497     ada = a_loc->a + ai[i];
1498 
1499     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1500     /*-------------------------------------------------------------*/
1501     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1502     pnz = po->i[i+1] - po->i[i];
1503     poJ = po->j + po->i[i];
1504     pA  = po->a + po->i[i];
1505     for (j=0; j<pnz; j++) {
1506       row = poJ[j];
1507       cj  = coj + coi[row];
1508       ca  = coa + coi[row];
1509       /* perform sparse axpy */
1510       nexta  = 0;
1511       valtmp = pA[j];
1512       for (k=0; nexta<anz; k++) {
1513         if (cj[k] == adj[nexta]) {
1514           ca[k] += valtmp*ada[nexta];
1515           nexta++;
1516         }
1517       }
1518       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1519     }
1520 
1521     /* put the value into Cd (diagonal part) */
1522     pnz = pd->i[i+1] - pd->i[i];
1523     pdJ = pd->j + pd->i[i];
1524     pA  = pd->a + pd->i[i];
1525     for (j=0; j<pnz; j++) {
1526       row = pdJ[j];
1527       cj  = bj + bi[row];
1528       ca  = ba + bi[row];
1529       /* perform sparse axpy */
1530       nexta  = 0;
1531       valtmp = pA[j];
1532       for (k=0; nexta<anz; k++) {
1533         if (cj[k] == adj[nexta]) {
1534           ca[k] += valtmp*ada[nexta];
1535           nexta++;
1536         }
1537       }
1538       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1539     }
1540   }
1541 
1542   /* 3) send and recv matrix values coa */
1543   /*------------------------------------*/
1544   buf_ri = merge->buf_ri;
1545   buf_rj = merge->buf_rj;
1546   len_s  = merge->len_s;
1547   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1548   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1549 
1550   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1551   for (proc=0,k=0; proc<size; proc++) {
1552     if (!len_s[proc]) continue;
1553     i    = merge->owners_co[proc];
1554     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1555     k++;
1556   }
1557   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1558   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1559 
1560   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1561   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1562   ierr = PetscFree(coa);CHKERRQ(ierr);
1563 
1564   /* 4) insert local Cseq and received values into Cmpi */
1565   /*----------------------------------------------------*/
1566   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1567   for (k=0; k<merge->nrecv; k++) {
1568     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1569     nrows       = *(buf_ri_k[k]);
1570     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1571     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1572   }
1573 
1574   for (i=0; i<cm; i++) {
1575     row  = owners[rank] + i; /* global row index of C_seq */
1576     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1577     ba_i = ba + bi[i];
1578     bnz  = bi[i+1] - bi[i];
1579     /* add received vals into ba */
1580     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1581       /* i-th row */
1582       if (i == *nextrow[k]) {
1583         cnz    = *(nextci[k]+1) - *nextci[k];
1584         cj     = buf_rj[k] + *(nextci[k]);
1585         ca     = abuf_r[k] + *(nextci[k]);
1586         nextcj = 0;
1587         for (j=0; nextcj<cnz; j++) {
1588           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1589             ba_i[j] += ca[nextcj++];
1590           }
1591         }
1592         nextrow[k]++; nextci[k]++;
1593         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1594       }
1595     }
1596     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1597   }
1598   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1599   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1600 
1601   ierr = PetscFree(ba);CHKERRQ(ierr);
1602   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1603   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1604   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 #if 0
1609 #undef __FUNCT__
1610 #define __FUNCT__ "MatRowMergeMax_MPIAIJ"
1611 PetscErrorCode MatRowMergeMax_MPIAIJ(Mat A,const PetscInt nexpect,PetscInt maxkey,PetscInt *rmax)
1612 {
1613   PetscErrorCode  ierr;
1614   Mat_MPIAIJ      *a = (Mat_MPIAIJ*)A->data;
1615   Mat_SeqAIJ      *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1616   PetscInt        am=A->rmap->n,*col,row,j,nz,*garray=a->garray;
1617   PetscTable      ta;
1618 
1619   PetscFunctionBegin;
1620   ierr = PetscTableCreate(nexpect,maxkey,&ta);CHKERRQ(ierr);
1621   /* diagonal part */
1622   for (row=0; row<am; row++) {
1623     nz = ad->i[row+1] - ad->i[row];
1624     for (j=0; j<nz; j++) {
1625       col = j + ad->j + ad->i[row];
1626       ierr = PetscTableAdd(ta,*col+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
1627     }
1628   }
1629 
1630   /* off-diagonal part */
1631   for (row=0; row<am; row++) {
1632     nz = ao->i[row+1] - ao->i[row];
1633     for (j=0; j<nz; j++) {
1634       col = j + ao->j + ao->i[row];
1635       ierr = PetscTableAdd(ta,garray[*col]+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
1636     }
1637   }
1638 
1639   ierr = PetscTableGetCount(ta,rmax);CHKERRQ(ierr);
1640   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1641   printf("MatRowMergeMax_MPIAIJ...rmax %d\n",*rmax);
1642   PetscFunctionReturn(0);
1643 }
1644 #endif
1645 
1646 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1647 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1648    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1649 #undef __FUNCT__
1650 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1651 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1652 {
1653   PetscErrorCode      ierr;
1654   Mat                 Cmpi,A_loc,POt,PDt;
1655   Mat_PtAPMPI         *ptap;
1656   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1657   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1658   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1659   PetscInt            nnz;
1660   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1661   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1662   MPI_Comm            comm;
1663   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1664   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1665   PetscInt            len,proc,*dnz,*onz,*owners;
1666   PetscInt            nzi,*bi,*bj;
1667   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1668   MPI_Request         *swaits,*rwaits;
1669   MPI_Status          *sstatus,rstatus;
1670   Mat_Merge_SeqsToMPI *merge;
1671   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1672   PetscReal           afill  =1.0,afill_tmp;
1673   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax,*col,nz;
1674   PetscScalar         *vals;
1675   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1676   PetscTable          ta;
1677 
1678   PetscFunctionBegin;
1679   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1680   /* check if matrix local sizes are compatible */
1681   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1682     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1683   }
1684 
1685   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1686   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1687 
1688   /* create struct Mat_PtAPMPI and attached it to C later */
1689   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1690 
1691   /* get A_loc by taking all local rows of A */
1692   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1693 
1694   ptap->A_loc = A_loc;
1695   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1696   ai          = a_loc->i;
1697   aj          = a_loc->j;
1698 
1699   /* determine symbolic Co=(p->B)^T*A - send to others */
1700   /*----------------------------------------------------*/
1701   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1702   pdt  = (Mat_SeqAIJ*)PDt->data;
1703   pdti = pdt->i; pdtj = pdt->j;
1704 
1705   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1706   pot  = (Mat_SeqAIJ*)POt->data;
1707   poti = pot->i; potj = pot->j;
1708 
1709   /* then, compute symbolic Co = (p->B)^T*A */
1710   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1711                          >= (num of nonzero rows of C_seq) - pn */
1712   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1713   coi[0] = 0;
1714 
1715   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1716   nnz           = fill*(poti[pon] + ai[am]);
1717   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1718   current_space = free_space;
1719 
1720   /* create and initialize a linked list */
1721   ierr = PetscTableCreate(2*a_loc->rmax,aN,&ta);CHKERRQ(ierr);
1722   for (row=0; row<am; row++) {
1723     nz = ai[row+1] - ai[row];
1724     for (j=0; j<nz; j++) {
1725       col = j + aj + ai[row];
1726       ierr = PetscTableAdd(ta,*col+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
1727     }
1728   }
1729   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1730   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1731 
1732   for (i=0; i<pon; i++) {
1733     pnz = poti[i+1] - poti[i];
1734     ptJ = potj + poti[i];
1735     for (j=0; j<pnz; j++) {
1736       row  = ptJ[j]; /* row of A_loc == col of Pot */
1737       anz  = ai[row+1] - ai[row];
1738       Jptr = aj + ai[row];
1739       /* add non-zero cols of AP into the sorted linked list lnk */
1740       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1741     }
1742     nnz = lnk[0];
1743 
1744     /* If free space is not available, double the total space in the list */
1745     if (current_space->local_remaining<nnz) {
1746       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1747       nspacedouble++;
1748     }
1749 
1750     /* Copy data into free space, and zero out denserows */
1751     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1752 
1753     current_space->array           += nnz;
1754     current_space->local_used      += nnz;
1755     current_space->local_remaining -= nnz;
1756 
1757     coi[i+1] = coi[i] + nnz;
1758   }
1759 
1760   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1761   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1762   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1763 
1764   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1765   if (afill_tmp > afill) afill = afill_tmp;
1766 
1767   /* send j-array (coj) of Co to other processors */
1768   /*----------------------------------------------*/
1769   /* determine row ownership */
1770   ierr = PetscNew(&merge);CHKERRQ(ierr);
1771   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1772 
1773   merge->rowmap->n  = pn;
1774   merge->rowmap->bs = 1;
1775 
1776   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1777   owners = merge->rowmap->range;
1778 
1779   /* determine the number of messages to send, their lengths */
1780   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1781   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1782 
1783   len_s        = merge->len_s;
1784   merge->nsend = 0;
1785 
1786   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1787   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1788 
1789   proc = 0;
1790   for (i=0; i<pon; i++) {
1791     while (prmap[i] >= owners[proc+1]) proc++;
1792     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1793     len_s[proc] += coi[i+1] - coi[i];
1794   }
1795 
1796   len          = 0; /* max length of buf_si[] */
1797   owners_co[0] = 0;
1798   for (proc=0; proc<size; proc++) {
1799     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1800     if (len_si[proc]) {
1801       merge->nsend++;
1802       len_si[proc] = 2*(len_si[proc] + 1);
1803       len         += len_si[proc];
1804     }
1805   }
1806 
1807   /* determine the number and length of messages to receive for coi and coj  */
1808   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1809   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1810 
1811   /* post the Irecv and Isend of coj */
1812   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1813   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1814   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1815   for (proc=0, k=0; proc<size; proc++) {
1816     if (!len_s[proc]) continue;
1817     i    = owners_co[proc];
1818     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1819     k++;
1820   }
1821 
1822   /* receives and sends of coj are complete */
1823   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1824   for (i=0; i<merge->nrecv; i++) {
1825     PetscMPIInt icompleted;
1826     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1827   }
1828   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1829   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1830 
1831   /* add received column indices into table to update Armax */
1832   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1833     Jptr = buf_rj[k];
1834     for (j=0; j<merge->len_r[k]; j++) {
1835       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1836     }
1837   }
1838   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1839 
1840   /* send and recv coi */
1841   /*-------------------*/
1842   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1843   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1844   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1845   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1846   for (proc=0,k=0; proc<size; proc++) {
1847     if (!len_s[proc]) continue;
1848     /* form outgoing message for i-structure:
1849          buf_si[0]:                 nrows to be sent
1850                [1:nrows]:           row index (global)
1851                [nrows+1:2*nrows+1]: i-structure index
1852     */
1853     /*-------------------------------------------*/
1854     nrows       = len_si[proc]/2 - 1;
1855     buf_si_i    = buf_si + nrows+1;
1856     buf_si[0]   = nrows;
1857     buf_si_i[0] = 0;
1858     nrows       = 0;
1859     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1860       nzi               = coi[i+1] - coi[i];
1861       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1862       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1863       nrows++;
1864     }
1865     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1866     k++;
1867     buf_si += len_si[proc];
1868   }
1869   i = merge->nrecv;
1870   while (i--) {
1871     PetscMPIInt icompleted;
1872     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1873   }
1874   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1875   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1876   ierr = PetscFree(len_si);CHKERRQ(ierr);
1877   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1878   ierr = PetscFree(swaits);CHKERRQ(ierr);
1879   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1880   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1881 
1882   /* compute the local portion of C (mpi mat) */
1883   /*------------------------------------------*/
1884   /* allocate bi array and free space for accumulating nonzero column info */
1885   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1886   bi[0] = 0;
1887 
1888   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1889   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1890   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1891   current_space = free_space;
1892 
1893   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1894   for (k=0; k<merge->nrecv; k++) {
1895     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1896     nrows       = *buf_ri_k[k];
1897     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1898     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1899   }
1900 
1901   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1902   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1903   rmax = 0;
1904   for (i=0; i<pn; i++) {
1905     /* add pdt[i,:]*AP into lnk */
1906     pnz = pdti[i+1] - pdti[i];
1907     ptJ = pdtj + pdti[i];
1908     for (j=0; j<pnz; j++) {
1909       row  = ptJ[j];  /* row of AP == col of Pt */
1910       anz  = ai[row+1] - ai[row];
1911       Jptr = aj + ai[row];
1912       /* add non-zero cols of AP into the sorted linked list lnk */
1913       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1914     }
1915 
1916     /* add received col data into lnk */
1917     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1918       if (i == *nextrow[k]) { /* i-th row */
1919         nzi  = *(nextci[k]+1) - *nextci[k];
1920         Jptr = buf_rj[k] + *nextci[k];
1921         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1922         nextrow[k]++; nextci[k]++;
1923       }
1924     }
1925     nnz = lnk[0];
1926 
1927     /* if free space is not available, make more free space */
1928     if (current_space->local_remaining<nnz) {
1929       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1930       nspacedouble++;
1931     }
1932     /* copy data into free space, then initialize lnk */
1933     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1934     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1935 
1936     current_space->array           += nnz;
1937     current_space->local_used      += nnz;
1938     current_space->local_remaining -= nnz;
1939 
1940     bi[i+1] = bi[i] + nnz;
1941     if (nnz > rmax) rmax = nnz;
1942   }
1943   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1944 
1945   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1946   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1947   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1948   if (afill_tmp > afill) afill = afill_tmp;
1949   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1950   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1951 
1952   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1953   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1954 
1955   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1956   /*----------------------------------------------------------------------------------*/
1957   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1958 
1959   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1960   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1961   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1962   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1963   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1964   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1965   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1966   for (i=0; i<pn; i++) {
1967     row  = i + rstart;
1968     nnz  = bi[i+1] - bi[i];
1969     Jptr = bj + bi[i];
1970     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1971   }
1972   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1973   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1974   ierr = PetscFree(vals);CHKERRQ(ierr);
1975 
1976   merge->bi        = bi;
1977   merge->bj        = bj;
1978   merge->coi       = coi;
1979   merge->coj       = coj;
1980   merge->buf_ri    = buf_ri;
1981   merge->buf_rj    = buf_rj;
1982   merge->owners_co = owners_co;
1983   merge->destroy   = Cmpi->ops->destroy;
1984   merge->duplicate = Cmpi->ops->duplicate;
1985 
1986   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1987   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1988   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1989 
1990   /* attach the supporting struct to Cmpi for reuse */
1991   c = (Mat_MPIAIJ*)Cmpi->data;
1992 
1993   c->ptap     = ptap;
1994   ptap->api   = NULL;
1995   ptap->apj   = NULL;
1996   ptap->merge = merge;
1997   ptap->rmax  = rmax;
1998   ptap->apa   = NULL;
1999 
2000   *C = Cmpi;
2001 #if defined(PETSC_USE_INFO)
2002   if (bi[pn] != 0) {
2003     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
2004     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
2005   } else {
2006     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
2007   }
2008 #endif
2009   PetscFunctionReturn(0);
2010 }
2011