xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision f8efd71cd5f4648a8995733c8aefbf78c090be08)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc-private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=0; /* set default algorithm */
20 
21   PetscFunctionBegin;
22   if (scall == MAT_INITIAL_MATRIX) {
23     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
24     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
25     ierr = PetscOptionsEnd();CHKERRQ(ierr);
26 
27     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
28     switch (alg) {
29     case 1:
30       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
31       break;
32     default:
33       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
34       break;
35     }
36     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
37   }
38   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
39   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
40   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
41   PetscFunctionReturn(0);
42 }
43 
44 #undef __FUNCT__
45 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
46 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47 {
48   PetscErrorCode ierr;
49   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
50   Mat_PtAPMPI    *ptap = a->ptap;
51 
52   PetscFunctionBegin;
53   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
54   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
55   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
56   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
57   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
59   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
60   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
61   ierr = ptap->destroy(A);CHKERRQ(ierr);
62   ierr = PetscFree(ptap);CHKERRQ(ierr);
63   PetscFunctionReturn(0);
64 }
65 
66 #undef __FUNCT__
67 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
68 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69 {
70   PetscErrorCode ierr;
71   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
72   Mat_PtAPMPI    *ptap = a->ptap;
73 
74   PetscFunctionBegin;
75   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
76 
77   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79   PetscFunctionReturn(0);
80 }
81 
82 #undef __FUNCT__
83 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
84 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
85 {
86   PetscErrorCode ierr;
87   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
88   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
89   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
90   PetscInt       *adi=ad->i,*adj,*aoi=ao->i,*aoj;
91   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
92   Mat_SeqAIJ     *p_loc,*p_oth;
93   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
94   PetscScalar    *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
95   PetscInt       cm   =C->rmap->n,anz,pnz;
96   Mat_PtAPMPI    *ptap=c->ptap;
97   PetscInt       *api,*apj,*apJ,i,j,k,row;
98   PetscInt       cstart=C->cmap->rstart;
99   PetscInt       cdnz,conz,k0,k1;
100   MPI_Comm       comm;
101   PetscMPIInt    size;
102 
103   PetscFunctionBegin;
104   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
105   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
106 
107   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
108   /*-----------------------------------------------------*/
109   /* update numerical values of P_oth and P_loc */
110   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
111   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
112 
113   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
114   /*----------------------------------------------------------*/
115   /* get data from symbolic products */
116   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
117   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
118   if (size >1) {
119     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
120     pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
121   }
122 
123   /* get apa for storing dense row A[i,:]*P */
124   apa = ptap->apa;
125 
126   api = ptap->api;
127   apj = ptap->apj;
128   for (i=0; i<cm; i++) {
129     /* diagonal portion of A */
130     anz = adi[i+1] - adi[i];
131     adj = ad->j + adi[i];
132     ada = ad->a + adi[i];
133     for (j=0; j<anz; j++) {
134       row = adj[j];
135       pnz = pi_loc[row+1] - pi_loc[row];
136       pj  = pj_loc + pi_loc[row];
137       pa  = pa_loc + pi_loc[row];
138 
139       /* perform dense axpy */
140       valtmp = ada[j];
141       for (k=0; k<pnz; k++) {
142         apa[pj[k]] += valtmp*pa[k];
143       }
144       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
145     }
146 
147     /* off-diagonal portion of A */
148     anz = aoi[i+1] - aoi[i];
149     aoj = ao->j + aoi[i];
150     aoa = ao->a + aoi[i];
151     for (j=0; j<anz; j++) {
152       row = aoj[j];
153       pnz = pi_oth[row+1] - pi_oth[row];
154       pj  = pj_oth + pi_oth[row];
155       pa  = pa_oth + pi_oth[row];
156 
157       /* perform dense axpy */
158       valtmp = aoa[j];
159       for (k=0; k<pnz; k++) {
160         apa[pj[k]] += valtmp*pa[k];
161       }
162       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
163     }
164 
165     /* set values in C */
166     apJ  = apj + api[i];
167     cdnz = cd->i[i+1] - cd->i[i];
168     conz = co->i[i+1] - co->i[i];
169 
170     /* 1st off-diagoanl part of C */
171     ca = coa + co->i[i];
172     k  = 0;
173     for (k0=0; k0<conz; k0++) {
174       if (apJ[k] >= cstart) break;
175       ca[k0]      = apa[apJ[k]];
176       apa[apJ[k]] = 0.0;
177       k++;
178     }
179 
180     /* diagonal part of C */
181     ca = cda + cd->i[i];
182     for (k1=0; k1<cdnz; k1++) {
183       ca[k1]      = apa[apJ[k]];
184       apa[apJ[k]] = 0.0;
185       k++;
186     }
187 
188     /* 2nd off-diagoanl part of C */
189     ca = coa + co->i[i];
190     for (; k0<conz; k0++) {
191       ca[k0]      = apa[apJ[k]];
192       apa[apJ[k]] = 0.0;
193       k++;
194     }
195   }
196   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
197   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 #undef __FUNCT__
202 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
203 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
204 {
205   PetscErrorCode     ierr;
206   MPI_Comm           comm;
207   PetscMPIInt        size;
208   Mat                Cmpi;
209   Mat_PtAPMPI        *ptap;
210   PetscFreeSpaceList free_space=NULL,current_space=NULL;
211   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
212   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
213   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
214   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
215   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
216   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
217   PetscBT            lnkbt;
218   PetscScalar        *apa;
219   PetscReal          afill;
220   PetscInt           nlnk_max,armax,prmax;
221 
222   PetscFunctionBegin;
223   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
224   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
225 
226   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
227     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
228   }
229 
230   /* create struct Mat_PtAPMPI and attached it to C later */
231   ierr = PetscNew(&ptap);CHKERRQ(ierr);
232 
233   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
234   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
235 
236   /* get P_loc by taking all local rows of P */
237   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
238 
239   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
240   pi_loc = p_loc->i; pj_loc = p_loc->j;
241   if (size > 1) {
242     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
243     pi_oth = p_oth->i; pj_oth = p_oth->j;
244   }
245 
246   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
247   /*-------------------------------------------------------------------*/
248   ierr      = PetscMalloc1((am+2),&api);CHKERRQ(ierr);
249   ptap->api = api;
250   api[0]    = 0;
251 
252   /* create and initialize a linked list */
253   armax    = ad->rmax+ao->rmax;
254   if (size >1) {
255     prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
256   } else {
257     prmax = p_loc->rmax;
258   }
259   nlnk_max = armax*prmax;
260   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
261   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
262 
263   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
264   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
265 
266   current_space = free_space;
267 
268   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
269   for (i=0; i<am; i++) {
270     /* diagonal portion of A */
271     nzi = adi[i+1] - adi[i];
272     for (j=0; j<nzi; j++) {
273       row  = *adj++;
274       pnz  = pi_loc[row+1] - pi_loc[row];
275       Jptr = pj_loc + pi_loc[row];
276       /* add non-zero cols of P into the sorted linked list lnk */
277       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
278     }
279     /* off-diagonal portion of A */
280     nzi = aoi[i+1] - aoi[i];
281     for (j=0; j<nzi; j++) {
282       row  = *aoj++;
283       pnz  = pi_oth[row+1] - pi_oth[row];
284       Jptr = pj_oth + pi_oth[row];
285       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
286     }
287 
288     apnz     = lnk[0];
289     api[i+1] = api[i] + apnz;
290 
291     /* if free space is not available, double the total space in the list */
292     if (current_space->local_remaining<apnz) {
293       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
294       nspacedouble++;
295     }
296 
297     /* Copy data into free space, then initialize lnk */
298     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
299     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
300 
301     current_space->array           += apnz;
302     current_space->local_used      += apnz;
303     current_space->local_remaining -= apnz;
304   }
305 
306   /* Allocate space for apj, initialize apj, and */
307   /* destroy list of free space and other temporary array(s) */
308   ierr = PetscMalloc1((api[am]+1),&ptap->apj);CHKERRQ(ierr);
309   apj  = ptap->apj;
310   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
311   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
312 
313   /* malloc apa to store dense row A[i,:]*P */
314   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
315 
316   ptap->apa = apa;
317 
318   /* create and assemble symbolic parallel matrix Cmpi */
319   /*----------------------------------------------------*/
320   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
321   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
322   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
323 
324   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
325   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
326   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
327   for (i=0; i<am; i++) {
328     row  = i + rstart;
329     apnz = api[i+1] - api[i];
330     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
331     apj += apnz;
332   }
333   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
334   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
335 
336   ptap->destroy        = Cmpi->ops->destroy;
337   ptap->duplicate      = Cmpi->ops->duplicate;
338   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
339   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
340   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
341 
342   /* attach the supporting struct to Cmpi for reuse */
343   c       = (Mat_MPIAIJ*)Cmpi->data;
344   c->ptap = ptap;
345 
346   *C = Cmpi;
347 
348   /* set MatInfo */
349   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
350   if (afill < 1.0) afill = 1.0;
351   Cmpi->info.mallocs           = nspacedouble;
352   Cmpi->info.fill_ratio_given  = fill;
353   Cmpi->info.fill_ratio_needed = afill;
354 
355 #if defined(PETSC_USE_INFO)
356   if (api[am]) {
357     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
358     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
359   } else {
360     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
361   }
362 #endif
363   PetscFunctionReturn(0);
364 }
365 
366 #undef __FUNCT__
367 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
368 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
369 {
370   PetscErrorCode ierr;
371 
372   PetscFunctionBegin;
373   if (scall == MAT_INITIAL_MATRIX) {
374     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
375     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
376     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
377   }
378   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
379   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
380   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 typedef struct {
385   Mat         workB;
386   PetscScalar *rvalues,*svalues;
387   MPI_Request *rwaits,*swaits;
388 } MPIAIJ_MPIDense;
389 
390 #undef __FUNCT__
391 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
392 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
393 {
394   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
395   PetscErrorCode  ierr;
396 
397   PetscFunctionBegin;
398   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
399   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
400   ierr = PetscFree(contents);CHKERRQ(ierr);
401   PetscFunctionReturn(0);
402 }
403 
404 #undef __FUNCT__
405 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
406 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
407 {
408   PetscErrorCode         ierr;
409   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
410   PetscInt               nz   = aij->B->cmap->n;
411   PetscContainer         container;
412   MPIAIJ_MPIDense        *contents;
413   VecScatter             ctx   = aij->Mvctx;
414   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
415   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
416   PetscInt               m     = A->rmap->n,n=B->cmap->n;
417 
418   PetscFunctionBegin;
419   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
420   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
421   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
422   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
423   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
424   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
425   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
426 
427   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
428 
429   ierr = PetscNew(&contents);CHKERRQ(ierr);
430   /* Create work matrix used to store off processor rows of B needed for local product */
431   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
432   /* Create work arrays needed */
433   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
434                       B->cmap->N*to->starts[to->n],&contents->svalues,
435                       from->n,&contents->rwaits,
436                       to->n,&contents->swaits);CHKERRQ(ierr);
437 
438   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
439   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
440   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
441   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
442   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 
446 #undef __FUNCT__
447 #define __FUNCT__ "MatMPIDenseScatter"
448 /*
449     Performs an efficient scatter on the rows of B needed by this process; this is
450     a modification of the VecScatterBegin_() routines.
451 */
452 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
453 {
454   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
455   PetscErrorCode         ierr;
456   PetscScalar            *b,*w,*svalues,*rvalues;
457   VecScatter             ctx   = aij->Mvctx;
458   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
459   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
460   PetscInt               i,j,k;
461   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
462   PetscMPIInt            *sprocs,*rprocs,nrecvs;
463   MPI_Request            *swaits,*rwaits;
464   MPI_Comm               comm;
465   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
466   MPI_Status             status;
467   MPIAIJ_MPIDense        *contents;
468   PetscContainer         container;
469   Mat                    workB;
470 
471   PetscFunctionBegin;
472   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
473   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
474   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
475   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
476 
477   workB = *outworkB = contents->workB;
478   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
479   sindices = to->indices;
480   sstarts  = to->starts;
481   sprocs   = to->procs;
482   swaits   = contents->swaits;
483   svalues  = contents->svalues;
484 
485   rindices = from->indices;
486   rstarts  = from->starts;
487   rprocs   = from->procs;
488   rwaits   = contents->rwaits;
489   rvalues  = contents->rvalues;
490 
491   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
492   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
493 
494   for (i=0; i<from->n; i++) {
495     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
496   }
497 
498   for (i=0; i<to->n; i++) {
499     /* pack a message at a time */
500     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
501       for (k=0; k<ncols; k++) {
502         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
503       }
504     }
505     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
506   }
507 
508   nrecvs = from->n;
509   while (nrecvs) {
510     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
511     nrecvs--;
512     /* unpack a message at a time */
513     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
514       for (k=0; k<ncols; k++) {
515         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
516       }
517     }
518   }
519   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
520 
521   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
522   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
523   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
524   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
525   PetscFunctionReturn(0);
526 }
527 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
528 
529 #undef __FUNCT__
530 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
531 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
532 {
533   PetscErrorCode ierr;
534   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
535   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
536   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
537   Mat            workB;
538 
539   PetscFunctionBegin;
540   /* diagonal block of A times all local rows of B*/
541   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
542 
543   /* get off processor parts of B needed to complete the product */
544   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
545 
546   /* off-diagonal block of A times nonlocal rows of B */
547   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
548   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
549   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
550   PetscFunctionReturn(0);
551 }
552 
553 #undef __FUNCT__
554 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
555 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
556 {
557   PetscErrorCode ierr;
558   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
559   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
560   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
561   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
562   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
563   Mat_SeqAIJ     *p_loc,*p_oth;
564   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
565   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
566   PetscInt       cm          = C->rmap->n,anz,pnz;
567   Mat_PtAPMPI    *ptap       = c->ptap;
568   PetscScalar    *apa_sparse = ptap->apa;
569   PetscInt       *api,*apj,*apJ,i,j,k,row;
570   PetscInt       cstart = C->cmap->rstart;
571   PetscInt       cdnz,conz,k0,k1,nextp;
572   MPI_Comm       comm;
573   PetscMPIInt    size;
574 
575   PetscFunctionBegin;
576   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
577   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
578 
579   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
580   /*-----------------------------------------------------*/
581   /* update numerical values of P_oth and P_loc */
582   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
583   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
584 
585   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
586   /*----------------------------------------------------------*/
587   /* get data from symbolic products */
588   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
589   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
590   if (size >1) {
591     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
592     pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
593   }
594 
595   api = ptap->api;
596   apj = ptap->apj;
597   for (i=0; i<cm; i++) {
598     apJ = apj + api[i];
599 
600     /* diagonal portion of A */
601     anz = adi[i+1] - adi[i];
602     adj = ad->j + adi[i];
603     ada = ad->a + adi[i];
604     for (j=0; j<anz; j++) {
605       row = adj[j];
606       pnz = pi_loc[row+1] - pi_loc[row];
607       pj  = pj_loc + pi_loc[row];
608       pa  = pa_loc + pi_loc[row];
609       /* perform sparse axpy */
610       valtmp = ada[j];
611       nextp  = 0;
612       for (k=0; nextp<pnz; k++) {
613         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
614           apa_sparse[k] += valtmp*pa[nextp++];
615         }
616       }
617       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
618     }
619 
620     /* off-diagonal portion of A */
621     anz = aoi[i+1] - aoi[i];
622     aoj = ao->j + aoi[i];
623     aoa = ao->a + aoi[i];
624     for (j=0; j<anz; j++) {
625       row = aoj[j];
626       pnz = pi_oth[row+1] - pi_oth[row];
627       pj  = pj_oth + pi_oth[row];
628       pa  = pa_oth + pi_oth[row];
629       /* perform sparse axpy */
630       valtmp = aoa[j];
631       nextp  = 0;
632       for (k=0; nextp<pnz; k++) {
633         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
634           apa_sparse[k] += valtmp*pa[nextp++];
635         }
636       }
637       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
638     }
639 
640     /* set values in C */
641     cdnz = cd->i[i+1] - cd->i[i];
642     conz = co->i[i+1] - co->i[i];
643 
644     /* 1st off-diagoanl part of C */
645     ca = coa + co->i[i];
646     k  = 0;
647     for (k0=0; k0<conz; k0++) {
648       if (apJ[k] >= cstart) break;
649       ca[k0]        = apa_sparse[k];
650       apa_sparse[k] = 0.0;
651       k++;
652     }
653 
654     /* diagonal part of C */
655     ca = cda + cd->i[i];
656     for (k1=0; k1<cdnz; k1++) {
657       ca[k1]        = apa_sparse[k];
658       apa_sparse[k] = 0.0;
659       k++;
660     }
661 
662     /* 2nd off-diagoanl part of C */
663     ca = coa + co->i[i];
664     for (; k0<conz; k0++) {
665       ca[k0]        = apa_sparse[k];
666       apa_sparse[k] = 0.0;
667       k++;
668     }
669   }
670   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
671   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
672   PetscFunctionReturn(0);
673 }
674 
675 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
676 #undef __FUNCT__
677 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
678 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
679 {
680   PetscErrorCode     ierr;
681   MPI_Comm           comm;
682   PetscMPIInt        size;
683   Mat                Cmpi;
684   Mat_PtAPMPI        *ptap;
685   PetscFreeSpaceList free_space = NULL,current_space=NULL;
686   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
687   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
688   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
689   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
690   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
691   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
692   PetscInt           nlnk_max,armax,prmax;
693   PetscReal          afill;
694   PetscScalar        *apa;
695 
696   PetscFunctionBegin;
697   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
698   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
699 
700   /* create struct Mat_PtAPMPI and attached it to C later */
701   ierr = PetscNew(&ptap);CHKERRQ(ierr);
702 
703   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
704   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
705 
706   /* get P_loc by taking all local rows of P */
707   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
708 
709   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
710   pi_loc = p_loc->i; pj_loc = p_loc->j;
711   if (size > 1) {
712     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
713     pi_oth = p_oth->i;
714     pj_oth = p_oth->j;
715   }
716 
717   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
718   /*-------------------------------------------------------------------*/
719   ierr      = PetscMalloc1((am+2),&api);CHKERRQ(ierr);
720   ptap->api = api;
721   api[0]    = 0;
722 
723   /* create and initialize a linked list */
724   armax = ad->rmax+ao->rmax;
725   if (size >1) {
726     prmax = PetscMax(p_loc->rmax,p_oth->rmax);
727   } else {
728     prmax = p_loc->rmax;
729   }
730   nlnk_max = armax*prmax;
731   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
732   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
733 
734   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
735   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
736 
737   current_space = free_space;
738 
739   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
740   for (i=0; i<am; i++) {
741     /* diagonal portion of A */
742     nzi = adi[i+1] - adi[i];
743     for (j=0; j<nzi; j++) {
744       row  = *adj++;
745       pnz  = pi_loc[row+1] - pi_loc[row];
746       Jptr = pj_loc + pi_loc[row];
747       /* add non-zero cols of P into the sorted linked list lnk */
748       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
749     }
750     /* off-diagonal portion of A */
751     nzi = aoi[i+1] - aoi[i];
752     for (j=0; j<nzi; j++) {
753       row  = *aoj++;
754       pnz  = pi_oth[row+1] - pi_oth[row];
755       Jptr = pj_oth + pi_oth[row];
756       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
757     }
758 
759     apnz     = *lnk;
760     api[i+1] = api[i] + apnz;
761     if (apnz > apnz_max) apnz_max = apnz;
762 
763     /* if free space is not available, double the total space in the list */
764     if (current_space->local_remaining<apnz) {
765       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
766       nspacedouble++;
767     }
768 
769     /* Copy data into free space, then initialize lnk */
770     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
771     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
772 
773     current_space->array           += apnz;
774     current_space->local_used      += apnz;
775     current_space->local_remaining -= apnz;
776   }
777 
778   /* Allocate space for apj, initialize apj, and */
779   /* destroy list of free space and other temporary array(s) */
780   ierr = PetscMalloc1((api[am]+1),&ptap->apj);CHKERRQ(ierr);
781   apj  = ptap->apj;
782   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
783   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
784 
785   /* create and assemble symbolic parallel matrix Cmpi */
786   /*----------------------------------------------------*/
787   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
788   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
789   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
790   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
791   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
792   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
793 
794   /* malloc apa for assembly Cmpi */
795   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
796 
797   ptap->apa = apa;
798   for (i=0; i<am; i++) {
799     row  = i + rstart;
800     apnz = api[i+1] - api[i];
801     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
802     apj += apnz;
803   }
804   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
805   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
806 
807   ptap->destroy             = Cmpi->ops->destroy;
808   ptap->duplicate           = Cmpi->ops->duplicate;
809   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
810   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
811   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
812 
813   /* attach the supporting struct to Cmpi for reuse */
814   c       = (Mat_MPIAIJ*)Cmpi->data;
815   c->ptap = ptap;
816 
817   *C = Cmpi;
818 
819   /* set MatInfo */
820   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
821   if (afill < 1.0) afill = 1.0;
822   Cmpi->info.mallocs           = nspacedouble;
823   Cmpi->info.fill_ratio_given  = fill;
824   Cmpi->info.fill_ratio_needed = afill;
825 
826 #if defined(PETSC_USE_INFO)
827   if (api[am]) {
828     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
829     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
830   } else {
831     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
832   }
833 #endif
834   PetscFunctionReturn(0);
835 }
836 
837 /*-------------------------------------------------------------------------*/
838 #undef __FUNCT__
839 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
840 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
841 {
842   PetscErrorCode ierr;
843   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
844   PetscInt       alg=0; /* set default algorithm */
845 
846   PetscFunctionBegin;
847   if (scall == MAT_INITIAL_MATRIX) {
848     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
849     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
850     ierr = PetscOptionsEnd();CHKERRQ(ierr);
851 
852     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
853     switch (alg) {
854     case 1:
855       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
856       break;
857     case 2:
858     {
859       Mat         Pt;
860       Mat_PtAPMPI *ptap;
861       Mat_MPIAIJ  *c;
862       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
863       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
864       c        = (Mat_MPIAIJ*)(*C)->data;
865       ptap     = c->ptap;
866       ptap->Pt = Pt;
867       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
868       PetscFunctionReturn(0);
869     }
870       break;
871     default:
872       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
873       break;
874     }
875     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
876   }
877   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
878   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
879   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
880   PetscFunctionReturn(0);
881 }
882 
883 /* This routine only works when scall=MAT_REUSE_MATRIX! */
884 #undef __FUNCT__
885 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
886 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
887 {
888   PetscErrorCode ierr;
889   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
890   Mat_PtAPMPI    *ptap= c->ptap;
891   Mat            Pt=ptap->Pt;
892 
893   PetscFunctionBegin;
894   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
895   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
896   PetscFunctionReturn(0);
897 }
898 
899 /* Non-scalable version, use dense axpy */
900 #undef __FUNCT__
901 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
902 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
903 {
904   PetscErrorCode      ierr;
905   Mat_Merge_SeqsToMPI *merge;
906   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
907   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
908   Mat_PtAPMPI         *ptap;
909   PetscInt            *adj,*aJ;
910   PetscInt            i,j,k,anz,pnz,row,*cj;
911   MatScalar           *ada,*aval,*ca,valtmp;
912   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
913   MPI_Comm            comm;
914   PetscMPIInt         size,rank,taga,*len_s;
915   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
916   PetscInt            **buf_ri,**buf_rj;
917   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
918   MPI_Request         *s_waits,*r_waits;
919   MPI_Status          *status;
920   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
921   PetscInt            *ai,*aj,*coi,*coj;
922   PetscInt            *poJ,*pdJ;
923   Mat                 A_loc;
924   Mat_SeqAIJ          *a_loc;
925 
926   PetscFunctionBegin;
927   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
928   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
929   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
930 
931   ptap  = c->ptap;
932   merge = ptap->merge;
933 
934   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
935   /*--------------------------------------------------------------*/
936   /* get data from symbolic products */
937   coi  = merge->coi; coj = merge->coj;
938   ierr = PetscCalloc1((coi[pon]+1),&coa);CHKERRQ(ierr);
939 
940   bi     = merge->bi; bj = merge->bj;
941   owners = merge->rowmap->range;
942   ierr   = PetscCalloc1((bi[cm]+1),&ba);CHKERRQ(ierr);
943 
944   /* get A_loc by taking all local rows of A */
945   A_loc = ptap->A_loc;
946   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
947   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
948   ai    = a_loc->i;
949   aj    = a_loc->j;
950 
951   ierr = PetscCalloc1((A->cmap->N),&aval);CHKERRQ(ierr); /* non-scalable!!! */
952 
953   for (i=0; i<am; i++) {
954     /* 2-a) put A[i,:] to dense array aval */
955     anz = ai[i+1] - ai[i];
956     adj = aj + ai[i];
957     ada = a_loc->a + ai[i];
958     for (j=0; j<anz; j++) {
959       aval[adj[j]] = ada[j];
960     }
961 
962     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
963     /*--------------------------------------------------------------*/
964     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
965     pnz = po->i[i+1] - po->i[i];
966     poJ = po->j + po->i[i];
967     pA  = po->a + po->i[i];
968     for (j=0; j<pnz; j++) {
969       row = poJ[j];
970       cnz = coi[row+1] - coi[row];
971       cj  = coj + coi[row];
972       ca  = coa + coi[row];
973       /* perform dense axpy */
974       valtmp = pA[j];
975       for (k=0; k<cnz; k++) {
976         ca[k] += valtmp*aval[cj[k]];
977       }
978       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
979     }
980 
981     /* put the value into Cd (diagonal part) */
982     pnz = pd->i[i+1] - pd->i[i];
983     pdJ = pd->j + pd->i[i];
984     pA  = pd->a + pd->i[i];
985     for (j=0; j<pnz; j++) {
986       row = pdJ[j];
987       cnz = bi[row+1] - bi[row];
988       cj  = bj + bi[row];
989       ca  = ba + bi[row];
990       /* perform dense axpy */
991       valtmp = pA[j];
992       for (k=0; k<cnz; k++) {
993         ca[k] += valtmp*aval[cj[k]];
994       }
995       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
996     }
997 
998     /* zero the current row of Pt*A */
999     aJ = aj + ai[i];
1000     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1001   }
1002 
1003   /* 3) send and recv matrix values coa */
1004   /*------------------------------------*/
1005   buf_ri = merge->buf_ri;
1006   buf_rj = merge->buf_rj;
1007   len_s  = merge->len_s;
1008   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1009   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1010 
1011   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1012   for (proc=0,k=0; proc<size; proc++) {
1013     if (!len_s[proc]) continue;
1014     i    = merge->owners_co[proc];
1015     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1016     k++;
1017   }
1018   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1019   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1020 
1021   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1022   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1023   ierr = PetscFree(coa);CHKERRQ(ierr);
1024 
1025   /* 4) insert local Cseq and received values into Cmpi */
1026   /*----------------------------------------------------*/
1027   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1028   for (k=0; k<merge->nrecv; k++) {
1029     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1030     nrows       = *(buf_ri_k[k]);
1031     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1032     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1033   }
1034 
1035   for (i=0; i<cm; i++) {
1036     row  = owners[rank] + i; /* global row index of C_seq */
1037     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1038     ba_i = ba + bi[i];
1039     bnz  = bi[i+1] - bi[i];
1040     /* add received vals into ba */
1041     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1042       /* i-th row */
1043       if (i == *nextrow[k]) {
1044         cnz    = *(nextci[k]+1) - *nextci[k];
1045         cj     = buf_rj[k] + *(nextci[k]);
1046         ca     = abuf_r[k] + *(nextci[k]);
1047         nextcj = 0;
1048         for (j=0; nextcj<cnz; j++) {
1049           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1050             ba_i[j] += ca[nextcj++];
1051           }
1052         }
1053         nextrow[k]++; nextci[k]++;
1054         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1055       }
1056     }
1057     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1058   }
1059   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1060   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1061 
1062   ierr = PetscFree(ba);CHKERRQ(ierr);
1063   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1064   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1065   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1066   ierr = PetscFree(aval);CHKERRQ(ierr);
1067   PetscFunctionReturn(0);
1068 }
1069 
1070 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1071 #undef __FUNCT__
1072 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1073 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1074 {
1075   PetscErrorCode      ierr;
1076   Mat                 Cmpi,A_loc,POt,PDt;
1077   Mat_PtAPMPI         *ptap;
1078   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1079   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1080   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1081   PetscInt            nnz;
1082   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1083   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1084   PetscBT             lnkbt;
1085   MPI_Comm            comm;
1086   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1087   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1088   PetscInt            len,proc,*dnz,*onz,*owners;
1089   PetscInt            nzi,*bi,*bj;
1090   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1091   MPI_Request         *swaits,*rwaits;
1092   MPI_Status          *sstatus,rstatus;
1093   Mat_Merge_SeqsToMPI *merge;
1094   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1095   PetscReal           afill  =1.0,afill_tmp;
1096   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1097   PetscScalar         *vals;
1098   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1099 
1100   PetscFunctionBegin;
1101   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1102   /* check if matrix local sizes are compatible */
1103   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1104     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1105   }
1106 
1107   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1108   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1109 
1110   /* create struct Mat_PtAPMPI and attached it to C later */
1111   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1112 
1113   /* get A_loc by taking all local rows of A */
1114   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1115 
1116   ptap->A_loc = A_loc;
1117 
1118   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1119   ai    = a_loc->i;
1120   aj    = a_loc->j;
1121 
1122   /* determine symbolic Co=(p->B)^T*A - send to others */
1123   /*----------------------------------------------------*/
1124   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1125   pdt  = (Mat_SeqAIJ*)PDt->data;
1126   pdti = pdt->i; pdtj = pdt->j;
1127 
1128   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1129   pot  = (Mat_SeqAIJ*)POt->data;
1130   poti = pot->i; potj = pot->j;
1131 
1132   /* then, compute symbolic Co = (p->B)^T*A */
1133   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1134   ierr   = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr);
1135   coi[0] = 0;
1136 
1137   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1138   nnz           = fill*(poti[pon] + ai[am]);
1139   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1140   current_space = free_space;
1141 
1142   /* create and initialize a linked list */
1143   i     = PetscMax(pdt->rmax,pot->rmax);
1144   Crmax = i*a_loc->rmax*size;
1145   if (!Crmax || Crmax > aN) Crmax = aN;
1146   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1147 
1148   for (i=0; i<pon; i++) {
1149     pnz = poti[i+1] - poti[i];
1150     ptJ = potj + poti[i];
1151     for (j=0; j<pnz; j++) {
1152       row  = ptJ[j]; /* row of A_loc == col of Pot */
1153       anz  = ai[row+1] - ai[row];
1154       Jptr = aj + ai[row];
1155       /* add non-zero cols of AP into the sorted linked list lnk */
1156       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1157     }
1158     nnz = lnk[0];
1159 
1160     /* If free space is not available, double the total space in the list */
1161     if (current_space->local_remaining<nnz) {
1162       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1163       nspacedouble++;
1164     }
1165 
1166     /* Copy data into free space, and zero out denserows */
1167     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1168 
1169     current_space->array           += nnz;
1170     current_space->local_used      += nnz;
1171     current_space->local_remaining -= nnz;
1172 
1173     coi[i+1] = coi[i] + nnz;
1174   }
1175 
1176   ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr);
1177   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1178 
1179   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1180   if (afill_tmp > afill) afill = afill_tmp;
1181 
1182   /* send j-array (coj) of Co to other processors */
1183   /*----------------------------------------------*/
1184   /* determine row ownership */
1185   ierr = PetscNew(&merge);CHKERRQ(ierr);
1186   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1187 
1188   merge->rowmap->n  = pn;
1189   merge->rowmap->bs = 1;
1190 
1191   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1192   owners = merge->rowmap->range;
1193 
1194   /* determine the number of messages to send, their lengths */
1195   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1196   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1197 
1198   len_s        = merge->len_s;
1199   merge->nsend = 0;
1200 
1201   ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr);
1202   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1203 
1204   proc = 0;
1205   for (i=0; i<pon; i++) {
1206     while (prmap[i] >= owners[proc+1]) proc++;
1207     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1208     len_s[proc] += coi[i+1] - coi[i];
1209   }
1210 
1211   len          = 0; /* max length of buf_si[] */
1212   owners_co[0] = 0;
1213   for (proc=0; proc<size; proc++) {
1214     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1215     if (len_si[proc]) {
1216       merge->nsend++;
1217       len_si[proc] = 2*(len_si[proc] + 1);
1218       len         += len_si[proc];
1219     }
1220   }
1221 
1222   /* determine the number and length of messages to receive for coi and coj  */
1223   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1224   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1225 
1226   /* post the Irecv and Isend of coj */
1227   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1228   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1229   ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr);
1230   for (proc=0, k=0; proc<size; proc++) {
1231     if (!len_s[proc]) continue;
1232     i    = owners_co[proc];
1233     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1234     k++;
1235   }
1236 
1237   /* receives and sends of coj are complete */
1238   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1239   for (i=0; i<merge->nrecv; i++) {
1240     PetscMPIInt icompleted;
1241     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1242   }
1243   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1244   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1245 
1246   /* send and recv coi */
1247   /*-------------------*/
1248   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1249   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1250   ierr   = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr);
1251   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1252   for (proc=0,k=0; proc<size; proc++) {
1253     if (!len_s[proc]) continue;
1254     /* form outgoing message for i-structure:
1255          buf_si[0]:                 nrows to be sent
1256                [1:nrows]:           row index (global)
1257                [nrows+1:2*nrows+1]: i-structure index
1258     */
1259     /*-------------------------------------------*/
1260     nrows       = len_si[proc]/2 - 1;
1261     buf_si_i    = buf_si + nrows+1;
1262     buf_si[0]   = nrows;
1263     buf_si_i[0] = 0;
1264     nrows       = 0;
1265     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1266       nzi               = coi[i+1] - coi[i];
1267       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1268       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1269       nrows++;
1270     }
1271     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1272     k++;
1273     buf_si += len_si[proc];
1274   }
1275   i = merge->nrecv;
1276   while (i--) {
1277     PetscMPIInt icompleted;
1278     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1279   }
1280   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1281   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1282   ierr = PetscFree(len_si);CHKERRQ(ierr);
1283   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1284   ierr = PetscFree(swaits);CHKERRQ(ierr);
1285   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1286   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1287 
1288   /* compute the local portion of C (mpi mat) */
1289   /*------------------------------------------*/
1290   /* allocate bi array and free space for accumulating nonzero column info */
1291   ierr  = PetscMalloc1((pn+1),&bi);CHKERRQ(ierr);
1292   bi[0] = 0;
1293 
1294   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1295   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1296   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1297   current_space = free_space;
1298 
1299   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1300   for (k=0; k<merge->nrecv; k++) {
1301     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1302     nrows       = *buf_ri_k[k];
1303     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1304     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1305   }
1306 
1307   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1308   rmax = 0;
1309   for (i=0; i<pn; i++) {
1310     /* add pdt[i,:]*AP into lnk */
1311     pnz = pdti[i+1] - pdti[i];
1312     ptJ = pdtj + pdti[i];
1313     for (j=0; j<pnz; j++) {
1314       row  = ptJ[j];  /* row of AP == col of Pt */
1315       anz  = ai[row+1] - ai[row];
1316       Jptr = aj + ai[row];
1317       /* add non-zero cols of AP into the sorted linked list lnk */
1318       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1319     }
1320 
1321     /* add received col data into lnk */
1322     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1323       if (i == *nextrow[k]) { /* i-th row */
1324         nzi  = *(nextci[k]+1) - *nextci[k];
1325         Jptr = buf_rj[k] + *nextci[k];
1326         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1327         nextrow[k]++; nextci[k]++;
1328       }
1329     }
1330     nnz = lnk[0];
1331 
1332     /* if free space is not available, make more free space */
1333     if (current_space->local_remaining<nnz) {
1334       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1335       nspacedouble++;
1336     }
1337     /* copy data into free space, then initialize lnk */
1338     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1339     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1340 
1341     current_space->array           += nnz;
1342     current_space->local_used      += nnz;
1343     current_space->local_remaining -= nnz;
1344 
1345     bi[i+1] = bi[i] + nnz;
1346     if (nnz > rmax) rmax = nnz;
1347   }
1348   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1349 
1350   ierr = PetscMalloc1((bi[pn]+1),&bj);CHKERRQ(ierr);
1351   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1352 
1353   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1354   if (afill_tmp > afill) afill = afill_tmp;
1355   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1356   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1357   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1358 
1359   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1360   /*----------------------------------------------------------------------------------*/
1361   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1362 
1363   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1364   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1365   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1366   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1367   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1368   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1369   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1370   for (i=0; i<pn; i++) {
1371     row  = i + rstart;
1372     nnz  = bi[i+1] - bi[i];
1373     Jptr = bj + bi[i];
1374     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1375   }
1376   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1377   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1378   ierr = PetscFree(vals);CHKERRQ(ierr);
1379 
1380   merge->bi        = bi;
1381   merge->bj        = bj;
1382   merge->coi       = coi;
1383   merge->coj       = coj;
1384   merge->buf_ri    = buf_ri;
1385   merge->buf_rj    = buf_rj;
1386   merge->owners_co = owners_co;
1387   merge->destroy   = Cmpi->ops->destroy;
1388   merge->duplicate = Cmpi->ops->duplicate;
1389 
1390   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1391   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1392 
1393   /* attach the supporting struct to Cmpi for reuse */
1394   c           = (Mat_MPIAIJ*)Cmpi->data;
1395   c->ptap     = ptap;
1396   ptap->api   = NULL;
1397   ptap->apj   = NULL;
1398   ptap->merge = merge;
1399   ptap->rmax  = rmax;
1400 
1401   *C = Cmpi;
1402 #if defined(PETSC_USE_INFO)
1403   if (bi[pn] != 0) {
1404     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1405     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1406   } else {
1407     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1408   }
1409 #endif
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 #undef __FUNCT__
1414 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1415 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1416 {
1417   PetscErrorCode      ierr;
1418   Mat_Merge_SeqsToMPI *merge;
1419   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1420   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1421   Mat_PtAPMPI         *ptap;
1422   PetscInt            *adj;
1423   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1424   MatScalar           *ada,*ca,valtmp;
1425   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1426   MPI_Comm            comm;
1427   PetscMPIInt         size,rank,taga,*len_s;
1428   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1429   PetscInt            **buf_ri,**buf_rj;
1430   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1431   MPI_Request         *s_waits,*r_waits;
1432   MPI_Status          *status;
1433   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1434   PetscInt            *ai,*aj,*coi,*coj;
1435   PetscInt            *poJ,*pdJ;
1436   Mat                 A_loc;
1437   Mat_SeqAIJ          *a_loc;
1438 
1439   PetscFunctionBegin;
1440   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1441   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1442   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1443 
1444   ptap  = c->ptap;
1445   merge = ptap->merge;
1446 
1447   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1448   /*------------------------------------------*/
1449   /* get data from symbolic products */
1450   coi    = merge->coi; coj = merge->coj;
1451   ierr   = PetscCalloc1((coi[pon]+1),&coa);CHKERRQ(ierr);
1452   bi     = merge->bi; bj = merge->bj;
1453   owners = merge->rowmap->range;
1454   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1455 
1456   /* get A_loc by taking all local rows of A */
1457   A_loc = ptap->A_loc;
1458   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1459   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1460   ai    = a_loc->i;
1461   aj    = a_loc->j;
1462 
1463   for (i=0; i<am; i++) {
1464     anz = ai[i+1] - ai[i];
1465     adj = aj + ai[i];
1466     ada = a_loc->a + ai[i];
1467 
1468     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1469     /*-------------------------------------------------------------*/
1470     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1471     pnz = po->i[i+1] - po->i[i];
1472     poJ = po->j + po->i[i];
1473     pA  = po->a + po->i[i];
1474     for (j=0; j<pnz; j++) {
1475       row = poJ[j];
1476       cj  = coj + coi[row];
1477       ca  = coa + coi[row];
1478       /* perform sparse axpy */
1479       nexta  = 0;
1480       valtmp = pA[j];
1481       for (k=0; nexta<anz; k++) {
1482         if (cj[k] == adj[nexta]) {
1483           ca[k] += valtmp*ada[nexta];
1484           nexta++;
1485         }
1486       }
1487       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1488     }
1489 
1490     /* put the value into Cd (diagonal part) */
1491     pnz = pd->i[i+1] - pd->i[i];
1492     pdJ = pd->j + pd->i[i];
1493     pA  = pd->a + pd->i[i];
1494     for (j=0; j<pnz; j++) {
1495       row = pdJ[j];
1496       cj  = bj + bi[row];
1497       ca  = ba + bi[row];
1498       /* perform sparse axpy */
1499       nexta  = 0;
1500       valtmp = pA[j];
1501       for (k=0; nexta<anz; k++) {
1502         if (cj[k] == adj[nexta]) {
1503           ca[k] += valtmp*ada[nexta];
1504           nexta++;
1505         }
1506       }
1507       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1508     }
1509   }
1510 
1511   /* 3) send and recv matrix values coa */
1512   /*------------------------------------*/
1513   buf_ri = merge->buf_ri;
1514   buf_rj = merge->buf_rj;
1515   len_s  = merge->len_s;
1516   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1517   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1518 
1519   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1520   for (proc=0,k=0; proc<size; proc++) {
1521     if (!len_s[proc]) continue;
1522     i    = merge->owners_co[proc];
1523     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1524     k++;
1525   }
1526   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1527   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1528 
1529   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1530   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1531   ierr = PetscFree(coa);CHKERRQ(ierr);
1532 
1533   /* 4) insert local Cseq and received values into Cmpi */
1534   /*----------------------------------------------------*/
1535   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1536   for (k=0; k<merge->nrecv; k++) {
1537     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1538     nrows       = *(buf_ri_k[k]);
1539     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1540     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1541   }
1542 
1543   for (i=0; i<cm; i++) {
1544     row  = owners[rank] + i; /* global row index of C_seq */
1545     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1546     ba_i = ba + bi[i];
1547     bnz  = bi[i+1] - bi[i];
1548     /* add received vals into ba */
1549     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1550       /* i-th row */
1551       if (i == *nextrow[k]) {
1552         cnz    = *(nextci[k]+1) - *nextci[k];
1553         cj     = buf_rj[k] + *(nextci[k]);
1554         ca     = abuf_r[k] + *(nextci[k]);
1555         nextcj = 0;
1556         for (j=0; nextcj<cnz; j++) {
1557           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1558             ba_i[j] += ca[nextcj++];
1559           }
1560         }
1561         nextrow[k]++; nextci[k]++;
1562         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1563       }
1564     }
1565     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1566   }
1567   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1568   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1569 
1570   ierr = PetscFree(ba);CHKERRQ(ierr);
1571   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1572   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1573   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1578    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1579 #undef __FUNCT__
1580 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1581 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1582 {
1583   PetscErrorCode      ierr;
1584   Mat                 Cmpi,A_loc,POt,PDt;
1585   Mat_PtAPMPI         *ptap;
1586   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1587   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1588   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1589   PetscInt            nnz;
1590   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1591   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1592   MPI_Comm            comm;
1593   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1594   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1595   PetscInt            len,proc,*dnz,*onz,*owners;
1596   PetscInt            nzi,*bi,*bj;
1597   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1598   MPI_Request         *swaits,*rwaits;
1599   MPI_Status          *sstatus,rstatus;
1600   Mat_Merge_SeqsToMPI *merge;
1601   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1602   PetscReal           afill  =1.0,afill_tmp;
1603   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1604   PetscScalar         *vals;
1605   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1606 
1607   PetscFunctionBegin;
1608   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1609   /* check if matrix local sizes are compatible */
1610   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1611     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1612   }
1613 
1614   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1615   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1616 
1617   /* create struct Mat_PtAPMPI and attached it to C later */
1618   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1619 
1620   /* get A_loc by taking all local rows of A */
1621   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1622 
1623   ptap->A_loc = A_loc;
1624   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1625   ai          = a_loc->i;
1626   aj          = a_loc->j;
1627 
1628   /* determine symbolic Co=(p->B)^T*A - send to others */
1629   /*----------------------------------------------------*/
1630   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1631   pdt  = (Mat_SeqAIJ*)PDt->data;
1632   pdti = pdt->i; pdtj = pdt->j;
1633 
1634   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1635   pot  = (Mat_SeqAIJ*)POt->data;
1636   poti = pot->i; potj = pot->j;
1637 
1638   /* then, compute symbolic Co = (p->B)^T*A */
1639   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1640                          >= (num of nonzero rows of C_seq) - pn */
1641   ierr   = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr);
1642   coi[0] = 0;
1643 
1644   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1645   nnz           = fill*(poti[pon] + ai[am]);
1646   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1647   current_space = free_space;
1648 
1649   /* create and initialize a linked list */
1650   i     = PetscMax(pdt->rmax,pot->rmax);
1651   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1652   if (!Crmax || Crmax > aN) Crmax = aN;
1653   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1654 
1655   for (i=0; i<pon; i++) {
1656     pnz = poti[i+1] - poti[i];
1657     ptJ = potj + poti[i];
1658     for (j=0; j<pnz; j++) {
1659       row  = ptJ[j]; /* row of A_loc == col of Pot */
1660       anz  = ai[row+1] - ai[row];
1661       Jptr = aj + ai[row];
1662       /* add non-zero cols of AP into the sorted linked list lnk */
1663       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1664     }
1665     nnz = lnk[0];
1666 
1667     /* If free space is not available, double the total space in the list */
1668     if (current_space->local_remaining<nnz) {
1669       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1670       nspacedouble++;
1671     }
1672 
1673     /* Copy data into free space, and zero out denserows */
1674     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1675 
1676     current_space->array           += nnz;
1677     current_space->local_used      += nnz;
1678     current_space->local_remaining -= nnz;
1679 
1680     coi[i+1] = coi[i] + nnz;
1681   }
1682 
1683   ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr);
1684   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1685 
1686   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1687   if (afill_tmp > afill) afill = afill_tmp;
1688 
1689   /* send j-array (coj) of Co to other processors */
1690   /*----------------------------------------------*/
1691   /* determine row ownership */
1692   ierr = PetscNew(&merge);CHKERRQ(ierr);
1693   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1694 
1695   merge->rowmap->n  = pn;
1696   merge->rowmap->bs = 1;
1697 
1698   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1699   owners = merge->rowmap->range;
1700 
1701   /* determine the number of messages to send, their lengths */
1702   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1703   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1704 
1705   len_s        = merge->len_s;
1706   merge->nsend = 0;
1707 
1708   ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr);
1709   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1710 
1711   proc = 0;
1712   for (i=0; i<pon; i++) {
1713     while (prmap[i] >= owners[proc+1]) proc++;
1714     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1715     len_s[proc] += coi[i+1] - coi[i];
1716   }
1717 
1718   len          = 0; /* max length of buf_si[] */
1719   owners_co[0] = 0;
1720   for (proc=0; proc<size; proc++) {
1721     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1722     if (len_si[proc]) {
1723       merge->nsend++;
1724       len_si[proc] = 2*(len_si[proc] + 1);
1725       len         += len_si[proc];
1726     }
1727   }
1728 
1729   /* determine the number and length of messages to receive for coi and coj  */
1730   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1731   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1732 
1733   /* post the Irecv and Isend of coj */
1734   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1735   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1736   ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr);
1737   for (proc=0, k=0; proc<size; proc++) {
1738     if (!len_s[proc]) continue;
1739     i    = owners_co[proc];
1740     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1741     k++;
1742   }
1743 
1744   /* receives and sends of coj are complete */
1745   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1746   for (i=0; i<merge->nrecv; i++) {
1747     PetscMPIInt icompleted;
1748     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1749   }
1750   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1751   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1752 
1753   /* send and recv coi */
1754   /*-------------------*/
1755   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1756   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1757   ierr   = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr);
1758   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1759   for (proc=0,k=0; proc<size; proc++) {
1760     if (!len_s[proc]) continue;
1761     /* form outgoing message for i-structure:
1762          buf_si[0]:                 nrows to be sent
1763                [1:nrows]:           row index (global)
1764                [nrows+1:2*nrows+1]: i-structure index
1765     */
1766     /*-------------------------------------------*/
1767     nrows       = len_si[proc]/2 - 1;
1768     buf_si_i    = buf_si + nrows+1;
1769     buf_si[0]   = nrows;
1770     buf_si_i[0] = 0;
1771     nrows       = 0;
1772     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1773       nzi               = coi[i+1] - coi[i];
1774       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1775       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1776       nrows++;
1777     }
1778     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1779     k++;
1780     buf_si += len_si[proc];
1781   }
1782   i = merge->nrecv;
1783   while (i--) {
1784     PetscMPIInt icompleted;
1785     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1786   }
1787   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1788   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1789   ierr = PetscFree(len_si);CHKERRQ(ierr);
1790   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1791   ierr = PetscFree(swaits);CHKERRQ(ierr);
1792   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1793   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1794 
1795   /* compute the local portion of C (mpi mat) */
1796   /*------------------------------------------*/
1797   /* allocate bi array and free space for accumulating nonzero column info */
1798   ierr  = PetscMalloc1((pn+1),&bi);CHKERRQ(ierr);
1799   bi[0] = 0;
1800 
1801   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1802   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1803   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1804   current_space = free_space;
1805 
1806   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1807   for (k=0; k<merge->nrecv; k++) {
1808     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1809     nrows       = *buf_ri_k[k];
1810     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1811     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1812   }
1813 
1814   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1815   rmax = 0;
1816   for (i=0; i<pn; i++) {
1817     /* add pdt[i,:]*AP into lnk */
1818     pnz = pdti[i+1] - pdti[i];
1819     ptJ = pdtj + pdti[i];
1820     for (j=0; j<pnz; j++) {
1821       row  = ptJ[j];  /* row of AP == col of Pt */
1822       anz  = ai[row+1] - ai[row];
1823       Jptr = aj + ai[row];
1824       /* add non-zero cols of AP into the sorted linked list lnk */
1825       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1826     }
1827 
1828     /* add received col data into lnk */
1829     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1830       if (i == *nextrow[k]) { /* i-th row */
1831         nzi  = *(nextci[k]+1) - *nextci[k];
1832         Jptr = buf_rj[k] + *nextci[k];
1833         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1834         nextrow[k]++; nextci[k]++;
1835       }
1836     }
1837     nnz = lnk[0];
1838 
1839     /* if free space is not available, make more free space */
1840     if (current_space->local_remaining<nnz) {
1841       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1842       nspacedouble++;
1843     }
1844     /* copy data into free space, then initialize lnk */
1845     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1846     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1847 
1848     current_space->array           += nnz;
1849     current_space->local_used      += nnz;
1850     current_space->local_remaining -= nnz;
1851 
1852     bi[i+1] = bi[i] + nnz;
1853     if (nnz > rmax) rmax = nnz;
1854   }
1855   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1856 
1857   ierr      = PetscMalloc1((bi[pn]+1),&bj);CHKERRQ(ierr);
1858   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1859   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1860   if (afill_tmp > afill) afill = afill_tmp;
1861   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1862   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1863   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1864 
1865   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1866   /*----------------------------------------------------------------------------------*/
1867   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1868 
1869   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1870   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1871   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1872   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1873   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1874   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1875   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1876   for (i=0; i<pn; i++) {
1877     row  = i + rstart;
1878     nnz  = bi[i+1] - bi[i];
1879     Jptr = bj + bi[i];
1880     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1881   }
1882   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1883   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1884   ierr = PetscFree(vals);CHKERRQ(ierr);
1885 
1886   merge->bi        = bi;
1887   merge->bj        = bj;
1888   merge->coi       = coi;
1889   merge->coj       = coj;
1890   merge->buf_ri    = buf_ri;
1891   merge->buf_rj    = buf_rj;
1892   merge->owners_co = owners_co;
1893   merge->destroy   = Cmpi->ops->destroy;
1894   merge->duplicate = Cmpi->ops->duplicate;
1895 
1896   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1897   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1898 
1899   /* attach the supporting struct to Cmpi for reuse */
1900   c = (Mat_MPIAIJ*)Cmpi->data;
1901 
1902   c->ptap     = ptap;
1903   ptap->api   = NULL;
1904   ptap->apj   = NULL;
1905   ptap->merge = merge;
1906   ptap->rmax  = rmax;
1907   ptap->apa   = NULL;
1908 
1909   *C = Cmpi;
1910 #if defined(PETSC_USE_INFO)
1911   if (bi[pn] != 0) {
1912     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1913     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1914   } else {
1915     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1916   }
1917 #endif
1918   PetscFunctionReturn(0);
1919 }
1920