xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision c0174eb74783e1ccfa817272a5782dd72b4cfcef)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
14 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
15 {
16   PetscErrorCode ierr;
17 
18   PetscFunctionBegin;
19   if (scall == MAT_INITIAL_MATRIX){
20     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
21     ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
22     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
23   }
24 
25   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
26   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
27   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
28   PetscFunctionReturn(0);
29 }
30 
31 #undef __FUNCT__
32 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
33 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
34 {
35   PetscErrorCode ierr;
36   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
37   Mat_PtAPMPI    *ptap=a->ptap;
38 
39   PetscFunctionBegin;
40   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
41   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
42   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
43   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
44   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
45   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
46   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
47   ierr = ptap->destroy(A);CHKERRQ(ierr);
48   ierr = PetscFree(ptap);CHKERRQ(ierr);
49   PetscFunctionReturn(0);
50 }
51 
52 #undef __FUNCT__
53 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
54 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
55 {
56   PetscErrorCode     ierr;
57   Mat_MPIAIJ         *a=(Mat_MPIAIJ*)A->data;
58   Mat_PtAPMPI        *ptap=a->ptap;
59 
60   PetscFunctionBegin;
61   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
62   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
63   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
64   PetscFunctionReturn(0);
65 }
66 
67 #undef __FUNCT__
68 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
69 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
70 {
71   PetscErrorCode     ierr;
72   Mat_MPIAIJ         *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
73   Mat_SeqAIJ         *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
74   Mat_SeqAIJ         *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
75   PetscInt           *adi=ad->i,*adj,*aoi=ao->i,*aoj;
76   PetscScalar        *ada,*aoa,*cda=cd->a,*coa=co->a;
77   Mat_SeqAIJ         *p_loc,*p_oth;
78   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
79   PetscScalar        *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
80   PetscInt           cm=C->rmap->n,anz,pnz;
81   Mat_PtAPMPI        *ptap=c->ptap;
82   PetscInt           *api,*apj,*apJ,i,j,k,row;
83   PetscInt           cstart=C->cmap->rstart;
84   PetscInt           cdnz,conz,k0,k1;
85 
86   PetscFunctionBegin;
87   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
88   /*-----------------------------------------------------*/
89   /* update numerical values of P_oth and P_loc */
90   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
91   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
92 
93   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
94   /*----------------------------------------------------------*/
95   /* get data from symbolic products */
96   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
97   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
98   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
99   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
100 
101   /* get apa for storing dense row A[i,:]*P */
102   apa = ptap->apa;
103 
104   api = ptap->api;
105   apj = ptap->apj;
106   for (i=0; i<cm; i++) {
107     /* diagonal portion of A */
108     anz = adi[i+1] - adi[i];
109     adj = ad->j + adi[i];
110     ada = ad->a + adi[i];
111     for (j=0; j<anz; j++) {
112       row = adj[j];
113       pnz = pi_loc[row+1] - pi_loc[row];
114       pj  = pj_loc + pi_loc[row];
115       pa  = pa_loc + pi_loc[row];
116 
117       /* perform dense axpy */
118       valtmp = ada[j];
119       for (k=0; k<pnz; k++){
120         apa[pj[k]] += valtmp*pa[k];
121       }
122       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
123     }
124 
125     /* off-diagonal portion of A */
126     anz = aoi[i+1] - aoi[i];
127     aoj = ao->j + aoi[i];
128     aoa = ao->a + aoi[i];
129     for (j=0; j<anz; j++) {
130       row = aoj[j];
131       pnz = pi_oth[row+1] - pi_oth[row];
132       pj  = pj_oth + pi_oth[row];
133       pa  = pa_oth + pi_oth[row];
134 
135       /* perform dense axpy */
136       valtmp = aoa[j];
137       for (k=0; k<pnz; k++){
138         apa[pj[k]] += valtmp*pa[k];
139       }
140       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
141     }
142 
143     /* set values in C */
144     apJ = apj + api[i];
145     cdnz = cd->i[i+1] - cd->i[i];
146     conz = co->i[i+1] - co->i[i];
147 
148     /* 1st off-diagoanl part of C */
149     ca = coa + co->i[i];
150     k  = 0;
151     for (k0=0; k0<conz; k0++){
152       if (apJ[k] >= cstart) break;
153       ca[k0]      = apa[apJ[k]];
154       apa[apJ[k]] = 0.0;
155       k++;
156     }
157 
158     /* diagonal part of C */
159     ca = cda + cd->i[i];
160     for (k1=0; k1<cdnz; k1++){
161       ca[k1]      = apa[apJ[k]];
162       apa[apJ[k]] = 0.0;
163       k++;
164     }
165 
166     /* 2nd off-diagoanl part of C */
167     ca = coa + co->i[i];
168     for (; k0<conz; k0++){
169       ca[k0]      = apa[apJ[k]];
170       apa[apJ[k]] = 0.0;
171       k++;
172     }
173   }
174   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
175   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
181 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
182 {
183   PetscErrorCode       ierr;
184   MPI_Comm             comm=((PetscObject)A)->comm;
185   Mat                  Cmpi;
186   Mat_PtAPMPI          *ptap;
187   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
188   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*c;
189   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
190   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
191   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
192   PetscInt             *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
193   PetscInt             am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
194   PetscBT              lnkbt;
195   PetscScalar          *apa;
196   PetscReal            afill;
197   PetscBool            scalable=PETSC_FALSE;
198   PetscInt             nlnk_max,armax,prmax;
199 
200   PetscFunctionBegin;
201   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend){
202     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
203   }
204 
205   ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
206     ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
207     if (scalable){
208       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);;CHKERRQ(ierr);
209       PetscFunctionReturn(0);
210     }
211   ierr = PetscOptionsEnd();CHKERRQ(ierr);
212 
213   /* create struct Mat_PtAPMPI and attached it to C later */
214   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
215 
216   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
217   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
218 
219   /* get P_loc by taking all local rows of P */
220   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
221 
222   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
223   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
224   pi_loc = p_loc->i; pj_loc = p_loc->j;
225   pi_oth = p_oth->i; pj_oth = p_oth->j;
226 
227   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
228   /*-------------------------------------------------------------------*/
229   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
230   ptap->api = api;
231   api[0]    = 0;
232 
233   /* create and initialize a linked list */
234   armax = ad->rmax+ao->rmax;
235   prmax = PetscMax(p_loc->rmax,p_oth->rmax);
236   nlnk_max = armax*prmax;
237   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
238   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
239 
240   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
241   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
242   current_space = free_space;
243 
244   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
245   for (i=0; i<am; i++) {
246     apnz = 0;
247     /* diagonal portion of A */
248     nzi = adi[i+1] - adi[i];
249     for (j=0; j<nzi; j++){
250       row = *adj++;
251       pnz = pi_loc[row+1] - pi_loc[row];
252       Jptr  = pj_loc + pi_loc[row];
253       /* add non-zero cols of P into the sorted linked list lnk */
254       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
255     }
256     /* off-diagonal portion of A */
257     nzi = aoi[i+1] - aoi[i];
258     for (j=0; j<nzi; j++){
259       row = *aoj++;
260       pnz = pi_oth[row+1] - pi_oth[row];
261       Jptr  = pj_oth + pi_oth[row];
262       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
263     }
264 
265     apnz     = lnk[0];
266     api[i+1] = api[i] + apnz;
267 
268     /* if free space is not available, double the total space in the list */
269     if (current_space->local_remaining<apnz) {
270       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
271       nspacedouble++;
272     }
273 
274     /* Copy data into free space, then initialize lnk */
275     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
276     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
277     current_space->array           += apnz;
278     current_space->local_used      += apnz;
279     current_space->local_remaining -= apnz;
280   }
281 
282   /* Allocate space for apj, initialize apj, and */
283   /* destroy list of free space and other temporary array(s) */
284   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
285   apj  = ptap->apj;
286   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
287   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
288 
289   /* malloc apa to store dense row A[i,:]*P */
290   ierr = PetscMalloc(pN*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
291   ierr = PetscMemzero(apa,pN*sizeof(PetscScalar));CHKERRQ(ierr);
292   ptap->apa = apa;
293 
294   /* create and assemble symbolic parallel matrix Cmpi */
295   /*----------------------------------------------------*/
296   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
297   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
298   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
299 
300   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
301   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
302   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
303   for (i=0; i<am; i++){
304     row  = i + rstart;
305     apnz = api[i+1] - api[i];
306     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
307     apj += apnz;
308   }
309   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
310   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
311 
312   ptap->destroy             = Cmpi->ops->destroy;
313   ptap->duplicate           = Cmpi->ops->duplicate;
314   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
315   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
316   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
317 
318   /* attach the supporting struct to Cmpi for reuse */
319   c = (Mat_MPIAIJ*)Cmpi->data;
320   c->ptap  = ptap;
321 
322   *C = Cmpi;
323 
324   /* set MatInfo */
325   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
326   if (afill < 1.0) afill = 1.0;
327   Cmpi->info.mallocs           = nspacedouble;
328   Cmpi->info.fill_ratio_given  = fill;
329   Cmpi->info.fill_ratio_needed = afill;
330 
331 #if defined(PETSC_USE_INFO)
332   if (api[am]) {
333     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
334     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
335   } else {
336     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
337   }
338 #endif
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
344 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
345 {
346   PetscErrorCode ierr;
347 
348   PetscFunctionBegin;
349   if (scall == MAT_INITIAL_MATRIX){
350     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
351   }
352   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 typedef struct {
357   Mat         workB;
358   PetscScalar *rvalues,*svalues;
359   MPI_Request *rwaits,*swaits;
360 } MPIAIJ_MPIDense;
361 
362 #undef __FUNCT__
363 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
364 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
365 {
366   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
367   PetscErrorCode  ierr;
368 
369   PetscFunctionBegin;
370   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
371   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
372   ierr = PetscFree(contents);CHKERRQ(ierr);
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
378 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
379 {
380   PetscErrorCode         ierr;
381   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
382   PetscInt               nz = aij->B->cmap->n;
383   PetscContainer         container;
384   MPIAIJ_MPIDense        *contents;
385   VecScatter             ctx = aij->Mvctx;
386   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
387   VecScatter_MPI_General *to   = ( VecScatter_MPI_General*) ctx->todata;
388   PetscInt               m=A->rmap->n,n=B->cmap->n;
389 
390   PetscFunctionBegin;
391   ierr = MatCreate(((PetscObject)B)->comm,C);CHKERRQ(ierr);
392   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
393   ierr = MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);CHKERRQ(ierr);
394   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
395   ierr = MatMPIDenseSetPreallocation(*C,PETSC_NULL);CHKERRQ(ierr);
396   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
397   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
398   (*C)->ops->matmult = MatMatMult_MPIAIJ_MPIDense;
399 
400   ierr = PetscNew(MPIAIJ_MPIDense,&contents);CHKERRQ(ierr);
401   /* Create work matrix used to store off processor rows of B needed for local product */
402   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,PETSC_NULL,&contents->workB);CHKERRQ(ierr);
403   /* Create work arrays needed */
404   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
405                       B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
406                       from->n,MPI_Request,&contents->rwaits,
407                       to->n,MPI_Request,&contents->swaits);CHKERRQ(ierr);
408 
409   ierr = PetscContainerCreate(((PetscObject)A)->comm,&container);CHKERRQ(ierr);
410   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
411   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
412   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
413   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
414   PetscFunctionReturn(0);
415 }
416 
417 #undef __FUNCT__
418 #define __FUNCT__ "MatMPIDenseScatter"
419 /*
420     Performs an efficient scatter on the rows of B needed by this process; this is
421     a modification of the VecScatterBegin_() routines.
422 */
423 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
424 {
425   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
426   PetscErrorCode         ierr;
427   PetscScalar            *b,*w,*svalues,*rvalues;
428   VecScatter             ctx = aij->Mvctx;
429   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
430   VecScatter_MPI_General *to   = ( VecScatter_MPI_General*) ctx->todata;
431   PetscInt               i,j,k;
432   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
433   PetscMPIInt            *sprocs,*rprocs,nrecvs;
434   MPI_Request            *swaits,*rwaits;
435   MPI_Comm               comm = ((PetscObject)A)->comm;
436   PetscMPIInt            tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
437   MPI_Status             status;
438   MPIAIJ_MPIDense        *contents;
439   PetscContainer         container;
440   Mat                    workB;
441 
442   PetscFunctionBegin;
443   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
444   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
445   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
446 
447   workB = *outworkB = contents->workB;
448   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
449   sindices  = to->indices;
450   sstarts   = to->starts;
451   sprocs    = to->procs;
452   swaits    = contents->swaits;
453   svalues   = contents->svalues;
454 
455   rindices  = from->indices;
456   rstarts   = from->starts;
457   rprocs    = from->procs;
458   rwaits    = contents->rwaits;
459   rvalues   = contents->rvalues;
460 
461   ierr = MatGetArray(B,&b);CHKERRQ(ierr);
462   ierr = MatGetArray(workB,&w);CHKERRQ(ierr);
463 
464   for (i=0; i<from->n; i++) {
465     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
466   }
467 
468   for (i=0; i<to->n; i++) {
469     /* pack a message at a time */
470     CHKMEMQ;
471     for (j=0; j<sstarts[i+1]-sstarts[i]; j++){
472       for (k=0; k<ncols; k++) {
473         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
474       }
475     }
476     CHKMEMQ;
477     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
478   }
479 
480   nrecvs = from->n;
481   while (nrecvs) {
482     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
483     nrecvs--;
484     /* unpack a message at a time */
485     CHKMEMQ;
486     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++){
487       for (k=0; k<ncols; k++) {
488         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
489       }
490     }
491     CHKMEMQ;
492   }
493   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
494 
495   ierr = MatRestoreArray(B,&b);CHKERRQ(ierr);
496   ierr = MatRestoreArray(workB,&w);CHKERRQ(ierr);
497   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
498   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
502 
503 #undef __FUNCT__
504 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
505 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
506 {
507   PetscErrorCode       ierr;
508   Mat_MPIAIJ           *aij = (Mat_MPIAIJ*)A->data;
509   Mat_MPIDense         *bdense = (Mat_MPIDense*)B->data;
510   Mat_MPIDense         *cdense = (Mat_MPIDense*)C->data;
511   Mat                  workB;
512 
513   PetscFunctionBegin;
514 
515   /* diagonal block of A times all local rows of B*/
516   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
517 
518   /* get off processor parts of B needed to complete the product */
519   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
520 
521   /* off-diagonal block of A times nonlocal rows of B */
522   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
523   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
524   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
525   PetscFunctionReturn(0);
526 }
527 
528 #undef __FUNCT__
529 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
530 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
531 {
532   PetscErrorCode     ierr;
533   Mat_MPIAIJ         *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
534   Mat_SeqAIJ         *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
535   Mat_SeqAIJ         *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
536   PetscInt           *adi=ad->i,*adj,*aoi=ao->i,*aoj;
537   PetscScalar        *ada,*aoa,*cda=cd->a,*coa=co->a;
538   Mat_SeqAIJ         *p_loc,*p_oth;
539   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
540   PetscScalar        *pa_loc,*pa_oth,*pa,valtmp,*ca;
541   PetscInt           cm=C->rmap->n,anz,pnz;
542   Mat_PtAPMPI        *ptap=c->ptap;
543   PetscScalar        *apa_sparse=ptap->apa;
544   PetscInt           *api,*apj,*apJ,i,j,k,row;
545   PetscInt           cstart=C->cmap->rstart;
546   PetscInt           cdnz,conz,k0,k1,nextp;
547 
548   PetscFunctionBegin;
549   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
550   /*-----------------------------------------------------*/
551   /* update numerical values of P_oth and P_loc */
552   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
553   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
554 
555   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
556   /*----------------------------------------------------------*/
557   /* get data from symbolic products */
558   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
559   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
560   pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
561   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
562 
563   api = ptap->api;
564   apj = ptap->apj;
565   for (i=0; i<cm; i++) {
566     apJ = apj + api[i];
567 
568     /* diagonal portion of A */
569     anz = adi[i+1] - adi[i];
570     adj = ad->j + adi[i];
571     ada = ad->a + adi[i];
572     for (j=0; j<anz; j++) {
573       row = adj[j];
574       pnz = pi_loc[row+1] - pi_loc[row];
575       pj  = pj_loc + pi_loc[row];
576       pa  = pa_loc + pi_loc[row];
577       /* perform sparse axpy */
578       valtmp = ada[j];
579       nextp  = 0;
580       for (k=0; nextp<pnz; k++) {
581         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
582           apa_sparse[k] += valtmp*pa[nextp++];
583         }
584       }
585       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
586     }
587 
588     /* off-diagonal portion of A */
589     anz = aoi[i+1] - aoi[i];
590     aoj = ao->j + aoi[i];
591     aoa = ao->a + aoi[i];
592     for (j=0; j<anz; j++) {
593       row = aoj[j];
594       pnz = pi_oth[row+1] - pi_oth[row];
595       pj  = pj_oth + pi_oth[row];
596       pa  = pa_oth + pi_oth[row];
597       /* perform sparse axpy */
598       valtmp = aoa[j];
599       nextp  = 0;
600       for (k=0; nextp<pnz; k++) {
601         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
602           apa_sparse[k] += valtmp*pa[nextp++];
603         }
604       }
605       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
606     }
607 
608     /* set values in C */
609     cdnz = cd->i[i+1] - cd->i[i];
610     conz = co->i[i+1] - co->i[i];
611 
612     /* 1st off-diagoanl part of C */
613     ca = coa + co->i[i];
614     k  = 0;
615     for (k0=0; k0<conz; k0++){
616       if (apJ[k] >= cstart) break;
617       ca[k0]      = apa_sparse[k];
618       apa_sparse[k] = 0.0;
619       k++;
620     }
621 
622     /* diagonal part of C */
623     ca = cda + cd->i[i];
624     for (k1=0; k1<cdnz; k1++){
625       ca[k1]      = apa_sparse[k];
626       apa_sparse[k] = 0.0;
627       k++;
628     }
629 
630     /* 2nd off-diagoanl part of C */
631     ca = coa + co->i[i];
632     for (; k0<conz; k0++){
633       ca[k0]      = apa_sparse[k];
634       apa_sparse[k] = 0.0;
635       k++;
636     }
637   }
638   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
639   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
640   PetscFunctionReturn(0);
641 }
642 
643 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
644 #undef __FUNCT__
645 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
646 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
647 {
648   PetscErrorCode       ierr;
649   MPI_Comm             comm=((PetscObject)A)->comm;
650   Mat                  Cmpi;
651   Mat_PtAPMPI          *ptap;
652   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
653   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*c;
654   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
655   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
656   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
657   PetscInt             i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
658   PetscInt             am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
659   PetscInt             nlnk_max,armax,prmax;
660   PetscReal            afill;
661   PetscScalar          *apa;
662 
663   PetscFunctionBegin;
664   /* create struct Mat_PtAPMPI and attached it to C later */
665   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
666 
667   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
668   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
669 
670   /* get P_loc by taking all local rows of P */
671   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
672 
673   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
674   p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
675   pi_loc = p_loc->i; pj_loc = p_loc->j;
676   pi_oth = p_oth->i; pj_oth = p_oth->j;
677 
678   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
679   /*-------------------------------------------------------------------*/
680   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
681   ptap->api = api;
682   api[0]    = 0;
683 
684   /* create and initialize a linked list */
685   armax = ad->rmax+ao->rmax;
686   prmax = PetscMax(p_loc->rmax,p_oth->rmax);
687   nlnk_max = armax*prmax;
688   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
689   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
690 
691   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
692   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
693   current_space = free_space;
694 
695   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
696   for (i=0; i<am; i++) {
697     apnz = 0;
698     /* diagonal portion of A */
699     nzi = adi[i+1] - adi[i];
700     for (j=0; j<nzi; j++){
701       row = *adj++;
702       pnz = pi_loc[row+1] - pi_loc[row];
703       Jptr  = pj_loc + pi_loc[row];
704       /* add non-zero cols of P into the sorted linked list lnk */
705       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
706     }
707     /* off-diagonal portion of A */
708     nzi = aoi[i+1] - aoi[i];
709     for (j=0; j<nzi; j++){
710       row = *aoj++;
711       pnz = pi_oth[row+1] - pi_oth[row];
712       Jptr  = pj_oth + pi_oth[row];
713       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
714     }
715 
716     apnz     = *lnk;
717     api[i+1] = api[i] + apnz;
718     if (apnz > apnz_max) apnz_max = apnz;
719 
720     /* if free space is not available, double the total space in the list */
721     if (current_space->local_remaining<apnz) {
722       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
723       nspacedouble++;
724     }
725 
726     /* Copy data into free space, then initialize lnk */
727     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
728     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
729     current_space->array           += apnz;
730     current_space->local_used      += apnz;
731     current_space->local_remaining -= apnz;
732   }
733 
734   /* Allocate space for apj, initialize apj, and */
735   /* destroy list of free space and other temporary array(s) */
736   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr);
737   apj  = ptap->apj;
738   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
739   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
740 
741   /* create and assemble symbolic parallel matrix Cmpi */
742   /*----------------------------------------------------*/
743   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
744   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
745   ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr);
746   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
747   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
748   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
749 
750   /* malloc apa for assembly Cmpi */
751   ierr = PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);CHKERRQ(ierr);
752   ierr = PetscMemzero(apa,apnz_max*sizeof(PetscScalar));CHKERRQ(ierr);
753   ptap->apa = apa;
754   for (i=0; i<am; i++){
755     row  = i + rstart;
756     apnz = api[i+1] - api[i];
757     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
758     apj += apnz;
759   }
760   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
761   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
762 
763   ptap->destroy             = Cmpi->ops->destroy;
764   ptap->duplicate           = Cmpi->ops->duplicate;
765   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
766   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
767   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
768 
769   /* attach the supporting struct to Cmpi for reuse */
770   c = (Mat_MPIAIJ*)Cmpi->data;
771   c->ptap  = ptap;
772 
773   *C = Cmpi;
774 
775   /* set MatInfo */
776   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
777   if (afill < 1.0) afill = 1.0;
778   Cmpi->info.mallocs           = nspacedouble;
779   Cmpi->info.fill_ratio_given  = fill;
780   Cmpi->info.fill_ratio_needed = afill;
781 
782 #if defined(PETSC_USE_INFO)
783   if (api[am]) {
784     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
785     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr);
786   } else {
787     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
788   }
789 #endif
790   PetscFunctionReturn(0);
791 }
792 
793 /*-------------------------------------------------------------------------*/
794 #undef __FUNCT__
795 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
796 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
797 {
798   PetscErrorCode ierr;
799   PetscBool      scalable=PETSC_FALSE;
800 
801   PetscFunctionBegin;
802   if (scall == MAT_INITIAL_MATRIX){
803     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
804       ierr = PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr);
805       if  (scalable){
806         ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);CHKERRQ(ierr);
807       } else {
808         ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
809       }
810     ierr = PetscOptionsEnd();CHKERRQ(ierr);
811   }
812   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
813   PetscFunctionReturn(0);
814 }
815 
816 #undef __FUNCT__
817 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
818 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
819 {
820   PetscErrorCode       ierr;
821   Mat_Merge_SeqsToMPI  *merge;
822   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
823   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
824   Mat_PtAPMPI          *ptap;
825   PetscInt             *adj,*aJ;
826   PetscInt             i,j,k,anz,pnz,row,*cj;
827   MatScalar            *ada,*aval,*ca,valtmp;
828   PetscInt             am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
829   MPI_Comm             comm=((PetscObject)C)->comm;
830   PetscMPIInt          size,rank,taga,*len_s;
831   PetscInt             *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
832   PetscInt             **buf_ri,**buf_rj;
833   PetscInt             cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
834   MPI_Request          *s_waits,*r_waits;
835   MPI_Status           *status;
836   MatScalar            **abuf_r,*ba_i,*pA,*coa,*ba;
837   PetscInt             *ai,*aj,*coi,*coj;
838   PetscInt             *poJ=po->j,*pdJ=pd->j;
839   Mat                  A_loc;
840   Mat_SeqAIJ           *a_loc;
841 
842   PetscFunctionBegin;
843   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
844   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
845 
846   ptap  = c->ptap;
847   merge = ptap->merge;
848 
849   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
850   /*--------------------------------------------------------------*/
851   /* get data from symbolic products */
852   coi = merge->coi; coj = merge->coj;
853   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
854   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
855 
856   bi     = merge->bi; bj = merge->bj;
857   owners = merge->rowmap->range;
858   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
859   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
860 
861   /* get A_loc by taking all local rows of A */
862   A_loc = ptap->A_loc;
863   ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
864   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
865   ai   = a_loc->i;
866   aj   = a_loc->j;
867 
868   ierr = PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval);CHKERRQ(ierr); /* non-scalable!!! */
869   ierr = PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr);
870 
871     for (i=0; i<am; i++) {
872       /* 2-a) put A[i,:] to dense array aval */
873       anz = ai[i+1] - ai[i];
874       adj = aj + ai[i];
875       ada = a_loc->a + ai[i];
876       for (j=0; j<anz; j++){
877         aval[adj[j]] = ada[j];
878       }
879 
880       /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
881       /*--------------------------------------------------------------*/
882       /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
883       pnz = po->i[i+1] - po->i[i];
884       poJ = po->j + po->i[i];
885       pA  = po->a + po->i[i];
886       for (j=0; j<pnz; j++){
887         row = poJ[j];
888         cnz = coi[row+1] - coi[row];
889         cj  = coj + coi[row];
890         ca  = coa + coi[row];
891         /* perform dense axpy */
892         valtmp = pA[j];
893         for (k=0; k<cnz; k++) {
894           ca[k] += valtmp*aval[cj[k]];
895         }
896         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
897       }
898 
899       /* put the value into Cd (diagonal part) */
900       pnz = pd->i[i+1] - pd->i[i];
901       pdJ = pd->j + pd->i[i];
902       pA  = pd->a + pd->i[i];
903       for (j=0; j<pnz; j++){
904         row = pdJ[j];
905         cnz = bi[row+1] - bi[row];
906         cj  = bj + bi[row];
907         ca  = ba + bi[row];
908         /* perform dense axpy */
909         valtmp = pA[j];
910         for (k=0; k<cnz; k++) {
911           ca[k] += valtmp*aval[cj[k]];
912         }
913         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
914       }
915 
916       /* zero the current row of Pt*A */
917       aJ = aj + ai[i];
918       for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
919     }
920 
921   /* 3) send and recv matrix values coa */
922   /*------------------------------------*/
923   buf_ri = merge->buf_ri;
924   buf_rj = merge->buf_rj;
925   len_s  = merge->len_s;
926   ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
927   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
928 
929   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
930   for (proc=0,k=0; proc<size; proc++){
931     if (!len_s[proc]) continue;
932     i = merge->owners_co[proc];
933     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
934     k++;
935   }
936   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
937   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
938 
939   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
940   ierr = PetscFree(r_waits);CHKERRQ(ierr);
941   ierr = PetscFree(coa);CHKERRQ(ierr);
942 
943   /* 4) insert local Cseq and received values into Cmpi */
944   /*----------------------------------------------------*/
945   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
946   for (k=0; k<merge->nrecv; k++){
947     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
948     nrows       = *(buf_ri_k[k]);
949     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
950     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
951   }
952 
953   for (i=0; i<cm; i++) {
954     row = owners[rank] + i; /* global row index of C_seq */
955     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
956     ba_i = ba + bi[i];
957     bnz  = bi[i+1] - bi[i];
958     /* add received vals into ba */
959     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
960       /* i-th row */
961       if (i == *nextrow[k]) {
962         cnz = *(nextci[k]+1) - *nextci[k];
963         cj  = buf_rj[k] + *(nextci[k]);
964         ca  = abuf_r[k] + *(nextci[k]);
965         nextcj = 0;
966         for (j=0; nextcj<cnz; j++){
967           if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
968             ba_i[j] += ca[nextcj++];
969           }
970         }
971         nextrow[k]++; nextci[k]++;
972         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
973       }
974     }
975     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
976   }
977   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
978   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
979 
980   ierr = PetscFree(ba);CHKERRQ(ierr);
981   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
982   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
983   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
984   ierr = PetscFree(aval);CHKERRQ(ierr);
985   PetscFunctionReturn(0);
986 }
987 
988 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
989 #undef __FUNCT__
990 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
991 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
992 {
993   PetscErrorCode       ierr;
994   Mat                  Cmpi,A_loc,POt,PDt;
995   Mat_PtAPMPI          *ptap;
996   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
997   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c;
998   PetscInt             *pdti,*pdtj,*poti,*potj,*ptJ;
999   PetscInt             nnz;
1000   PetscInt             *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1001   PetscInt             am=A->rmap->n,pn=P->cmap->n;
1002   PetscBT              lnkbt;
1003   MPI_Comm             comm=((PetscObject)A)->comm;
1004   PetscMPIInt          size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1005   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1006   PetscInt             len,proc,*dnz,*onz,*owners;
1007   PetscInt             nzi,*bi,*bj;
1008   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1009   MPI_Request          *swaits,*rwaits;
1010   MPI_Status           *sstatus,rstatus;
1011   Mat_Merge_SeqsToMPI  *merge;
1012   PetscInt             *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1013   PetscReal            afill=1.0,afill_tmp;
1014   PetscInt             rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1015   PetscScalar          *vals;
1016   Mat_SeqAIJ           *a_loc, *pdt,*pot;
1017 
1018   PetscFunctionBegin;
1019   /* check if matrix local sizes are compatible */
1020   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){
1021     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1022   }
1023 
1024   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1025   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1026 
1027   /* create struct Mat_PtAPMPI and attached it to C later */
1028   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1029 
1030   /* get A_loc by taking all local rows of A */
1031   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1032   ptap->A_loc = A_loc;
1033   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1034   ai   = a_loc->i;
1035   aj   = a_loc->j;
1036 
1037   /* determine symbolic Co=(p->B)^T*A - send to others */
1038   /*----------------------------------------------------*/
1039   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1040   pdt = (Mat_SeqAIJ*)PDt->data;
1041   pdti = pdt->i; pdtj = pdt->j;
1042 
1043   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1044   pot = (Mat_SeqAIJ*)POt->data;
1045   poti = pot->i; potj = pot->j;
1046 
1047   /* then, compute symbolic Co = (p->B)^T*A */
1048   pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1049                          >= (num of nonzero rows of C_seq) - pn */
1050   ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1051   coi[0] = 0;
1052 
1053   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1054   nnz           = fill*(poti[pon] + ai[am]);
1055   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1056   current_space = free_space;
1057 
1058   /* create and initialize a linked list */
1059   i = PetscMax(pdt->rmax,pot->rmax);
1060   Crmax = i*a_loc->rmax*size;
1061   if (!Crmax || Crmax > aN) Crmax = aN;
1062   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1063 
1064   for (i=0; i<pon; i++) {
1065     pnz = poti[i+1] - poti[i];
1066     ptJ = potj + poti[i];
1067     for (j=0; j<pnz; j++){
1068       row  = ptJ[j]; /* row of A_loc == col of Pot */
1069       anz  = ai[row+1] - ai[row];
1070       Jptr = aj + ai[row];
1071       /* add non-zero cols of AP into the sorted linked list lnk */
1072       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1073     }
1074     nnz = lnk[0];
1075 
1076     /* If free space is not available, double the total space in the list */
1077     if (current_space->local_remaining<nnz) {
1078       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1079       nspacedouble++;
1080     }
1081 
1082     /* Copy data into free space, and zero out denserows */
1083     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1084     current_space->array           += nnz;
1085     current_space->local_used      += nnz;
1086     current_space->local_remaining -= nnz;
1087     coi[i+1] = coi[i] + nnz;
1088   }
1089 
1090   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1091   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1092   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1093   if (afill_tmp > afill) afill = afill_tmp;
1094 
1095   /* send j-array (coj) of Co to other processors */
1096   /*----------------------------------------------*/
1097   /* determine row ownership */
1098   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1099   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1100   merge->rowmap->n = pn;
1101   merge->rowmap->bs = 1;
1102   ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1103   owners = merge->rowmap->range;
1104 
1105   /* determine the number of messages to send, their lengths */
1106   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1107   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1108   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1109   len_s = merge->len_s;
1110   merge->nsend = 0;
1111 
1112   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1113   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1114 
1115   proc = 0;
1116   for (i=0; i<pon; i++){
1117     while (prmap[i] >= owners[proc+1]) proc++;
1118     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1119     len_s[proc] += coi[i+1] - coi[i];
1120   }
1121 
1122   len   = 0;  /* max length of buf_si[] */
1123   owners_co[0] = 0;
1124   for (proc=0; proc<size; proc++){
1125     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1126     if (len_si[proc]){
1127       merge->nsend++;
1128       len_si[proc] = 2*(len_si[proc] + 1);
1129       len += len_si[proc];
1130     }
1131   }
1132 
1133   /* determine the number and length of messages to receive for coi and coj  */
1134   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1135   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1136 
1137   /* post the Irecv and Isend of coj */
1138   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1139   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1140   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1141   for (proc=0, k=0; proc<size; proc++){
1142     if (!len_s[proc]) continue;
1143     i = owners_co[proc];
1144     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1145     k++;
1146   }
1147 
1148   /* receives and sends of coj are complete */
1149   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1150   for (i=0; i<merge->nrecv; i++){
1151     PetscMPIInt icompleted;
1152     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1153   }
1154   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1155   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1156 
1157   /* send and recv coi */
1158   /*-------------------*/
1159   ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1160   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1161   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1162   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1163   for (proc=0,k=0; proc<size; proc++){
1164     if (!len_s[proc]) continue;
1165     /* form outgoing message for i-structure:
1166          buf_si[0]:                 nrows to be sent
1167                [1:nrows]:           row index (global)
1168                [nrows+1:2*nrows+1]: i-structure index
1169     */
1170     /*-------------------------------------------*/
1171     nrows = len_si[proc]/2 - 1;
1172     buf_si_i    = buf_si + nrows+1;
1173     buf_si[0]   = nrows;
1174     buf_si_i[0] = 0;
1175     nrows = 0;
1176     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
1177       nzi = coi[i+1] - coi[i];
1178       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1179       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
1180       nrows++;
1181     }
1182     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1183     k++;
1184     buf_si += len_si[proc];
1185   }
1186   i = merge->nrecv;
1187   while (i--) {
1188     PetscMPIInt icompleted;
1189     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1190   }
1191   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1192   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1193   ierr = PetscFree(len_si);CHKERRQ(ierr);
1194   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1195   ierr = PetscFree(swaits);CHKERRQ(ierr);
1196   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1197   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1198 
1199   /* compute the local portion of C (mpi mat) */
1200   /*------------------------------------------*/
1201   /* allocate bi array and free space for accumulating nonzero column info */
1202   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1203   bi[0] = 0;
1204 
1205   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1206   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1207   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1208   current_space = free_space;
1209 
1210   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1211   for (k=0; k<merge->nrecv; k++){
1212     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1213     nrows       = *buf_ri_k[k];
1214     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1215     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1216   }
1217 
1218   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1219   rmax = 0;
1220   for (i=0; i<pn; i++) {
1221     /* add pdt[i,:]*AP into lnk */
1222     pnz = pdti[i+1] - pdti[i];
1223     ptJ = pdtj + pdti[i];
1224     for (j=0; j<pnz; j++){
1225       row  = ptJ[j];  /* row of AP == col of Pt */
1226       anz  = ai[row+1] - ai[row];
1227       Jptr = aj + ai[row];
1228       /* add non-zero cols of AP into the sorted linked list lnk */
1229       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1230     }
1231 
1232     /* add received col data into lnk */
1233     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1234       if (i == *nextrow[k]) { /* i-th row */
1235         nzi = *(nextci[k]+1) - *nextci[k];
1236         Jptr  = buf_rj[k] + *nextci[k];
1237         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1238         nextrow[k]++; nextci[k]++;
1239       }
1240     }
1241     nnz = lnk[0];
1242 
1243     /* if free space is not available, make more free space */
1244     if (current_space->local_remaining<nnz) {
1245       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1246       nspacedouble++;
1247     }
1248     /* copy data into free space, then initialize lnk */
1249     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1250     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1251     current_space->array           += nnz;
1252     current_space->local_used      += nnz;
1253     current_space->local_remaining -= nnz;
1254     bi[i+1] = bi[i] + nnz;
1255     if (nnz > rmax) rmax = nnz;
1256   }
1257   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1258 
1259   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1260   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1261   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1262   if (afill_tmp > afill) afill = afill_tmp;
1263   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1264   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1265   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1266 
1267   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1268   /*----------------------------------------------------------------------------------*/
1269   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1270   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1271 
1272   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1273   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1274   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr);
1275   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1276   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1277   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1278   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1279   for (i=0; i<pn; i++){
1280     row = i + rstart;
1281     nnz = bi[i+1] - bi[i];
1282     Jptr = bj + bi[i];
1283     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1284   }
1285   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1286   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1287   ierr = PetscFree(vals);CHKERRQ(ierr);
1288 
1289   merge->bi            = bi;
1290   merge->bj            = bj;
1291   merge->coi           = coi;
1292   merge->coj           = coj;
1293   merge->buf_ri        = buf_ri;
1294   merge->buf_rj        = buf_rj;
1295   merge->owners_co     = owners_co;
1296   merge->destroy       = Cmpi->ops->destroy;
1297   merge->duplicate     = Cmpi->ops->duplicate;
1298 
1299   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1300   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1301 
1302   /* attach the supporting struct to Cmpi for reuse */
1303   c = (Mat_MPIAIJ*)Cmpi->data;
1304   c->ptap        = ptap;
1305   ptap->api      = PETSC_NULL;
1306   ptap->apj      = PETSC_NULL;
1307   ptap->merge    = merge;
1308   ptap->rmax     = rmax;
1309 
1310   *C = Cmpi;
1311 #if defined(PETSC_USE_INFO)
1312   if (bi[pn] != 0) {
1313     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1314     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1315   } else {
1316     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1317   }
1318 #endif
1319   PetscFunctionReturn(0);
1320 }
1321 
1322 #undef __FUNCT__
1323 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable"
1324 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1325 {
1326   PetscErrorCode       ierr;
1327   Mat_Merge_SeqsToMPI  *merge;
1328   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1329   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1330   Mat_PtAPMPI          *ptap;
1331   PetscInt             *adj;
1332   PetscInt             i,j,k,anz,pnz,row,*cj,nexta;
1333   MatScalar            *ada,*ca,valtmp;
1334   PetscInt             am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1335   MPI_Comm             comm=((PetscObject)C)->comm;
1336   PetscMPIInt          size,rank,taga,*len_s;
1337   PetscInt             *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1338   PetscInt             **buf_ri,**buf_rj;
1339   PetscInt             cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1340   MPI_Request          *s_waits,*r_waits;
1341   MPI_Status           *status;
1342   MatScalar            **abuf_r,*ba_i,*pA,*coa,*ba;
1343   PetscInt             *ai,*aj,*coi,*coj;
1344   PetscInt             *poJ=po->j,*pdJ=pd->j;
1345   Mat                  A_loc;
1346   Mat_SeqAIJ           *a_loc;
1347 
1348   PetscFunctionBegin;
1349   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1350   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1351 
1352   ptap  = c->ptap;
1353   merge = ptap->merge;
1354 
1355   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1356   /*------------------------------------------*/
1357   /* get data from symbolic products */
1358   coi = merge->coi; coj = merge->coj;
1359   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
1360   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
1361   bi     = merge->bi; bj = merge->bj;
1362   owners = merge->rowmap->range;
1363   ierr   = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1364   ierr   = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1365 
1366   /* get A_loc by taking all local rows of A */
1367   A_loc = ptap->A_loc;
1368   ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1369   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1370   ai   = a_loc->i;
1371   aj   = a_loc->j;
1372 
1373   for (i=0; i<am; i++) {
1374     /* 2-a) put A[i,:] to dense array aval */
1375     anz = ai[i+1] - ai[i];
1376     adj = aj + ai[i];
1377     ada = a_loc->a + ai[i];
1378 
1379     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1380     /*-------------------------------------------------------------*/
1381     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1382     pnz = po->i[i+1] - po->i[i];
1383     poJ = po->j + po->i[i];
1384     pA  = po->a + po->i[i];
1385     for (j=0; j<pnz; j++){
1386       row = poJ[j];
1387       cnz = coi[row+1] - coi[row];
1388       cj  = coj + coi[row];
1389       ca  = coa + coi[row];
1390       /* perform sparse axpy */
1391       nexta  = 0;
1392       valtmp = pA[j];
1393       for (k=0; nexta<anz; k++) {
1394         if (cj[k] == adj[nexta]){
1395           ca[k] += valtmp*ada[nexta];
1396           nexta++;
1397         }
1398       }
1399       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1400     }
1401 
1402     /* put the value into Cd (diagonal part) */
1403     pnz = pd->i[i+1] - pd->i[i];
1404     pdJ = pd->j + pd->i[i];
1405     pA  = pd->a + pd->i[i];
1406     for (j=0; j<pnz; j++){
1407       row = pdJ[j];
1408       cnz = bi[row+1] - bi[row];
1409       cj  = bj + bi[row];
1410       ca  = ba + bi[row];
1411       /* perform sparse axpy */
1412       nexta  = 0;
1413       valtmp = pA[j];
1414       for (k=0; nexta<anz; k++) {
1415         if (cj[k] == adj[nexta]){
1416           ca[k] += valtmp*ada[nexta];
1417           nexta++;
1418         }
1419       }
1420       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1421     }
1422 
1423   }
1424 
1425   /* 3) send and recv matrix values coa */
1426   /*------------------------------------*/
1427   buf_ri = merge->buf_ri;
1428   buf_rj = merge->buf_rj;
1429   len_s  = merge->len_s;
1430   ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1431   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1432 
1433   ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr);
1434   for (proc=0,k=0; proc<size; proc++){
1435     if (!len_s[proc]) continue;
1436     i = merge->owners_co[proc];
1437     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1438     k++;
1439   }
1440   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1441   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1442 
1443   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1444   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1445   ierr = PetscFree(coa);CHKERRQ(ierr);
1446 
1447   /* 4) insert local Cseq and received values into Cmpi */
1448   /*----------------------------------------------------*/
1449   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1450   for (k=0; k<merge->nrecv; k++){
1451     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1452     nrows       = *(buf_ri_k[k]);
1453     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1454     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1455   }
1456 
1457   for (i=0; i<cm; i++) {
1458     row = owners[rank] + i; /* global row index of C_seq */
1459     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1460     ba_i = ba + bi[i];
1461     bnz  = bi[i+1] - bi[i];
1462     /* add received vals into ba */
1463     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1464       /* i-th row */
1465       if (i == *nextrow[k]) {
1466         cnz = *(nextci[k]+1) - *nextci[k];
1467         cj  = buf_rj[k] + *(nextci[k]);
1468         ca  = abuf_r[k] + *(nextci[k]);
1469         nextcj = 0;
1470         for (j=0; nextcj<cnz; j++){
1471           if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
1472             ba_i[j] += ca[nextcj++];
1473           }
1474         }
1475         nextrow[k]++; nextci[k]++;
1476         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1477       }
1478     }
1479     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1480   }
1481   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1482   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1483 
1484   ierr = PetscFree(ba);CHKERRQ(ierr);
1485   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1486   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1487   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1492 #undef __FUNCT__
1493 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable"
1494 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1495 {
1496   PetscErrorCode       ierr;
1497   Mat                  Cmpi,A_loc,POt,PDt;
1498   Mat_PtAPMPI          *ptap;
1499   PetscFreeSpaceList   free_space=PETSC_NULL,current_space=PETSC_NULL;
1500   Mat_MPIAIJ           *p=(Mat_MPIAIJ*)P->data,*c;
1501   PetscInt             *pdti,*pdtj,*poti,*potj,*ptJ;
1502   PetscInt             nnz;
1503   PetscInt             *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1504   PetscInt             am=A->rmap->n,pn=P->cmap->n;
1505   MPI_Comm             comm=((PetscObject)A)->comm;
1506   PetscMPIInt          size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1507   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1508   PetscInt             len,proc,*dnz,*onz,*owners;
1509   PetscInt             nzi,*bi,*bj;
1510   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1511   MPI_Request          *swaits,*rwaits;
1512   MPI_Status           *sstatus,rstatus;
1513   Mat_Merge_SeqsToMPI  *merge;
1514   PetscInt             *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1515   PetscReal            afill=1.0,afill_tmp;
1516   PetscInt             rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1517   PetscScalar          *vals;
1518   Mat_SeqAIJ           *a_loc, *pdt,*pot;
1519 
1520   PetscFunctionBegin;
1521   /* check if matrix local sizes are compatible */
1522   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){
1523     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1524   }
1525 
1526   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1527   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1528 
1529   /* create struct Mat_PtAPMPI and attached it to C later */
1530   ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr);
1531 
1532   /* get A_loc by taking all local rows of A */
1533   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1534   ptap->A_loc = A_loc;
1535   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1536   ai   = a_loc->i;
1537   aj   = a_loc->j;
1538 
1539   /* determine symbolic Co=(p->B)^T*A - send to others */
1540   /*----------------------------------------------------*/
1541   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1542   pdt = (Mat_SeqAIJ*)PDt->data;
1543   pdti = pdt->i; pdtj = pdt->j;
1544 
1545   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1546   pot = (Mat_SeqAIJ*)POt->data;
1547   poti = pot->i; potj = pot->j;
1548 
1549   /* then, compute symbolic Co = (p->B)^T*A */
1550   pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1551                          >= (num of nonzero rows of C_seq) - pn */
1552   ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
1553   coi[0] = 0;
1554 
1555   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1556   nnz           = fill*(poti[pon] + ai[am]);
1557   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1558   current_space = free_space;
1559 
1560   /* create and initialize a linked list */
1561   i = PetscMax(pdt->rmax,pot->rmax);
1562   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1563   if (!Crmax || Crmax > aN) Crmax = aN;
1564   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1565 
1566   for (i=0; i<pon; i++) {
1567     nnz = 0;
1568     pnz = poti[i+1] - poti[i];
1569     ptJ = potj + poti[i];
1570     for (j=0; j<pnz; j++){
1571       row  = ptJ[j]; /* row of A_loc == col of Pot */
1572       anz  = ai[row+1] - ai[row];
1573       Jptr = aj + ai[row];
1574       /* add non-zero cols of AP into the sorted linked list lnk */
1575       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1576     }
1577     nnz = lnk[0];
1578 
1579     /* If free space is not available, double the total space in the list */
1580     if (current_space->local_remaining<nnz) {
1581       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1582       nspacedouble++;
1583     }
1584 
1585     /* Copy data into free space, and zero out denserows */
1586     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1587     current_space->array           += nnz;
1588     current_space->local_used      += nnz;
1589     current_space->local_remaining -= nnz;
1590     coi[i+1] = coi[i] + nnz;
1591   }
1592 
1593   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
1594   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1595   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1596   if (afill_tmp > afill) afill = afill_tmp;
1597 
1598   /* send j-array (coj) of Co to other processors */
1599   /*----------------------------------------------*/
1600   /* determine row ownership */
1601   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1602   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1603   merge->rowmap->n = pn;
1604   merge->rowmap->bs = 1;
1605   ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1606   owners = merge->rowmap->range;
1607 
1608   /* determine the number of messages to send, their lengths */
1609   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1610   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1611   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1612   len_s = merge->len_s;
1613   merge->nsend = 0;
1614 
1615   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
1616   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1617 
1618   proc = 0;
1619   for (i=0; i<pon; i++){
1620     while (prmap[i] >= owners[proc+1]) proc++;
1621     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1622     len_s[proc] += coi[i+1] - coi[i];
1623   }
1624 
1625   len   = 0;  /* max length of buf_si[] */
1626   owners_co[0] = 0;
1627   for (proc=0; proc<size; proc++){
1628     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1629     if (len_si[proc]){
1630       merge->nsend++;
1631       len_si[proc] = 2*(len_si[proc] + 1);
1632       len += len_si[proc];
1633     }
1634   }
1635 
1636   /* determine the number and length of messages to receive for coi and coj  */
1637   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1638   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1639 
1640   /* post the Irecv and Isend of coj */
1641   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1642   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1643   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr);
1644   for (proc=0, k=0; proc<size; proc++){
1645     if (!len_s[proc]) continue;
1646     i = owners_co[proc];
1647     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1648     k++;
1649   }
1650 
1651   /* receives and sends of coj are complete */
1652   ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr);
1653   for (i=0; i<merge->nrecv; i++){
1654     PetscMPIInt icompleted;
1655     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1656   }
1657   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1658   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1659 
1660   /* send and recv coi */
1661   /*-------------------*/
1662   ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1663   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1664   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1665   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1666   for (proc=0,k=0; proc<size; proc++){
1667     if (!len_s[proc]) continue;
1668     /* form outgoing message for i-structure:
1669          buf_si[0]:                 nrows to be sent
1670                [1:nrows]:           row index (global)
1671                [nrows+1:2*nrows+1]: i-structure index
1672     */
1673     /*-------------------------------------------*/
1674     nrows = len_si[proc]/2 - 1;
1675     buf_si_i    = buf_si + nrows+1;
1676     buf_si[0]   = nrows;
1677     buf_si_i[0] = 0;
1678     nrows = 0;
1679     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
1680       nzi = coi[i+1] - coi[i];
1681       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1682       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
1683       nrows++;
1684     }
1685     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1686     k++;
1687     buf_si += len_si[proc];
1688   }
1689   i = merge->nrecv;
1690   while (i--) {
1691     PetscMPIInt icompleted;
1692     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1693   }
1694   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1695   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1696   ierr = PetscFree(len_si);CHKERRQ(ierr);
1697   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1698   ierr = PetscFree(swaits);CHKERRQ(ierr);
1699   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1700   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1701 
1702   /* compute the local portion of C (mpi mat) */
1703   /*------------------------------------------*/
1704   /* allocate bi array and free space for accumulating nonzero column info */
1705   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1706   bi[0] = 0;
1707 
1708   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1709   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1710   ierr          = PetscFreeSpaceGet(nnz,&free_space);
1711   current_space = free_space;
1712 
1713   ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr);
1714   for (k=0; k<merge->nrecv; k++){
1715     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1716     nrows       = *buf_ri_k[k];
1717     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1718     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1719   }
1720 
1721   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1722   rmax = 0;
1723   for (i=0; i<pn; i++) {
1724     /* add pdt[i,:]*AP into lnk */
1725     pnz = pdti[i+1] - pdti[i];
1726     ptJ = pdtj + pdti[i];
1727     for (j=0; j<pnz; j++){
1728       row  = ptJ[j];  /* row of AP == col of Pt */
1729       anz  = ai[row+1] - ai[row];
1730       Jptr = aj + ai[row];
1731       /* add non-zero cols of AP into the sorted linked list lnk */
1732       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1733     }
1734 
1735     /* add received col data into lnk */
1736     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1737       if (i == *nextrow[k]) { /* i-th row */
1738         nzi = *(nextci[k]+1) - *nextci[k];
1739         Jptr  = buf_rj[k] + *nextci[k];
1740         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1741         nextrow[k]++; nextci[k]++;
1742       }
1743     }
1744     nnz = lnk[0];
1745 
1746     /* if free space is not available, make more free space */
1747     if (current_space->local_remaining<nnz) {
1748       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1749       nspacedouble++;
1750     }
1751     /* copy data into free space, then initialize lnk */
1752     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1753     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1754     current_space->array           += nnz;
1755     current_space->local_used      += nnz;
1756     current_space->local_remaining -= nnz;
1757     bi[i+1] = bi[i] + nnz;
1758     if (nnz > rmax) rmax = nnz;
1759   }
1760   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1761 
1762   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1763   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1764   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1765   if (afill_tmp > afill) afill = afill_tmp;
1766   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1767   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1768   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1769 
1770   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1771   /*----------------------------------------------------------------------------------*/
1772   ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr);
1773   ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr);
1774 
1775   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1776   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1777   ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs); CHKERRQ(ierr);
1778   ierr = MatSetType(Cmpi,MATMPIAIJ); CHKERRQ(ierr);
1779   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1780   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1781   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1782   for (i=0; i<pn; i++){
1783     row = i + rstart;
1784     nnz = bi[i+1] - bi[i];
1785     Jptr = bj + bi[i];
1786     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1787   }
1788   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1789   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1790   ierr = PetscFree(vals);CHKERRQ(ierr);
1791 
1792   merge->bi            = bi;
1793   merge->bj            = bj;
1794   merge->coi           = coi;
1795   merge->coj           = coj;
1796   merge->buf_ri        = buf_ri;
1797   merge->buf_rj        = buf_rj;
1798   merge->owners_co     = owners_co;
1799   merge->destroy       = Cmpi->ops->destroy;
1800   merge->duplicate     = Cmpi->ops->duplicate;
1801 
1802   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1803   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1804 
1805   /* attach the supporting struct to Cmpi for reuse */
1806   c = (Mat_MPIAIJ*)Cmpi->data;
1807   c->ptap        = ptap;
1808   ptap->api      = PETSC_NULL;
1809   ptap->apj      = PETSC_NULL;
1810   ptap->merge    = merge;
1811   ptap->rmax     = rmax;
1812   ptap->apa      = PETSC_NULL;
1813 
1814   *C = Cmpi;
1815 #if defined(PETSC_USE_INFO)
1816   if (bi[pn] != 0) {
1817     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr);
1818     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr);
1819   } else {
1820     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1821   }
1822 #endif
1823   PetscFunctionReturn(0);
1824 }
1825