xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision c12557a13f0fad7313f160073e352ba05c6ae3a8)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20 
21   PetscFunctionBegin;
22   if (scall == MAT_INITIAL_MATRIX) {
23     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
24     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr);
25     ierr = PetscOptionsEnd();CHKERRQ(ierr);
26 
27     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
28     switch (alg) {
29     case 1:
30       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
31       break;
32     default:
33       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
34       break;
35     }
36     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
37   }
38   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
39   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
40   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
41   PetscFunctionReturn(0);
42 }
43 
44 #undef __FUNCT__
45 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
46 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47 {
48   PetscErrorCode ierr;
49   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
50   Mat_PtAPMPI    *ptap = a->ptap;
51 
52   PetscFunctionBegin;
53   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
54   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
55   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
56   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
57   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
59   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
60   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
61   ierr = ptap->destroy(A);CHKERRQ(ierr);
62   ierr = PetscFree(ptap);CHKERRQ(ierr);
63   PetscFunctionReturn(0);
64 }
65 
66 #undef __FUNCT__
67 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
68 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69 {
70   PetscErrorCode ierr;
71   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
72   Mat_PtAPMPI    *ptap = a->ptap;
73 
74   PetscFunctionBegin;
75   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
76 
77   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79   PetscFunctionReturn(0);
80 }
81 
82 /* compute apa = A[i,:]*P */
83 #define AProw(i,ad,ao,p_loc,p_oth,apa) \
84 {\
85   PetscInt    _anz,_pnz,_j,_k,*_ai,*_aj,_row,*_pi,*_pj;      \
86   PetscScalar *_aa,_valtmp,*_pa;                             \
87   /* diagonal portion of A */\
88   _ai  = ad->i;\
89   _anz = _ai[i+1] - _ai[i];\
90   _aj  = ad->j + _ai[i];\
91   _aa  = ad->a + _ai[i];\
92   for (_j=0; _j<_anz; _j++) {\
93     _row = _aj[_j]; \
94     _pi  = p_loc->i;                                 \
95     _pnz = _pi[_row+1] - _pi[_row];         \
96     _pj  = p_loc->j + _pi[_row];                 \
97     _pa  = p_loc->a + _pi[_row];                 \
98     /* perform dense axpy */                    \
99     valtmp = _aa[_j];                           \
100     for (_k=0; _k<_pnz; _k++) {                    \
101       apa[_pj[_k]] += valtmp*_pa[_k];               \
102     }                                           \
103     PetscLogFlops(2.0*_pnz);                    \
104   }                                             \
105   /* off-diagonal portion of A */               \
106   _ai  = ao->i;\
107   _anz = _ai[i+1] - _ai[i];                     \
108   _aj  = ao->j + _ai[i];                         \
109   _aa  = ao->a + _ai[i];                         \
110   for (_j=0; _j<_anz; _j++) {                      \
111     _row = _aj[_j];    \
112     _pi  = p_oth->i;                         \
113     _pnz = _pi[_row+1] - _pi[_row];          \
114     _pj  = p_oth->j + _pi[_row];                  \
115     _pa  = p_oth->a + _pi[_row];                  \
116     /* perform dense axpy */                     \
117     _valtmp = _aa[_j];                             \
118     for (_k=0; _k<_pnz; _k++) {                     \
119       apa[_pj[_k]] += _valtmp*_pa[_k];                \
120     }                                            \
121     PetscLogFlops(2.0*_pnz);                     \
122   }                                              \
123 }
124 
125 #undef __FUNCT__
126 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
127 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
128 {
129   PetscErrorCode ierr;
130   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
131   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
132   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
133   PetscScalar    *cda=cd->a,*coa=co->a;
134   Mat_SeqAIJ     *p_loc,*p_oth;
135   PetscScalar    *apa,valtmp,*ca;
136   PetscInt       cm   =C->rmap->n;
137   Mat_PtAPMPI    *ptap=c->ptap;
138   PetscInt       *api,*apj,*apJ,i,k;
139   PetscInt       cstart=C->cmap->rstart;
140   PetscInt       cdnz,conz,k0,k1;
141   MPI_Comm       comm;
142   PetscMPIInt    size;
143 
144   PetscFunctionBegin;
145   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
146   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
147 
148   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
149   /*-----------------------------------------------------*/
150   /* update numerical values of P_oth and P_loc */
151   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
152   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
153 
154   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
155   /*----------------------------------------------------------*/
156   /* get data from symbolic products */
157   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
158   p_oth = NULL;
159   if (size >1) {
160     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
161   }
162 
163   /* get apa for storing dense row A[i,:]*P */
164   apa = ptap->apa;
165 
166   api = ptap->api;
167   apj = ptap->apj;
168   for (i=0; i<cm; i++) {
169     /* compute apa = A[i,:]*P */
170     AProw(i,ad,ao,p_loc,p_oth,apa);
171 
172     /* set values in C */
173     apJ  = apj + api[i];
174     cdnz = cd->i[i+1] - cd->i[i];
175     conz = co->i[i+1] - co->i[i];
176 
177     /* 1st off-diagoanl part of C */
178     ca = coa + co->i[i];
179     k  = 0;
180     for (k0=0; k0<conz; k0++) {
181       if (apJ[k] >= cstart) break;
182       ca[k0]      = apa[apJ[k]];
183       apa[apJ[k]] = 0.0;
184       k++;
185     }
186 
187     /* diagonal part of C */
188     ca = cda + cd->i[i];
189     for (k1=0; k1<cdnz; k1++) {
190       ca[k1]      = apa[apJ[k]];
191       apa[apJ[k]] = 0.0;
192       k++;
193     }
194 
195     /* 2nd off-diagoanl part of C */
196     ca = coa + co->i[i];
197     for (; k0<conz; k0++) {
198       ca[k0]      = apa[apJ[k]];
199       apa[apJ[k]] = 0.0;
200       k++;
201     }
202   }
203   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
204   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
210 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
211 {
212   PetscErrorCode     ierr;
213   MPI_Comm           comm;
214   PetscMPIInt        size;
215   Mat                Cmpi;
216   Mat_PtAPMPI        *ptap;
217   PetscFreeSpaceList free_space=NULL,current_space=NULL;
218   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
219   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
220   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
221   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
222   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
223   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
224   PetscBT            lnkbt;
225   PetscScalar        *apa;
226   PetscReal          afill;
227   PetscInt           nlnk_max,armax,prmax;
228 
229   PetscFunctionBegin;
230   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
231   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
232 
233   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
234     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
235   }
236 
237   /* create struct Mat_PtAPMPI and attached it to C later */
238   ierr = PetscNew(&ptap);CHKERRQ(ierr);
239 
240   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
241   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
242 
243   /* get P_loc by taking all local rows of P */
244   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
245 
246   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
247   pi_loc = p_loc->i; pj_loc = p_loc->j;
248   if (size > 1) {
249     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
250     pi_oth = p_oth->i; pj_oth = p_oth->j;
251   } else {
252     p_oth = NULL;
253     pi_oth = NULL; pj_oth = NULL;
254   }
255 
256   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
257   /*-------------------------------------------------------------------*/
258   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
259   ptap->api = api;
260   api[0]    = 0;
261 
262   /* create and initialize a linked list */
263   armax    = ad->rmax+ao->rmax;
264   if (size >1) {
265     prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
266   } else {
267     prmax = p_loc->rmax;
268   }
269   nlnk_max = armax*prmax;
270   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
271   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
272 
273   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
274   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
275 
276   current_space = free_space;
277 
278   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
279   for (i=0; i<am; i++) {
280     /* diagonal portion of A */
281     nzi = adi[i+1] - adi[i];
282     for (j=0; j<nzi; j++) {
283       row  = *adj++;
284       pnz  = pi_loc[row+1] - pi_loc[row];
285       Jptr = pj_loc + pi_loc[row];
286       /* add non-zero cols of P into the sorted linked list lnk */
287       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
288     }
289     /* off-diagonal portion of A */
290     nzi = aoi[i+1] - aoi[i];
291     for (j=0; j<nzi; j++) {
292       row  = *aoj++;
293       pnz  = pi_oth[row+1] - pi_oth[row];
294       Jptr = pj_oth + pi_oth[row];
295       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
296     }
297 
298     apnz     = lnk[0];
299     api[i+1] = api[i] + apnz;
300 
301     /* if free space is not available, double the total space in the list */
302     if (current_space->local_remaining<apnz) {
303       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
304       nspacedouble++;
305     }
306 
307     /* Copy data into free space, then initialize lnk */
308     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
309     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
310 
311     current_space->array           += apnz;
312     current_space->local_used      += apnz;
313     current_space->local_remaining -= apnz;
314   }
315 
316   /* Allocate space for apj, initialize apj, and */
317   /* destroy list of free space and other temporary array(s) */
318   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
319   apj  = ptap->apj;
320   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
321   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
322 
323   /* malloc apa to store dense row A[i,:]*P */
324   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
325 
326   ptap->apa = apa;
327 
328   /* create and assemble symbolic parallel matrix Cmpi */
329   /*----------------------------------------------------*/
330   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
331   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
332   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
333 
334   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
335   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
336   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
337   for (i=0; i<am; i++) {
338     row  = i + rstart;
339     apnz = api[i+1] - api[i];
340     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
341     apj += apnz;
342   }
343   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
344   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
345 
346   ptap->destroy        = Cmpi->ops->destroy;
347   ptap->duplicate      = Cmpi->ops->duplicate;
348   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
349   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
350   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
351 
352   /* attach the supporting struct to Cmpi for reuse */
353   c       = (Mat_MPIAIJ*)Cmpi->data;
354   c->ptap = ptap;
355 
356   *C = Cmpi;
357 
358   /* set MatInfo */
359   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
360   if (afill < 1.0) afill = 1.0;
361   Cmpi->info.mallocs           = nspacedouble;
362   Cmpi->info.fill_ratio_given  = fill;
363   Cmpi->info.fill_ratio_needed = afill;
364 
365 #if defined(PETSC_USE_INFO)
366   if (api[am]) {
367     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
368     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
369   } else {
370     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
371   }
372 #endif
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
378 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (scall == MAT_INITIAL_MATRIX) {
384     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
385     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
386     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
387   }
388   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
389   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
390   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
391   PetscFunctionReturn(0);
392 }
393 
394 typedef struct {
395   Mat         workB;
396   PetscScalar *rvalues,*svalues;
397   MPI_Request *rwaits,*swaits;
398 } MPIAIJ_MPIDense;
399 
400 #undef __FUNCT__
401 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
402 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
403 {
404   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
405   PetscErrorCode  ierr;
406 
407   PetscFunctionBegin;
408   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
409   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
410   ierr = PetscFree(contents);CHKERRQ(ierr);
411   PetscFunctionReturn(0);
412 }
413 
414 #undef __FUNCT__
415 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
416 /*
417     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
418   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
419 
420   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
421 */
422 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
423 {
424   PetscErrorCode         ierr;
425   PetscBool              flg;
426   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
427   PetscInt               nz   = aij->B->cmap->n;
428   PetscContainer         container;
429   MPIAIJ_MPIDense        *contents;
430   VecScatter             ctx   = aij->Mvctx;
431   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
432   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
433 
434   PetscFunctionBegin;
435   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
436   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
437 
438   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
439   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
440   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
441 
442   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
443 
444   ierr = PetscNew(&contents);CHKERRQ(ierr);
445   /* Create work matrix used to store off processor rows of B needed for local product */
446   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
447   /* Create work arrays needed */
448   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
449                       B->cmap->N*to->starts[to->n],&contents->svalues,
450                       from->n,&contents->rwaits,
451                       to->n,&contents->swaits);CHKERRQ(ierr);
452 
453   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
454   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
455   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
456   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
457   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
458 
459   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 #undef __FUNCT__
464 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
465 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
466 {
467   PetscErrorCode         ierr;
468   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
469   PetscInt               nz   = aij->B->cmap->n;
470   PetscContainer         container;
471   MPIAIJ_MPIDense        *contents;
472   VecScatter             ctx   = aij->Mvctx;
473   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
474   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
475   PetscInt               m     = A->rmap->n,n=B->cmap->n;
476 
477   PetscFunctionBegin;
478   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
479   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
480   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
481   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
482   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
483   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
484   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
485 
486   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
487 
488   ierr = PetscNew(&contents);CHKERRQ(ierr);
489   /* Create work matrix used to store off processor rows of B needed for local product */
490   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
491   /* Create work arrays needed */
492   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
493                       B->cmap->N*to->starts[to->n],&contents->svalues,
494                       from->n,&contents->rwaits,
495                       to->n,&contents->swaits);CHKERRQ(ierr);
496 
497   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
498   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
499   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
500   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
501   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNCT__
506 #define __FUNCT__ "MatMPIDenseScatter"
507 /*
508     Performs an efficient scatter on the rows of B needed by this process; this is
509     a modification of the VecScatterBegin_() routines.
510 */
511 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
512 {
513   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
514   PetscErrorCode         ierr;
515   PetscScalar            *b,*w,*svalues,*rvalues;
516   VecScatter             ctx   = aij->Mvctx;
517   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
518   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
519   PetscInt               i,j,k;
520   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
521   PetscMPIInt            *sprocs,*rprocs,nrecvs;
522   MPI_Request            *swaits,*rwaits;
523   MPI_Comm               comm;
524   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
525   MPI_Status             status;
526   MPIAIJ_MPIDense        *contents;
527   PetscContainer         container;
528   Mat                    workB;
529 
530   PetscFunctionBegin;
531   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
532   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
533   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
534   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
535 
536   workB = *outworkB = contents->workB;
537   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
538   sindices = to->indices;
539   sstarts  = to->starts;
540   sprocs   = to->procs;
541   swaits   = contents->swaits;
542   svalues  = contents->svalues;
543 
544   rindices = from->indices;
545   rstarts  = from->starts;
546   rprocs   = from->procs;
547   rwaits   = contents->rwaits;
548   rvalues  = contents->rvalues;
549 
550   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
551   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
552 
553   for (i=0; i<from->n; i++) {
554     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
555   }
556 
557   for (i=0; i<to->n; i++) {
558     /* pack a message at a time */
559     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
560       for (k=0; k<ncols; k++) {
561         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
562       }
563     }
564     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
565   }
566 
567   nrecvs = from->n;
568   while (nrecvs) {
569     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
570     nrecvs--;
571     /* unpack a message at a time */
572     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
573       for (k=0; k<ncols; k++) {
574         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
575       }
576     }
577   }
578   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
579 
580   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
581   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
582   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
583   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
584   PetscFunctionReturn(0);
585 }
586 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
587 
588 #undef __FUNCT__
589 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
590 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
591 {
592   PetscErrorCode ierr;
593   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
594   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
595   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
596   Mat            workB;
597 
598   PetscFunctionBegin;
599   /* diagonal block of A times all local rows of B*/
600   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
601 
602   /* get off processor parts of B needed to complete the product */
603   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
604 
605   /* off-diagonal block of A times nonlocal rows of B */
606   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
607   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
608   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 #undef __FUNCT__
613 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
614 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
615 {
616   PetscErrorCode ierr;
617   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
618   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
619   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
620   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
621   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
622   Mat_SeqAIJ     *p_loc,*p_oth;
623   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
624   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
625   PetscInt       cm          = C->rmap->n,anz,pnz;
626   Mat_PtAPMPI    *ptap       = c->ptap;
627   PetscScalar    *apa_sparse = ptap->apa;
628   PetscInt       *api,*apj,*apJ,i,j,k,row;
629   PetscInt       cstart = C->cmap->rstart;
630   PetscInt       cdnz,conz,k0,k1,nextp;
631   MPI_Comm       comm;
632   PetscMPIInt    size;
633 
634   PetscFunctionBegin;
635   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
636   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
637 
638   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
639   /*-----------------------------------------------------*/
640   /* update numerical values of P_oth and P_loc */
641   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
642   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
643 
644   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
645   /*----------------------------------------------------------*/
646   /* get data from symbolic products */
647   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
648   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
649   if (size >1) {
650     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
651     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
652   } else {
653     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
654   }
655 
656   api = ptap->api;
657   apj = ptap->apj;
658   for (i=0; i<cm; i++) {
659     apJ = apj + api[i];
660 
661     /* diagonal portion of A */
662     anz = adi[i+1] - adi[i];
663     adj = ad->j + adi[i];
664     ada = ad->a + adi[i];
665     for (j=0; j<anz; j++) {
666       row = adj[j];
667       pnz = pi_loc[row+1] - pi_loc[row];
668       pj  = pj_loc + pi_loc[row];
669       pa  = pa_loc + pi_loc[row];
670       /* perform sparse axpy */
671       valtmp = ada[j];
672       nextp  = 0;
673       for (k=0; nextp<pnz; k++) {
674         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
675           apa_sparse[k] += valtmp*pa[nextp++];
676         }
677       }
678       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
679     }
680 
681     /* off-diagonal portion of A */
682     anz = aoi[i+1] - aoi[i];
683     aoj = ao->j + aoi[i];
684     aoa = ao->a + aoi[i];
685     for (j=0; j<anz; j++) {
686       row = aoj[j];
687       pnz = pi_oth[row+1] - pi_oth[row];
688       pj  = pj_oth + pi_oth[row];
689       pa  = pa_oth + pi_oth[row];
690       /* perform sparse axpy */
691       valtmp = aoa[j];
692       nextp  = 0;
693       for (k=0; nextp<pnz; k++) {
694         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
695           apa_sparse[k] += valtmp*pa[nextp++];
696         }
697       }
698       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
699     }
700 
701     /* set values in C */
702     cdnz = cd->i[i+1] - cd->i[i];
703     conz = co->i[i+1] - co->i[i];
704 
705     /* 1st off-diagoanl part of C */
706     ca = coa + co->i[i];
707     k  = 0;
708     for (k0=0; k0<conz; k0++) {
709       if (apJ[k] >= cstart) break;
710       ca[k0]        = apa_sparse[k];
711       apa_sparse[k] = 0.0;
712       k++;
713     }
714 
715     /* diagonal part of C */
716     ca = cda + cd->i[i];
717     for (k1=0; k1<cdnz; k1++) {
718       ca[k1]        = apa_sparse[k];
719       apa_sparse[k] = 0.0;
720       k++;
721     }
722 
723     /* 2nd off-diagoanl part of C */
724     ca = coa + co->i[i];
725     for (; k0<conz; k0++) {
726       ca[k0]        = apa_sparse[k];
727       apa_sparse[k] = 0.0;
728       k++;
729     }
730   }
731   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
732   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
733   PetscFunctionReturn(0);
734 }
735 
736 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
737 #undef __FUNCT__
738 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
739 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
740 {
741   PetscErrorCode     ierr;
742   MPI_Comm           comm;
743   PetscMPIInt        size;
744   Mat                Cmpi;
745   Mat_PtAPMPI        *ptap;
746   PetscFreeSpaceList free_space = NULL,current_space=NULL;
747   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
748   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
749   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
750   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
751   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
752   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
753   PetscInt           nlnk_max,armax,prmax;
754   PetscReal          afill;
755   PetscScalar        *apa;
756 
757   PetscFunctionBegin;
758   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
759   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
760 
761   /* create struct Mat_PtAPMPI and attached it to C later */
762   ierr = PetscNew(&ptap);CHKERRQ(ierr);
763 
764   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
765   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
766 
767   /* get P_loc by taking all local rows of P */
768   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
769 
770   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
771   pi_loc = p_loc->i; pj_loc = p_loc->j;
772   if (size > 1) {
773     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
774     pi_oth = p_oth->i; pj_oth = p_oth->j;
775   } else {
776     p_oth  = NULL;
777     pi_oth = NULL; pj_oth = NULL;
778   }
779 
780   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
781   /*-------------------------------------------------------------------*/
782   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
783   ptap->api = api;
784   api[0]    = 0;
785 
786   /* create and initialize a linked list */
787   armax = ad->rmax+ao->rmax;
788   if (size >1) {
789     prmax = PetscMax(p_loc->rmax,p_oth->rmax);
790   } else {
791     prmax = p_loc->rmax;
792   }
793   nlnk_max = armax*prmax;
794   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
795   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
796 
797   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
798   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
799 
800   current_space = free_space;
801 
802   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
803   for (i=0; i<am; i++) {
804     /* diagonal portion of A */
805     nzi = adi[i+1] - adi[i];
806     for (j=0; j<nzi; j++) {
807       row  = *adj++;
808       pnz  = pi_loc[row+1] - pi_loc[row];
809       Jptr = pj_loc + pi_loc[row];
810       /* add non-zero cols of P into the sorted linked list lnk */
811       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
812     }
813     /* off-diagonal portion of A */
814     nzi = aoi[i+1] - aoi[i];
815     for (j=0; j<nzi; j++) {
816       row  = *aoj++;
817       pnz  = pi_oth[row+1] - pi_oth[row];
818       Jptr = pj_oth + pi_oth[row];
819       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
820     }
821 
822     apnz     = *lnk;
823     api[i+1] = api[i] + apnz;
824     if (apnz > apnz_max) apnz_max = apnz;
825 
826     /* if free space is not available, double the total space in the list */
827     if (current_space->local_remaining<apnz) {
828       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
829       nspacedouble++;
830     }
831 
832     /* Copy data into free space, then initialize lnk */
833     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
834     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
835 
836     current_space->array           += apnz;
837     current_space->local_used      += apnz;
838     current_space->local_remaining -= apnz;
839   }
840 
841   /* Allocate space for apj, initialize apj, and */
842   /* destroy list of free space and other temporary array(s) */
843   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
844   apj  = ptap->apj;
845   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
846   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
847 
848   /* create and assemble symbolic parallel matrix Cmpi */
849   /*----------------------------------------------------*/
850   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
851   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
852   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
853   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
854   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
855   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
856 
857   /* malloc apa for assembly Cmpi */
858   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
859 
860   ptap->apa = apa;
861   for (i=0; i<am; i++) {
862     row  = i + rstart;
863     apnz = api[i+1] - api[i];
864     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
865     apj += apnz;
866   }
867   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
868   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
869 
870   ptap->destroy             = Cmpi->ops->destroy;
871   ptap->duplicate           = Cmpi->ops->duplicate;
872   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
873   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
874   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
875 
876   /* attach the supporting struct to Cmpi for reuse */
877   c       = (Mat_MPIAIJ*)Cmpi->data;
878   c->ptap = ptap;
879 
880   *C = Cmpi;
881 
882   /* set MatInfo */
883   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
884   if (afill < 1.0) afill = 1.0;
885   Cmpi->info.mallocs           = nspacedouble;
886   Cmpi->info.fill_ratio_given  = fill;
887   Cmpi->info.fill_ratio_needed = afill;
888 
889 #if defined(PETSC_USE_INFO)
890   if (api[am]) {
891     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
892     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
893   } else {
894     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
895   }
896 #endif
897   PetscFunctionReturn(0);
898 }
899 
900 /*-------------------------------------------------------------------------*/
901 #undef __FUNCT__
902 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
903 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
904 {
905   PetscErrorCode ierr;
906   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
907   PetscInt       alg=0; /* set default algorithm */
908 
909   PetscFunctionBegin;
910   if (scall == MAT_INITIAL_MATRIX) {
911     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
912     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
913     ierr = PetscOptionsEnd();CHKERRQ(ierr);
914 
915     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
916     switch (alg) {
917     case 1:
918       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
919       break;
920     case 2:
921     {
922       Mat         Pt;
923       Mat_PtAPMPI *ptap;
924       Mat_MPIAIJ  *c;
925       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
926       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
927       c        = (Mat_MPIAIJ*)(*C)->data;
928       ptap     = c->ptap;
929       ptap->Pt = Pt;
930       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
931       PetscFunctionReturn(0);
932     }
933       break;
934     default:
935       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
936       break;
937     }
938     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
939   }
940   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
941   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
942   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
943   PetscFunctionReturn(0);
944 }
945 
946 /* This routine only works when scall=MAT_REUSE_MATRIX! */
947 #undef __FUNCT__
948 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
949 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
950 {
951   PetscErrorCode ierr;
952   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
953   Mat_PtAPMPI    *ptap= c->ptap;
954   Mat            Pt=ptap->Pt;
955 
956   PetscFunctionBegin;
957   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
958   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
959   PetscFunctionReturn(0);
960 }
961 
962 /* Non-scalable version, use dense axpy */
963 #undef __FUNCT__
964 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
965 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
966 {
967   PetscErrorCode      ierr;
968   Mat_Merge_SeqsToMPI *merge;
969   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
970   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
971   Mat_PtAPMPI         *ptap;
972   PetscInt            *adj,*aJ;
973   PetscInt            i,j,k,anz,pnz,row,*cj;
974   MatScalar           *ada,*aval,*ca,valtmp;
975   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
976   MPI_Comm            comm;
977   PetscMPIInt         size,rank,taga,*len_s;
978   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
979   PetscInt            **buf_ri,**buf_rj;
980   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
981   MPI_Request         *s_waits,*r_waits;
982   MPI_Status          *status;
983   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
984   PetscInt            *ai,*aj,*coi,*coj;
985   PetscInt            *poJ,*pdJ;
986   Mat                 A_loc;
987   Mat_SeqAIJ          *a_loc;
988 
989   PetscFunctionBegin;
990   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
991   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
992   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
993 
994   ptap  = c->ptap;
995   merge = ptap->merge;
996 
997   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
998   /*--------------------------------------------------------------*/
999   /* get data from symbolic products */
1000   coi  = merge->coi; coj = merge->coj;
1001   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1002 
1003   bi     = merge->bi; bj = merge->bj;
1004   owners = merge->rowmap->range;
1005   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1006 
1007   /* get A_loc by taking all local rows of A */
1008   A_loc = ptap->A_loc;
1009   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1010   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1011   ai    = a_loc->i;
1012   aj    = a_loc->j;
1013 
1014   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
1015 
1016   for (i=0; i<am; i++) {
1017     /* 2-a) put A[i,:] to dense array aval */
1018     anz = ai[i+1] - ai[i];
1019     adj = aj + ai[i];
1020     ada = a_loc->a + ai[i];
1021     for (j=0; j<anz; j++) {
1022       aval[adj[j]] = ada[j];
1023     }
1024 
1025     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1026     /*--------------------------------------------------------------*/
1027     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1028     pnz = po->i[i+1] - po->i[i];
1029     poJ = po->j + po->i[i];
1030     pA  = po->a + po->i[i];
1031     for (j=0; j<pnz; j++) {
1032       row = poJ[j];
1033       cnz = coi[row+1] - coi[row];
1034       cj  = coj + coi[row];
1035       ca  = coa + coi[row];
1036       /* perform dense axpy */
1037       valtmp = pA[j];
1038       for (k=0; k<cnz; k++) {
1039         ca[k] += valtmp*aval[cj[k]];
1040       }
1041       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1042     }
1043 
1044     /* put the value into Cd (diagonal part) */
1045     pnz = pd->i[i+1] - pd->i[i];
1046     pdJ = pd->j + pd->i[i];
1047     pA  = pd->a + pd->i[i];
1048     for (j=0; j<pnz; j++) {
1049       row = pdJ[j];
1050       cnz = bi[row+1] - bi[row];
1051       cj  = bj + bi[row];
1052       ca  = ba + bi[row];
1053       /* perform dense axpy */
1054       valtmp = pA[j];
1055       for (k=0; k<cnz; k++) {
1056         ca[k] += valtmp*aval[cj[k]];
1057       }
1058       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1059     }
1060 
1061     /* zero the current row of Pt*A */
1062     aJ = aj + ai[i];
1063     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1064   }
1065 
1066   /* 3) send and recv matrix values coa */
1067   /*------------------------------------*/
1068   buf_ri = merge->buf_ri;
1069   buf_rj = merge->buf_rj;
1070   len_s  = merge->len_s;
1071   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1072   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1073 
1074   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1075   for (proc=0,k=0; proc<size; proc++) {
1076     if (!len_s[proc]) continue;
1077     i    = merge->owners_co[proc];
1078     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1079     k++;
1080   }
1081   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1082   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1083 
1084   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1085   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1086   ierr = PetscFree(coa);CHKERRQ(ierr);
1087 
1088   /* 4) insert local Cseq and received values into Cmpi */
1089   /*----------------------------------------------------*/
1090   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1091   for (k=0; k<merge->nrecv; k++) {
1092     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1093     nrows       = *(buf_ri_k[k]);
1094     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1095     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1096   }
1097 
1098   for (i=0; i<cm; i++) {
1099     row  = owners[rank] + i; /* global row index of C_seq */
1100     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1101     ba_i = ba + bi[i];
1102     bnz  = bi[i+1] - bi[i];
1103     /* add received vals into ba */
1104     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1105       /* i-th row */
1106       if (i == *nextrow[k]) {
1107         cnz    = *(nextci[k]+1) - *nextci[k];
1108         cj     = buf_rj[k] + *(nextci[k]);
1109         ca     = abuf_r[k] + *(nextci[k]);
1110         nextcj = 0;
1111         for (j=0; nextcj<cnz; j++) {
1112           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1113             ba_i[j] += ca[nextcj++];
1114           }
1115         }
1116         nextrow[k]++; nextci[k]++;
1117         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1118       }
1119     }
1120     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1121   }
1122   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1123   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1124 
1125   ierr = PetscFree(ba);CHKERRQ(ierr);
1126   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1127   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1128   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1129   ierr = PetscFree(aval);CHKERRQ(ierr);
1130   PetscFunctionReturn(0);
1131 }
1132 
1133 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1134 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1135 #undef __FUNCT__
1136 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1137 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1138 {
1139   PetscErrorCode      ierr;
1140   Mat                 Cmpi,A_loc,POt,PDt;
1141   Mat_PtAPMPI         *ptap;
1142   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1143   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1144   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1145   PetscInt            nnz;
1146   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1147   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1148   PetscBT             lnkbt;
1149   MPI_Comm            comm;
1150   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1151   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1152   PetscInt            len,proc,*dnz,*onz,*owners;
1153   PetscInt            nzi,*bi,*bj;
1154   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1155   MPI_Request         *swaits,*rwaits;
1156   MPI_Status          *sstatus,rstatus;
1157   Mat_Merge_SeqsToMPI *merge;
1158   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1159   PetscReal           afill  =1.0,afill_tmp;
1160   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1161   PetscScalar         *vals;
1162   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1163 
1164   PetscFunctionBegin;
1165   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1166   /* check if matrix local sizes are compatible */
1167   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1168     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1169   }
1170 
1171   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1172   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1173 
1174   /* create struct Mat_PtAPMPI and attached it to C later */
1175   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1176 
1177   /* get A_loc by taking all local rows of A */
1178   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1179 
1180   ptap->A_loc = A_loc;
1181 
1182   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1183   ai    = a_loc->i;
1184   aj    = a_loc->j;
1185 
1186   /* determine symbolic Co=(p->B)^T*A - send to others */
1187   /*----------------------------------------------------*/
1188   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1189   pdt  = (Mat_SeqAIJ*)PDt->data;
1190   pdti = pdt->i; pdtj = pdt->j;
1191 
1192   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1193   pot  = (Mat_SeqAIJ*)POt->data;
1194   poti = pot->i; potj = pot->j;
1195 
1196   /* then, compute symbolic Co = (p->B)^T*A */
1197   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1198   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1199   coi[0] = 0;
1200 
1201   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1202   nnz           = fill*(poti[pon] + ai[am]);
1203   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1204   current_space = free_space;
1205 
1206   /* create and initialize a linked list */
1207   i     = PetscMax(pdt->rmax,pot->rmax);
1208   Crmax = i*a_loc->rmax*size;
1209   if (!Crmax || Crmax > aN) Crmax = aN;
1210   ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1211 
1212   for (i=0; i<pon; i++) {
1213     pnz = poti[i+1] - poti[i];
1214     ptJ = potj + poti[i];
1215     for (j=0; j<pnz; j++) {
1216       row  = ptJ[j]; /* row of A_loc == col of Pot */
1217       anz  = ai[row+1] - ai[row];
1218       Jptr = aj + ai[row];
1219       /* add non-zero cols of AP into the sorted linked list lnk */
1220       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1221     }
1222     nnz = lnk[0];
1223 
1224     /* If free space is not available, double the total space in the list */
1225     if (current_space->local_remaining<nnz) {
1226       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1227       nspacedouble++;
1228     }
1229 
1230     /* Copy data into free space, and zero out denserows */
1231     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1232 
1233     current_space->array           += nnz;
1234     current_space->local_used      += nnz;
1235     current_space->local_remaining -= nnz;
1236 
1237     coi[i+1] = coi[i] + nnz;
1238   }
1239 
1240   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1241   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1242 
1243   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1244   if (afill_tmp > afill) afill = afill_tmp;
1245 
1246   /* send j-array (coj) of Co to other processors */
1247   /*----------------------------------------------*/
1248   /* determine row ownership */
1249   ierr = PetscNew(&merge);CHKERRQ(ierr);
1250   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1251 
1252   merge->rowmap->n  = pn;
1253   merge->rowmap->bs = 1;
1254 
1255   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1256   owners = merge->rowmap->range;
1257 
1258   /* determine the number of messages to send, their lengths */
1259   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1260   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1261 
1262   len_s        = merge->len_s;
1263   merge->nsend = 0;
1264 
1265   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1266   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1267 
1268   proc = 0;
1269   for (i=0; i<pon; i++) {
1270     while (prmap[i] >= owners[proc+1]) proc++;
1271     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1272     len_s[proc] += coi[i+1] - coi[i];
1273   }
1274 
1275   len          = 0; /* max length of buf_si[] */
1276   owners_co[0] = 0;
1277   for (proc=0; proc<size; proc++) {
1278     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1279     if (len_si[proc]) {
1280       merge->nsend++;
1281       len_si[proc] = 2*(len_si[proc] + 1);
1282       len         += len_si[proc];
1283     }
1284   }
1285 
1286   /* determine the number and length of messages to receive for coi and coj  */
1287   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1288   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1289 
1290   /* post the Irecv and Isend of coj */
1291   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1292   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1293   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1294   for (proc=0, k=0; proc<size; proc++) {
1295     if (!len_s[proc]) continue;
1296     i    = owners_co[proc];
1297     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1298     k++;
1299   }
1300 
1301   /* receives and sends of coj are complete */
1302   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1303   for (i=0; i<merge->nrecv; i++) {
1304     PetscMPIInt icompleted;
1305     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1306   }
1307   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1308   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1309 
1310   /* send and recv coi */
1311   /*-------------------*/
1312   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1313   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1314   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1315   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1316   for (proc=0,k=0; proc<size; proc++) {
1317     if (!len_s[proc]) continue;
1318     /* form outgoing message for i-structure:
1319          buf_si[0]:                 nrows to be sent
1320                [1:nrows]:           row index (global)
1321                [nrows+1:2*nrows+1]: i-structure index
1322     */
1323     /*-------------------------------------------*/
1324     nrows       = len_si[proc]/2 - 1;
1325     buf_si_i    = buf_si + nrows+1;
1326     buf_si[0]   = nrows;
1327     buf_si_i[0] = 0;
1328     nrows       = 0;
1329     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1330       nzi               = coi[i+1] - coi[i];
1331       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1332       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1333       nrows++;
1334     }
1335     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1336     k++;
1337     buf_si += len_si[proc];
1338   }
1339   i = merge->nrecv;
1340   while (i--) {
1341     PetscMPIInt icompleted;
1342     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1343   }
1344   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1345   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1346   ierr = PetscFree(len_si);CHKERRQ(ierr);
1347   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1348   ierr = PetscFree(swaits);CHKERRQ(ierr);
1349   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1350   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1351 
1352   /* compute the local portion of C (mpi mat) */
1353   /*------------------------------------------*/
1354   /* allocate bi array and free space for accumulating nonzero column info */
1355   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1356   bi[0] = 0;
1357 
1358   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1359   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1360   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1361   current_space = free_space;
1362 
1363   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1364   for (k=0; k<merge->nrecv; k++) {
1365     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1366     nrows       = *buf_ri_k[k];
1367     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1368     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1369   }
1370 
1371   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1372   rmax = 0;
1373   for (i=0; i<pn; i++) {
1374     /* add pdt[i,:]*AP into lnk */
1375     pnz = pdti[i+1] - pdti[i];
1376     ptJ = pdtj + pdti[i];
1377     for (j=0; j<pnz; j++) {
1378       row  = ptJ[j];  /* row of AP == col of Pt */
1379       anz  = ai[row+1] - ai[row];
1380       Jptr = aj + ai[row];
1381       /* add non-zero cols of AP into the sorted linked list lnk */
1382       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1383     }
1384 
1385     /* add received col data into lnk */
1386     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1387       if (i == *nextrow[k]) { /* i-th row */
1388         nzi  = *(nextci[k]+1) - *nextci[k];
1389         Jptr = buf_rj[k] + *nextci[k];
1390         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1391         nextrow[k]++; nextci[k]++;
1392       }
1393     }
1394     nnz = lnk[0];
1395 
1396     /* if free space is not available, make more free space */
1397     if (current_space->local_remaining<nnz) {
1398       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1399       nspacedouble++;
1400     }
1401     /* copy data into free space, then initialize lnk */
1402     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1403     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1404 
1405     current_space->array           += nnz;
1406     current_space->local_used      += nnz;
1407     current_space->local_remaining -= nnz;
1408 
1409     bi[i+1] = bi[i] + nnz;
1410     if (nnz > rmax) rmax = nnz;
1411   }
1412   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1413 
1414   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1415   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1416 
1417   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1418   if (afill_tmp > afill) afill = afill_tmp;
1419   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1420   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1421   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1422 
1423   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1424   /*----------------------------------------------------------------------------------*/
1425   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1426 
1427   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1428   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1429   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1430   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1431   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1432   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1433   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1434   for (i=0; i<pn; i++) {
1435     row  = i + rstart;
1436     nnz  = bi[i+1] - bi[i];
1437     Jptr = bj + bi[i];
1438     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1439   }
1440   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1441   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1442   ierr = PetscFree(vals);CHKERRQ(ierr);
1443 
1444   merge->bi        = bi;
1445   merge->bj        = bj;
1446   merge->coi       = coi;
1447   merge->coj       = coj;
1448   merge->buf_ri    = buf_ri;
1449   merge->buf_rj    = buf_rj;
1450   merge->owners_co = owners_co;
1451   merge->destroy   = Cmpi->ops->destroy;
1452   merge->duplicate = Cmpi->ops->duplicate;
1453 
1454   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1455   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1456   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1457 
1458   /* attach the supporting struct to Cmpi for reuse */
1459   c           = (Mat_MPIAIJ*)Cmpi->data;
1460   c->ptap     = ptap;
1461   ptap->api   = NULL;
1462   ptap->apj   = NULL;
1463   ptap->merge = merge;
1464   ptap->rmax  = rmax;
1465 
1466   *C = Cmpi;
1467 #if defined(PETSC_USE_INFO)
1468   if (bi[pn] != 0) {
1469     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1470     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1471   } else {
1472     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1473   }
1474 #endif
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 #undef __FUNCT__
1479 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1480 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1481 {
1482   PetscErrorCode      ierr;
1483   Mat_Merge_SeqsToMPI *merge;
1484   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1485   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1486   Mat_PtAPMPI         *ptap;
1487   PetscInt            *adj;
1488   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1489   MatScalar           *ada,*ca,valtmp;
1490   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1491   MPI_Comm            comm;
1492   PetscMPIInt         size,rank,taga,*len_s;
1493   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1494   PetscInt            **buf_ri,**buf_rj;
1495   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1496   MPI_Request         *s_waits,*r_waits;
1497   MPI_Status          *status;
1498   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1499   PetscInt            *ai,*aj,*coi,*coj;
1500   PetscInt            *poJ,*pdJ;
1501   Mat                 A_loc;
1502   Mat_SeqAIJ          *a_loc;
1503 
1504   PetscFunctionBegin;
1505   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1506   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1507   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1508 
1509   ptap  = c->ptap;
1510   merge = ptap->merge;
1511 
1512   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1513   /*------------------------------------------*/
1514   /* get data from symbolic products */
1515   coi    = merge->coi; coj = merge->coj;
1516   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1517   bi     = merge->bi; bj = merge->bj;
1518   owners = merge->rowmap->range;
1519   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1520 
1521   /* get A_loc by taking all local rows of A */
1522   A_loc = ptap->A_loc;
1523   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1524   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1525   ai    = a_loc->i;
1526   aj    = a_loc->j;
1527 
1528   for (i=0; i<am; i++) {
1529     anz = ai[i+1] - ai[i];
1530     adj = aj + ai[i];
1531     ada = a_loc->a + ai[i];
1532 
1533     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1534     /*-------------------------------------------------------------*/
1535     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1536     pnz = po->i[i+1] - po->i[i];
1537     poJ = po->j + po->i[i];
1538     pA  = po->a + po->i[i];
1539     for (j=0; j<pnz; j++) {
1540       row = poJ[j];
1541       cj  = coj + coi[row];
1542       ca  = coa + coi[row];
1543       /* perform sparse axpy */
1544       nexta  = 0;
1545       valtmp = pA[j];
1546       for (k=0; nexta<anz; k++) {
1547         if (cj[k] == adj[nexta]) {
1548           ca[k] += valtmp*ada[nexta];
1549           nexta++;
1550         }
1551       }
1552       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1553     }
1554 
1555     /* put the value into Cd (diagonal part) */
1556     pnz = pd->i[i+1] - pd->i[i];
1557     pdJ = pd->j + pd->i[i];
1558     pA  = pd->a + pd->i[i];
1559     for (j=0; j<pnz; j++) {
1560       row = pdJ[j];
1561       cj  = bj + bi[row];
1562       ca  = ba + bi[row];
1563       /* perform sparse axpy */
1564       nexta  = 0;
1565       valtmp = pA[j];
1566       for (k=0; nexta<anz; k++) {
1567         if (cj[k] == adj[nexta]) {
1568           ca[k] += valtmp*ada[nexta];
1569           nexta++;
1570         }
1571       }
1572       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1573     }
1574   }
1575 
1576   /* 3) send and recv matrix values coa */
1577   /*------------------------------------*/
1578   buf_ri = merge->buf_ri;
1579   buf_rj = merge->buf_rj;
1580   len_s  = merge->len_s;
1581   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1582   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1583 
1584   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1585   for (proc=0,k=0; proc<size; proc++) {
1586     if (!len_s[proc]) continue;
1587     i    = merge->owners_co[proc];
1588     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1589     k++;
1590   }
1591   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1592   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1593 
1594   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1595   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1596   ierr = PetscFree(coa);CHKERRQ(ierr);
1597 
1598   /* 4) insert local Cseq and received values into Cmpi */
1599   /*----------------------------------------------------*/
1600   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1601   for (k=0; k<merge->nrecv; k++) {
1602     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1603     nrows       = *(buf_ri_k[k]);
1604     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1605     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1606   }
1607 
1608   for (i=0; i<cm; i++) {
1609     row  = owners[rank] + i; /* global row index of C_seq */
1610     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1611     ba_i = ba + bi[i];
1612     bnz  = bi[i+1] - bi[i];
1613     /* add received vals into ba */
1614     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1615       /* i-th row */
1616       if (i == *nextrow[k]) {
1617         cnz    = *(nextci[k]+1) - *nextci[k];
1618         cj     = buf_rj[k] + *(nextci[k]);
1619         ca     = abuf_r[k] + *(nextci[k]);
1620         nextcj = 0;
1621         for (j=0; nextcj<cnz; j++) {
1622           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1623             ba_i[j] += ca[nextcj++];
1624           }
1625         }
1626         nextrow[k]++; nextci[k]++;
1627         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1628       }
1629     }
1630     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1631   }
1632   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1633   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1634 
1635   ierr = PetscFree(ba);CHKERRQ(ierr);
1636   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1637   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1638   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1639   PetscFunctionReturn(0);
1640 }
1641 
1642 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1643 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1644    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1645 #undef __FUNCT__
1646 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1647 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1648 {
1649   PetscErrorCode      ierr;
1650   Mat                 Cmpi,A_loc,POt,PDt;
1651   Mat_PtAPMPI         *ptap;
1652   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1653   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1654   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1655   PetscInt            nnz;
1656   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1657   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1658   MPI_Comm            comm;
1659   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1660   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1661   PetscInt            len,proc,*dnz,*onz,*owners;
1662   PetscInt            nzi,*bi,*bj;
1663   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1664   MPI_Request         *swaits,*rwaits;
1665   MPI_Status          *sstatus,rstatus;
1666   Mat_Merge_SeqsToMPI *merge;
1667   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1668   PetscReal           afill  =1.0,afill_tmp;
1669   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1670   PetscScalar         *vals;
1671   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1672 
1673   PetscFunctionBegin;
1674   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1675   /* check if matrix local sizes are compatible */
1676   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1677     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1678   }
1679 
1680   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1681   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1682 
1683   /* create struct Mat_PtAPMPI and attached it to C later */
1684   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1685 
1686   /* get A_loc by taking all local rows of A */
1687   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1688 
1689   ptap->A_loc = A_loc;
1690   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1691   ai          = a_loc->i;
1692   aj          = a_loc->j;
1693 
1694   /* determine symbolic Co=(p->B)^T*A - send to others */
1695   /*----------------------------------------------------*/
1696   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1697   pdt  = (Mat_SeqAIJ*)PDt->data;
1698   pdti = pdt->i; pdtj = pdt->j;
1699 
1700   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1701   pot  = (Mat_SeqAIJ*)POt->data;
1702   poti = pot->i; potj = pot->j;
1703 
1704   /* then, compute symbolic Co = (p->B)^T*A */
1705   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1706                          >= (num of nonzero rows of C_seq) - pn */
1707   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1708   coi[0] = 0;
1709 
1710   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1711   nnz           = fill*(poti[pon] + ai[am]);
1712   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1713   current_space = free_space;
1714 
1715   /* create and initialize a linked list */
1716   i     = PetscMax(pdt->rmax,pot->rmax);
1717   Crmax = i*a_loc->rmax*size; /* non-scalable! */
1718   if (!Crmax || Crmax > aN) Crmax = aN;
1719   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
1720 
1721   for (i=0; i<pon; i++) {
1722     pnz = poti[i+1] - poti[i];
1723     ptJ = potj + poti[i];
1724     for (j=0; j<pnz; j++) {
1725       row  = ptJ[j]; /* row of A_loc == col of Pot */
1726       anz  = ai[row+1] - ai[row];
1727       Jptr = aj + ai[row];
1728       /* add non-zero cols of AP into the sorted linked list lnk */
1729       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1730     }
1731     nnz = lnk[0];
1732 
1733     /* If free space is not available, double the total space in the list */
1734     if (current_space->local_remaining<nnz) {
1735       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1736       nspacedouble++;
1737     }
1738 
1739     /* Copy data into free space, and zero out denserows */
1740     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1741 
1742     current_space->array           += nnz;
1743     current_space->local_used      += nnz;
1744     current_space->local_remaining -= nnz;
1745 
1746     coi[i+1] = coi[i] + nnz;
1747   }
1748 
1749   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1750   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1751 
1752   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1753   if (afill_tmp > afill) afill = afill_tmp;
1754 
1755   /* send j-array (coj) of Co to other processors */
1756   /*----------------------------------------------*/
1757   /* determine row ownership */
1758   ierr = PetscNew(&merge);CHKERRQ(ierr);
1759   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1760 
1761   merge->rowmap->n  = pn;
1762   merge->rowmap->bs = 1;
1763 
1764   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1765   owners = merge->rowmap->range;
1766 
1767   /* determine the number of messages to send, their lengths */
1768   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1769   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1770 
1771   len_s        = merge->len_s;
1772   merge->nsend = 0;
1773 
1774   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1775   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1776 
1777   proc = 0;
1778   for (i=0; i<pon; i++) {
1779     while (prmap[i] >= owners[proc+1]) proc++;
1780     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1781     len_s[proc] += coi[i+1] - coi[i];
1782   }
1783 
1784   len          = 0; /* max length of buf_si[] */
1785   owners_co[0] = 0;
1786   for (proc=0; proc<size; proc++) {
1787     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1788     if (len_si[proc]) {
1789       merge->nsend++;
1790       len_si[proc] = 2*(len_si[proc] + 1);
1791       len         += len_si[proc];
1792     }
1793   }
1794 
1795   /* determine the number and length of messages to receive for coi and coj  */
1796   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1797   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1798 
1799   /* post the Irecv and Isend of coj */
1800   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1801   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1802   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1803   for (proc=0, k=0; proc<size; proc++) {
1804     if (!len_s[proc]) continue;
1805     i    = owners_co[proc];
1806     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1807     k++;
1808   }
1809 
1810   /* receives and sends of coj are complete */
1811   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1812   for (i=0; i<merge->nrecv; i++) {
1813     PetscMPIInt icompleted;
1814     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1815   }
1816   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1817   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1818 
1819   /* send and recv coi */
1820   /*-------------------*/
1821   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1822   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1823   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1824   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1825   for (proc=0,k=0; proc<size; proc++) {
1826     if (!len_s[proc]) continue;
1827     /* form outgoing message for i-structure:
1828          buf_si[0]:                 nrows to be sent
1829                [1:nrows]:           row index (global)
1830                [nrows+1:2*nrows+1]: i-structure index
1831     */
1832     /*-------------------------------------------*/
1833     nrows       = len_si[proc]/2 - 1;
1834     buf_si_i    = buf_si + nrows+1;
1835     buf_si[0]   = nrows;
1836     buf_si_i[0] = 0;
1837     nrows       = 0;
1838     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1839       nzi               = coi[i+1] - coi[i];
1840       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1841       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1842       nrows++;
1843     }
1844     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1845     k++;
1846     buf_si += len_si[proc];
1847   }
1848   i = merge->nrecv;
1849   while (i--) {
1850     PetscMPIInt icompleted;
1851     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1852   }
1853   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1854   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1855   ierr = PetscFree(len_si);CHKERRQ(ierr);
1856   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1857   ierr = PetscFree(swaits);CHKERRQ(ierr);
1858   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1859   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1860 
1861   /* compute the local portion of C (mpi mat) */
1862   /*------------------------------------------*/
1863   /* allocate bi array and free space for accumulating nonzero column info */
1864   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1865   bi[0] = 0;
1866 
1867   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1868   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1869   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1870   current_space = free_space;
1871 
1872   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1873   for (k=0; k<merge->nrecv; k++) {
1874     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1875     nrows       = *buf_ri_k[k];
1876     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1877     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1878   }
1879 
1880   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1881   rmax = 0;
1882   for (i=0; i<pn; i++) {
1883     /* add pdt[i,:]*AP into lnk */
1884     pnz = pdti[i+1] - pdti[i];
1885     ptJ = pdtj + pdti[i];
1886     for (j=0; j<pnz; j++) {
1887       row  = ptJ[j];  /* row of AP == col of Pt */
1888       anz  = ai[row+1] - ai[row];
1889       Jptr = aj + ai[row];
1890       /* add non-zero cols of AP into the sorted linked list lnk */
1891       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1892     }
1893 
1894     /* add received col data into lnk */
1895     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1896       if (i == *nextrow[k]) { /* i-th row */
1897         nzi  = *(nextci[k]+1) - *nextci[k];
1898         Jptr = buf_rj[k] + *nextci[k];
1899         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1900         nextrow[k]++; nextci[k]++;
1901       }
1902     }
1903     nnz = lnk[0];
1904 
1905     /* if free space is not available, make more free space */
1906     if (current_space->local_remaining<nnz) {
1907       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1908       nspacedouble++;
1909     }
1910     /* copy data into free space, then initialize lnk */
1911     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1912     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1913 
1914     current_space->array           += nnz;
1915     current_space->local_used      += nnz;
1916     current_space->local_remaining -= nnz;
1917 
1918     bi[i+1] = bi[i] + nnz;
1919     if (nnz > rmax) rmax = nnz;
1920   }
1921   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); //mpiexec -n 2 ./ex94 -f0 $D/tiny -f1 $D/tiny
1922 
1923   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1924   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1925   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1926   if (afill_tmp > afill) afill = afill_tmp;
1927   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1928   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1929   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1930 
1931   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1932   /*----------------------------------------------------------------------------------*/
1933   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1934 
1935   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1936   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1937   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1938   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1939   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1940   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1941   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1942   for (i=0; i<pn; i++) {
1943     row  = i + rstart;
1944     nnz  = bi[i+1] - bi[i];
1945     Jptr = bj + bi[i];
1946     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1947   }
1948   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1949   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1950   ierr = PetscFree(vals);CHKERRQ(ierr);
1951 
1952   merge->bi        = bi;
1953   merge->bj        = bj;
1954   merge->coi       = coi;
1955   merge->coj       = coj;
1956   merge->buf_ri    = buf_ri;
1957   merge->buf_rj    = buf_rj;
1958   merge->owners_co = owners_co;
1959   merge->destroy   = Cmpi->ops->destroy;
1960   merge->duplicate = Cmpi->ops->duplicate;
1961 
1962   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1963   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1964   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1965 
1966   /* attach the supporting struct to Cmpi for reuse */
1967   c = (Mat_MPIAIJ*)Cmpi->data;
1968 
1969   c->ptap     = ptap;
1970   ptap->api   = NULL;
1971   ptap->apj   = NULL;
1972   ptap->merge = merge;
1973   ptap->rmax  = rmax;
1974   ptap->apa   = NULL;
1975 
1976   *C = Cmpi;
1977 #if defined(PETSC_USE_INFO)
1978   if (bi[pn] != 0) {
1979     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1980     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1981   } else {
1982     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1983   }
1984 #endif
1985   PetscFunctionReturn(0);
1986 }
1987