xref: /petsc/src/mat/impls/aij/mpi/mpimatmatmult.c (revision 78d30b94ba0e23c9ea405755060bbf738523281e)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4           C = A * B
5 */
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <../src/mat/impls/aij/mpi/mpiaij.h>
9 #include <petscbt.h>
10 #include <../src/mat/impls/dense/mpi/mpidense.h>
11 #include <petsc/private/vecimpl.h>
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ"
15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16 {
17   PetscErrorCode ierr;
18   const char     *algTypes[2] = {"scalable","nonscalable"};
19   PetscInt       alg=1; /* set default algorithm */
20 
21   PetscFunctionBegin;
22   if (scall == MAT_INITIAL_MATRIX) {
23     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
24     ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);CHKERRQ(ierr);
25     ierr = PetscOptionsEnd();CHKERRQ(ierr);
26 
27     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
28     switch (alg) {
29     case 1:
30       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr);
31       break;
32     default:
33       ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr);
34       break;
35     }
36     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
37   }
38   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
39   ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr);
40   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
41   PetscFunctionReturn(0);
42 }
43 
44 #undef __FUNCT__
45 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult"
46 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47 {
48   PetscErrorCode ierr;
49   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
50   Mat_PtAPMPI    *ptap = a->ptap;
51 
52   PetscFunctionBegin;
53   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
54   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
55   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
56   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
57   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
58   ierr = PetscFree(ptap->api);CHKERRQ(ierr);
59   ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
60   ierr = PetscFree(ptap->apa);CHKERRQ(ierr);
61   ierr = ptap->destroy(A);CHKERRQ(ierr);
62   ierr = PetscFree(ptap);CHKERRQ(ierr);
63   PetscFunctionReturn(0);
64 }
65 
66 #undef __FUNCT__
67 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult"
68 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69 {
70   PetscErrorCode ierr;
71   Mat_MPIAIJ     *a    = (Mat_MPIAIJ*)A->data;
72   Mat_PtAPMPI    *ptap = a->ptap;
73 
74   PetscFunctionBegin;
75   ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr);
76 
77   (*M)->ops->destroy   = ptap->destroy;   /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78   (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79   PetscFunctionReturn(0);
80 }
81 
82 #undef __FUNCT__
83 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
84 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
85 {
86   PetscErrorCode ierr;
87   Mat_MPIAIJ     *a  =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
88   Mat_SeqAIJ     *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
89   Mat_SeqAIJ     *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
90   PetscScalar    *cda=cd->a,*coa=co->a;
91   Mat_SeqAIJ     *p_loc,*p_oth;
92   PetscScalar    *apa,*ca;
93   PetscInt       cm   =C->rmap->n;
94   Mat_PtAPMPI    *ptap=c->ptap;
95   PetscInt       *api,*apj,*apJ,i,k;
96   PetscInt       cstart=C->cmap->rstart;
97   PetscInt       cdnz,conz,k0,k1;
98   MPI_Comm       comm;
99   PetscMPIInt    size;
100 
101   PetscFunctionBegin;
102   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
103   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
104 
105   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
106   /*-----------------------------------------------------*/
107   /* update numerical values of P_oth and P_loc */
108   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
109   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
110 
111   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
112   /*----------------------------------------------------------*/
113   /* get data from symbolic products */
114   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
115   p_oth = NULL;
116   if (size >1) {
117     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
118   }
119 
120   /* get apa for storing dense row A[i,:]*P */
121   apa = ptap->apa;
122 
123   api = ptap->api;
124   apj = ptap->apj;
125   for (i=0; i<cm; i++) {
126     /* compute apa = A[i,:]*P */
127     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
128 
129     /* set values in C */
130     apJ  = apj + api[i];
131     cdnz = cd->i[i+1] - cd->i[i];
132     conz = co->i[i+1] - co->i[i];
133 
134     /* 1st off-diagoanl part of C */
135     ca = coa + co->i[i];
136     k  = 0;
137     for (k0=0; k0<conz; k0++) {
138       if (apJ[k] >= cstart) break;
139       ca[k0]      = apa[apJ[k]];
140       apa[apJ[k]] = 0.0;
141       k++;
142     }
143 
144     /* diagonal part of C */
145     ca = cda + cd->i[i];
146     for (k1=0; k1<cdnz; k1++) {
147       ca[k1]      = apa[apJ[k]];
148       apa[apJ[k]] = 0.0;
149       k++;
150     }
151 
152     /* 2nd off-diagoanl part of C */
153     ca = coa + co->i[i];
154     for (; k0<conz; k0++) {
155       ca[k0]      = apa[apJ[k]];
156       apa[apJ[k]] = 0.0;
157       k++;
158     }
159   }
160   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 #undef __FUNCT__
166 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
167 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
168 {
169   PetscErrorCode     ierr;
170   MPI_Comm           comm;
171   PetscMPIInt        size;
172   Mat                Cmpi;
173   Mat_PtAPMPI        *ptap;
174   PetscFreeSpaceList free_space=NULL,current_space=NULL;
175   Mat_MPIAIJ         *a        =(Mat_MPIAIJ*)A->data,*c;
176   Mat_SeqAIJ         *ad       =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
177   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
178   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
179   PetscInt           *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
180   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
181   PetscBT            lnkbt;
182   PetscScalar        *apa;
183   PetscReal          afill;
184   PetscInt           nlnk_max,armax,prmax;
185 
186   PetscFunctionBegin;
187   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
188   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
189 
190   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
191     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
192   }
193 
194   /* create struct Mat_PtAPMPI and attached it to C later */
195   ierr = PetscNew(&ptap);CHKERRQ(ierr);
196 
197   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
199 
200   /* get P_loc by taking all local rows of P */
201   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
202 
203   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204   pi_loc = p_loc->i; pj_loc = p_loc->j;
205   if (size > 1) {
206     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207     pi_oth = p_oth->i; pj_oth = p_oth->j;
208   } else {
209     p_oth = NULL;
210     pi_oth = NULL; pj_oth = NULL;
211   }
212 
213   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214   /*-------------------------------------------------------------------*/
215   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
216   ptap->api = api;
217   api[0]    = 0;
218 
219   /* create and initialize a linked list */
220   armax    = ad->rmax+ao->rmax;
221   if (size >1) {
222     prmax    = PetscMax(p_loc->rmax,p_oth->rmax);
223   } else {
224     prmax = p_loc->rmax;
225   }
226   nlnk_max = armax*prmax; /* not good -- needs fix!!! */
227   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
228   ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr);
229 
230   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
231   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
232 
233   current_space = free_space;
234 
235   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
236   for (i=0; i<am; i++) {
237     /* diagonal portion of A */
238     nzi = adi[i+1] - adi[i];
239     for (j=0; j<nzi; j++) {
240       row  = *adj++;
241       pnz  = pi_loc[row+1] - pi_loc[row];
242       Jptr = pj_loc + pi_loc[row];
243       /* add non-zero cols of P into the sorted linked list lnk */
244       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
245     }
246     /* off-diagonal portion of A */
247     nzi = aoi[i+1] - aoi[i];
248     for (j=0; j<nzi; j++) {
249       row  = *aoj++;
250       pnz  = pi_oth[row+1] - pi_oth[row];
251       Jptr = pj_oth + pi_oth[row];
252       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
253     }
254 
255     apnz     = lnk[0];
256     api[i+1] = api[i] + apnz;
257 
258     /* if free space is not available, double the total space in the list */
259     if (current_space->local_remaining<apnz) {
260       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
261       nspacedouble++;
262     }
263 
264     /* Copy data into free space, then initialize lnk */
265     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
266     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
267 
268     current_space->array           += apnz;
269     current_space->local_used      += apnz;
270     current_space->local_remaining -= apnz;
271   }
272 
273   /* Allocate space for apj, initialize apj, and */
274   /* destroy list of free space and other temporary array(s) */
275   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
276   apj  = ptap->apj;
277   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
278   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
279 
280   /* malloc apa to store dense row A[i,:]*P */
281   ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr);
282 
283   ptap->apa = apa;
284 
285   /* create and assemble symbolic parallel matrix Cmpi */
286   /*----------------------------------------------------*/
287   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
288   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
289   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
290 
291   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
292   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
293   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
294   for (i=0; i<am; i++) {
295     row  = i + rstart;
296     apnz = api[i+1] - api[i];
297     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
298     apj += apnz;
299   }
300   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
301   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
302 
303   ptap->destroy        = Cmpi->ops->destroy;
304   ptap->duplicate      = Cmpi->ops->duplicate;
305   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
306   Cmpi->ops->destroy   = MatDestroy_MPIAIJ_MatMatMult;
307   Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
308 
309   /* attach the supporting struct to Cmpi for reuse */
310   c       = (Mat_MPIAIJ*)Cmpi->data;
311   c->ptap = ptap;
312 
313   *C = Cmpi;
314 
315   /* set MatInfo */
316   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
317   if (afill < 1.0) afill = 1.0;
318   Cmpi->info.mallocs           = nspacedouble;
319   Cmpi->info.fill_ratio_given  = fill;
320   Cmpi->info.fill_ratio_needed = afill;
321 
322 #if defined(PETSC_USE_INFO)
323   if (api[am]) {
324     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
325     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
326   } else {
327     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
328   }
329 #endif
330   PetscFunctionReturn(0);
331 }
332 
333 #undef __FUNCT__
334 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense"
335 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
336 {
337   PetscErrorCode ierr;
338 
339   PetscFunctionBegin;
340   if (scall == MAT_INITIAL_MATRIX) {
341     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
342     ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr);
343     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
344   }
345   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
346   ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr);
347   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 typedef struct {
352   Mat         workB;
353   PetscScalar *rvalues,*svalues;
354   MPI_Request *rwaits,*swaits;
355 } MPIAIJ_MPIDense;
356 
357 #undef __FUNCT__
358 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy"
359 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
360 {
361   MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
362   PetscErrorCode  ierr;
363 
364   PetscFunctionBegin;
365   ierr = MatDestroy(&contents->workB);CHKERRQ(ierr);
366   ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr);
367   ierr = PetscFree(contents);CHKERRQ(ierr);
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatMatMultNumeric_MPIDense"
373 /*
374     This is a "dummy function" that handles the case where matrix C was created as a dense matrix
375   directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
376 
377   It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
378 */
379 PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
380 {
381   PetscErrorCode         ierr;
382   PetscBool              flg;
383   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
384   PetscInt               nz   = aij->B->cmap->n;
385   PetscContainer         container;
386   MPIAIJ_MPIDense        *contents;
387   VecScatter             ctx   = aij->Mvctx;
388   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
389   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
390 
391   PetscFunctionBegin;
392   ierr = PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);CHKERRQ(ierr);
393   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
394 
395   /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
396   ierr = PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);CHKERRQ(ierr);
397   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
398 
399   C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
400 
401   ierr = PetscNew(&contents);CHKERRQ(ierr);
402   /* Create work matrix used to store off processor rows of B needed for local product */
403   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
404   /* Create work arrays needed */
405   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
406                       B->cmap->N*to->starts[to->n],&contents->svalues,
407                       from->n,&contents->rwaits,
408                       to->n,&contents->swaits);CHKERRQ(ierr);
409 
410   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
411   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
412   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
413   ierr = PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);CHKERRQ(ierr);
414   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
415 
416   ierr = (*C->ops->matmultnumeric)(A,B,C);CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 #undef __FUNCT__
421 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense"
422 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
423 {
424   PetscErrorCode         ierr;
425   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*) A->data;
426   PetscInt               nz   = aij->B->cmap->n;
427   PetscContainer         container;
428   MPIAIJ_MPIDense        *contents;
429   VecScatter             ctx   = aij->Mvctx;
430   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
431   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
432   PetscInt               m     = A->rmap->n,n=B->cmap->n;
433 
434   PetscFunctionBegin;
435   ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr);
436   ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
437   ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr);
438   ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr);
439   ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr);
440   ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
441   ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
442 
443   (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
444 
445   ierr = PetscNew(&contents);CHKERRQ(ierr);
446   /* Create work matrix used to store off processor rows of B needed for local product */
447   ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr);
448   /* Create work arrays needed */
449   ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
450                       B->cmap->N*to->starts[to->n],&contents->svalues,
451                       from->n,&contents->rwaits,
452                       to->n,&contents->swaits);CHKERRQ(ierr);
453 
454   ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr);
455   ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr);
456   ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr);
457   ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr);
458   ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
459   PetscFunctionReturn(0);
460 }
461 
462 #undef __FUNCT__
463 #define __FUNCT__ "MatMPIDenseScatter"
464 /*
465     Performs an efficient scatter on the rows of B needed by this process; this is
466     a modification of the VecScatterBegin_() routines.
467 */
468 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
469 {
470   Mat_MPIAIJ             *aij = (Mat_MPIAIJ*)A->data;
471   PetscErrorCode         ierr;
472   PetscScalar            *b,*w,*svalues,*rvalues;
473   VecScatter             ctx   = aij->Mvctx;
474   VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
475   VecScatter_MPI_General *to   = (VecScatter_MPI_General*) ctx->todata;
476   PetscInt               i,j,k;
477   PetscInt               *sindices,*sstarts,*rindices,*rstarts;
478   PetscMPIInt            *sprocs,*rprocs,nrecvs;
479   MPI_Request            *swaits,*rwaits;
480   MPI_Comm               comm;
481   PetscMPIInt            tag  = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
482   MPI_Status             status;
483   MPIAIJ_MPIDense        *contents;
484   PetscContainer         container;
485   Mat                    workB;
486 
487   PetscFunctionBegin;
488   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
489   ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr);
490   if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
491   ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr);
492 
493   workB = *outworkB = contents->workB;
494   if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
495   sindices = to->indices;
496   sstarts  = to->starts;
497   sprocs   = to->procs;
498   swaits   = contents->swaits;
499   svalues  = contents->svalues;
500 
501   rindices = from->indices;
502   rstarts  = from->starts;
503   rprocs   = from->procs;
504   rwaits   = contents->rwaits;
505   rvalues  = contents->rvalues;
506 
507   ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr);
508   ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr);
509 
510   for (i=0; i<from->n; i++) {
511     ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
512   }
513 
514   for (i=0; i<to->n; i++) {
515     /* pack a message at a time */
516     for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
517       for (k=0; k<ncols; k++) {
518         svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
519       }
520     }
521     ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
522   }
523 
524   nrecvs = from->n;
525   while (nrecvs) {
526     ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr);
527     nrecvs--;
528     /* unpack a message at a time */
529     for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
530       for (k=0; k<ncols; k++) {
531         w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
532       }
533     }
534   }
535   if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);}
536 
537   ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr);
538   ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr);
539   ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
540   ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
541   PetscFunctionReturn(0);
542 }
543 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
544 
545 #undef __FUNCT__
546 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense"
547 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
548 {
549   PetscErrorCode ierr;
550   Mat_MPIAIJ     *aij    = (Mat_MPIAIJ*)A->data;
551   Mat_MPIDense   *bdense = (Mat_MPIDense*)B->data;
552   Mat_MPIDense   *cdense = (Mat_MPIDense*)C->data;
553   Mat            workB;
554 
555   PetscFunctionBegin;
556   /* diagonal block of A times all local rows of B*/
557   ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr);
558 
559   /* get off processor parts of B needed to complete the product */
560   ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr);
561 
562   /* off-diagonal block of A times nonlocal rows of B */
563   ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr);
564   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
565   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
566   PetscFunctionReturn(0);
567 }
568 
569 #undef __FUNCT__
570 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ"
571 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
572 {
573   PetscErrorCode ierr;
574   Mat_MPIAIJ     *a   = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
575   Mat_SeqAIJ     *ad  = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
576   Mat_SeqAIJ     *cd  = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
577   PetscInt       *adi = ad->i,*adj,*aoi=ao->i,*aoj;
578   PetscScalar    *ada,*aoa,*cda=cd->a,*coa=co->a;
579   Mat_SeqAIJ     *p_loc,*p_oth;
580   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
581   PetscScalar    *pa_loc,*pa_oth,*pa,valtmp,*ca;
582   PetscInt       cm          = C->rmap->n,anz,pnz;
583   Mat_PtAPMPI    *ptap       = c->ptap;
584   PetscScalar    *apa_sparse = ptap->apa;
585   PetscInt       *api,*apj,*apJ,i,j,k,row;
586   PetscInt       cstart = C->cmap->rstart;
587   PetscInt       cdnz,conz,k0,k1,nextp;
588   MPI_Comm       comm;
589   PetscMPIInt    size;
590 
591   PetscFunctionBegin;
592   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
593   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
594 
595   /* 1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
596   /*-----------------------------------------------------*/
597   /* update numerical values of P_oth and P_loc */
598   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
599   ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
600 
601   /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
602   /*----------------------------------------------------------*/
603   /* get data from symbolic products */
604   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
605   pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
606   if (size >1) {
607     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
608     pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
609   } else {
610     p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
611   }
612 
613   api = ptap->api;
614   apj = ptap->apj;
615   for (i=0; i<cm; i++) {
616     apJ = apj + api[i];
617 
618     /* diagonal portion of A */
619     anz = adi[i+1] - adi[i];
620     adj = ad->j + adi[i];
621     ada = ad->a + adi[i];
622     for (j=0; j<anz; j++) {
623       row = adj[j];
624       pnz = pi_loc[row+1] - pi_loc[row];
625       pj  = pj_loc + pi_loc[row];
626       pa  = pa_loc + pi_loc[row];
627       /* perform sparse axpy */
628       valtmp = ada[j];
629       nextp  = 0;
630       for (k=0; nextp<pnz; k++) {
631         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
632           apa_sparse[k] += valtmp*pa[nextp++];
633         }
634       }
635       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
636     }
637 
638     /* off-diagonal portion of A */
639     anz = aoi[i+1] - aoi[i];
640     aoj = ao->j + aoi[i];
641     aoa = ao->a + aoi[i];
642     for (j=0; j<anz; j++) {
643       row = aoj[j];
644       pnz = pi_oth[row+1] - pi_oth[row];
645       pj  = pj_oth + pi_oth[row];
646       pa  = pa_oth + pi_oth[row];
647       /* perform sparse axpy */
648       valtmp = aoa[j];
649       nextp  = 0;
650       for (k=0; nextp<pnz; k++) {
651         if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
652           apa_sparse[k] += valtmp*pa[nextp++];
653         }
654       }
655       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
656     }
657 
658     /* set values in C */
659     cdnz = cd->i[i+1] - cd->i[i];
660     conz = co->i[i+1] - co->i[i];
661 
662     /* 1st off-diagoanl part of C */
663     ca = coa + co->i[i];
664     k  = 0;
665     for (k0=0; k0<conz; k0++) {
666       if (apJ[k] >= cstart) break;
667       ca[k0]        = apa_sparse[k];
668       apa_sparse[k] = 0.0;
669       k++;
670     }
671 
672     /* diagonal part of C */
673     ca = cda + cd->i[i];
674     for (k1=0; k1<cdnz; k1++) {
675       ca[k1]        = apa_sparse[k];
676       apa_sparse[k] = 0.0;
677       k++;
678     }
679 
680     /* 2nd off-diagoanl part of C */
681     ca = coa + co->i[i];
682     for (; k0<conz; k0++) {
683       ca[k0]        = apa_sparse[k];
684       apa_sparse[k] = 0.0;
685       k++;
686     }
687   }
688   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
689   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
690   PetscFunctionReturn(0);
691 }
692 
693 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
694 #undef __FUNCT__
695 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ"
696 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
697 {
698   PetscErrorCode     ierr;
699   MPI_Comm           comm;
700   PetscMPIInt        size;
701   Mat                Cmpi;
702   Mat_PtAPMPI        *ptap;
703   PetscFreeSpaceList free_space = NULL,current_space=NULL;
704   Mat_MPIAIJ         *a         = (Mat_MPIAIJ*)A->data,*c;
705   Mat_SeqAIJ         *ad        = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
706   PetscInt           *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
707   PetscInt           *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
708   PetscInt           i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
709   PetscInt           am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
710   PetscInt           nlnk_max,armax,prmax;
711   PetscReal          afill;
712   PetscScalar        *apa;
713 
714   PetscFunctionBegin;
715   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
716   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
717 
718   /* create struct Mat_PtAPMPI and attached it to C later */
719   ierr = PetscNew(&ptap);CHKERRQ(ierr);
720 
721   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
722   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
723 
724   /* get P_loc by taking all local rows of P */
725   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
726 
727   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
728   pi_loc = p_loc->i; pj_loc = p_loc->j;
729   if (size > 1) {
730     p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
731     pi_oth = p_oth->i; pj_oth = p_oth->j;
732   } else {
733     p_oth  = NULL;
734     pi_oth = NULL; pj_oth = NULL;
735   }
736 
737   /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
738   /*-------------------------------------------------------------------*/
739   ierr      = PetscMalloc1(am+2,&api);CHKERRQ(ierr);
740   ptap->api = api;
741   api[0]    = 0;
742 
743   /* create and initialize a linked list */
744   armax = ad->rmax+ao->rmax;
745   if (size >1) {
746     prmax = PetscMax(p_loc->rmax,p_oth->rmax);
747   } else {
748     prmax = p_loc->rmax;
749   }
750   nlnk_max = armax*prmax;
751   if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
752   ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr);
753 
754   /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
755   ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr);
756 
757   current_space = free_space;
758 
759   ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr);
760   for (i=0; i<am; i++) {
761     /* diagonal portion of A */
762     nzi = adi[i+1] - adi[i];
763     for (j=0; j<nzi; j++) {
764       row  = *adj++;
765       pnz  = pi_loc[row+1] - pi_loc[row];
766       Jptr = pj_loc + pi_loc[row];
767       /* add non-zero cols of P into the sorted linked list lnk */
768       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
769     }
770     /* off-diagonal portion of A */
771     nzi = aoi[i+1] - aoi[i];
772     for (j=0; j<nzi; j++) {
773       row  = *aoj++;
774       pnz  = pi_oth[row+1] - pi_oth[row];
775       Jptr = pj_oth + pi_oth[row];
776       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
777     }
778 
779     apnz     = *lnk;
780     api[i+1] = api[i] + apnz;
781     if (apnz > apnz_max) apnz_max = apnz;
782 
783     /* if free space is not available, double the total space in the list */
784     if (current_space->local_remaining<apnz) {
785       ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
786       nspacedouble++;
787     }
788 
789     /* Copy data into free space, then initialize lnk */
790     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
791     ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr);
792 
793     current_space->array           += apnz;
794     current_space->local_used      += apnz;
795     current_space->local_remaining -= apnz;
796   }
797 
798   /* Allocate space for apj, initialize apj, and */
799   /* destroy list of free space and other temporary array(s) */
800   ierr = PetscMalloc1(api[am]+1,&ptap->apj);CHKERRQ(ierr);
801   apj  = ptap->apj;
802   ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr);
803   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
804 
805   /* create and assemble symbolic parallel matrix Cmpi */
806   /*----------------------------------------------------*/
807   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
808   ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
809   ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr);
810   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
811   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
812   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
813 
814   /* malloc apa for assembly Cmpi */
815   ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr);
816 
817   ptap->apa = apa;
818   for (i=0; i<am; i++) {
819     row  = i + rstart;
820     apnz = api[i+1] - api[i];
821     ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr);
822     apj += apnz;
823   }
824   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
825   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
826 
827   ptap->destroy             = Cmpi->ops->destroy;
828   ptap->duplicate           = Cmpi->ops->duplicate;
829   Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
830   Cmpi->ops->destroy        = MatDestroy_MPIAIJ_MatMatMult;
831   Cmpi->ops->duplicate      = MatDuplicate_MPIAIJ_MatMatMult;
832 
833   /* attach the supporting struct to Cmpi for reuse */
834   c       = (Mat_MPIAIJ*)Cmpi->data;
835   c->ptap = ptap;
836 
837   *C = Cmpi;
838 
839   /* set MatInfo */
840   afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
841   if (afill < 1.0) afill = 1.0;
842   Cmpi->info.mallocs           = nspacedouble;
843   Cmpi->info.fill_ratio_given  = fill;
844   Cmpi->info.fill_ratio_needed = afill;
845 
846 #if defined(PETSC_USE_INFO)
847   if (api[am]) {
848     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
849     ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr);
850   } else {
851     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
852   }
853 #endif
854   PetscFunctionReturn(0);
855 }
856 
857 /*-------------------------------------------------------------------------*/
858 #undef __FUNCT__
859 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ"
860 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
861 {
862   PetscErrorCode ierr;
863   const char     *algTypes[3] = {"scalable","nonscalable","matmatmult"};
864   PetscInt       alg=0; /* set default algorithm */
865 
866   PetscFunctionBegin;
867   if (scall == MAT_INITIAL_MATRIX) {
868     ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr);
869     ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr);
870     ierr = PetscOptionsEnd();CHKERRQ(ierr);
871 
872     ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
873     switch (alg) {
874     case 1:
875       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr);
876       break;
877     case 2:
878     {
879       Mat         Pt;
880       Mat_PtAPMPI *ptap;
881       Mat_MPIAIJ  *c;
882       ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr);
883       ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr);
884       c        = (Mat_MPIAIJ*)(*C)->data;
885       ptap     = c->ptap;
886       ptap->Pt = Pt;
887       (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
888       PetscFunctionReturn(0);
889     }
890       break;
891     default:
892       ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr);
893       break;
894     }
895     ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr);
896   }
897   ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
898   ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr);
899   ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr);
900   PetscFunctionReturn(0);
901 }
902 
903 /* This routine only works when scall=MAT_REUSE_MATRIX! */
904 #undef __FUNCT__
905 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult"
906 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
907 {
908   PetscErrorCode ierr;
909   Mat_MPIAIJ     *c=(Mat_MPIAIJ*)C->data;
910   Mat_PtAPMPI    *ptap= c->ptap;
911   Mat            Pt=ptap->Pt;
912 
913   PetscFunctionBegin;
914   ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr);
915   ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr);
916   PetscFunctionReturn(0);
917 }
918 
919 /* Non-scalable version, use dense axpy */
920 #undef __FUNCT__
921 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable"
922 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
923 {
924   PetscErrorCode      ierr;
925   Mat_Merge_SeqsToMPI *merge;
926   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
927   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
928   Mat_PtAPMPI         *ptap;
929   PetscInt            *adj,*aJ;
930   PetscInt            i,j,k,anz,pnz,row,*cj;
931   MatScalar           *ada,*aval,*ca,valtmp;
932   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
933   MPI_Comm            comm;
934   PetscMPIInt         size,rank,taga,*len_s;
935   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
936   PetscInt            **buf_ri,**buf_rj;
937   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
938   MPI_Request         *s_waits,*r_waits;
939   MPI_Status          *status;
940   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
941   PetscInt            *ai,*aj,*coi,*coj;
942   PetscInt            *poJ,*pdJ;
943   Mat                 A_loc;
944   Mat_SeqAIJ          *a_loc;
945 
946   PetscFunctionBegin;
947   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
948   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
949   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
950 
951   ptap  = c->ptap;
952   merge = ptap->merge;
953 
954   /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
955   /*--------------------------------------------------------------*/
956   /* get data from symbolic products */
957   coi  = merge->coi; coj = merge->coj;
958   ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
959 
960   bi     = merge->bi; bj = merge->bj;
961   owners = merge->rowmap->range;
962   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
963 
964   /* get A_loc by taking all local rows of A */
965   A_loc = ptap->A_loc;
966   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
967   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
968   ai    = a_loc->i;
969   aj    = a_loc->j;
970 
971   ierr = PetscCalloc1(A->cmap->N,&aval);CHKERRQ(ierr); /* non-scalable!!! */
972 
973   for (i=0; i<am; i++) {
974     /* 2-a) put A[i,:] to dense array aval */
975     anz = ai[i+1] - ai[i];
976     adj = aj + ai[i];
977     ada = a_loc->a + ai[i];
978     for (j=0; j<anz; j++) {
979       aval[adj[j]] = ada[j];
980     }
981 
982     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
983     /*--------------------------------------------------------------*/
984     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
985     pnz = po->i[i+1] - po->i[i];
986     poJ = po->j + po->i[i];
987     pA  = po->a + po->i[i];
988     for (j=0; j<pnz; j++) {
989       row = poJ[j];
990       cnz = coi[row+1] - coi[row];
991       cj  = coj + coi[row];
992       ca  = coa + coi[row];
993       /* perform dense axpy */
994       valtmp = pA[j];
995       for (k=0; k<cnz; k++) {
996         ca[k] += valtmp*aval[cj[k]];
997       }
998       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
999     }
1000 
1001     /* put the value into Cd (diagonal part) */
1002     pnz = pd->i[i+1] - pd->i[i];
1003     pdJ = pd->j + pd->i[i];
1004     pA  = pd->a + pd->i[i];
1005     for (j=0; j<pnz; j++) {
1006       row = pdJ[j];
1007       cnz = bi[row+1] - bi[row];
1008       cj  = bj + bi[row];
1009       ca  = ba + bi[row];
1010       /* perform dense axpy */
1011       valtmp = pA[j];
1012       for (k=0; k<cnz; k++) {
1013         ca[k] += valtmp*aval[cj[k]];
1014       }
1015       ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1016     }
1017 
1018     /* zero the current row of Pt*A */
1019     aJ = aj + ai[i];
1020     for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1021   }
1022 
1023   /* 3) send and recv matrix values coa */
1024   /*------------------------------------*/
1025   buf_ri = merge->buf_ri;
1026   buf_rj = merge->buf_rj;
1027   len_s  = merge->len_s;
1028   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1029   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1030 
1031   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1032   for (proc=0,k=0; proc<size; proc++) {
1033     if (!len_s[proc]) continue;
1034     i    = merge->owners_co[proc];
1035     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1036     k++;
1037   }
1038   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1039   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1040 
1041   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1042   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1043   ierr = PetscFree(coa);CHKERRQ(ierr);
1044 
1045   /* 4) insert local Cseq and received values into Cmpi */
1046   /*----------------------------------------------------*/
1047   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1048   for (k=0; k<merge->nrecv; k++) {
1049     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1050     nrows       = *(buf_ri_k[k]);
1051     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1052     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1053   }
1054 
1055   for (i=0; i<cm; i++) {
1056     row  = owners[rank] + i; /* global row index of C_seq */
1057     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1058     ba_i = ba + bi[i];
1059     bnz  = bi[i+1] - bi[i];
1060     /* add received vals into ba */
1061     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1062       /* i-th row */
1063       if (i == *nextrow[k]) {
1064         cnz    = *(nextci[k]+1) - *nextci[k];
1065         cj     = buf_rj[k] + *(nextci[k]);
1066         ca     = abuf_r[k] + *(nextci[k]);
1067         nextcj = 0;
1068         for (j=0; nextcj<cnz; j++) {
1069           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1070             ba_i[j] += ca[nextcj++];
1071           }
1072         }
1073         nextrow[k]++; nextci[k]++;
1074         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1075       }
1076     }
1077     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1078   }
1079   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1080   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1081 
1082   ierr = PetscFree(ba);CHKERRQ(ierr);
1083   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1084   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1085   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1086   ierr = PetscFree(aval);CHKERRQ(ierr);
1087   PetscFunctionReturn(0);
1088 }
1089 
1090 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1091 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1092 #undef __FUNCT__
1093 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable"
1094 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1095 {
1096   PetscErrorCode      ierr;
1097   Mat                 Cmpi,A_loc,POt,PDt;
1098   Mat_PtAPMPI         *ptap;
1099   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1100   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1101   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1102   PetscInt            nnz;
1103   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1104   PetscInt            am=A->rmap->n,pn=P->cmap->n;
1105   PetscBT             lnkbt;
1106   MPI_Comm            comm;
1107   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1108   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1109   PetscInt            len,proc,*dnz,*onz,*owners;
1110   PetscInt            nzi,*bi,*bj;
1111   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1112   MPI_Request         *swaits,*rwaits;
1113   MPI_Status          *sstatus,rstatus;
1114   Mat_Merge_SeqsToMPI *merge;
1115   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1116   PetscReal           afill  =1.0,afill_tmp;
1117   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1118   PetscScalar         *vals;
1119   Mat_SeqAIJ          *a_loc, *pdt,*pot;
1120 
1121   PetscFunctionBegin;
1122   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1123   /* check if matrix local sizes are compatible */
1124   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1125     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1126   }
1127 
1128   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1129   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1130 
1131   /* create struct Mat_PtAPMPI and attached it to C later */
1132   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1133 
1134   /* get A_loc by taking all local rows of A */
1135   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1136 
1137   ptap->A_loc = A_loc;
1138 
1139   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1140   ai    = a_loc->i;
1141   aj    = a_loc->j;
1142 
1143   /* determine symbolic Co=(p->B)^T*A - send to others */
1144   /*----------------------------------------------------*/
1145   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1146   pdt  = (Mat_SeqAIJ*)PDt->data;
1147   pdti = pdt->i; pdtj = pdt->j;
1148 
1149   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1150   pot  = (Mat_SeqAIJ*)POt->data;
1151   poti = pot->i; potj = pot->j;
1152 
1153   /* then, compute symbolic Co = (p->B)^T*A */
1154   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1155   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1156   coi[0] = 0;
1157 
1158   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1159   nnz           = fill*(poti[pon] + ai[am]);
1160   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1161   current_space = free_space;
1162 
1163   /* create and initialize a linked list */
1164   ierr = PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);CHKERRQ(ierr);
1165 
1166   for (i=0; i<pon; i++) {
1167     pnz = poti[i+1] - poti[i];
1168     ptJ = potj + poti[i];
1169     for (j=0; j<pnz; j++) {
1170       row  = ptJ[j]; /* row of A_loc == col of Pot */
1171       anz  = ai[row+1] - ai[row];
1172       Jptr = aj + ai[row];
1173       /* add non-zero cols of AP into the sorted linked list lnk */
1174       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1175     }
1176     nnz = lnk[0];
1177 
1178     /* If free space is not available, double the total space in the list */
1179     if (current_space->local_remaining<nnz) {
1180       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1181       nspacedouble++;
1182     }
1183 
1184     /* Copy data into free space, and zero out denserows */
1185     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1186 
1187     current_space->array           += nnz;
1188     current_space->local_used      += nnz;
1189     current_space->local_remaining -= nnz;
1190 
1191     coi[i+1] = coi[i] + nnz;
1192   }
1193 
1194   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1195   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1196 
1197   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1198   if (afill_tmp > afill) afill = afill_tmp;
1199 
1200   /* send j-array (coj) of Co to other processors */
1201   /*----------------------------------------------*/
1202   /* determine row ownership */
1203   ierr = PetscNew(&merge);CHKERRQ(ierr);
1204   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1205 
1206   merge->rowmap->n  = pn;
1207   merge->rowmap->bs = 1;
1208 
1209   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1210   owners = merge->rowmap->range;
1211 
1212   /* determine the number of messages to send, their lengths */
1213   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1214   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1215 
1216   len_s        = merge->len_s;
1217   merge->nsend = 0;
1218 
1219   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1220   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1221 
1222   proc = 0;
1223   for (i=0; i<pon; i++) {
1224     while (prmap[i] >= owners[proc+1]) proc++;
1225     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1226     len_s[proc] += coi[i+1] - coi[i];
1227   }
1228 
1229   len          = 0; /* max length of buf_si[] */
1230   owners_co[0] = 0;
1231   for (proc=0; proc<size; proc++) {
1232     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1233     if (len_si[proc]) {
1234       merge->nsend++;
1235       len_si[proc] = 2*(len_si[proc] + 1);
1236       len         += len_si[proc];
1237     }
1238   }
1239 
1240   /* determine the number and length of messages to receive for coi and coj  */
1241   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1242   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1243 
1244   /* post the Irecv and Isend of coj */
1245   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1246   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1247   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1248   for (proc=0, k=0; proc<size; proc++) {
1249     if (!len_s[proc]) continue;
1250     i    = owners_co[proc];
1251     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1252     k++;
1253   }
1254 
1255   /* receives and sends of coj are complete */
1256   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1257   for (i=0; i<merge->nrecv; i++) {
1258     PetscMPIInt icompleted;
1259     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1260   }
1261   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1262   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1263 
1264   /* send and recv coi */
1265   /*-------------------*/
1266   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1267   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1268   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1269   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1270   for (proc=0,k=0; proc<size; proc++) {
1271     if (!len_s[proc]) continue;
1272     /* form outgoing message for i-structure:
1273          buf_si[0]:                 nrows to be sent
1274                [1:nrows]:           row index (global)
1275                [nrows+1:2*nrows+1]: i-structure index
1276     */
1277     /*-------------------------------------------*/
1278     nrows       = len_si[proc]/2 - 1;
1279     buf_si_i    = buf_si + nrows+1;
1280     buf_si[0]   = nrows;
1281     buf_si_i[0] = 0;
1282     nrows       = 0;
1283     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1284       nzi               = coi[i+1] - coi[i];
1285       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1286       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1287       nrows++;
1288     }
1289     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1290     k++;
1291     buf_si += len_si[proc];
1292   }
1293   i = merge->nrecv;
1294   while (i--) {
1295     PetscMPIInt icompleted;
1296     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1297   }
1298   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1299   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1300   ierr = PetscFree(len_si);CHKERRQ(ierr);
1301   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1302   ierr = PetscFree(swaits);CHKERRQ(ierr);
1303   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1304   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1305 
1306   /* compute the local portion of C (mpi mat) */
1307   /*------------------------------------------*/
1308   /* allocate bi array and free space for accumulating nonzero column info */
1309   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1310   bi[0] = 0;
1311 
1312   /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1313   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1314   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1315   current_space = free_space;
1316 
1317   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1318   for (k=0; k<merge->nrecv; k++) {
1319     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1320     nrows       = *buf_ri_k[k];
1321     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1322     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1323   }
1324 
1325   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1326   rmax = 0;
1327   for (i=0; i<pn; i++) {
1328     /* add pdt[i,:]*AP into lnk */
1329     pnz = pdti[i+1] - pdti[i];
1330     ptJ = pdtj + pdti[i];
1331     for (j=0; j<pnz; j++) {
1332       row  = ptJ[j];  /* row of AP == col of Pt */
1333       anz  = ai[row+1] - ai[row];
1334       Jptr = aj + ai[row];
1335       /* add non-zero cols of AP into the sorted linked list lnk */
1336       ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1337     }
1338 
1339     /* add received col data into lnk */
1340     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1341       if (i == *nextrow[k]) { /* i-th row */
1342         nzi  = *(nextci[k]+1) - *nextci[k];
1343         Jptr = buf_rj[k] + *nextci[k];
1344         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1345         nextrow[k]++; nextci[k]++;
1346       }
1347     }
1348     nnz = lnk[0];
1349 
1350     /* if free space is not available, make more free space */
1351     if (current_space->local_remaining<nnz) {
1352       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1353       nspacedouble++;
1354     }
1355     /* copy data into free space, then initialize lnk */
1356     ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1357     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1358 
1359     current_space->array           += nnz;
1360     current_space->local_used      += nnz;
1361     current_space->local_remaining -= nnz;
1362 
1363     bi[i+1] = bi[i] + nnz;
1364     if (nnz > rmax) rmax = nnz;
1365   }
1366   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1367 
1368   ierr = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1369   ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1370 
1371   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1372   if (afill_tmp > afill) afill = afill_tmp;
1373   ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr);
1374   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1375   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1376 
1377   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1378   /*----------------------------------------------------------------------------------*/
1379   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1380 
1381   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1382   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1383   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1384   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1385   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1386   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1387   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1388   for (i=0; i<pn; i++) {
1389     row  = i + rstart;
1390     nnz  = bi[i+1] - bi[i];
1391     Jptr = bj + bi[i];
1392     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1393   }
1394   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1395   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1396   ierr = PetscFree(vals);CHKERRQ(ierr);
1397 
1398   merge->bi        = bi;
1399   merge->bj        = bj;
1400   merge->coi       = coi;
1401   merge->coj       = coj;
1402   merge->buf_ri    = buf_ri;
1403   merge->buf_rj    = buf_rj;
1404   merge->owners_co = owners_co;
1405   merge->destroy   = Cmpi->ops->destroy;
1406   merge->duplicate = Cmpi->ops->duplicate;
1407 
1408   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1409   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1410   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1411 
1412   /* attach the supporting struct to Cmpi for reuse */
1413   c           = (Mat_MPIAIJ*)Cmpi->data;
1414   c->ptap     = ptap;
1415   ptap->api   = NULL;
1416   ptap->apj   = NULL;
1417   ptap->merge = merge;
1418   ptap->rmax  = rmax;
1419 
1420   *C = Cmpi;
1421 #if defined(PETSC_USE_INFO)
1422   if (bi[pn] != 0) {
1423     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1424     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1425   } else {
1426     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1427   }
1428 #endif
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 #undef __FUNCT__
1433 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ"
1434 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1435 {
1436   PetscErrorCode      ierr;
1437   Mat_Merge_SeqsToMPI *merge;
1438   Mat_MPIAIJ          *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1439   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1440   Mat_PtAPMPI         *ptap;
1441   PetscInt            *adj;
1442   PetscInt            i,j,k,anz,pnz,row,*cj,nexta;
1443   MatScalar           *ada,*ca,valtmp;
1444   PetscInt            am  =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1445   MPI_Comm            comm;
1446   PetscMPIInt         size,rank,taga,*len_s;
1447   PetscInt            *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1448   PetscInt            **buf_ri,**buf_rj;
1449   PetscInt            cnz=0,*bj_i,*bi,*bj,bnz,nextcj;  /* bi,bj,ba: local array of C(mpi mat) */
1450   MPI_Request         *s_waits,*r_waits;
1451   MPI_Status          *status;
1452   MatScalar           **abuf_r,*ba_i,*pA,*coa,*ba;
1453   PetscInt            *ai,*aj,*coi,*coj;
1454   PetscInt            *poJ,*pdJ;
1455   Mat                 A_loc;
1456   Mat_SeqAIJ          *a_loc;
1457 
1458   PetscFunctionBegin;
1459   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
1460   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1461   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1462 
1463   ptap  = c->ptap;
1464   merge = ptap->merge;
1465 
1466   /* 2) compute numeric C_seq = P_loc^T*A_loc */
1467   /*------------------------------------------*/
1468   /* get data from symbolic products */
1469   coi    = merge->coi; coj = merge->coj;
1470   ierr   = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr);
1471   bi     = merge->bi; bj = merge->bj;
1472   owners = merge->rowmap->range;
1473   ierr   = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr);
1474 
1475   /* get A_loc by taking all local rows of A */
1476   A_loc = ptap->A_loc;
1477   ierr  = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr);
1478   a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1479   ai    = a_loc->i;
1480   aj    = a_loc->j;
1481 
1482   for (i=0; i<am; i++) {
1483     anz = ai[i+1] - ai[i];
1484     adj = aj + ai[i];
1485     ada = a_loc->a + ai[i];
1486 
1487     /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1488     /*-------------------------------------------------------------*/
1489     /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1490     pnz = po->i[i+1] - po->i[i];
1491     poJ = po->j + po->i[i];
1492     pA  = po->a + po->i[i];
1493     for (j=0; j<pnz; j++) {
1494       row = poJ[j];
1495       cj  = coj + coi[row];
1496       ca  = coa + coi[row];
1497       /* perform sparse axpy */
1498       nexta  = 0;
1499       valtmp = pA[j];
1500       for (k=0; nexta<anz; k++) {
1501         if (cj[k] == adj[nexta]) {
1502           ca[k] += valtmp*ada[nexta];
1503           nexta++;
1504         }
1505       }
1506       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1507     }
1508 
1509     /* put the value into Cd (diagonal part) */
1510     pnz = pd->i[i+1] - pd->i[i];
1511     pdJ = pd->j + pd->i[i];
1512     pA  = pd->a + pd->i[i];
1513     for (j=0; j<pnz; j++) {
1514       row = pdJ[j];
1515       cj  = bj + bi[row];
1516       ca  = ba + bi[row];
1517       /* perform sparse axpy */
1518       nexta  = 0;
1519       valtmp = pA[j];
1520       for (k=0; nexta<anz; k++) {
1521         if (cj[k] == adj[nexta]) {
1522           ca[k] += valtmp*ada[nexta];
1523           nexta++;
1524         }
1525       }
1526       ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr);
1527     }
1528   }
1529 
1530   /* 3) send and recv matrix values coa */
1531   /*------------------------------------*/
1532   buf_ri = merge->buf_ri;
1533   buf_rj = merge->buf_rj;
1534   len_s  = merge->len_s;
1535   ierr   = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr);
1536   ierr   = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1537 
1538   ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr);
1539   for (proc=0,k=0; proc<size; proc++) {
1540     if (!len_s[proc]) continue;
1541     i    = merge->owners_co[proc];
1542     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1543     k++;
1544   }
1545   if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);}
1546   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);}
1547 
1548   ierr = PetscFree2(s_waits,status);CHKERRQ(ierr);
1549   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1550   ierr = PetscFree(coa);CHKERRQ(ierr);
1551 
1552   /* 4) insert local Cseq and received values into Cmpi */
1553   /*----------------------------------------------------*/
1554   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1555   for (k=0; k<merge->nrecv; k++) {
1556     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1557     nrows       = *(buf_ri_k[k]);
1558     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
1559     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1560   }
1561 
1562   for (i=0; i<cm; i++) {
1563     row  = owners[rank] + i; /* global row index of C_seq */
1564     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
1565     ba_i = ba + bi[i];
1566     bnz  = bi[i+1] - bi[i];
1567     /* add received vals into ba */
1568     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1569       /* i-th row */
1570       if (i == *nextrow[k]) {
1571         cnz    = *(nextci[k]+1) - *nextci[k];
1572         cj     = buf_rj[k] + *(nextci[k]);
1573         ca     = abuf_r[k] + *(nextci[k]);
1574         nextcj = 0;
1575         for (j=0; nextcj<cnz; j++) {
1576           if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1577             ba_i[j] += ca[nextcj++];
1578           }
1579         }
1580         nextrow[k]++; nextci[k]++;
1581         ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr);
1582       }
1583     }
1584     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
1585   }
1586   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1587   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1588 
1589   ierr = PetscFree(ba);CHKERRQ(ierr);
1590   ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr);
1591   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1592   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1593   PetscFunctionReturn(0);
1594 }
1595 
1596 #if 0
1597 #undef __FUNCT__
1598 #define __FUNCT__ "MatRowMergeMax_MPIAIJ"
1599 PetscErrorCode MatRowMergeMax_MPIAIJ(Mat A,const PetscInt nexpect,PetscInt maxkey,PetscInt *rmax)
1600 {
1601   PetscErrorCode  ierr;
1602   Mat_MPIAIJ      *a = (Mat_MPIAIJ*)A->data;
1603   Mat_SeqAIJ      *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1604   PetscInt        am=A->rmap->n,*col,row,j,nz,*garray=a->garray;
1605   PetscTable      ta;
1606 
1607   PetscFunctionBegin;
1608   ierr = PetscTableCreate(nexpect,maxkey,&ta);CHKERRQ(ierr);
1609   /* diagonal part */
1610   for (row=0; row<am; row++) {
1611     nz = ad->i[row+1] - ad->i[row];
1612     for (j=0; j<nz; j++) {
1613       col = j + ad->j + ad->i[row];
1614       ierr = PetscTableAdd(ta,*col+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
1615     }
1616   }
1617 
1618   /* off-diagonal part */
1619   for (row=0; row<am; row++) {
1620     nz = ao->i[row+1] - ao->i[row];
1621     for (j=0; j<nz; j++) {
1622       col = j + ao->j + ao->i[row];
1623       ierr = PetscTableAdd(ta,garray[*col]+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
1624     }
1625   }
1626 
1627   ierr = PetscTableGetCount(ta,rmax);CHKERRQ(ierr);
1628   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1629   printf("MatRowMergeMax_MPIAIJ...rmax %d\n",*rmax);
1630   PetscFunctionReturn(0);
1631 }
1632 #endif
1633 
1634 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1635 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1636    differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1637 #undef __FUNCT__
1638 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ"
1639 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1640 {
1641   PetscErrorCode      ierr;
1642   Mat                 Cmpi,A_loc,POt,PDt;
1643   Mat_PtAPMPI         *ptap;
1644   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1645   Mat_MPIAIJ          *p        =(Mat_MPIAIJ*)P->data,*c;
1646   PetscInt            *pdti,*pdtj,*poti,*potj,*ptJ;
1647   PetscInt            nnz;
1648   PetscInt            *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1649   PetscInt            am  =A->rmap->n,pn=P->cmap->n;
1650   MPI_Comm            comm;
1651   PetscMPIInt         size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1652   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1653   PetscInt            len,proc,*dnz,*onz,*owners;
1654   PetscInt            nzi,*bi,*bj;
1655   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1656   MPI_Request         *swaits,*rwaits;
1657   MPI_Status          *sstatus,rstatus;
1658   Mat_Merge_SeqsToMPI *merge;
1659   PetscInt            *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1660   PetscReal           afill  =1.0,afill_tmp;
1661   PetscInt            rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax,*col,nz;
1662   PetscScalar         *vals;
1663   Mat_SeqAIJ          *a_loc,*pdt,*pot;
1664   PetscTable          ta;
1665 
1666   PetscFunctionBegin;
1667   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1668   /* check if matrix local sizes are compatible */
1669   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1670     SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1671   }
1672 
1673   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1674   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1675 
1676   /* create struct Mat_PtAPMPI and attached it to C later */
1677   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1678 
1679   /* get A_loc by taking all local rows of A */
1680   ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr);
1681 
1682   ptap->A_loc = A_loc;
1683   a_loc       = (Mat_SeqAIJ*)(A_loc)->data;
1684   ai          = a_loc->i;
1685   aj          = a_loc->j;
1686 
1687   /* determine symbolic Co=(p->B)^T*A - send to others */
1688   /*----------------------------------------------------*/
1689   ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr);
1690   pdt  = (Mat_SeqAIJ*)PDt->data;
1691   pdti = pdt->i; pdtj = pdt->j;
1692 
1693   ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr);
1694   pot  = (Mat_SeqAIJ*)POt->data;
1695   poti = pot->i; potj = pot->j;
1696 
1697   /* then, compute symbolic Co = (p->B)^T*A */
1698   pon    = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1699                          >= (num of nonzero rows of C_seq) - pn */
1700   ierr   = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr);
1701   coi[0] = 0;
1702 
1703   /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1704   nnz           = fill*(poti[pon] + ai[am]);
1705   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1706   current_space = free_space;
1707 
1708   /* create and initialize a linked list */
1709   ierr = PetscTableCreate(2*a_loc->rmax,aN,&ta);CHKERRQ(ierr);
1710   for (row=0; row<am; row++) {
1711     nz = ai[row+1] - ai[row];
1712     for (j=0; j<nz; j++) {
1713       col = j + aj + ai[row];
1714       ierr = PetscTableAdd(ta,*col+1,1,INSERT_VALUES);CHKERRQ(ierr); /* key must be >0 */
1715     }
1716   }
1717   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1718   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1719 
1720   for (i=0; i<pon; i++) {
1721     pnz = poti[i+1] - poti[i];
1722     ptJ = potj + poti[i];
1723     for (j=0; j<pnz; j++) {
1724       row  = ptJ[j]; /* row of A_loc == col of Pot */
1725       anz  = ai[row+1] - ai[row];
1726       Jptr = aj + ai[row];
1727       /* add non-zero cols of AP into the sorted linked list lnk */
1728       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1729     }
1730     nnz = lnk[0];
1731 
1732     /* If free space is not available, double the total space in the list */
1733     if (current_space->local_remaining<nnz) {
1734       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1735       nspacedouble++;
1736     }
1737 
1738     /* Copy data into free space, and zero out denserows */
1739     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1740 
1741     current_space->array           += nnz;
1742     current_space->local_used      += nnz;
1743     current_space->local_remaining -= nnz;
1744 
1745     coi[i+1] = coi[i] + nnz;
1746   }
1747 
1748   ierr = PetscMalloc1(coi[pon]+1,&coj);CHKERRQ(ierr);
1749   ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
1750   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); /* must destroy to get a new one for C */
1751 
1752   afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1753   if (afill_tmp > afill) afill = afill_tmp;
1754 
1755   /* send j-array (coj) of Co to other processors */
1756   /*----------------------------------------------*/
1757   /* determine row ownership */
1758   ierr = PetscNew(&merge);CHKERRQ(ierr);
1759   ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1760 
1761   merge->rowmap->n  = pn;
1762   merge->rowmap->bs = 1;
1763 
1764   ierr   = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr);
1765   owners = merge->rowmap->range;
1766 
1767   /* determine the number of messages to send, their lengths */
1768   ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr);
1769   ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr);
1770 
1771   len_s        = merge->len_s;
1772   merge->nsend = 0;
1773 
1774   ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr);
1775   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1776 
1777   proc = 0;
1778   for (i=0; i<pon; i++) {
1779     while (prmap[i] >= owners[proc+1]) proc++;
1780     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
1781     len_s[proc] += coi[i+1] - coi[i];
1782   }
1783 
1784   len          = 0; /* max length of buf_si[] */
1785   owners_co[0] = 0;
1786   for (proc=0; proc<size; proc++) {
1787     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1788     if (len_si[proc]) {
1789       merge->nsend++;
1790       len_si[proc] = 2*(len_si[proc] + 1);
1791       len         += len_si[proc];
1792     }
1793   }
1794 
1795   /* determine the number and length of messages to receive for coi and coj  */
1796   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1797   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1798 
1799   /* post the Irecv and Isend of coj */
1800   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1801   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1802   ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr);
1803   for (proc=0, k=0; proc<size; proc++) {
1804     if (!len_s[proc]) continue;
1805     i    = owners_co[proc];
1806     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr);
1807     k++;
1808   }
1809 
1810   /* receives and sends of coj are complete */
1811   ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr);
1812   for (i=0; i<merge->nrecv; i++) {
1813     PetscMPIInt icompleted;
1814     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1815   }
1816   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1817   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1818 
1819   /* add received column indices into table to update Armax */
1820   for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1821     Jptr = buf_rj[k];
1822     for (j=0; j<merge->len_r[k]; j++) {
1823       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1824     }
1825   }
1826   ierr = PetscTableGetCount(ta,&Armax);CHKERRQ(ierr);
1827 
1828   /* send and recv coi */
1829   /*-------------------*/
1830   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1831   ierr   = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1832   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1833   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1834   for (proc=0,k=0; proc<size; proc++) {
1835     if (!len_s[proc]) continue;
1836     /* form outgoing message for i-structure:
1837          buf_si[0]:                 nrows to be sent
1838                [1:nrows]:           row index (global)
1839                [nrows+1:2*nrows+1]: i-structure index
1840     */
1841     /*-------------------------------------------*/
1842     nrows       = len_si[proc]/2 - 1;
1843     buf_si_i    = buf_si + nrows+1;
1844     buf_si[0]   = nrows;
1845     buf_si_i[0] = 0;
1846     nrows       = 0;
1847     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1848       nzi               = coi[i+1] - coi[i];
1849       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1850       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1851       nrows++;
1852     }
1853     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr);
1854     k++;
1855     buf_si += len_si[proc];
1856   }
1857   i = merge->nrecv;
1858   while (i--) {
1859     PetscMPIInt icompleted;
1860     ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr);
1861   }
1862   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1863   if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);}
1864   ierr = PetscFree(len_si);CHKERRQ(ierr);
1865   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1866   ierr = PetscFree(swaits);CHKERRQ(ierr);
1867   ierr = PetscFree(sstatus);CHKERRQ(ierr);
1868   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1869 
1870   /* compute the local portion of C (mpi mat) */
1871   /*------------------------------------------*/
1872   /* allocate bi array and free space for accumulating nonzero column info */
1873   ierr  = PetscMalloc1(pn+1,&bi);CHKERRQ(ierr);
1874   bi[0] = 0;
1875 
1876   /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1877   nnz           = fill*(pdti[pn] + poti[pon] + ai[am]);
1878   ierr          = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr);
1879   current_space = free_space;
1880 
1881   ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr);
1882   for (k=0; k<merge->nrecv; k++) {
1883     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1884     nrows       = *buf_ri_k[k];
1885     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1886     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure  */
1887   }
1888 
1889   ierr = PetscLLCondensedCreate_Scalable(Armax,&lnk);CHKERRQ(ierr);
1890   ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr);
1891   rmax = 0;
1892   for (i=0; i<pn; i++) {
1893     /* add pdt[i,:]*AP into lnk */
1894     pnz = pdti[i+1] - pdti[i];
1895     ptJ = pdtj + pdti[i];
1896     for (j=0; j<pnz; j++) {
1897       row  = ptJ[j];  /* row of AP == col of Pt */
1898       anz  = ai[row+1] - ai[row];
1899       Jptr = aj + ai[row];
1900       /* add non-zero cols of AP into the sorted linked list lnk */
1901       ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr);
1902     }
1903 
1904     /* add received col data into lnk */
1905     for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1906       if (i == *nextrow[k]) { /* i-th row */
1907         nzi  = *(nextci[k]+1) - *nextci[k];
1908         Jptr = buf_rj[k] + *nextci[k];
1909         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
1910         nextrow[k]++; nextci[k]++;
1911       }
1912     }
1913     nnz = lnk[0];
1914 
1915     /* if free space is not available, make more free space */
1916     if (current_space->local_remaining<nnz) {
1917       ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,&current_space);CHKERRQ(ierr);
1918       nspacedouble++;
1919     }
1920     /* copy data into free space, then initialize lnk */
1921     ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr);
1922     ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr);
1923 
1924     current_space->array           += nnz;
1925     current_space->local_used      += nnz;
1926     current_space->local_remaining -= nnz;
1927 
1928     bi[i+1] = bi[i] + nnz;
1929     if (nnz > rmax) rmax = nnz;
1930   }
1931   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1932 
1933   ierr      = PetscMalloc1(bi[pn]+1,&bj);CHKERRQ(ierr);
1934   ierr      = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1935   afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1936   if (afill_tmp > afill) afill = afill_tmp;
1937   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
1938   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1939 
1940   ierr = MatDestroy(&POt);CHKERRQ(ierr);
1941   ierr = MatDestroy(&PDt);CHKERRQ(ierr);
1942 
1943   /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part   */
1944   /*----------------------------------------------------------------------------------*/
1945   ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr);
1946 
1947   ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr);
1948   ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1949   ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr);
1950   ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr);
1951   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1952   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1953   ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr);
1954   for (i=0; i<pn; i++) {
1955     row  = i + rstart;
1956     nnz  = bi[i+1] - bi[i];
1957     Jptr = bj + bi[i];
1958     ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr);
1959   }
1960   ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1961   ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1962   ierr = PetscFree(vals);CHKERRQ(ierr);
1963 
1964   merge->bi        = bi;
1965   merge->bj        = bj;
1966   merge->coi       = coi;
1967   merge->coj       = coj;
1968   merge->buf_ri    = buf_ri;
1969   merge->buf_rj    = buf_rj;
1970   merge->owners_co = owners_co;
1971   merge->destroy   = Cmpi->ops->destroy;
1972   merge->duplicate = Cmpi->ops->duplicate;
1973 
1974   Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1975   Cmpi->ops->destroy                 = MatDestroy_MPIAIJ_PtAP;
1976   Cmpi->ops->duplicate               = MatDuplicate_MPIAIJ_MatPtAP;
1977 
1978   /* attach the supporting struct to Cmpi for reuse */
1979   c = (Mat_MPIAIJ*)Cmpi->data;
1980 
1981   c->ptap     = ptap;
1982   ptap->api   = NULL;
1983   ptap->apj   = NULL;
1984   ptap->merge = merge;
1985   ptap->rmax  = rmax;
1986   ptap->apa   = NULL;
1987 
1988   *C = Cmpi;
1989 #if defined(PETSC_USE_INFO)
1990   if (bi[pn] != 0) {
1991     ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr);
1992     ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr);
1993   } else {
1994     ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr);
1995   }
1996 #endif
1997   PetscFunctionReturn(0);
1998 }
1999