xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision a1dbd1ada563686aa79d3c8a365b3c1540de08e7)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139 
140     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
141     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
142 
143     /* get P_loc by taking all local rows of P */
144     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
145 
146   } else {
147     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
148   }
149   /* now, compute numeric local P^T*A*P */
150   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
151   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
152   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
153 
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
159 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
160 {
161   PetscErrorCode       ierr;
162   Mat                  P_loc,P_oth;
163   Mat_MatMatMultMPI    *ptap;
164   PetscObjectContainer container;
165   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
166   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
167   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
168   Mat_SeqAIJ           *p_loc,*p_oth;
169   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
170   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
171   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
172   PetscInt             aN=A->N,am=A->m,pN=P->N;
173   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
174   MatScalar            *ca;
175   PetscBT              lnkbt;
176   PetscInt             prstart,prend,nprow_loc,nprow_oth;
177   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
178 
179   MPI_Comm             comm=A->comm;
180   Mat                  B_mpi;
181   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
182   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
183   PetscInt             len,proc,*dnz,*onz,*owners;
184   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
185   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
186   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
187   MPI_Status           *status;
188   Mat_Merge_SeqsToMPI  *merge;
189 
190   PetscFunctionBegin;
191   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
192   if (container) {
193     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
194   } else {
195     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
196   }
197   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
198   /*--------------------------------------------------*/
199   /* get data from P_loc and P_oth */
200   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
201   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
202   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
203   prend = prstart + pm;
204 
205   /* get ij structure of P_loc^T */
206   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
207   ptJ=ptj;
208 
209   /* Allocate ci array, arrays for fill computation and */
210   /* free space for accumulating nonzero column info */
211   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
212   ci[0] = 0;
213 
214   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
215   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
216   ptasparserow_loc = ptadenserow_loc + aN;
217   ptadenserow_oth  = ptasparserow_loc + aN;
218   ptasparserow_oth = ptadenserow_oth + aN;
219 
220   /* create and initialize a linked list */
221   nlnk = pN+1;
222   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
223 
224   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
225   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
226   nnz           = adi[am] + aoi[am];
227   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
228   current_space = free_space;
229 
230   /* determine symbolic info for each row of C: */
231   for (i=0;i<pN;i++) {
232     ptnzi  = pti[i+1] - pti[i];
233     nprow_loc = 0; nprow_oth = 0;
234     /* i-th row of symbolic P_loc^T*A_loc: */
235     for (j=0;j<ptnzi;j++) {
236       arow = *ptJ++;
237       /* diagonal portion of A */
238       anzj = adi[arow+1] - adi[arow];
239       ajj  = adj + adi[arow];
240       for (k=0;k<anzj;k++) {
241         col = ajj[k]+prstart;
242         if (!ptadenserow_loc[col]) {
243           ptadenserow_loc[col]    = -1;
244           ptasparserow_loc[nprow_loc++] = col;
245         }
246       }
247       /* off-diagonal portion of A */
248       anzj = aoi[arow+1] - aoi[arow];
249       ajj  = aoj + aoi[arow];
250       for (k=0;k<anzj;k++) {
251         col = a->garray[ajj[k]];  /* global col */
252         if (col < cstart){
253           col = ajj[k];
254         } else if (col >= cend){
255           col = ajj[k] + pm;
256         } else {
257           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
258         }
259         if (!ptadenserow_oth[col]) {
260           ptadenserow_oth[col]    = -1;
261           ptasparserow_oth[nprow_oth++] = col;
262         }
263       }
264     }
265 
266     /* using symbolic info of local PtA, determine symbolic info for row of C: */
267     cnzi   = 0;
268     /* rows of P_loc */
269     ptaj = ptasparserow_loc;
270     for (j=0; j<nprow_loc; j++) {
271       prow = *ptaj++;
272       prow -= prstart; /* rm */
273       pnzj = pi_loc[prow+1] - pi_loc[prow];
274       pjj  = pj_loc + pi_loc[prow];
275       /* add non-zero cols of P into the sorted linked list lnk */
276       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
277       cnzi += nlnk;
278     }
279     /* rows of P_oth */
280     ptaj = ptasparserow_oth;
281     for (j=0; j<nprow_oth; j++) {
282       prow = *ptaj++;
283       if (prow >= prend) prow -= pm; /* rm */
284       pnzj = pi_oth[prow+1] - pi_oth[prow];
285       pjj  = pj_oth + pi_oth[prow];
286       /* add non-zero cols of P into the sorted linked list lnk */
287       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
288       cnzi += nlnk;
289     }
290 
291     /* If free space is not available, make more free space */
292     /* Double the amount of total space in the list */
293     if (current_space->local_remaining<cnzi) {
294       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
295     }
296 
297     /* Copy data into free space, and zero out denserows */
298     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
299     current_space->array           += cnzi;
300     current_space->local_used      += cnzi;
301     current_space->local_remaining -= cnzi;
302 
303     for (j=0;j<nprow_loc; j++) {
304       ptadenserow_loc[ptasparserow_loc[j]] = 0;
305     }
306     for (j=0;j<nprow_oth; j++) {
307       ptadenserow_oth[ptasparserow_oth[j]] = 0;
308     }
309 
310     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
311     /*        For now, we will recompute what is needed. */
312     ci[i+1] = ci[i] + cnzi;
313   }
314   /* Clean up. */
315   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
316 
317   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
318   /* Allocate space for cj, initialize cj, and */
319   /* destroy list of free space and other temporary array(s) */
320   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
321   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
322   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
323   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
324 
325   /* Allocate space for ca */
326   /*
327   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
328   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
329   */
330 
331   /* add C_seq into mpi C              */
332   /*-----------------------------------*/
333   free_space=PETSC_NULL; current_space=PETSC_NULL;
334 
335   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
336   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
337 
338   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
339 
340 
341   /* determine row ownership */
342   /*---------------------------------------------------------*/
343   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
344   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
345   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
346   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
347 
348   /* determine the number of messages to send, their lengths */
349   /*---------------------------------------------------------*/
350   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
351   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
352   len_s  = merge->len_s;
353   len = 0;  /* length of buf_si[] */
354   merge->nsend = 0;
355   for (proc=0; proc<size; proc++){
356     len_si[proc] = 0;
357     if (proc == rank){
358       len_s[proc] = 0;
359     } else {
360       len_si[proc] = owners[proc+1] - owners[proc] + 1;
361       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
362     }
363     if (len_s[proc]) {
364       merge->nsend++;
365       nrows = 0;
366       for (i=owners[proc]; i<owners[proc+1]; i++){
367         if (ci[i+1] > ci[i]) nrows++;
368       }
369       len_si[proc] = 2*(nrows+1);
370       len += len_si[proc];
371     }
372   }
373 
374   /* determine the number and length of messages to receive for ij-structure */
375   /*-------------------------------------------------------------------------*/
376   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
377   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
378 
379   /* post the Irecv of j-structure */
380   /*-------------------------------*/
381   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
382   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
383 
384   /* post the Isend of j-structure */
385   /*--------------------------------*/
386   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
387   sj_waits = si_waits + merge->nsend;
388 
389   for (proc=0, k=0; proc<size; proc++){
390     if (!len_s[proc]) continue;
391     i = owners[proc];
392     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
393     k++;
394   }
395 
396   /* receives and sends of j-structure are complete */
397   /*------------------------------------------------*/
398   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
399   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
400   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
401 
402   /* send and recv i-structure */
403   /*---------------------------*/
404   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
405   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
406 
407   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
408   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
409   for (proc=0,k=0; proc<size; proc++){
410     if (!len_s[proc]) continue;
411     /* form outgoing message for i-structure:
412          buf_si[0]:                 nrows to be sent
413                [1:nrows]:           row index (global)
414                [nrows+1:2*nrows+1]: i-structure index
415     */
416     /*-------------------------------------------*/
417     nrows = len_si[proc]/2 - 1;
418     buf_si_i    = buf_si + nrows+1;
419     buf_si[0]   = nrows;
420     buf_si_i[0] = 0;
421     nrows = 0;
422     for (i=owners[proc]; i<owners[proc+1]; i++){
423       anzi = ci[i+1] - ci[i];
424       if (anzi) {
425         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
426         buf_si[nrows+1] = i-owners[proc]; /* local row index */
427         nrows++;
428       }
429     }
430     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
431     k++;
432     buf_si += len_si[proc];
433   }
434 
435   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
436   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
437 
438   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
439   for (i=0; i<merge->nrecv; i++){
440     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
441   }
442 
443   ierr = PetscFree(len_si);CHKERRQ(ierr);
444   ierr = PetscFree(len_ri);CHKERRQ(ierr);
445   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
446   ierr = PetscFree(si_waits);CHKERRQ(ierr);
447   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
448   ierr = PetscFree(buf_s);CHKERRQ(ierr);
449   ierr = PetscFree(status);CHKERRQ(ierr);
450 
451   /* compute a local seq matrix in each processor */
452   /*----------------------------------------------*/
453   /* allocate bi array and free space for accumulating nonzero column info */
454   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
455   bi[0] = 0;
456 
457   /* create and initialize a linked list */
458   nlnk = pN+1;
459   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
460 
461   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
462   len = 0;
463   len  = ci[owners[rank+1]] - ci[owners[rank]];
464   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
465   current_space = free_space;
466 
467   /* determine symbolic info for each local row */
468   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
469   nextrow = buf_ri_k + merge->nrecv;
470   nextci  = nextrow + merge->nrecv;
471   for (k=0; k<merge->nrecv; k++){
472     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
473     nrows = *buf_ri_k[k];
474     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
475     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
476   }
477 
478   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
479   len = 0;
480   for (i=0; i<pn; i++) {
481     bnzi   = 0;
482     /* add local non-zero cols of this proc's C_seq into lnk */
483     arow   = owners[rank] + i;
484     anzi   = ci[arow+1] - ci[arow];
485     cji    = cj + ci[arow];
486     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
487     bnzi += nlnk;
488     /* add received col data into lnk */
489     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
490       if (i == *nextrow[k]) { /* i-th row */
491         anzi = *(nextci[k]+1) - *nextci[k];
492         cji  = buf_rj[k] + *nextci[k];
493         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
494         bnzi += nlnk;
495         nextrow[k]++; nextci[k]++;
496       }
497     }
498     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
499 
500     /* if free space is not available, make more free space */
501     if (current_space->local_remaining<bnzi) {
502       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
503       nspacedouble++;
504     }
505     /* copy data into free space, then initialize lnk */
506     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
507     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
508 
509     current_space->array           += bnzi;
510     current_space->local_used      += bnzi;
511     current_space->local_remaining -= bnzi;
512 
513     bi[i+1] = bi[i] + bnzi;
514   }
515 
516   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
517 
518   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
519   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
520   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
521 
522   /* create symbolic parallel matrix B_mpi */
523   /*---------------------------------------*/
524   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
525   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
526   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
527   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
528 
529   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
530   B_mpi->assembled     = PETSC_FALSE;
531   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
532   merge->bi            = bi;
533   merge->bj            = bj;
534   merge->ci            = ci;
535   merge->cj            = cj;
536   merge->buf_ri        = buf_ri;
537   merge->buf_rj        = buf_rj;
538   merge->C_seq         = PETSC_NULL;
539 
540   /* attach the supporting struct to B_mpi for reuse */
541   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
542   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
543   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
544   *C = B_mpi;
545 
546   PetscFunctionReturn(0);
547 
548 #ifdef TOBEREMOVED
549   PetscErrorCode       ierr;
550   Mat                  C_seq;
551   Mat_MatMatMultMPI    *ptap;
552   PetscObjectContainer container;
553 
554   PetscFunctionBegin;
555 
556   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
557   if (container) {
558     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
559   } else {
560     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
561   }
562   /* compute C_seq = P_loc^T * A_loc * P_seq */
563   ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(A,ptap->B_loc,ptap->B_oth,ptap->brstart,fill,&C_seq);CHKERRQ(ierr);
564 
565   /* add C_seq into mpi C */
566   ierr = MatMerge_SeqsToMPISymbolic(A->comm,C_seq,P->n,P->n,C);CHKERRQ(ierr);
567 
568   PetscFunctionReturn(0);
569 #endif /* TOBEREMOVED */
570 }
571 
572 #undef __FUNCT__
573 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
574 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
575 {
576   PetscErrorCode       ierr;
577   Mat_Merge_SeqsToMPI  *merge;
578   Mat_MatMatMultMPI    *ptap;
579   PetscObjectContainer cont_merge,cont_ptap;
580 
581   PetscInt       flops=0;
582   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
583   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
584   Mat            C_seq;
585   Mat_SeqAIJ     *p_loc,*p_oth;
586   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
587   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
588   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
589   PetscInt       *cjj;
590   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj;
591   MatScalar      *pa_loc,*pA,*pa_oth;
592   PetscInt       am=A->m,cN=C->N;
593 
594   MPI_Comm             comm=C->comm;
595   PetscMPIInt          size,rank,taga,*len_s;
596   PetscInt             *owners;
597   PetscInt             proc;
598   PetscInt             **buf_ri,**buf_rj;
599   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
600   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
601   MPI_Request          *s_waits,*r_waits;
602   MPI_Status           *status;
603   MatScalar            **abuf_r,*ba_i;
604   Mat_SeqAIJ           *cseq;
605   PetscInt             *cseqi,*cseqj;
606   PetscInt             *cseqj_tmp;
607   MatScalar            *cseqa_tmp;
608 
609   PetscFunctionBegin;
610   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
611   if (cont_merge) {
612     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
613   } else {
614     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
615   }
616   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
617   if (cont_ptap) {
618     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
619   } else {
620     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
621   }
622   /* get data from symbolic products */
623   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
624   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
625   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
626   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
627 
628   cseqi = merge->ci; cseqj=merge->cj;
629   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
630   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
631 
632   /* Allocate temporary array for storage of one row of A*P */
633   ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
634   ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
635   apj      = (PetscInt *)(apa + cN);
636   apjdense = apj + cN;
637 
638   /* Clear old values in C_Seq */
639   ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
640 
641   for (i=0;i<am;i++) {
642     /* Form i-th sparse row of A*P */
643      apnzj = 0;
644     /* diagonal portion of A */
645     anzi  = adi[i+1] - adi[i];
646     for (j=0;j<anzi;j++) {
647       prow = *adj;
648       adj++;
649       pnzj = pi_loc[prow+1] - pi_loc[prow];
650       pjj  = pj_loc + pi_loc[prow];
651       paj  = pa_loc + pi_loc[prow];
652       for (k=0;k<pnzj;k++) {
653         if (!apjdense[pjj[k]]) {
654           apjdense[pjj[k]] = -1;
655           apj[apnzj++]     = pjj[k];
656         }
657         apa[pjj[k]] += (*ada)*paj[k];
658       }
659       flops += 2*pnzj;
660       ada++;
661     }
662     /* off-diagonal portion of A */
663     anzi  = aoi[i+1] - aoi[i];
664     for (j=0;j<anzi;j++) {
665       col = a->garray[*aoj];
666       if (col < cstart){
667         prow = *aoj;
668       } else if (col >= cend){
669         prow = *aoj;
670       } else {
671         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
672       }
673       aoj++;
674       pnzj = pi_oth[prow+1] - pi_oth[prow];
675       pjj  = pj_oth + pi_oth[prow];
676       paj  = pa_oth + pi_oth[prow];
677       for (k=0;k<pnzj;k++) {
678         if (!apjdense[pjj[k]]) {
679           apjdense[pjj[k]] = -1;
680           apj[apnzj++]     = pjj[k];
681         }
682         apa[pjj[k]] += (*aoa)*paj[k];
683       }
684       flops += 2*pnzj;
685       aoa++;
686     }
687     /* Sort the j index array for quick sparse axpy. */
688     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
689 
690     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
691     pnzi = pi_loc[i+1] - pi_loc[i];
692     for (j=0;j<pnzi;j++) {
693       nextap = 0;
694       crow   = *pJ++;
695       cjj    = cseqj + cseqi[crow];
696       caj    = cseqa + cseqi[crow];
697       /* Perform sparse axpy operation.  Note cjj includes apj. */
698       for (k=0;nextap<apnzj;k++) {
699         if (cjj[k]==apj[nextap]) {
700           caj[k] += (*pA)*apa[apj[nextap++]];
701         }
702       }
703       flops += 2*apnzj;
704       pA++;
705     }
706 
707     /* Zero the current row info for A*P */
708     for (j=0;j<apnzj;j++) {
709       apa[apj[j]]      = 0.;
710       apjdense[apj[j]] = 0;
711     }
712   }
713 
714   ierr = PetscFree(apa);CHKERRQ(ierr);
715   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
716 
717   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
718   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
719 
720   bi     = merge->bi;
721   bj     = merge->bj;
722   buf_ri = merge->buf_ri;
723   buf_rj = merge->buf_rj;
724 
725   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
726   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
727 
728   /* send and recv matrix values */
729   /*-----------------------------*/
730   len_s  = merge->len_s;
731   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
732   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
733 
734   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
735   for (proc=0,k=0; proc<size; proc++){
736     if (!len_s[proc]) continue;
737     i = owners[proc];
738     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
739     k++;
740   }
741 
742   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
743   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
744   ierr = PetscFree(status);CHKERRQ(ierr);
745 
746   ierr = PetscFree(s_waits);CHKERRQ(ierr);
747   ierr = PetscFree(r_waits);CHKERRQ(ierr);
748 
749   /* insert mat values of mpimat */
750   /*----------------------------*/
751   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
752   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
753   nextrow = buf_ri_k + merge->nrecv;
754   nextcseqi  = nextrow + merge->nrecv;
755 
756   for (k=0; k<merge->nrecv; k++){
757     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
758     nrows = *(buf_ri_k[k]);
759     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
760     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
761   }
762 
763   /* set values of ba */
764   for (i=0; i<C->m; i++) {
765     cseqrow = owners[rank] + i;
766     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
767     bnzi = bi[i+1] - bi[i];
768     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
769 
770     /* add local non-zero vals of this proc's C_seq into ba */
771     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
772     cseqj_tmp = cseqj + cseqi[cseqrow];
773     cseqa_tmp = cseqa + cseqi[cseqrow];
774     nextcseqj = 0;
775     for (j=0; nextcseqj<cseqnzi; j++){
776       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
777         ba_i[j] += cseqa_tmp[nextcseqj++];
778       }
779     }
780 
781     /* add received vals into ba */
782     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
783       /* i-th row */
784       if (i == *nextrow[k]) {
785         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
786         cseqj_tmp   = buf_rj[k] + *(nextcseqi[k]);
787         cseqa_tmp   = abuf_r[k] + *(nextcseqi[k]);
788         nextcseqj = 0;
789         for (j=0; nextcseqj<cseqnzi; j++){
790           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
791             ba_i[j] += cseqa_tmp[nextcseqj++];
792           }
793         }
794         nextrow[k]++; nextcseqi[k]++;
795       }
796     }
797     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
798   }
799   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
800   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
801 
802   ierr = PetscFree(cseqa);CHKERRQ(ierr);
803   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
804   ierr = PetscFree(ba_i);CHKERRQ(ierr);
805   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
806 
807   PetscFunctionReturn(0);
808 }
809 
810 #ifdef TOBEREMOVED
811 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
812 #undef __FUNCT__
813 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
814 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
815 {
816   PetscErrorCode       ierr;
817   Mat_MatMatMultMPI    *ptap;
818   PetscObjectContainer container;
819 
820   PetscFunctionBegin;
821   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
822   if (container) {
823     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
824     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
825     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
826     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
827     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
828 
829     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
830     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
831   }
832   ierr = PetscFree(ptap);CHKERRQ(ierr);
833 
834   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
835   PetscFunctionReturn(0);
836 }
837 
838 #undef __FUNCT__
839 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
840 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
841 {
842   PetscErrorCode       ierr;
843   Mat_MatMatMultMPI    *ptap;
844   PetscObjectContainer container;
845 
846   PetscFunctionBegin;
847   if (scall == MAT_INITIAL_MATRIX){
848     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
849     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
850 
851     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
852     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
853 
854     /* get P_loc by taking all local rows of P */
855     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
856 
857     /* attach the supporting struct to P for reuse */
858     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
859     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
860     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
861     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
862 
863     /* now, compute symbolic local P^T*A*P */
864     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
865     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
866   } else if (scall == MAT_REUSE_MATRIX){
867     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
868     if (container) {
869       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
870     } else {
871       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
872     }
873     /* update P_oth */
874     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
875 
876   } else {
877     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
878   }
879   /* now, compute numeric local P^T*A*P */
880   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
881   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
882   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
883 
884   PetscFunctionReturn(0);
885 }
886 /* Set symbolic info for i-th row of local product C=P^T*A*P */
887 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
888 {\
889   /* allocate ci array, arrays for fill computation and */\
890   /* free space for accumulating nonzero column info */\
891   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
892   ci[0] = 0;\
893 \
894   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
895   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
896   nnz           = adi[am] + aoi[am];\
897   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
898   current_space = free_space;\
899 \
900   ptJ=ptj;\
901   for (i=0; i<ptM; i++) {\
902     ptnzi  = pti[i+1] - pti[i];\
903     nprow_loc = 0; nprow_oth = 0;\
904     /* i-th row of symbolic Pt*A: */\
905     for (j=0;j<ptnzi;j++) {\
906       arow = *ptJ++;\
907       /* diagonal portion of A */\
908       anzj = adi[arow+1] - adi[arow];\
909       ajj  = adj + adi[arow];\
910       for (k=0;k<anzj;k++) {\
911         col = ajj[k]+prstart;\
912         if (!ptadenserow_loc[col]) {\
913           ptadenserow_loc[col]    = -1;\
914           ptasparserow_loc[nprow_loc++] = col;\
915         }\
916       }\
917       /* off-diagonal portion of A */\
918       anzj = aoi[arow+1] - aoi[arow];\
919       ajj  = aoj + aoi[arow];\
920       for (k=0;k<anzj;k++) {\
921         col = a->garray[ajj[k]];  /* global col */\
922         if (col < cstart){\
923           col = ajj[k];\
924         } else if (col >= cend){\
925           col = ajj[k] + pm;\
926         } else {\
927           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
928         }\
929         if (!ptadenserow_oth[col]) {\
930           ptadenserow_oth[col]    = -1;\
931           ptasparserow_oth[nprow_oth++] = col;\
932         }\
933       }\
934     }\
935     /* determine symbolic info for row of C_seq: */\
936     cnzi   = 0;\
937     /* rows of P_loc */\
938     ptaj = ptasparserow_loc;\
939     for (j=0; j<nprow_loc; j++) {\
940       prow = *ptaj++; \
941       prow -= prstart; /* rm */\
942       pnzj = pi_loc[prow+1] - pi_loc[prow];\
943       pjj  = pj_loc + pi_loc[prow];\
944       /* add non-zero cols of P into the sorted linked list lnk */\
945       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
946       cnzi += nlnk;\
947     }\
948     /* rows of P_oth */\
949     ptaj = ptasparserow_oth;\
950     for (j=0; j<nprow_oth; j++) {\
951       prow = *ptaj++; \
952       if (prow >= prend) prow -= pm; /* rm */\
953       pnzj = pi_oth[prow+1] - pi_oth[prow];\
954       pjj  = pj_oth + pi_oth[prow];\
955       /* add non-zero cols of P into the sorted linked list lnk */\
956       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
957       cnzi += nlnk;\
958     }\
959    \
960     /* If free space is not available, make more free space */\
961     /* Double the amount of total space in the list */\
962     if (current_space->local_remaining<cnzi) {\
963       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
964     }\
965 \
966     /* Copy data into free space, and zero out denserows */\
967     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
968     current_space->array           += cnzi;\
969     current_space->local_used      += cnzi;\
970     current_space->local_remaining -= cnzi;\
971    \
972     for (j=0;j<nprow_loc; j++) {\
973       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
974     }\
975     for (j=0;j<nprow_oth; j++) {\
976       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
977     }\
978     \
979     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
980     /*        For now, we will recompute what is needed. */ \
981     ci[i+1] = ci[i] + cnzi;\
982   }\
983   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
984   /* Allocate space for cj, initialize cj, and */\
985   /* destroy list of free space and other temporary array(s) */\
986   ierr = PetscMalloc((ci[ptM]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
987   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
988 }
989 #undef __FUNCT__
990 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
991 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
992 {
993   PetscErrorCode       ierr;
994   Mat                  P_loc,P_oth;
995   Mat_MatMatMultMPI    *ptap;
996   PetscObjectContainer container;
997   FreeSpaceList        free_space=PETSC_NULL,current_space;
998   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
999   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
1000                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1001   Mat_SeqAIJ           *p_loc,*p_oth;
1002   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
1003   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1004   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
1005   PetscInt             aN=A->N,am=A->m,pN=P->N;
1006   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
1007   PetscBT              lnkbt;
1008   PetscInt             prstart,prend,nprow_loc,nprow_oth;
1009   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1010 
1011   MPI_Comm             comm=A->comm;
1012   Mat                  B_mpi;
1013   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
1014   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1015   PetscInt             len,proc,*dnz,*onz,*owners;
1016   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
1017   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1018   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
1019   MPI_Status           *status;
1020   Mat_Merge_SeqsToMPI  *merge;
1021   PetscInt             *ptdi,*ptdj,*cdi,*cdj;
1022   PetscInt             *ptoi,*ptoj,*coi,*coj;
1023 
1024   PetscFunctionBegin;
1025   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
1026   if (container) {
1027     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
1028   } else {
1029     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1030   }
1031 
1032   /* determine row ownership of C */
1033   /*---------------------------------------------------------*/
1034   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1035   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1036   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1037 
1038   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1039   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
1040   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
1041   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1042 
1043   /* get data from P_loc and P_oth */
1044   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
1045   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
1046   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
1047   prend = prstart + pm;
1048 
1049   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
1050   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1051   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1052   ptasparserow_loc = ptadenserow_loc + aN;
1053   ptadenserow_oth  = ptasparserow_loc + aN;
1054   ptasparserow_oth = ptadenserow_oth + aN;
1055 
1056   /* create and initialize a linked list */
1057   nlnk = pN+1;
1058   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1059 
1060   /* Pt = P_loc^T */
1061   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1062   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
1063 
1064   /* Pt = Pd^T - diagonal portion of P_loc */
1065   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1066   ierr = MatPtAPSymbolicLocal_Private(pn,ptdi,ptdj,cdi,cdj);CHKERRQ(ierr);
1067   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1068 #ifdef NEW
1069   /* Pt = Po^T - off-diagonal portion of P_loc */
1070   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&ptoi,&ptoj);CHKERRQ(ierr);
1071   ierr = MatPtAPSymbolicLocal_Private(tnrows,ptoi,ptoj,coi,coj);CHKERRQ(ierr);
1072   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptoi,&ptoj);CHKERRQ(ierr);
1073 #endif
1074   /* clean up */
1075   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1076   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1077   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1078 
1079   /* determine the number of messages to send, their lengths */
1080   /*---------------------------------------------------------*/
1081   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1082   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1083   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1084   len_s  = merge->len_s;
1085   len = 0;  /* max length of buf_si[] */
1086   merge->nsend = 0;
1087   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
1088   proc = 0;
1089   for (i=0; i<tnrows; i++){
1090     while (rmap[i] >= owners[proc+1]) proc++;
1091     len_si[proc]++;
1092   }
1093   for (proc=0; proc<size; proc++){
1094     len_s[proc] = 0;
1095     if (len_si[proc]){
1096       merge->nsend++;
1097       len_si[proc] = 2*(len_si[proc] + 1);
1098       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
1099       len += len_si[proc];
1100     }
1101   }
1102 
1103   /* determine the number and length of messages to receive for ij-structure */
1104   /*-------------------------------------------------------------------------*/
1105   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1106   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1107 
1108   /* post the Irecv of j-structure */
1109   /*-------------------------------*/
1110   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
1111   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
1112 
1113   /* post the Isend of j-structure */
1114   /*--------------------------------*/
1115   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
1116   sj_waits = si_waits + merge->nsend;
1117 
1118   for (proc=0, k=0; proc<size; proc++){
1119     if (!len_s[proc]) continue;
1120     i = owners[proc];
1121     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
1122     k++;
1123   }
1124 
1125   /* receives and sends of j-structure are complete */
1126   /*------------------------------------------------*/
1127   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1128   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
1129   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
1130 
1131   /* send and recv i-structure */
1132   /*---------------------------*/
1133   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
1134   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
1135 
1136   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1137   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1138 
1139   j = 0; /* row index to be sent */
1140   for (proc=0,k=0; proc<size; proc++){
1141     if (!len_s[proc]) continue;
1142     /* form outgoing message for i-structure:
1143          buf_si[0]:                 nrows to be sent
1144                [1:nrows]:           row index (global)
1145                [nrows+1:2*nrows+1]: i-structure index
1146     */
1147     /*-------------------------------------------*/
1148     nrows = len_si[proc]/2 - 1;
1149     buf_si_i    = buf_si + nrows+1;
1150     buf_si[0]   = nrows;
1151     buf_si_i[0] = 0;
1152     for (i=0; i<nrows; i++){
1153       row = rmap[j++];
1154       anzi = ci[row+1] - ci[row];
1155       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
1156       buf_si[i+1]   = row - owners[proc]; /* local row index */
1157     }
1158     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
1159     k++;
1160     buf_si += len_si[proc];
1161   }
1162 
1163   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
1164   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
1165 
1166   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
1167   for (i=0; i<merge->nrecv; i++){
1168     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
1169   }
1170 
1171   ierr = PetscFree(len_si);CHKERRQ(ierr);
1172   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1173   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
1174   ierr = PetscFree(si_waits);CHKERRQ(ierr);
1175   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
1176   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1177   ierr = PetscFree(status);CHKERRQ(ierr);
1178 
1179   /* compute a local seq matrix in each processor */
1180   /*----------------------------------------------*/
1181   /* allocate bi array and free space for accumulating nonzero column info */
1182   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1183   bi[0] = 0;
1184 
1185   /* create and initialize a linked list */
1186   nlnk = pN+1;
1187   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1188 
1189   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
1190   len = 0;
1191   len  = ci[owners[rank+1]] - ci[owners[rank]];
1192   free_space=PETSC_NULL;
1193   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
1194   current_space = free_space;
1195 
1196   /* determine symbolic info for each local row of C */
1197   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1198   nextrow = buf_ri_k + merge->nrecv;
1199   nextci  = nextrow + merge->nrecv;
1200   for (k=0; k<merge->nrecv; k++){
1201     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1202     nrows = *buf_ri_k[k];
1203     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1204     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1205   }
1206 
1207   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1208   len = 0;
1209   for (i=0; i<pn; i++) {
1210     bnzi = 0;
1211     /* add local non-zero cols of this proc's C_seq into lnk */
1212     anzi = cdi[i+1] - cdi[i];
1213     cji  = cdj + cdi[i];
1214     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1215     bnzi += nlnk;
1216     /* add received col data into lnk */
1217     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1218       if (i == *nextrow[k]) { /* i-th row */
1219         anzi = *(nextci[k]+1) - *nextci[k];
1220         cji  = buf_rj[k] + *nextci[k];
1221         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1222         bnzi += nlnk;
1223         nextrow[k]++; nextci[k]++;
1224       }
1225     }
1226     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
1227 
1228     /* if free space is not available, make more free space */
1229     if (current_space->local_remaining<bnzi) {
1230       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1231       nspacedouble++;
1232     }
1233     /* copy data into free space, then initialize lnk */
1234     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1235     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1236 
1237     current_space->array           += bnzi;
1238     current_space->local_used      += bnzi;
1239     current_space->local_remaining -= bnzi;
1240 
1241     bi[i+1] = bi[i] + bnzi;
1242   }
1243 
1244   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1245 
1246   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1247   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1248   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1249 
1250   /* create symbolic parallel matrix B_mpi */
1251   /*---------------------------------------*/
1252   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
1253   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
1254   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
1255   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1256 
1257   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
1258   B_mpi->assembled     = PETSC_FALSE;
1259   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
1260   merge->bi            = bi;
1261   merge->bj            = bj;
1262   merge->ci            = ci;
1263   merge->cj            = cj;
1264   merge->buf_ri        = buf_ri;
1265   merge->buf_rj        = buf_rj;
1266   merge->C_seq         = PETSC_NULL;
1267 
1268   /* attach the supporting struct to B_mpi for reuse */
1269   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
1270   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
1271   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
1272   *C = B_mpi;
1273 
1274   ierr = PetscFree(cdi);CHKERRQ(ierr);
1275   ierr = PetscFree(cdj);CHKERRQ(ierr);
1276 #ifdef NEW
1277   ierr = PetscFree(coi);CHKERRQ(ierr);
1278   ierr = PetscFree(coj);CHKERRQ(ierr);
1279 #endif
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 #undef __FUNCT__
1284 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
1285 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1286 {
1287   PetscErrorCode       ierr;
1288   Mat_Merge_SeqsToMPI  *merge;
1289   Mat_MatMatMultMPI    *ptap;
1290   PetscObjectContainer cont_merge,cont_ptap;
1291   PetscInt             flops=0;
1292   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1293   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1294   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,*p_oth;
1295   Mat_SeqAIJ           *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
1296   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense;
1297   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
1298   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1299   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
1300   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
1301   PetscInt             am=A->m,cN=C->N,cm=C->m;
1302   MPI_Comm             comm=C->comm;
1303   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
1304   PetscInt             *owners,proc,nrows;
1305   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
1306   MPI_Request          *s_waits,*r_waits;
1307   MPI_Status           *status;
1308   PetscInt             *cseqi,*cseqj,col;
1309   PetscInt             stages[3];
1310 
1311   PetscFunctionBegin;
1312   ierr = PetscLogStageRegister(&stages[0],"NumAP_local");CHKERRQ(ierr);
1313   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
1314   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
1315 
1316   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
1317   if (cont_merge) {
1318     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
1319   } else {
1320     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
1321   }
1322   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
1323   if (cont_ptap) {
1324     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
1325   } else {
1326     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1327   }
1328   /* get data from symbolic products */
1329   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
1330   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1331 
1332   cseqi = merge->ci; cseqj=merge->cj;
1333   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
1334   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
1335 
1336   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1337   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1338   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1339 
1340   /* Allocate temporary array for storage of one row of A*P and one row of C */
1341   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1342   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
1343   ca       = (MatScalar*)(apa + cN);
1344   apj      = (PetscInt *)(ca + cN);
1345   apjdense = (PetscInt *)(apj + cN);
1346 
1347   /* Clear old values in C */
1348   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1349   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1350 
1351   for (i=0;i<am;i++) {
1352     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
1353     /* Form i-th sparse row of A*P */
1354      apnzj = 0;
1355     /* diagonal portion of A */
1356     nzi  = adi[i+1] - adi[i];
1357     for (j=0;j<nzi;j++) {
1358       prow = *adj++;
1359       /* diagonal portion of P */
1360       pnzj = pd->i[prow+1] - pd->i[prow];
1361       pjj  = pd->j + pd->i[prow]; /* local col index of P */
1362       paj  = pd->a + pd->i[prow];
1363       for (k=0;k<pnzj;k++) {
1364         col = *pjj + p->cstart; pjj++; /* global col index of P */
1365         if (!apjdense[col]) {
1366           apjdense[col] = -1;
1367           apj[apnzj++]  = col;
1368         }
1369         apa[col] += (*ada)*paj[k];
1370       }
1371       flops += 2*pnzj;
1372       /* off-diagonal portion of P */
1373       pnzj = po->i[prow+1] - po->i[prow];
1374       pjj  = po->j + po->i[prow]; /* local col index of P */
1375       paj  = po->a + po->i[prow];
1376       for (k=0;k<pnzj;k++) {
1377         col = p->garray[*pjj]; pjj++; /* global col index of P */
1378         if (!apjdense[col]) {
1379           apjdense[col] = -1;
1380           apj[apnzj++]  = col;
1381         }
1382         apa[col] += (*ada)*paj[k];
1383       }
1384       flops += 2*pnzj;
1385 
1386       ada++;
1387     }
1388     /* off-diagonal portion of A */
1389     nzi  = aoi[i+1] - aoi[i];
1390     for (j=0;j<nzi;j++) {
1391       prow = *aoj++;
1392       pnzj = pi_oth[prow+1] - pi_oth[prow];
1393       pjj  = pj_oth + pi_oth[prow];
1394       paj  = pa_oth + pi_oth[prow];
1395       for (k=0;k<pnzj;k++) {
1396         if (!apjdense[pjj[k]]) {
1397           apjdense[pjj[k]] = -1;
1398           apj[apnzj++]     = pjj[k];
1399         }
1400         apa[pjj[k]] += (*aoa)*paj[k];
1401       }
1402       flops += 2*pnzj;
1403       aoa++;
1404     }
1405     /* Sort the j index array for quick sparse axpy. */
1406     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1407 
1408     ierr = PetscLogStagePop();
1409     ierr = PetscLogStagePush(stages[1]);
1410     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1411     /* diagonal portion of P -- gives matrix value of local C */
1412     pnzi = pd->i[i+1] - pd->i[i];
1413     for (j=0;j<pnzi;j++) {
1414       /* add value into C */
1415       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
1416       crow = (*pJ_d++) + owners[rank];
1417       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
1418       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
1419       pA_d++;
1420       flops += 2*apnzj;
1421     }
1422 
1423     /* off-diagonal portion of P -- gives matrix values to be sent to others */
1424     pnzi = po->i[i+1] - po->i[i];
1425     for (j=0;j<pnzi;j++) {
1426       crow   = p->garray[*pJ_o++];
1427       cjj    = cseqj + cseqi[crow];
1428       caj    = cseqa + cseqi[crow];
1429       /* add value into C_seq to be sent to other processors */
1430       nextap = 0;
1431       for (k=0;nextap<apnzj;k++) {
1432         if (cjj[k]==apj[nextap]) {
1433           caj[k] += (*pA_o)*apa[apj[nextap++]];
1434         }
1435       }
1436       flops += 2*apnzj;
1437       pA_o++;
1438     }
1439 
1440     /* Zero the current row info for A*P */
1441     for (j=0;j<apnzj;j++) {
1442       apa[apj[j]]      = 0.;
1443       apjdense[apj[j]] = 0;
1444     }
1445     ierr = PetscLogStagePop();
1446   }
1447 
1448   /* send and recv matrix values */
1449   /*-----------------------------*/
1450   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
1451   bi     = merge->bi;
1452   bj     = merge->bj;
1453   buf_ri = merge->buf_ri;
1454   buf_rj = merge->buf_rj;
1455   len_s  = merge->len_s;
1456   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
1457   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1458 
1459   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
1460   for (proc=0,k=0; proc<size; proc++){
1461     if (!len_s[proc]) continue;
1462     i = owners[proc];
1463     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1464     k++;
1465   }
1466   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1467   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
1468   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
1469   ierr = PetscFree(status);CHKERRQ(ierr);
1470 
1471   ierr = PetscFree(s_waits);CHKERRQ(ierr);
1472   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1473   ierr = PetscFree(cseqa);CHKERRQ(ierr);
1474 
1475   /* insert mat values of mpimat */
1476   /*----------------------------*/
1477   ba_i = apa; /* rename, temporary array for storage of one row of C */
1478   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1479   nextrow = buf_ri_k + merge->nrecv;
1480   nextcseqi = nextrow + merge->nrecv;
1481 
1482   for (k=0; k<merge->nrecv; k++){
1483     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1484     nrows        = *(buf_ri_k[k]);
1485     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
1486     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1487   }
1488 
1489   /* add received local vals to C */
1490   for (i=0; i<cm; i++) {
1491     crow = owners[rank] + i;
1492     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
1493     bnzi = bi[i+1] - bi[i];
1494     nzi = 0;
1495     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1496       /* i-th row */
1497       if (i == *nextrow[k]) {
1498         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
1499         cseqj   = buf_rj[k] + *(nextcseqi[k]);
1500         cseqa   = abuf_r[k] + *(nextcseqi[k]);
1501         nextcseqj = 0;
1502         for (j=0; nextcseqj<cseqnzi; j++){
1503           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
1504             ba_i[j] += cseqa[nextcseqj++];
1505             nzi++;
1506           }
1507         }
1508         nextrow[k]++; nextcseqi[k]++;
1509       }
1510     }
1511     if (nzi>0){
1512       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
1513       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
1514       flops += 2*nzi;
1515     }
1516   }
1517   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1518   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1519   ierr = PetscLogStagePop();CHKERRQ(ierr);
1520 
1521   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1522   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1523   ierr = PetscFree(apa);CHKERRQ(ierr);
1524   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1525 
1526   PetscFunctionReturn(0);
1527 }
1528 #endif /* TOBEREMOVED */
1529 
1530 #undef __FUNCT__
1531 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
1532 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
1533 {
1534   PetscErrorCode ierr;
1535 
1536   PetscFunctionBegin;
1537   if (scall == MAT_INITIAL_MATRIX){
1538     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1539     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
1540     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1541   }
1542   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1543   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
1544   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 /*
1549    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
1550 
1551    Collective on Mat
1552 
1553    Input Parameters:
1554 +  A - the matrix
1555 -  P - the projection matrix
1556 
1557    Output Parameters:
1558 .  C - the (i,j) structure of the product matrix
1559 
1560    Notes:
1561    C will be created and must be destroyed by the user with MatDestroy().
1562 
1563    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
1564    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
1565    this (i,j) structure by calling MatPtAPNumeric().
1566 
1567    Level: intermediate
1568 
1569 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
1570 */
1571 #undef __FUNCT__
1572 #define __FUNCT__ "MatPtAPSymbolic"
1573 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
1574 {
1575   PetscErrorCode ierr;
1576   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
1577   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
1578 
1579   PetscFunctionBegin;
1580 
1581   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1582   PetscValidType(A,1);
1583   MatPreallocated(A);
1584   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1585   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1586 
1587   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1588   PetscValidType(P,2);
1589   MatPreallocated(P);
1590   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1591   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1592 
1593   PetscValidPointer(C,3);
1594 
1595   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1596   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1597 
1598   /* For now, we do not dispatch based on the type of A and P */
1599   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1600   fA = A->ops->ptapsymbolic;
1601   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
1602   fP = P->ops->ptapsymbolic;
1603   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
1604   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1605 
1606   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1607   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
1608   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1609 
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 typedef struct {
1614   Mat    symAP;
1615 } Mat_PtAPstruct;
1616 
1617 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
1618 
1619 #undef __FUNCT__
1620 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
1621 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
1622 {
1623   PetscErrorCode    ierr;
1624   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
1625 
1626   PetscFunctionBegin;
1627   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
1628   ierr = PetscFree(ptap);CHKERRQ(ierr);
1629   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 #undef __FUNCT__
1634 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
1635 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1636 {
1637   PetscErrorCode ierr;
1638   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1639   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
1640   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1641   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
1642   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
1643   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
1644   MatScalar      *ca;
1645   PetscBT        lnkbt;
1646 
1647   PetscFunctionBegin;
1648   /* Get ij structure of P^T */
1649   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1650   ptJ=ptj;
1651 
1652   /* Allocate ci array, arrays for fill computation and */
1653   /* free space for accumulating nonzero column info */
1654   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1655   ci[0] = 0;
1656 
1657   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1658   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1659   ptasparserow = ptadenserow  + an;
1660 
1661   /* create and initialize a linked list */
1662   nlnk = pn+1;
1663   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1664 
1665   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1666   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1667   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1668   current_space = free_space;
1669 
1670   /* Determine symbolic info for each row of C: */
1671   for (i=0;i<pn;i++) {
1672     ptnzi  = pti[i+1] - pti[i];
1673     ptanzi = 0;
1674     /* Determine symbolic row of PtA: */
1675     for (j=0;j<ptnzi;j++) {
1676       arow = *ptJ++;
1677       anzj = ai[arow+1] - ai[arow];
1678       ajj  = aj + ai[arow];
1679       for (k=0;k<anzj;k++) {
1680         if (!ptadenserow[ajj[k]]) {
1681           ptadenserow[ajj[k]]    = -1;
1682           ptasparserow[ptanzi++] = ajj[k];
1683         }
1684       }
1685     }
1686       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1687     ptaj = ptasparserow;
1688     cnzi   = 0;
1689     for (j=0;j<ptanzi;j++) {
1690       prow = *ptaj++;
1691       pnzj = pi[prow+1] - pi[prow];
1692       pjj  = pj + pi[prow];
1693       /* add non-zero cols of P into the sorted linked list lnk */
1694       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1695       cnzi += nlnk;
1696     }
1697 
1698     /* If free space is not available, make more free space */
1699     /* Double the amount of total space in the list */
1700     if (current_space->local_remaining<cnzi) {
1701       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1702     }
1703 
1704     /* Copy data into free space, and zero out denserows */
1705     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1706     current_space->array           += cnzi;
1707     current_space->local_used      += cnzi;
1708     current_space->local_remaining -= cnzi;
1709 
1710     for (j=0;j<ptanzi;j++) {
1711       ptadenserow[ptasparserow[j]] = 0;
1712     }
1713     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1714     /*        For now, we will recompute what is needed. */
1715     ci[i+1] = ci[i] + cnzi;
1716   }
1717   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1718   /* Allocate space for cj, initialize cj, and */
1719   /* destroy list of free space and other temporary array(s) */
1720   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1721   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1722   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1723   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1724 
1725   /* Allocate space for ca */
1726   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1727   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1728 
1729   /* put together the new matrix */
1730   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1731 
1732   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1733   /* Since these are PETSc arrays, change flags to free them as necessary. */
1734   c = (Mat_SeqAIJ *)((*C)->data);
1735   c->freedata = PETSC_TRUE;
1736   c->nonew    = 0;
1737 
1738   /* Clean up. */
1739   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1740 
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 #include "src/mat/impls/maij/maij.h"
1745 EXTERN_C_BEGIN
1746 #undef __FUNCT__
1747 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1748 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1749 {
1750   /* This routine requires testing -- I don't think it works. */
1751   PetscErrorCode ierr;
1752   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1753   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1754   Mat            P=pp->AIJ;
1755   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1756   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1757   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1758   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1759   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1760   MatScalar      *ca;
1761 
1762   PetscFunctionBegin;
1763   /* Start timer */
1764   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1765 
1766   /* Get ij structure of P^T */
1767   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1768 
1769   /* Allocate ci array, arrays for fill computation and */
1770   /* free space for accumulating nonzero column info */
1771   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1772   ci[0] = 0;
1773 
1774   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1775   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1776   ptasparserow = ptadenserow  + an;
1777   denserow     = ptasparserow + an;
1778   sparserow    = denserow     + pn;
1779 
1780   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1781   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1782   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1783   current_space = free_space;
1784 
1785   /* Determine symbolic info for each row of C: */
1786   for (i=0;i<pn/ppdof;i++) {
1787     ptnzi  = pti[i+1] - pti[i];
1788     ptanzi = 0;
1789     ptJ    = ptj + pti[i];
1790     for (dof=0;dof<ppdof;dof++) {
1791     /* Determine symbolic row of PtA: */
1792       for (j=0;j<ptnzi;j++) {
1793         arow = ptJ[j] + dof;
1794         anzj = ai[arow+1] - ai[arow];
1795         ajj  = aj + ai[arow];
1796         for (k=0;k<anzj;k++) {
1797           if (!ptadenserow[ajj[k]]) {
1798             ptadenserow[ajj[k]]    = -1;
1799             ptasparserow[ptanzi++] = ajj[k];
1800           }
1801         }
1802       }
1803       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1804       ptaj = ptasparserow;
1805       cnzi   = 0;
1806       for (j=0;j<ptanzi;j++) {
1807         pdof = *ptaj%dof;
1808         prow = (*ptaj++)/dof;
1809         pnzj = pi[prow+1] - pi[prow];
1810         pjj  = pj + pi[prow];
1811         for (k=0;k<pnzj;k++) {
1812           if (!denserow[pjj[k]+pdof]) {
1813             denserow[pjj[k]+pdof] = -1;
1814             sparserow[cnzi++]     = pjj[k]+pdof;
1815           }
1816         }
1817       }
1818 
1819       /* sort sparserow */
1820       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1821 
1822       /* If free space is not available, make more free space */
1823       /* Double the amount of total space in the list */
1824       if (current_space->local_remaining<cnzi) {
1825         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1826       }
1827 
1828       /* Copy data into free space, and zero out denserows */
1829       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1830       current_space->array           += cnzi;
1831       current_space->local_used      += cnzi;
1832       current_space->local_remaining -= cnzi;
1833 
1834       for (j=0;j<ptanzi;j++) {
1835         ptadenserow[ptasparserow[j]] = 0;
1836       }
1837       for (j=0;j<cnzi;j++) {
1838         denserow[sparserow[j]] = 0;
1839       }
1840       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1841       /*        For now, we will recompute what is needed. */
1842       ci[i+1+dof] = ci[i+dof] + cnzi;
1843     }
1844   }
1845   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1846   /* Allocate space for cj, initialize cj, and */
1847   /* destroy list of free space and other temporary array(s) */
1848   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1849   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1850   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1851 
1852   /* Allocate space for ca */
1853   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1854   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1855 
1856   /* put together the new matrix */
1857   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1858 
1859   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1860   /* Since these are PETSc arrays, change flags to free them as necessary. */
1861   c = (Mat_SeqAIJ *)((*C)->data);
1862   c->freedata = PETSC_TRUE;
1863   c->nonew    = 0;
1864 
1865   /* Clean up. */
1866   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1867 
1868   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1869   PetscFunctionReturn(0);
1870 }
1871 EXTERN_C_END
1872 
1873 /*
1874    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1875 
1876    Collective on Mat
1877 
1878    Input Parameters:
1879 +  A - the matrix
1880 -  P - the projection matrix
1881 
1882    Output Parameters:
1883 .  C - the product matrix
1884 
1885    Notes:
1886    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1887    the user using MatDeatroy().
1888 
1889    This routine is currently only implemented for pairs of AIJ matrices and classes
1890    which inherit from AIJ.  C will be of type MATAIJ.
1891 
1892    Level: intermediate
1893 
1894 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1895 */
1896 #undef __FUNCT__
1897 #define __FUNCT__ "MatPtAPNumeric"
1898 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1899 {
1900   PetscErrorCode ierr;
1901   PetscErrorCode (*fA)(Mat,Mat,Mat);
1902   PetscErrorCode (*fP)(Mat,Mat,Mat);
1903 
1904   PetscFunctionBegin;
1905 
1906   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1907   PetscValidType(A,1);
1908   MatPreallocated(A);
1909   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1910   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1911 
1912   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1913   PetscValidType(P,2);
1914   MatPreallocated(P);
1915   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1916   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1917 
1918   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1919   PetscValidType(C,3);
1920   MatPreallocated(C);
1921   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1922   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1923 
1924   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1925   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1926   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1927   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1928 
1929   /* For now, we do not dispatch based on the type of A and P */
1930   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1931   fA = A->ops->ptapnumeric;
1932   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1933   fP = P->ops->ptapnumeric;
1934   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1935   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1936 
1937   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1938   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1939   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1940 
1941   PetscFunctionReturn(0);
1942 }
1943 
1944 #undef __FUNCT__
1945 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1946 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1947 {
1948   PetscErrorCode ierr;
1949   PetscInt       flops=0;
1950   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1951   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1952   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1953   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1954   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1955   PetscInt       am=A->M,cn=C->N,cm=C->M;
1956   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1957   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1958 
1959   PetscFunctionBegin;
1960   /* Allocate temporary array for storage of one row of A*P */
1961   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1962   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1963 
1964   apj      = (PetscInt *)(apa + cn);
1965   apjdense = apj + cn;
1966 
1967   /* Clear old values in C */
1968   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1969 
1970   for (i=0;i<am;i++) {
1971     /* Form sparse row of A*P */
1972     anzi  = ai[i+1] - ai[i];
1973     apnzj = 0;
1974     for (j=0;j<anzi;j++) {
1975       prow = *aj++;
1976       pnzj = pi[prow+1] - pi[prow];
1977       pjj  = pj + pi[prow];
1978       paj  = pa + pi[prow];
1979       for (k=0;k<pnzj;k++) {
1980         if (!apjdense[pjj[k]]) {
1981           apjdense[pjj[k]] = -1;
1982           apj[apnzj++]     = pjj[k];
1983         }
1984         apa[pjj[k]] += (*aa)*paj[k];
1985       }
1986       flops += 2*pnzj;
1987       aa++;
1988     }
1989 
1990     /* Sort the j index array for quick sparse axpy. */
1991     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1992 
1993     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1994     pnzi = pi[i+1] - pi[i];
1995     for (j=0;j<pnzi;j++) {
1996       nextap = 0;
1997       crow   = *pJ++;
1998       cjj    = cj + ci[crow];
1999       caj    = ca + ci[crow];
2000       /* Perform sparse axpy operation.  Note cjj includes apj. */
2001       for (k=0;nextap<apnzj;k++) {
2002         if (cjj[k]==apj[nextap]) {
2003           caj[k] += (*pA)*apa[apj[nextap++]];
2004         }
2005       }
2006       flops += 2*apnzj;
2007       pA++;
2008     }
2009 
2010     /* Zero the current row info for A*P */
2011     for (j=0;j<apnzj;j++) {
2012       apa[apj[j]]      = 0.;
2013       apjdense[apj[j]] = 0;
2014     }
2015   }
2016 
2017   /* Assemble the final matrix and clean up */
2018   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2019   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2020   ierr = PetscFree(apa);CHKERRQ(ierr);
2021   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2022 
2023   PetscFunctionReturn(0);
2024 }
2025 
2026 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
2027 static PetscEvent logkey_matptapnumeric_local = 0;
2028 #undef __FUNCT__
2029 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
2030 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
2031 {
2032   PetscErrorCode ierr;
2033   PetscInt       flops=0;
2034   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2035   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2036   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
2037   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2038   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2039   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
2040   PetscInt       *pJ=pj_loc,*pjj;
2041   PetscInt       *ci=c->i,*cj=c->j,*cjj;
2042   PetscInt       am=A->m,cn=C->N,cm=C->M;
2043   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
2044   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
2045   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
2046 
2047   PetscFunctionBegin;
2048   if (!logkey_matptapnumeric_local) {
2049     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
2050   }
2051   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2052 
2053   /* Allocate temporary array for storage of one row of A*P */
2054   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
2055   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
2056   apj      = (PetscInt *)(apa + cn);
2057   apjdense = apj + cn;
2058 
2059   /* Clear old values in C */
2060   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
2061 
2062   for (i=0;i<am;i++) {
2063     /* Form i-th sparse row of A*P */
2064      apnzj = 0;
2065     /* diagonal portion of A */
2066     anzi  = adi[i+1] - adi[i];
2067     for (j=0;j<anzi;j++) {
2068       prow = *adj;
2069       adj++;
2070       pnzj = pi_loc[prow+1] - pi_loc[prow];
2071       pjj  = pj_loc + pi_loc[prow];
2072       paj  = pa_loc + pi_loc[prow];
2073       for (k=0;k<pnzj;k++) {
2074         if (!apjdense[pjj[k]]) {
2075           apjdense[pjj[k]] = -1;
2076           apj[apnzj++]     = pjj[k];
2077         }
2078         apa[pjj[k]] += (*ada)*paj[k];
2079       }
2080       flops += 2*pnzj;
2081       ada++;
2082     }
2083     /* off-diagonal portion of A */
2084     anzi  = aoi[i+1] - aoi[i];
2085     for (j=0;j<anzi;j++) {
2086       col = a->garray[*aoj];
2087       if (col < cstart){
2088         prow = *aoj;
2089       } else if (col >= cend){
2090         prow = *aoj;
2091       } else {
2092         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2093       }
2094       aoj++;
2095       pnzj = pi_oth[prow+1] - pi_oth[prow];
2096       pjj  = pj_oth + pi_oth[prow];
2097       paj  = pa_oth + pi_oth[prow];
2098       for (k=0;k<pnzj;k++) {
2099         if (!apjdense[pjj[k]]) {
2100           apjdense[pjj[k]] = -1;
2101           apj[apnzj++]     = pjj[k];
2102         }
2103         apa[pjj[k]] += (*aoa)*paj[k];
2104       }
2105       flops += 2*pnzj;
2106       aoa++;
2107     }
2108     /* Sort the j index array for quick sparse axpy. */
2109     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
2110 
2111     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
2112     pnzi = pi_loc[i+1] - pi_loc[i];
2113     for (j=0;j<pnzi;j++) {
2114       nextap = 0;
2115       crow   = *pJ++;
2116       cjj    = cj + ci[crow];
2117       caj    = ca + ci[crow];
2118       /* Perform sparse axpy operation.  Note cjj includes apj. */
2119       for (k=0;nextap<apnzj;k++) {
2120         if (cjj[k]==apj[nextap]) {
2121           caj[k] += (*pA)*apa[apj[nextap++]];
2122         }
2123       }
2124       flops += 2*apnzj;
2125       pA++;
2126     }
2127 
2128     /* Zero the current row info for A*P */
2129     for (j=0;j<apnzj;j++) {
2130       apa[apj[j]]      = 0.;
2131       apjdense[apj[j]] = 0;
2132     }
2133   }
2134 
2135   /* Assemble the final matrix and clean up */
2136   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2137   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2138   ierr = PetscFree(apa);CHKERRQ(ierr);
2139   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2140   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2141   PetscFunctionReturn(0);
2142 }
2143 static PetscEvent logkey_matptapsymbolic_local = 0;
2144 #undef __FUNCT__
2145 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
2146 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
2147 {
2148   PetscErrorCode ierr;
2149   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
2150   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2151   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
2152   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
2153   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
2154   PetscInt       *ci,*cj,*ptaj;
2155   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
2156   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
2157   PetscInt       pm=P_loc->m,nlnk,*lnk;
2158   MatScalar      *ca;
2159   PetscBT        lnkbt;
2160   PetscInt       prend,nprow_loc,nprow_oth;
2161   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
2162   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2163   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2164 
2165   PetscFunctionBegin;
2166   if (!logkey_matptapsymbolic_local) {
2167     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
2168   }
2169   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2170 
2171   prend = prstart + pm;
2172 
2173   /* get ij structure of P_loc^T */
2174   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2175   ptJ=ptj;
2176 
2177   /* Allocate ci array, arrays for fill computation and */
2178   /* free space for accumulating nonzero column info */
2179   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
2180   ci[0] = 0;
2181 
2182   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
2183   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
2184   ptasparserow_loc = ptadenserow_loc + aN;
2185   ptadenserow_oth  = ptasparserow_loc + aN;
2186   ptasparserow_oth = ptadenserow_oth + aN;
2187 
2188   /* create and initialize a linked list */
2189   nlnk = pN+1;
2190   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2191 
2192   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
2193   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
2194   nnz           = adi[am] + aoi[am];
2195   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
2196   current_space = free_space;
2197 
2198   /* determine symbolic info for each row of C: */
2199   for (i=0;i<pN;i++) {
2200     ptnzi  = pti[i+1] - pti[i];
2201     nprow_loc = 0; nprow_oth = 0;
2202     /* i-th row of symbolic P_loc^T*A_loc: */
2203     for (j=0;j<ptnzi;j++) {
2204       arow = *ptJ++;
2205       /* diagonal portion of A */
2206       anzj = adi[arow+1] - adi[arow];
2207       ajj  = adj + adi[arow];
2208       for (k=0;k<anzj;k++) {
2209         col = ajj[k]+prstart;
2210         if (!ptadenserow_loc[col]) {
2211           ptadenserow_loc[col]    = -1;
2212           ptasparserow_loc[nprow_loc++] = col;
2213         }
2214       }
2215       /* off-diagonal portion of A */
2216       anzj = aoi[arow+1] - aoi[arow];
2217       ajj  = aoj + aoi[arow];
2218       for (k=0;k<anzj;k++) {
2219         col = a->garray[ajj[k]];  /* global col */
2220         if (col < cstart){
2221           col = ajj[k];
2222         } else if (col >= cend){
2223           col = ajj[k] + pm;
2224         } else {
2225           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2226         }
2227         if (!ptadenserow_oth[col]) {
2228           ptadenserow_oth[col]    = -1;
2229           ptasparserow_oth[nprow_oth++] = col;
2230         }
2231       }
2232     }
2233 
2234     /* using symbolic info of local PtA, determine symbolic info for row of C: */
2235     cnzi   = 0;
2236     /* rows of P_loc */
2237     ptaj = ptasparserow_loc;
2238     for (j=0; j<nprow_loc; j++) {
2239       prow = *ptaj++;
2240       prow -= prstart; /* rm */
2241       pnzj = pi_loc[prow+1] - pi_loc[prow];
2242       pjj  = pj_loc + pi_loc[prow];
2243       /* add non-zero cols of P into the sorted linked list lnk */
2244       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2245       cnzi += nlnk;
2246     }
2247     /* rows of P_oth */
2248     ptaj = ptasparserow_oth;
2249     for (j=0; j<nprow_oth; j++) {
2250       prow = *ptaj++;
2251       if (prow >= prend) prow -= pm; /* rm */
2252       pnzj = pi_oth[prow+1] - pi_oth[prow];
2253       pjj  = pj_oth + pi_oth[prow];
2254       /* add non-zero cols of P into the sorted linked list lnk */
2255       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2256       cnzi += nlnk;
2257     }
2258 
2259     /* If free space is not available, make more free space */
2260     /* Double the amount of total space in the list */
2261     if (current_space->local_remaining<cnzi) {
2262       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
2263     }
2264 
2265     /* Copy data into free space, and zero out denserows */
2266     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
2267     current_space->array           += cnzi;
2268     current_space->local_used      += cnzi;
2269     current_space->local_remaining -= cnzi;
2270 
2271     for (j=0;j<nprow_loc; j++) {
2272       ptadenserow_loc[ptasparserow_loc[j]] = 0;
2273     }
2274     for (j=0;j<nprow_oth; j++) {
2275       ptadenserow_oth[ptasparserow_oth[j]] = 0;
2276     }
2277 
2278     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
2279     /*        For now, we will recompute what is needed. */
2280     ci[i+1] = ci[i] + cnzi;
2281   }
2282   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
2283   /* Allocate space for cj, initialize cj, and */
2284   /* destroy list of free space and other temporary array(s) */
2285   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
2286   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
2287   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
2288   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2289 
2290   /* Allocate space for ca */
2291   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
2292   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
2293 
2294   /* put together the new matrix */
2295   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
2296 
2297   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
2298   /* Since these are PETSc arrays, change flags to free them as necessary. */
2299   c = (Mat_SeqAIJ *)((*C)->data);
2300   c->freedata = PETSC_TRUE;
2301   c->nonew    = 0;
2302 
2303   /* Clean up. */
2304   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2305 
2306   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2307   PetscFunctionReturn(0);
2308 }
2309